import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from lightgbm import LGBMRegressor
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import plotly
#import plotly.plotly as py
#import plotly.graph_objs as go
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import random
import warnings
warnings.filterwarnings("ignore")
#link of data
#https://www.kaggle.com/c/pubg-finish-placement-prediction/data
train = pd.read_csv('./data/train_V2.csv')
pd.options.display.max_columns = None
train.head()
train.info()
#Utility functions
def train_test_split(df, test_size=0.3):
import random
match_ids = df['matchId'].unique().tolist()
train_size = int(len(match_ids) * (1 - test_size))
train_match_ids = random.sample(match_ids, train_size)
train = df[df['matchId'].isin(train_match_ids)]
test = df[-df['matchId'].isin(train_match_ids)]
return train, test
def outlier_range(var,perc,df):
return np.percentile(df[var],perc)
def show_distplot(var,title):
plt.figure(figsize=(10,8))
sns.distplot(df[var],bins=50)
plt.title(title,fontsize=15)
plt.show()
#data cleaning and feature engineering part
df = train[(train['matchType'].isin(['solo','solo-fpp','duo','duo-fpp','squad','squad-fpp']))]
#number of player each match/each team
df['numJoined'] = df.groupby('matchId')['matchId'].transform('count')
df['teamNum'] = df.groupby('groupId')['groupId'].transform('count')
df['totalDistance'] = df['rideDistance'] + df['swimDistance'] + df['walkDistance']
df['headshotPerc'] = df['headshotKills']/df['kills']
#since each game has different num of players, needs to normalize
normalize = (100-df['numJoined'])/100 + 1
df['killsNorm'] = df['kills'] * normalize
df['killsPerDistance'] = df['killsNorm']/(df['totalDistance'] + 1e31)
df['weaponPerDistance'] = df['weaponsAcquired']/(df['totalDistance'] + 1e31)
#explore heals and boost
#allow you to stay out of zone longer
df['healsAndBoosts'] = df['heals'] + df['boosts']
df['healsAndBoostsPerWalkDistance'] = df['healsAndBoosts']/(df['walkDistance']+1e31)
df['healsAndBoostsPerKill'] = df['healsAndBoosts']/(df['killsNorm']+1e31)
#deal with inf and nan
df.replace([np.inf, -np.inf], np.nan)
df.fillna(0,inplace=True)
#feature importance using light GBM
target = 'winPlacePerc'
cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', target, 'killPlace','walkDistance','numJoined','teamNum',
'matchDuration','kills','maxPlace','winPoints','boosts','heals','rideDistance','swimDistance',
'rankPoints','revives','teamKills','numGroups','totalDistance']
cols_to_fit = [col for col in df.columns if col not in cols_to_drop]
train, val = train_test_split(df, 0.3)
params = {
'n_estimators': 100,
'learning_rate': 0.3,
'num_leaves': 20,
'objective': 'regression_l2',
'metric': 'mae',
'verbose': -1,
}
model = LGBMRegressor(**params)
model.fit(
train[cols_to_fit], train[target],
eval_set=[(val[cols_to_fit], val[target])],
eval_metric='mae',
verbose=-1,
)
feature_importance = pd.DataFrame(sorted(zip(model.feature_importances_, cols_to_fit)), columns=['Value','Feature'])
plt.figure(figsize=(12, 8))
sns.barplot(x="Value", y="Feature", data=feature_importance.sort_values(by="Value", ascending=False))
plt.title('Feature Importance with LightGBM')
plt.tight_layout()
random.seed(831)
data,test = train_test_split(df,0.99)
data = data[['killsNorm','winPlacePerc']]
data['killsNorm'] = data['killsNorm'].astype('int')
N = 30
c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 360, N)]
kill_num = list(set(data['killsNorm']))
graph_data = [{
'y': data['winPlacePerc'].loc[data['killsNorm']==kill_num[i]],
'name':kill_num[i],
'type':'box',
'marker':{'color': c[i]}
} for i in range(len(kill_num))]
layout = {'xaxis': {'showgrid':True,'zeroline':False, 'tickangle':60,'showticklabels':True,'title':'Number of Kills'},
'yaxis': {'zeroline':False,'gridcolor':'white','title':'Rank Percentile'},
#'paper_bgcolor': 'rgb(233,233,233)',
#'plot_bgcolor': 'rgb(233,233,233)',
'title': 'Kills vs. Final Rank Percentile'
}
init_notebook_mode(connected=True)
fig = go.Figure(data=graph_data,layout=layout)
iplot(fig)
non_killing_perc = round(len(df[df['killsNorm']==0])/len(df) * 100,2)
two_killing_perc = round(len(df[df['killsNorm']<=2])/len(df) * 100,2)
max_kill = int(max(df['killsNorm']))
outlier_kill = outlier_range('killsNorm',99,df)
# print(non_killing_perc)
# print(two_killing_perc)
# print(max_kill)
# print(outlier_kill)
show_distplot('headshotPerc','Headshot Percentage Distribution')
max_headshot = df['kills'][df['headshotPerc']==1].max()
#print(max_headshot)
df[['Id','roadKills','kills','winPlacePerc']].sort_values('roadKills').tail(3)
has_roadkill = round(100 - len(df[df['roadKills']==0])/len(df) * 100,2)
max_roadkill = int(max(df['roadKills']))
hover_text,bubble_size = [], []
data,test = train_test_split(df,0.9999)
data['is_team'] = np.where(data['matchType'].isin(['solo','solo-fpp']),'solo',
np.where(data['matchType'].isin(['duo','duo-fpp']),'duo','squad'))
for index, row in data.iterrows():
hover_text.append(('Match Type: {is_team}<br>'+
'Rank Percentile: {winPlacePerc}<br>'+
'Total Distance Traveled: {totalDistance}<br>'+
'Weapons Acquired: {weaponsAcquired}<br>').format(is_team=row['is_team'],
winPlacePerc=row['winPlacePerc'],
totalDistance=row['totalDistance'],
weaponsAcquired=row['weaponsAcquired']))
bubble_size.append(row['weaponsAcquired'] * 500)
data['text'] = hover_text
data['size'] = bubble_size
sizeref = data['weaponsAcquired'].max()
match_types = list(set(data['is_team']))
graph_data = list()
for match_type in match_types:
trace = go.Scatter(
x=data['totalDistance'][data['is_team'] == match_type],
y=data['winPlacePerc'][data['is_team'] == match_type ],
mode='markers',
name=match_type,
text=data['text'][data['is_team'] == match_type],
marker=dict(
symbol='circle',
sizemode='area',
sizeref=sizeref,
size=data['size'][data['is_team'] == match_type],
line=dict(
width=2
),
)
)
graph_data.append(trace)
layout = go.Layout(
title='Rank Percentile based on Weapons acquired per Distance',
xaxis=dict(
title='Total Distance Traveled',
gridcolor='rgb(255, 255, 255)',
range=[0, max(data['totalDistance'])],
zerolinewidth=1,
ticklen=5,
gridwidth=2,
),
yaxis=dict(
title='Rank Percentile',
gridcolor='rgb(255, 255, 255)',
range=[0, 1.05],
zerolinewidth=1,
ticklen=5,
gridwidth=2,
),
)
fig = go.Figure(data=graph_data, layout=layout)
iplot(fig)
outlier_weapon = round(len(df[(df['weaponsAcquired'] > 10) & (df['totalDistance'] == 0)])/len(df) * 100,2)
max_weapon = df[df['totalDistance']==0]['weaponsAcquired'].max()
# print(outlier_weapon)
# print(max_weapon)
hover_text = []
data,test = train_test_split(df,0.99)
for index, row in data.iterrows():
hover_text.append(('Number of Kills: {killsNorm}<br>'+
'Heals and Boosts: {healsAndBoosts}<br>'+
'Rank Percentile: {winPlacePerc}<br>').format(killsNorm=row['killsNorm'],
healsAndBoosts=row['healsAndBoosts'],
winPlacePerc=row['winPlacePerc']))
data['text'] = hover_text
trace1 = go.Scatter3d(
x=data['killsNorm'],
y=data['healsAndBoosts'],
z=data['winPlacePerc'],
mode='markers',
text = data['text'],
marker=dict(
size=5,
color = 'pink',
line = dict(color='lightblue', width = 0.5),
colorscale='Jet',
opacity=0.8
)
)
graph_data = [trace1]
layout = go.Layout(
title='Number of Kills v. Medical Supplies v. Rank Percentile',
autosize=False,
scene=go.Scene(
xaxis=go.XAxis(title='Number of Kills'),
yaxis=go.YAxis(title='Heals and Boosts'),
zaxis=go.ZAxis(title='Rank Percentile')),
margin=dict(
l=65,
r=50,
b=65,
t=90
)
)
fig = go.Figure(data=graph_data, layout=layout)
iplot(fig)
df[['healsAndBoosts','kills','winPlacePerc']][df['healsAndBoosts']==0].sort_values('kills').tail(3)
#subset the data from 4 million to 0.1 million to save computation time
#use the above assumptions to find out the cheaters in this 0.1 million data
data_sub,try_ = train_test_split(df,0.977)
id_set = set()
id_set.update(data_sub['Id'].loc[data_sub['killsNorm'] >= np.percentile(data_sub['killsNorm'],99)])
id_set.update(data_sub[data_sub['roadKills']>=int(outlier_range('roadKills',99.9,data_sub))]['Id'])
id_set.update(data_sub[(data_sub['weaponsAcquired'] > outlier_range('weaponsAcquired',99,data_sub)) & (data_sub['totalDistance'] == 0)]['Id'])
id_set.update(data_sub[(data_sub['headshotPerc']==1)&(data_sub['killsNorm']>outlier_range('killsNorm',99,data_sub))]['Id'])
id_set.update(data_sub[(data_sub['healsAndBoosts']==0)&(data_sub['killsNorm'] > outlier_range('killsNorm',99,data_sub))]['Id'])
outlier_len = len(set(id_set))
#print(outlier_len)
#data preparation
data = data_sub[['roadKills','killsNorm','weaponPerDistance','healsAndBoostsPerKill','headshotPerc']]
min_max_scaler = preprocessing.StandardScaler()
np_scaled = min_max_scaler.fit_transform(data)
data = pd.DataFrame(np_scaled)
# PCA to speed-up machine learning algorithms
pca = PCA(n_components=5)
pca_data = pca.fit_transform(data)
print(pca.explained_variance_ratio_) #every component explains a part of data
outlier_lst = [0.01,0.05,0.1]
#find out proportion of cheaters in the games
for outliers_fraction in outlier_lst:
model = OneClassSVM(nu=0.95 * outliers_fraction,verbose=False)
model.fit(data)
data_sub['anomaly_svm'] = model.predict(data)
data_sub['anomaly_svm'] = data_sub['anomaly_svm'].map({1: 0, -1: 1})
cheater_id = data_sub['Id'].loc[data_sub['anomaly_svm']==1]
precision = round(len(set(cheater_id) & id_set) / len(id_set) * 100,2)
print(f'Assumption: There are {str(outliers_fraction*100)}% of cheaters in the games:')
print('Count of cheaters: 0 - Normal players, 1 - Cheaters')
print(data_sub['anomaly_svm'].value_counts())
print(f"""{str(precision)}% of the {str(outlier_len)} player we identified as cheaters using above assumptions and data visualization methods
are also labeled as cheaters using the One-Class SVM model.\n""")
model = IsolationForest(contamination = outliers_fraction)
model.fit(data)
# add the data to the main
data_sub['anomaly_if'] = model.predict(data)
data_sub['anomaly_if'] = data_sub['anomaly_if'].map( {1: 0, -1: 1} )
print(data_sub['anomaly_if'].value_counts())
cheater_id2 = data_sub['Id'].loc[data_sub['anomaly_if']==1]
precision2 = round(len(set(cheater_id2) & id_set) / len(id_set) * 100,2)
#print(precision2)
overlap_len = len(set(data_sub['Id'].loc[data_sub['anomaly_if']==1]) & set(data_sub['Id'].loc[data_sub['anomaly_svm']==1]))
min_len = min(len(data_sub['Id'].loc[data_sub['anomaly_if']==1]),len(data_sub['Id'].loc[data_sub['anomaly_svm']==1]))
overlap_coeff = round(overlap_len/min_len *100,2)
# print(overlap_len)
# print(overlap_coeff)