대회설명: Dacon 웹 로그 데이터를 활용하여 앞으로 한 달 간 사용자의 로그인 여부를 예측
대회일자: 2019.08.05 ~ 2019.09.05
주관: Dacon
수상실적: 1위

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
plt.style.use('fivethirtyeight')
warnings.filterwarnings('ignore')

데이터

import os
os.chdir('C:\\Users\\Kim\\Desktop\\TNT\\웹데이터')

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

TNT = pd.concat([train, test], axis = 0)

TNT.head()  #Feature 파악

	Sex	apple_rat	email_type	past_1_month_login	past_1_week_login	past_login_total	person_id	phone_rat
0	male	1.0	naver	0.0	0.0	1.0	1015	0.0
1	female	1.0	other	0.0	0.0	2.0	1940	1.0
2	male	0.0	other	0.0	0.0	1.0	1356	1.0
3	male	1.0	other	0.0	0.0	2.0	1535	0.0
4	female	0.0	naver	NaN	NaN	NaN	216	0.0

#Nan값 파악
TNT.isnull().sum()

Sex                     0
apple_rat               0
email_type              0
login                 682
past_1_month_login    227
past_1_week_login     227
past_login_total      227
person_id               0
phone_rat               0
sub_size                0
dtype: int64

#nan값 시각화
import missingno as msno
msno.matrix(TNT)
plt.show()

#Feature 정보 파악
TNT.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2182 entries, 0 to 681
Data columns (total 10 columns):
Sex                   2182 non-null object
apple_rat             2182 non-null float64
email_type            2182 non-null object
login                 1500 non-null float64
past_1_month_login    1955 non-null float64
past_1_week_login     1955 non-null float64
past_login_total      1955 non-null float64
person_id             2182 non-null int64
phone_rat             2182 non-null float64
sub_size              2182 non-null float64
dtypes: float64(7), int64(1), object(2)
memory usage: 187.5+ KB

#전체적인 통계파악
TNT.describe()

	apple_rat	login	past_1_month_login	past_1_week_login	past_login_total	person_id	phone_rat	sub_size
count	2182.000000	1500.000000	1955.000000	1955.000000	1955.000000	2182.000000	2182.000000	2182.000000
mean	0.220114	0.099333	0.607161	0.257801	7.702813	1091.489918	0.125673	2.976169
std	0.397746	0.299209	3.215901	1.301143	21.546863	630.050764	0.295775	15.481671
min	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000
25%	0.000000	0.000000	0.000000	0.000000	1.000000	546.250000	0.000000	0.000000
50%	0.000000	0.000000	0.000000	0.000000	2.000000	1091.500000	0.000000	0.000000
75%	0.126995	0.000000	0.000000	0.000000	5.000000	1636.750000	0.000000	0.000000
max	1.000000	1.000000	93.000000	23.000000	503.000000	2182.000000	1.000000	358.000000

string_data = TNT.select_dtypes(include = ['object'])
int_data = TNT.select_dtypes(include = ['int64'])
float_data = TNT.select_dtypes(include = ['float64'])

#문자형 유니크 데이터 파악
for col in string_data.columns:
    print('Unique values for {0}:\n{1}\n'.format(col, string_data[col].unique()))

Unique values for Sex:
['male' 'female']

Unique values for email_type:
['naver' 'other' 'gmail' 'nate' 'hanmail']

#숫자형 유니크 데이터 파악
for col in int_data.columns:
    print('Unique values for {0}:\n{1}\n'.format(col, int_data[col].unique()))

Unique values for person_id:
[1015 1940 1356 ...  289 1590  572]

#실수형 유니크 데이터 파악
for col in float_data.columns:
    print('Unique values for {0}:\n{1}\n'.format(col, float_data[col].unique()))

Unique values for apple_rat:
[1.         0.         0.85416667 0.8        0.5        0.25
08333333 0.01666667 0.4        0.76923077 0.875      0.2
66666667 0.57142857 0.13157895 0.125      0.81818182 0.55555556
86170213 0.21052632 0.88235294 0.16666667 0.03225806 0.92
85714286 0.33333333 0.09090909 0.93877551 0.625      0.71428571
75       0.26666667 0.52631579 0.70588235 0.375      0.0625
91304348 0.42857143 0.13333333 0.03333333 0.15       0.83333333
03703704 0.3        0.35       0.95238095 0.28571429 0.91666667
11764706 0.68181818 0.6        0.07692308 0.92307692 0.05405405
88888889 0.12765957 0.47368421 0.44444444 0.54545455 0.13043478
07462687 0.35714286 0.14285714 0.1509434  0.90566038 0.06666667
14864865 0.07142857 0.11111111 0.01630435 0.03448276 0.12922465
1875     0.96666667 0.18181818 0.53333333 0.20454545 0.86666667
7037037  0.27659574 0.04       0.98863636 0.91111111 0.12903226
08108108 0.94736842]

Unique values for login:
[ 0.  1. nan]

Unique values for past_1_month_login:
[ 0. nan  1.  2.  5.  3.  4. 12. 18.  9. 13.  6.  7.  8. 21. 17. 50. 19.
29. 93. 26. 20. 27. 10.]

Unique values for past_1_week_login:
[ 0. nan  5.  2.  4.  1.  3.  9. 11.  6. 12. 18.  8.  7. 17. 20. 23.]

Unique values for past_login_total:
[  1.   2.  nan   4.  48.   5.  14.   3.   8.   9. 234.  12.  58.   7.
 10.  24.  65.  26.  61. 139.   6.  13.  23.  20.  11.  18.  25.
 38.  56.  15.  94.  19.  35.  27.  17.  31.  32.  30.  33.  62.
 21.  88.  45.  76.  40.  77.  28. 115.  60.  52. 236.  22.  54.
132.  50.  37.  64. 119.  34.  39.  47.  41. 112. 117. 101.  29.
 73.  53.  36.  74. 184.  81. 503.  86.  87.  44.  42.  72. 111.
 68.]

Unique values for phone_rat:
[0.         1.         0.0625     0.07142857 0.5        0.25
77777778 0.06837607 0.08333333 0.01666667 0.01538462 0.1
4        0.07913669 0.15384615 0.15       0.42857143 0.05555556
2        0.33333333 0.14285714 0.13157895 0.61538462 0.8
64285714 0.86170213 0.22222222 0.08571429 0.21052632 0.23529412
6        0.07692308 0.09090909 0.03225806 0.11111111 0.16666667
03278689 0.01020408 0.08695652 0.03333333 0.11363636 0.75
26666667 0.01315789 0.01136364 0.37037037 0.05263158 0.35
1875     0.66666667 0.21875    0.09375    0.12987013 0.30434783
00869565 0.09230769 0.12121212 0.03703704 0.00423729 0.05
26315789 0.12962963 0.92       0.78571429 0.28571429 0.17857143
015625   0.11764706 0.02272727 0.43589744 0.04878049 0.125
73684211 0.04       0.92307692 0.05405405 0.03418803 0.03960396
18181818 0.05714286 0.12765957 0.47368421 0.3        0.27586207
04347826 0.27272727 0.17164179 0.19491525 0.08219178 0.03448276
1509434  0.06666667 0.83333333 0.02941176 0.13888889 0.14864865
04761905 0.95454545 0.44444444 0.13793103 0.35643564 0.76190476
90909091 0.09940358 0.96666667 0.375      0.20930233 0.55263158
22641509 0.03947368 0.03030303 0.0952381  0.07407407 0.4893617
48611111 0.07207207 0.55       0.02857143 0.54411765 0.23076923
09677419 0.52380952 0.17142857 0.30769231 0.11538462 0.71428571
14814815 0.08108108 0.12068966]

Unique values for sub_size:
[  0.  28. 139.   3.   1. 114.   9.   6.  95.  56.   5. 163.  25.  17.
  8.   7.   2.  76.  20.   4.  46. 100.  19.  15.  34.  12.  49.
149.  32.  43.  30.  13.  11.  80.  81.  23.  37.  82.  62. 166.
 16.  18.  26.  41.  50.  96.  55. 110.  33.  21.  91.  14.  29.
282.  99.  63.  27.  36.  44.  68.  53.  45. 101.  64.  78.  22.
 38.  35.  48.  59.]

EDA

http://newsjel.ly/archives/newsjelly-report/visualization-report/8136(데이터 특징별 그래프 선택 방법)

plt.figure(figsize = [15,8])
sns.kdeplot(TNT.apple_rat, label = 'apple_rat')
plt.show()

TNT.apple_rat.hist()

<matplotlib.axes._subplots.AxesSubplot at 0xa383ed2710>

plt.figure(figsize = [15,8])
sns.kdeplot(TNT.apple_rat, label = 'phone_rat')
plt.show()

TNT.phone_rat.hist()

<matplotlib.axes._subplots.AxesSubplot at 0xa382897d30>

plt.figure(figsize = [15,8])
sns.kdeplot(TNT.apple_rat, label = 'sub_size')
plt.show()

TNT.sub_size.hist()

<matplotlib.axes._subplots.AxesSubplot at 0xa383df1cc0>

sns.countplot(TNT['sub_size'])

<matplotlib.axes._subplots.AxesSubplot at 0xa383d9a710>

plt.figure(figsize = [15,8])
sns.kdeplot(TNT.past_1_month_login, label = 'past_1_month_login')
plt.show()

TNT.past_1_month_login.hist()

<matplotlib.axes._subplots.AxesSubplot at 0xa383e28320>

sns.countplot(TNT['past_1_month_login'])

<matplotlib.axes._subplots.AxesSubplot at 0xa383e90470>

plt.figure(figsize = [15,8])
sns.kdeplot(TNT.past_1_week_login, label = 'past_1_week_login')
plt.show()

TNT.past_1_week_login.hist()

<matplotlib.axes._subplots.AxesSubplot at 0xa3840c7a58>

sns.countplot(TNT['past_1_week_login'])

<matplotlib.axes._subplots.AxesSubplot at 0xa38412d630>

plt.figure(figsize = [15,8])
sns.kdeplot(TNT.past_login_total, label = 'past_login_total')
plt.show()

TNT.past_login_total.hist()

<matplotlib.axes._subplots.AxesSubplot at 0xa384415be0>

plt.figure(figsize = [30,15])
sns.countplot(TNT['past_login_total'])

<matplotlib.axes._subplots.AxesSubplot at 0xa38444b550>

전처리

from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
for col in ["Sex", 'email_type']:    
    TNT[col] = label.fit_transform(TNT[col])

sns.countplot(TNT['Sex'])

<matplotlib.axes._subplots.AxesSubplot at 0x14e3cccd68>

sns.countplot(TNT['email_type'])

<matplotlib.axes._subplots.AxesSubplot at 0x9432400908>

sns.heatmap(TNT.corr(),annot=True,cmap='Blues',linewidths=0.5) #annot - 빈칸에 상관계수 채워줌
fig=plt.gcf()
fig.set_size_inches(10,8)
plt.show()

TNT.head()

	Sex	apple_rat	email_type	past_1_month_login	past_1_week_login	past_login_total	person_id	phone_rat
0	1	1.0	3	0.0	0.0	1.0	1015	0.0
1	0	1.0	4	0.0	0.0	2.0	1940	1.0
2	1	0.0	4	0.0	0.0	1.0	1356	1.0
3	1	1.0	4	0.0	0.0	2.0	1535	0.0
4	0	0.0	3	NaN	NaN	NaN	216	0.0

——————————————————-

TNT['MAC_OS'] = TNT['phone_rat']-TNT['apple_rat']

TNT.drop(['phone_rat', 'past_login_total' ], axis = 1, inplace = True) #열제거

TNT['past_1_month_login'] = TNT['past_1_month_login'].fillna(0.6)

TNT['past_1_week_login'] = TNT['past_1_week_login'].fillna(0.25)

——————————————————

결측치를 바꾸거나 Feature Engineering을 시도 했었는데 결과가 좋지 않아 사용하지 않았음

#데이터 분리
train = TNT.loc[TNT.login.notnull()]
test = TNT.loc[TNT.login.isna()]

#모델에 넣을 train 확인
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1500 entries, 0 to 1499
Data columns (total 9 columns):
Sex                   1500 non-null int64
apple_rat             1500 non-null float64
email_type            1500 non-null int64
login                 1500 non-null float64
past_1_month_login    1340 non-null float64
past_1_week_login     1340 non-null float64
person_id             1500 non-null int64
sub_size              1500 non-null float64
MAC_OS                1500 non-null float64
dtypes: float64(6), int64(3)
memory usage: 117.2 KB

모델링

참고 https://www.kaggle.com/lifesailor/xgboost

from sklearn.model_selection import GridSearchCV
import xgboost as xgb 
from xgboost.sklearn import XGBRegressor
from sklearn import metrics

def modelfit(alg, x_train, y_train,useTrainCV=True, cv_folds=5, early_stopping_rounds=100):
   
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(x_train, label=y_train) 
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          metrics='rmse', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
        print(alg)
    
    
    alg.fit(x_train, y_train, eval_metric='rmse')
        
    
    dtrain_predictions = alg.predict(x_train)
        
    #Print model report:
    print("\nModel Report")
    print("Training Accuracy : %.4g" % metrics.mean_squared_error(y_train, dtrain_predictions))

#처음 min_child_weight를 5로 설정했을 때 변화
#두번째 scale_pos_weight를 5로 설정했을 때 변화
# scale_pos_weight를 6으로 했을 때 변화
xgb1 = XGBRegressor(
    learning_rate =0.2,  #Learning rate(일반적으로 0.01 - 0.2)
    n_estimators=5000,
    max_depth=2,   #tree 깊이
    min_child_weight=5,  # min_child_weight를 기준으로 추가 분기 결정(크면 Underfitting)
    gamma=0,   #split 하기 위한 최소의 loss 감소 정의
    subsample=0.6,  #데이터 중 샘플링(0.5 - 1)
    colsample_bytree=0.6, #column 중 sampling(0.5 - 1)
    objective= "binary:logistic",
    nthread=-1,  #병렬 처리 조절
    scale_pos_weight=6, #positive, negative weight 지정
    seed=2018  #모델이 매번 수행시, 샘플링 결과가 바뀔수 있으므로 지정
)
modelfit(xgb1, train.drop(['login'],axis = 1), train['login'])

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.6, gamma=0,
       importance_type='gain', learning_rate=0.2, max_delta_step=0,
       max_depth=2, min_child_weight=5, missing=None, n_estimators=55,
       n_jobs=1, nthread=-1, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=6, seed=2018,
       silent=None, subsample=0.6, verbosity=1)

Model Report
Training Accuracy : 0.0905

제출

submission = pd.DataFrame({'person_id': test['person_id'] , 'login' : xgb1.predict(test.drop(['login'], axis = 1))})
submission.to_csv('submission9.csv', index = False)

TNT

[대회] Dacon Camp 웹데이터 대회

데이터

EDA

전처리

——————————————————-

——————————————————

모델링

참고 https://www.kaggle.com/lifesailor/xgboost

제출

—————————————————–