- 대회설명: Dacon 웹 로그 데이터를 활용하여 앞으로 한 달 간 사용자의 로그인 여부를 예측
- 대회일자: 2019.08.05 ~ 2019.09.05
- 주관: Dacon
- 수상실적: 1위
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
plt.style.use('fivethirtyeight')
warnings.filterwarnings('ignore')
데이터
import os
os.chdir('C:\\Users\\Kim\\Desktop\\TNT\\웹데이터')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
TNT = pd.concat([train, test], axis = 0)
TNT.head() #Feature 파악
| Sex | apple_rat | email_type | login | past_1_month_login | past_1_week_login | past_login_total | person_id | phone_rat | sub_size | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | male | 1.0 | naver | 0.0 | 0.0 | 0.0 | 1.0 | 1015 | 0.0 | 0.0 |
| 1 | female | 1.0 | other | 0.0 | 0.0 | 0.0 | 2.0 | 1940 | 1.0 | 0.0 |
| 2 | male | 0.0 | other | 0.0 | 0.0 | 0.0 | 1.0 | 1356 | 1.0 | 0.0 |
| 3 | male | 1.0 | other | 0.0 | 0.0 | 0.0 | 2.0 | 1535 | 0.0 | 0.0 |
| 4 | female | 0.0 | naver | 0.0 | NaN | NaN | NaN | 216 | 0.0 | 0.0 |
#Nan값 파악
TNT.isnull().sum()
Sex 0
apple_rat 0
email_type 0
login 682
past_1_month_login 227
past_1_week_login 227
past_login_total 227
person_id 0
phone_rat 0
sub_size 0
dtype: int64
#nan값 시각화
import missingno as msno
msno.matrix(TNT)
plt.show()

#Feature 정보 파악
TNT.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2182 entries, 0 to 681
Data columns (total 10 columns):
Sex 2182 non-null object
apple_rat 2182 non-null float64
email_type 2182 non-null object
login 1500 non-null float64
past_1_month_login 1955 non-null float64
past_1_week_login 1955 non-null float64
past_login_total 1955 non-null float64
person_id 2182 non-null int64
phone_rat 2182 non-null float64
sub_size 2182 non-null float64
dtypes: float64(7), int64(1), object(2)
memory usage: 187.5+ KB
#전체적인 통계파악
TNT.describe()
| apple_rat | login | past_1_month_login | past_1_week_login | past_login_total | person_id | phone_rat | sub_size | |
|---|---|---|---|---|---|---|---|---|
| count | 2182.000000 | 1500.000000 | 1955.000000 | 1955.000000 | 1955.000000 | 2182.000000 | 2182.000000 | 2182.000000 |
| mean | 0.220114 | 0.099333 | 0.607161 | 0.257801 | 7.702813 | 1091.489918 | 0.125673 | 2.976169 |
| std | 0.397746 | 0.299209 | 3.215901 | 1.301143 | 21.546863 | 630.050764 | 0.295775 | 15.481671 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 546.250000 | 0.000000 | 0.000000 |
| 50% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 1091.500000 | 0.000000 | 0.000000 |
| 75% | 0.126995 | 0.000000 | 0.000000 | 0.000000 | 5.000000 | 1636.750000 | 0.000000 | 0.000000 |
| max | 1.000000 | 1.000000 | 93.000000 | 23.000000 | 503.000000 | 2182.000000 | 1.000000 | 358.000000 |
string_data = TNT.select_dtypes(include = ['object'])
int_data = TNT.select_dtypes(include = ['int64'])
float_data = TNT.select_dtypes(include = ['float64'])
#문자형 유니크 데이터 파악
for col in string_data.columns:
print('Unique values for {0}:\n{1}\n'.format(col, string_data[col].unique()))
Unique values for Sex:
['male' 'female']
Unique values for email_type:
['naver' 'other' 'gmail' 'nate' 'hanmail']
#숫자형 유니크 데이터 파악
for col in int_data.columns:
print('Unique values for {0}:\n{1}\n'.format(col, int_data[col].unique()))
Unique values for person_id:
[1015 1940 1356 ... 289 1590 572]
#실수형 유니크 데이터 파악
for col in float_data.columns:
print('Unique values for {0}:\n{1}\n'.format(col, float_data[col].unique()))
Unique values for apple_rat:
[1. 0. 0.85416667 0.8 0.5 0.25
0.08333333 0.01666667 0.4 0.76923077 0.875 0.2
0.66666667 0.57142857 0.13157895 0.125 0.81818182 0.55555556
0.86170213 0.21052632 0.88235294 0.16666667 0.03225806 0.92
0.85714286 0.33333333 0.09090909 0.93877551 0.625 0.71428571
0.75 0.26666667 0.52631579 0.70588235 0.375 0.0625
0.91304348 0.42857143 0.13333333 0.03333333 0.15 0.83333333
0.03703704 0.3 0.35 0.95238095 0.28571429 0.91666667
0.11764706 0.68181818 0.6 0.07692308 0.92307692 0.05405405
0.88888889 0.12765957 0.47368421 0.44444444 0.54545455 0.13043478
0.07462687 0.35714286 0.14285714 0.1509434 0.90566038 0.06666667
0.14864865 0.07142857 0.11111111 0.01630435 0.03448276 0.12922465
0.1875 0.96666667 0.18181818 0.53333333 0.20454545 0.86666667
0.7037037 0.27659574 0.04 0.98863636 0.91111111 0.12903226
0.08108108 0.94736842]
Unique values for login:
[ 0. 1. nan]
Unique values for past_1_month_login:
[ 0. nan 1. 2. 5. 3. 4. 12. 18. 9. 13. 6. 7. 8. 21. 17. 50. 19.
11. 29. 93. 26. 20. 27. 10.]
Unique values for past_1_week_login:
[ 0. nan 5. 2. 4. 1. 3. 9. 11. 6. 12. 18. 8. 7. 17. 20. 23.]
Unique values for past_login_total:
[ 1. 2. nan 4. 48. 5. 14. 3. 8. 9. 234. 12. 58. 7.
240. 10. 24. 65. 26. 61. 139. 6. 13. 23. 20. 11. 18. 25.
16. 38. 56. 15. 94. 19. 35. 27. 17. 31. 32. 30. 33. 62.
98. 21. 88. 45. 76. 40. 77. 28. 115. 60. 52. 236. 22. 54.
125. 132. 50. 37. 64. 119. 34. 39. 47. 41. 112. 117. 101. 29.
134. 73. 53. 36. 74. 184. 81. 503. 86. 87. 44. 42. 72. 111.
93. 68.]
Unique values for phone_rat:
[0. 1. 0.0625 0.07142857 0.5 0.25
0.77777778 0.06837607 0.08333333 0.01666667 0.01538462 0.1
0.4 0.07913669 0.15384615 0.15 0.42857143 0.05555556
0.2 0.33333333 0.14285714 0.13157895 0.61538462 0.8
0.64285714 0.86170213 0.22222222 0.08571429 0.21052632 0.23529412
0.6 0.07692308 0.09090909 0.03225806 0.11111111 0.16666667
0.03278689 0.01020408 0.08695652 0.03333333 0.11363636 0.75
0.26666667 0.01315789 0.01136364 0.37037037 0.05263158 0.35
0.1875 0.66666667 0.21875 0.09375 0.12987013 0.30434783
0.00869565 0.09230769 0.12121212 0.03703704 0.00423729 0.05
0.26315789 0.12962963 0.92 0.78571429 0.28571429 0.17857143
0.015625 0.11764706 0.02272727 0.43589744 0.04878049 0.125
0.73684211 0.04 0.92307692 0.05405405 0.03418803 0.03960396
0.18181818 0.05714286 0.12765957 0.47368421 0.3 0.27586207
0.04347826 0.27272727 0.17164179 0.19491525 0.08219178 0.03448276
0.1509434 0.06666667 0.83333333 0.02941176 0.13888889 0.14864865
0.04761905 0.95454545 0.44444444 0.13793103 0.35643564 0.76190476
0.90909091 0.09940358 0.96666667 0.375 0.20930233 0.55263158
0.22641509 0.03947368 0.03030303 0.0952381 0.07407407 0.4893617
0.48611111 0.07207207 0.55 0.02857143 0.54411765 0.23076923
0.09677419 0.52380952 0.17142857 0.30769231 0.11538462 0.71428571
0.14814815 0.08108108 0.12068966]
Unique values for sub_size:
[ 0. 28. 139. 3. 1. 114. 9. 6. 95. 56. 5. 163. 25. 17.
10. 8. 7. 2. 76. 20. 4. 46. 100. 19. 15. 34. 12. 49.
358. 149. 32. 43. 30. 13. 11. 80. 81. 23. 37. 82. 62. 166.
24. 16. 18. 26. 41. 50. 96. 55. 110. 33. 21. 91. 14. 29.
39. 282. 99. 63. 27. 36. 44. 68. 53. 45. 101. 64. 78. 22.
140. 38. 35. 48. 59.]
EDA
http://newsjel.ly/archives/newsjelly-report/visualization-report/8136(데이터 특징별 그래프 선택 방법)
plt.figure(figsize = [15,8])
sns.kdeplot(TNT.apple_rat, label = 'apple_rat')
plt.show()

TNT.apple_rat.hist()
<matplotlib.axes._subplots.AxesSubplot at 0xa383ed2710>

plt.figure(figsize = [15,8])
sns.kdeplot(TNT.apple_rat, label = 'phone_rat')
plt.show()

TNT.phone_rat.hist()
<matplotlib.axes._subplots.AxesSubplot at 0xa382897d30>

plt.figure(figsize = [15,8])
sns.kdeplot(TNT.apple_rat, label = 'sub_size')
plt.show()

TNT.sub_size.hist()
<matplotlib.axes._subplots.AxesSubplot at 0xa383df1cc0>

sns.countplot(TNT['sub_size'])
<matplotlib.axes._subplots.AxesSubplot at 0xa383d9a710>

plt.figure(figsize = [15,8])
sns.kdeplot(TNT.past_1_month_login, label = 'past_1_month_login')
plt.show()

TNT.past_1_month_login.hist()
<matplotlib.axes._subplots.AxesSubplot at 0xa383e28320>

sns.countplot(TNT['past_1_month_login'])
<matplotlib.axes._subplots.AxesSubplot at 0xa383e90470>

plt.figure(figsize = [15,8])
sns.kdeplot(TNT.past_1_week_login, label = 'past_1_week_login')
plt.show()

TNT.past_1_week_login.hist()
<matplotlib.axes._subplots.AxesSubplot at 0xa3840c7a58>

sns.countplot(TNT['past_1_week_login'])
<matplotlib.axes._subplots.AxesSubplot at 0xa38412d630>

plt.figure(figsize = [15,8])
sns.kdeplot(TNT.past_login_total, label = 'past_login_total')
plt.show()

TNT.past_login_total.hist()
<matplotlib.axes._subplots.AxesSubplot at 0xa384415be0>

plt.figure(figsize = [30,15])
sns.countplot(TNT['past_login_total'])
<matplotlib.axes._subplots.AxesSubplot at 0xa38444b550>

전처리
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
for col in ["Sex", 'email_type']:
TNT[col] = label.fit_transform(TNT[col])
sns.countplot(TNT['Sex'])
<matplotlib.axes._subplots.AxesSubplot at 0x14e3cccd68>

sns.countplot(TNT['email_type'])
<matplotlib.axes._subplots.AxesSubplot at 0x9432400908>

sns.heatmap(TNT.corr(),annot=True,cmap='Blues',linewidths=0.5) #annot - 빈칸에 상관계수 채워줌
fig=plt.gcf()
fig.set_size_inches(10,8)
plt.show()

TNT.head()
| Sex | apple_rat | email_type | login | past_1_month_login | past_1_week_login | past_login_total | person_id | phone_rat | sub_size | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1.0 | 3 | 0.0 | 0.0 | 0.0 | 1.0 | 1015 | 0.0 | 0.0 |
| 1 | 0 | 1.0 | 4 | 0.0 | 0.0 | 0.0 | 2.0 | 1940 | 1.0 | 0.0 |
| 2 | 1 | 0.0 | 4 | 0.0 | 0.0 | 0.0 | 1.0 | 1356 | 1.0 | 0.0 |
| 3 | 1 | 1.0 | 4 | 0.0 | 0.0 | 0.0 | 2.0 | 1535 | 0.0 | 0.0 |
| 4 | 0 | 0.0 | 3 | 0.0 | NaN | NaN | NaN | 216 | 0.0 | 0.0 |
——————————————————-
TNT['MAC_OS'] = TNT['phone_rat']-TNT['apple_rat']
TNT.drop(['phone_rat', 'past_login_total' ], axis = 1, inplace = True) #열제거
TNT['past_1_month_login'] = TNT['past_1_month_login'].fillna(0.6)
TNT['past_1_week_login'] = TNT['past_1_week_login'].fillna(0.25)
——————————————————
결측치를 바꾸거나 Feature Engineering을 시도 했었는데 결과가 좋지 않아 사용하지 않았음
#데이터 분리
train = TNT.loc[TNT.login.notnull()]
test = TNT.loc[TNT.login.isna()]
#모델에 넣을 train 확인
train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1500 entries, 0 to 1499
Data columns (total 9 columns):
Sex 1500 non-null int64
apple_rat 1500 non-null float64
email_type 1500 non-null int64
login 1500 non-null float64
past_1_month_login 1340 non-null float64
past_1_week_login 1340 non-null float64
person_id 1500 non-null int64
sub_size 1500 non-null float64
MAC_OS 1500 non-null float64
dtypes: float64(6), int64(3)
memory usage: 117.2 KB
모델링
참고 https://www.kaggle.com/lifesailor/xgboost
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn import metrics
def modelfit(alg, x_train, y_train,useTrainCV=True, cv_folds=5, early_stopping_rounds=100):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(x_train, label=y_train)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='rmse', early_stopping_rounds=early_stopping_rounds)
alg.set_params(n_estimators=cvresult.shape[0])
print(alg)
alg.fit(x_train, y_train, eval_metric='rmse')
dtrain_predictions = alg.predict(x_train)
#Print model report:
print("\nModel Report")
print("Training Accuracy : %.4g" % metrics.mean_squared_error(y_train, dtrain_predictions))
#처음 min_child_weight를 5로 설정했을 때 변화
#두번째 scale_pos_weight를 5로 설정했을 때 변화
# scale_pos_weight를 6으로 했을 때 변화
xgb1 = XGBRegressor(
learning_rate =0.2, #Learning rate(일반적으로 0.01 - 0.2)
n_estimators=5000,
max_depth=2, #tree 깊이
min_child_weight=5, # min_child_weight를 기준으로 추가 분기 결정(크면 Underfitting)
gamma=0, #split 하기 위한 최소의 loss 감소 정의
subsample=0.6, #데이터 중 샘플링(0.5 - 1)
colsample_bytree=0.6, #column 중 sampling(0.5 - 1)
objective= "binary:logistic",
nthread=-1, #병렬 처리 조절
scale_pos_weight=6, #positive, negative weight 지정
seed=2018 #모델이 매번 수행시, 샘플링 결과가 바뀔수 있으므로 지정
)
modelfit(xgb1, train.drop(['login'],axis = 1), train['login'])
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=0.6, gamma=0,
importance_type='gain', learning_rate=0.2, max_delta_step=0,
max_depth=2, min_child_weight=5, missing=None, n_estimators=55,
n_jobs=1, nthread=-1, objective='binary:logistic', random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=6, seed=2018,
silent=None, subsample=0.6, verbosity=1)
Model Report
Training Accuracy : 0.0905
제출
submission = pd.DataFrame({'person_id': test['person_id'] , 'login' : xgb1.predict(test.drop(['login'], axis = 1))})
submission.to_csv('submission9.csv', index = False)