import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
plt.style.use('fivethirtyeight')
warnings.filterwarnings('ignore')
import os
os.chdir('C:\\Users\\Kim\\Desktop\\TNT\\웹데이터')
Sex 0
apple_rat 0
email_type 0
login 682
past_1_month_login 227
past_1_week_login 227
past_login_total 227
person_id 0
phone_rat 0
sub_size 0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2182 entries, 0 to 681
Data columns (total 10 columns):
Sex 2182 non-null object
apple_rat 2182 non-null float64
email_type 2182 non-null object
login 1500 non-null float64
past_1_month_login 1955 non-null float64
past_1_week_login 1955 non-null float64
past_login_total 1955 non-null float64
person_id 2182 non-null int64
phone_rat 2182 non-null float64
sub_size 2182 non-null float64
dtypes: float64(7), int64(1), object(2)
memory usage: 187.5+ KB
Unique values for Sex:
['male' 'female']
Unique values for email_type:
['naver' 'other' 'gmail' 'nate' 'hanmail']
Unique values for person_id:
[1015 1940 1356 ... 289 1590 572]
Unique values for apple_rat:
[1. 0. 0.85416667 0.8 0.5 0.25
0.08333333 0.01666667 0.4 0.76923077 0.875 0.2
0.66666667 0.57142857 0.13157895 0.125 0.81818182 0.55555556
0.86170213 0.21052632 0.88235294 0.16666667 0.03225806 0.92
0.85714286 0.33333333 0.09090909 0.93877551 0.625 0.71428571
0.75 0.26666667 0.52631579 0.70588235 0.375 0.0625
0.91304348 0.42857143 0.13333333 0.03333333 0.15 0.83333333
0.03703704 0.3 0.35 0.95238095 0.28571429 0.91666667
0.11764706 0.68181818 0.6 0.07692308 0.92307692 0.05405405
0.88888889 0.12765957 0.47368421 0.44444444 0.54545455 0.13043478
0.07462687 0.35714286 0.14285714 0.1509434 0.90566038 0.06666667
0.14864865 0.07142857 0.11111111 0.01630435 0.03448276 0.12922465
0.1875 0.96666667 0.18181818 0.53333333 0.20454545 0.86666667
0.7037037 0.27659574 0.04 0.98863636 0.91111111 0.12903226
0.08108108 0.94736842]
Unique values for login:
[ 0. 1. nan]
Unique values for past_1_month_login:
[ 0. nan 1. 2. 5. 3. 4. 12. 18. 9. 13. 6. 7. 8. 21. 17. 50. 19.
11. 29. 93. 26. 20. 27. 10.]
Unique values for past_1_week_login:
[ 0. nan 5. 2. 4. 1. 3. 9. 11. 6. 12. 18. 8. 7. 17. 20. 23.]
Unique values for past_login_total:
[ 1. 2. nan 4. 48. 5. 14. 3. 8. 9. 234. 12. 58. 7.
240. 10. 24. 65. 26. 61. 139. 6. 13. 23. 20. 11. 18. 25.
16. 38. 56. 15. 94. 19. 35. 27. 17. 31. 32. 30. 33. 62.
98. 21. 88. 45. 76. 40. 77. 28. 115. 60. 52. 236. 22. 54.
125. 132. 50. 37. 64. 119. 34. 39. 47. 41. 112. 117. 101. 29.
134. 73. 53. 36. 74. 184. 81. 503. 86. 87. 44. 42. 72. 111.
93. 68.]
Unique values for phone_rat:
[0. 1. 0.0625 0.07142857 0.5 0.25
0.77777778 0.06837607 0.08333333 0.01666667 0.01538462 0.1
0.4 0.07913669 0.15384615 0.15 0.42857143 0.05555556
0.2 0.33333333 0.14285714 0.13157895 0.61538462 0.8
0.64285714 0.86170213 0.22222222 0.08571429 0.21052632 0.23529412
0.6 0.07692308 0.09090909 0.03225806 0.11111111 0.16666667
0.03278689 0.01020408 0.08695652 0.03333333 0.11363636 0.75
0.26666667 0.01315789 0.01136364 0.37037037 0.05263158 0.35
0.1875 0.66666667 0.21875 0.09375 0.12987013 0.30434783
0.00869565 0.09230769 0.12121212 0.03703704 0.00423729 0.05
0.26315789 0.12962963 0.92 0.78571429 0.28571429 0.17857143
0.015625 0.11764706 0.02272727 0.43589744 0.04878049 0.125
0.73684211 0.04 0.92307692 0.05405405 0.03418803 0.03960396
0.18181818 0.05714286 0.12765957 0.47368421 0.3 0.27586207
0.04347826 0.27272727 0.17164179 0.19491525 0.08219178 0.03448276
0.1509434 0.06666667 0.83333333 0.02941176 0.13888889 0.14864865
0.04761905 0.95454545 0.44444444 0.13793103 0.35643564 0.76190476
0.90909091 0.09940358 0.96666667 0.375 0.20930233 0.55263158
0.22641509 0.03947368 0.03030303 0.0952381 0.07407407 0.4893617
0.48611111 0.07207207 0.55 0.02857143 0.54411765 0.23076923
0.09677419 0.52380952 0.17142857 0.30769231 0.11538462 0.71428571
0.14814815 0.08108108 0.12068966]
Unique values for sub_size:
[ 0. 28. 139. 3. 1. 114. 9. 6. 95. 56. 5. 163. 25. 17.
10. 8. 7. 2. 76. 20. 4. 46. 100. 19. 15. 34. 12. 49.
358. 149. 32. 43. 30. 13. 11. 80. 81. 23. 37. 82. 62. 166.
24. 16. 18. 26. 41. 50. 96. 55. 110. 33. 21. 91. 14. 29.
39. 282. 99. 63. 27. 36. 44. 68. 53. 45. 101. 64. 78. 22.
140. 38. 35. 48. 59.]
http://newsjel.ly/archives/newsjelly-report/visualization-report/8136(데이터 특징별 그래프 선택 방법)
<matplotlib.axes._subplots.AxesSubplot at 0xa383ed2710>
<matplotlib.axes._subplots.AxesSubplot at 0xa382897d30>
<matplotlib.axes._subplots.AxesSubplot at 0xa383df1cc0>
<matplotlib.axes._subplots.AxesSubplot at 0xa383d9a710>
<matplotlib.axes._subplots.AxesSubplot at 0xa383e28320>
<matplotlib.axes._subplots.AxesSubplot at 0xa383e90470>
<matplotlib.axes._subplots.AxesSubplot at 0xa3840c7a58>
<matplotlib.axes._subplots.AxesSubplot at 0xa38412d630>
<matplotlib.axes._subplots.AxesSubplot at 0xa384415be0>
<matplotlib.axes._subplots.AxesSubplot at 0xa38444b550>
<matplotlib.axes._subplots.AxesSubplot at 0x14e3cccd68>
<matplotlib.axes._subplots.AxesSubplot at 0x9432400908>
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1500 entries, 0 to 1499
Data columns (total 9 columns):
Sex 1500 non-null int64
apple_rat 1500 non-null float64
email_type 1500 non-null int64
login 1500 non-null float64
past_1_month_login 1340 non-null float64
past_1_week_login 1340 non-null float64
person_id 1500 non-null int64
sub_size 1500 non-null float64
MAC_OS 1500 non-null float64
dtypes: float64(6), int64(3)
memory usage: 117.2 KB
def modelfit(alg, x_train, y_train,useTrainCV=True, cv_folds=5, early_stopping_rounds=100):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(x_train, label=y_train)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='rmse', early_stopping_rounds=early_stopping_rounds)
alg.set_params(n_estimators=cvresult.shape[0])
print(alg)
alg.fit(x_train, y_train, eval_metric='rmse')
dtrain_predictions = alg.predict(x_train)
#Print model report:
print("\nModel Report")
print("Training Accuracy : %.4g" % metrics.mean_squared_error(y_train, dtrain_predictions))
#처음 min_child_weight를 5로 설정했을 때 변화
#두번째 scale_pos_weight를 5로 설정했을 때 변화
# scale_pos_weight를 6으로 했을 때 변화
xgb1 = XGBRegressor(
learning_rate =0.2, #Learning rate(일반적으로 0.01 - 0.2)
n_estimators=5000,
max_depth=2, #tree 깊이
min_child_weight=5, # min_child_weight를 기준으로 추가 분기 결정(크면 Underfitting)
gamma=0, #split 하기 위한 최소의 loss 감소 정의
subsample=0.6, #데이터 중 샘플링(0.5 - 1)
colsample_bytree=0.6, #column 중 sampling(0.5 - 1)
objective= "binary:logistic",
nthread=-1, #병렬 처리 조절
scale_pos_weight=6, #positive, negative weight 지정
seed=2018 #모델이 매번 수행시, 샘플링 결과가 바뀔수 있으므로 지정
)
modelfit(xgb1, train.drop(['login'],axis = 1), train['login'])
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=0.6, gamma=0,
importance_type='gain', learning_rate=0.2, max_delta_step=0,
max_depth=2, min_child_weight=5, missing=None, n_estimators=55,
n_jobs=1, nthread=-1, objective='binary:logistic', random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=6, seed=2018,
silent=None, subsample=0.6, verbosity=1)
Model Report
Training Accuracy : 0.0905