[대회] Dacon Camp 따릉이 수요 예측 대회

대회설명: 1시간 전의 기상상황을 가지고 1시간 후의 따릉이 대여수를 예측
대회일자: 2019.09.09 ~ 2019.10.05
주관: Dacon
수상실적: 1위

모듈 및 파일 로드

from google.colab import drive
drive.mount('/gdrive', force_remount=True)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

train_raw = pd.read_csv('/gdrive/My Drive/train.csv', encoding='utf-8') 
test_raw = pd.read_csv('/gdrive/My Drive/test.csv', encoding = 'utf-8')

##EDA 및 Data Preprocessing

train_raw.head(5)

	id	hour	hour_bef_temperature	hour_bef_precipitation	hour_bef_windspeed	hour_bef_humidity	hour_bef_visibility	hour_bef_ozone	hour_bef_pm10	hour_bef_pm2.5	count
0	3	20	16.3	1.0	1.5	89.0	576.0	0.027	76.0	33.0	49.0
1	6	13	20.1	0.0	1.4	48.0	916.0	0.042	73.0	40.0	159.0
2	7	6	13.9	0.0	0.7	79.0	1382.0	0.033	32.0	19.0	26.0
3	8	23	8.1	0.0	2.7	54.0	946.0	0.040	75.0	64.0	57.0
4	9	18	29.5	0.0	4.8	7.0	2000.0	0.057	27.0	11.0	431.0

train_raw.isnull().sum()

id                          0
hour                        0
hour_bef_temperature        2
hour_bef_precipitation      2
hour_bef_windspeed          9
hour_bef_humidity           2
hour_bef_visibility         2
hour_bef_ozone             76
hour_bef_pm10              90
hour_bef_pm2.5            117
count                       0
dtype: int64

# 결측치 분석
train_raw.loc[train_raw['hour_bef_temperature'].isnull()]

	id	hour	hour_bef_temperature	hour_bef_precipitation	hour_bef_windspeed	hour_bef_humidity	hour_bef_visibility	hour_bef_ozone	hour_bef_pm10	hour_bef_pm2.5	count
934	1420	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	39.0
1035	1553	18	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0

# 결측치 분석 결과 아무런 정보를 갖지 않는 data point인 것을 확인 후 제거
train = train_raw.loc[train_raw['hour_bef_temperature'].notnull()]
train.isnull().sum()

id                          0
hour                        0
hour_bef_temperature        0
hour_bef_precipitation      0
hour_bef_windspeed          7
hour_bef_humidity           0
hour_bef_visibility         0
hour_bef_ozone             74
hour_bef_pm10              88
hour_bef_pm2.5            115
count                       0
dtype: int64

train.describe()

	id	hour	hour_bef_temperature	hour_bef_precipitation	hour_bef_windspeed	hour_bef_humidity	hour_bef_visibility	hour_bef_ozone	hour_bef_pm10	hour_bef_pm2.5	count
count	1457.000000	1457.000000	1457.000000	1457.000000	1450.000000	1457.000000	1457.000000	1383.000000	1369.000000	1342.000000	1457.000000
mean	1105.391901	11.496911	16.717433	0.031572	2.479034	52.231297	1405.216884	0.039149	57.168736	30.327124	108.684969
std	631.609634	6.918890	5.239150	0.174917	1.378265	20.370387	583.131708	0.019509	31.771019	14.713252	82.620202
min	3.000000	0.000000	3.100000	0.000000	0.000000	7.000000	78.000000	0.003000	9.000000	8.000000	1.000000
25%	555.000000	6.000000	12.800000	0.000000	1.400000	36.000000	879.000000	0.025500	36.000000	20.000000	37.000000
50%	1113.000000	11.000000	16.600000	0.000000	2.300000	51.000000	1577.000000	0.039000	51.000000	26.000000	96.000000
75%	1652.000000	17.000000	20.100000	0.000000	3.400000	69.000000	1994.000000	0.052000	69.000000	37.000000	150.000000
max	2179.000000	23.000000	30.000000	1.000000	8.000000	99.000000	2000.000000	0.125000	269.000000	90.000000	431.000000

train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1457 entries, 0 to 1458
Data columns (total 11 columns):
id                        1457 non-null int64
hour                      1457 non-null int64
hour_bef_temperature      1457 non-null float64
hour_bef_precipitation    1457 non-null float64
hour_bef_windspeed        1450 non-null float64
hour_bef_humidity         1457 non-null float64
hour_bef_visibility       1457 non-null float64
hour_bef_ozone            1383 non-null float64
hour_bef_pm10             1369 non-null float64
hour_bef_pm2.5            1342 non-null float64
count                     1457 non-null float64
dtypes: float64(9), int64(2)
memory usage: 136.6 KB

# 시간에 따른 이용량 시각화
train.groupby(['hour'], as_index = False)['count'].mean().plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0x7f0b6a378128>

# 1시간전 기온에 따른 이용량 분석
plt.scatter(train['hour_bef_temperature'], train['count'])
train[['hour_bef_temperature', 'count']].corr(method = 'pearson')

	hour_bef_temperature	count
hour_bef_temperature	1.000000	0.619404
count	0.619404	1.000000

# 강수 유무에 따른 이용량 분석
train.groupby(['hour_bef_precipitation'])['count'].mean()

hour_bef_precipitation
0.0    111.130404
1.0     33.673913
Name: count, dtype: float64

# 바람세기에 따른 이용량 분석
plt.scatter(train['hour_bef_windspeed'], train['count'])
train[['hour_bef_windspeed', 'count']].corr(method = 'pearson')

	hour_bef_windspeed	count
hour_bef_windspeed	1.000000	0.459906
count	0.459906	1.000000

# 습도에 따른 이용량 분석
plt.scatter(train['hour_bef_humidity'], train['count'])
train[['hour_bef_humidity', 'count']].corr(method = 'pearson')

	hour_bef_humidity	count
hour_bef_humidity	1.000000	-0.471142
count	-0.471142	1.000000

# 가시성에 따른 이용량 분석
plt.scatter(train['hour_bef_visibility'], train['count'])
train[['hour_bef_visibility', 'count']].corr(method = 'pearson')

	hour_bef_visibility	count
hour_bef_visibility	1.000000	0.299094
count	0.299094	1.000000

# 오존량에 따른 이용량 분석
plt.scatter(train['hour_bef_ozone'], train['count'])
train[['hour_bef_ozone', 'count']].corr(method = 'pearson')

	hour_bef_ozone	count
hour_bef_ozone	1.000000	0.477614
count	0.477614	1.000000

# pm10에 따른 이용량 분석
plt.scatter(train['hour_bef_pm10'], train['count'])
train[['hour_bef_pm10', 'count']].corr(method = 'pearson')

	hour_bef_pm10	count
hour_bef_pm10	1.000000	-0.114288
count	-0.114288	1.000000

# pm2.5에 따른 이용량 분석
plt.scatter(train['hour_bef_pm2.5'], train['count'])
train[['hour_bef_pm2.5', 'count']].corr(method = 'pearson')

	hour_bef_pm2.5	count
hour_bef_pm2.5	1.000000	-0.134293
count	-0.134293	1.000000

# 오존농도를 기상청에서 알려주는 좋음/보통/나쁨/매우나쁨 을 기준으로 다른 class 부여
train['hour_bef_ozone'] = train['hour_bef_ozone'].apply(lambda x : 0 if x <= 0.03 else 
                                                                   1 if 0.03 < x and x <= 0.09 else 
                                                                   2 if 0.09 < x and x <= 0.151 else
                                                                   3 if 0.151 < x else x)

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.

# 미세먼지를 기상청에서 알려주는 좋음/보통/나쁨/매우나쁨 을 기준으로 다른 class 부여
train['hour_bef_pm10'] = train['hour_bef_pm10'].apply(lambda x : 0 if x <= 30 else 
                                                                 1 if 30 < x and x <= 80 else 
                                                                 2 if 80 < x and x <= 150 else
                                                                 3 if 150 < x else x)

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.

# 초미세먼지를 기상청에서 알려주는 좋음/보통/나쁨/매우나쁨 을 기준으로 다른 class 부여
train['hour_bef_pm2.5'] = train['hour_bef_pm2.5'].apply(lambda x : 0 if x <= 15 else 
                                                                   1 if 15 < x and x <= 35 else 
                                                                   2 if 35 < x and x <= 75 else
                                                                   3 if 75 < x else x)

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.

train.median(axis = 0)

id                        1113.0
hour                        11.0
hour_bef_temperature        16.6
hour_bef_precipitation       0.0
hour_bef_windspeed           2.3
hour_bef_humidity           51.0
hour_bef_visibility       1577.0
hour_bef_ozone               1.0
hour_bef_pm10                1.0
hour_bef_pm2.5               1.0
count                       96.0
dtype: float64

# 결측치를 median 값으로 처리
train.fillna(train.median(), inplace = True)

/usr/local/lib/python3.6/dist-packages/pandas/core/generic.py:6287: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)

# train data 확인
train.isnull().sum()

id                        0
hour                      0
hour_bef_temperature      0
hour_bef_precipitation    0
hour_bef_windspeed        0
hour_bef_humidity         0
hour_bef_visibility       0
hour_bef_ozone            0
hour_bef_pm10             0
hour_bef_pm2.5            0
count                     0
dtype: int64

# train data 확인
train.head(3)

	id	hour	hour_bef_temperature	hour_bef_precipitation	hour_bef_windspeed	hour_bef_humidity	hour_bef_visibility	hour_bef_ozone	hour_bef_pm10	hour_bef_pm2.5	count
0	3	20	16.3	1.0	1.5	89.0	576.0	0.0	1.0	1.0	49.0
1	6	13	20.1	0.0	1.4	48.0	916.0	1.0	1.0	2.0	159.0
2	7	6	13.9	0.0	0.7	79.0	1382.0	1.0	1.0	1.0	26.0

# train data set의 id값 제거, count 값 분리, array형태 변환
train_x = np.array(train.iloc[:, 1:-1])
train_y = np.array(train.iloc[:, 0])

Modeling

# 모듈설치
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from xgboost import XGBRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

# 모델별 rmse를 확인하는 함수
def rmse(model):
  kf = KFold(10, shuffle = True, random_state = 42)
  rmse = np.sqrt(-cross_val_score(model, train_x, train_y, scoring = 'neg_mean_squared_error', cv = kf))
  print(np.mean(rmse))

# 모델별 hyperparameter에 따른 rmse를 확인하는 함수
def girds(model, hyperparameters):
  kf = KFold(10, shuffle = True, random_state = 42)
  grid_search = GridSearchCV(model, param_grid = hyperparameters, scoring = 'neg_mean_squared_error', cv = kf)
  grid_search.fit(train_x, train_y)
  grid_search_result = grid_search.cv_results_
  for mean_score, params in zip(grid_search_result['mean_test_score'], grid_search_result['params']):
    print(np.sqrt(-mean_score), params)

# 최종 제출본 만들기
def submission(prediction):
  sub = pd.DataFrame()
  sub['id'] = test['id']
  sub['pred'] = prediction
  sub.to_csv('submission.csv', index = False)

lasso = make_pipeline(RobustScaler(), Lasso(random_state=3))
lasso_params = {'lasso__alpha' : [0.0001, 0.001, 0.01, 0.1]}
girds(lasso, lasso_params)

75776537898072 {'lasso__alpha': 0.0001}
75762990190659 {'lasso__alpha': 0.001}
75632230762074 {'lasso__alpha': 0.01}
74969915001244 {'lasso__alpha': 0.1}

model_GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
rmse(model_GBoost)

39.93306739247525

model_rf = RandomForestRegressor()
rf_params = {'n_estimators': [3, 10, 30, 60, 90]}
girds(model_rf, rf_params)

/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_search.py:814: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)


87376948585613 {'n_estimators': 3}
77442736314237 {'n_estimators': 10}
145458200601915 {'n_estimators': 30}
71933540674437 {'n_estimators': 60}
703951845692636 {'n_estimators': 90}

model_lgb = lgb.LGBMRegressor()
rmse(model_lgb)

38.22539983547282

model_xgb = xgb.XGBRegressor()
rmse(model_xgb)

[07:38:56] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[07:38:56] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[07:38:56] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[07:38:56] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[07:38:57] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[07:38:57] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[07:38:57] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[07:38:57] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[07:38:57] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[07:38:57] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
39.275307376492854

# 가장 성능이 좋은 3개의 model을 선정해 학습
model_rf = RandomForestRegressor(n_estimators = 90)
model_rf.fit(train_x, train_y)
model_xgb = xgb.XGBRegressor()
model_xgb.fit(train_x, train_y)
model_lgb = lgb.LGBMRegressor()
model_lgb.fit(train_x, train_y)

# test data set 처리
test_raw['hour_bef_ozone'] = test_raw['hour_bef_ozone'].apply(lambda x : 0 if x <= 0.03 else 
                                                                   1 if 0.03 < x and x <= 0.09 else 
                                                                   2 if 0.09 < x and x <= 0.151 else
                                                                   3 if 0.151 < x else x)
test_raw['hour_bef_pm10'] = test_raw['hour_bef_pm10'].apply(lambda x : 0 if x <= 30 else 
                                                                 1 if 30 < x and x <= 80 else 
                                                                 2 if 80 < x and x <= 150 else
                                                                 3 if 150 < x else x)
test_raw['hour_bef_pm2.5'] = test_raw['hour_bef_pm2.5'].apply(lambda x : 0 if x <= 15 else 
                                                                   1 if 15 < x and x <= 35 else 
                                                                   2 if 35 < x and x <= 75 else
                                                                   3 if 75 < x else x)
test = test_raw.fillna(train.median())
test_x = np.array(test.iloc[:, 1:])
print(test_x[0,0])

# 3개의 모델을 적절한 비율로 ensemble하여 predict
ensemble = model_rf.predict(test_x) * 0.25 + model_lgb.predict(test_x) * 0.5 + model_xgb.predict(test_x) * 0.25
submission(ensemble)