• 대회설명: 1시간 전의 기상상황을 가지고 1시간 후의 따릉이 대여수를 예측
  • 대회일자: 2019.09.09 ~ 2019.10.05
  • 주관: Dacon
  • 수상실적: 1위

모듈 및 파일 로드

from google.colab import drive
drive.mount('/gdrive', force_remount=True)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
train_raw = pd.read_csv('/gdrive/My Drive/train.csv', encoding='utf-8') 
test_raw = pd.read_csv('/gdrive/My Drive/test.csv', encoding = 'utf-8')

##EDA 및 Data Preprocessing

id hour hour_bef_temperature hour_bef_precipitation hour_bef_windspeed hour_bef_humidity hour_bef_visibility hour_bef_ozone hour_bef_pm10 hour_bef_pm2.5 count
0 3 20 16.3 1.0 1.5 89.0 576.0 0.027 76.0 33.0 49.0
1 6 13 20.1 0.0 1.4 48.0 916.0 0.042 73.0 40.0 159.0
2 7 6 13.9 0.0 0.7 79.0 1382.0 0.033 32.0 19.0 26.0
3 8 23 8.1 0.0 2.7 54.0 946.0 0.040 75.0 64.0 57.0
4 9 18 29.5 0.0 4.8 7.0 2000.0 0.057 27.0 11.0 431.0
id                          0
hour                        0
hour_bef_temperature        2
hour_bef_precipitation      2
hour_bef_windspeed          9
hour_bef_humidity           2
hour_bef_visibility         2
hour_bef_ozone             76
hour_bef_pm10              90
hour_bef_pm2.5            117
count                       0
dtype: int64
# 결측치 분석
id hour hour_bef_temperature hour_bef_precipitation hour_bef_windspeed hour_bef_humidity hour_bef_visibility hour_bef_ozone hour_bef_pm10 hour_bef_pm2.5 count
934 1420 0 NaN NaN NaN NaN NaN NaN NaN NaN 39.0
1035 1553 18 NaN NaN NaN NaN NaN NaN NaN NaN 1.0
# 결측치 분석 결과 아무런 정보를 갖지 않는 data point인 것을 확인 후 제거
train = train_raw.loc[train_raw['hour_bef_temperature'].notnull()]
id                          0
hour                        0
hour_bef_temperature        0
hour_bef_precipitation      0
hour_bef_windspeed          7
hour_bef_humidity           0
hour_bef_visibility         0
hour_bef_ozone             74
hour_bef_pm10              88
hour_bef_pm2.5            115
count                       0
dtype: int64
id hour hour_bef_temperature hour_bef_precipitation hour_bef_windspeed hour_bef_humidity hour_bef_visibility hour_bef_ozone hour_bef_pm10 hour_bef_pm2.5 count
count 1457.000000 1457.000000 1457.000000 1457.000000 1450.000000 1457.000000 1457.000000 1383.000000 1369.000000 1342.000000 1457.000000
mean 1105.391901 11.496911 16.717433 0.031572 2.479034 52.231297 1405.216884 0.039149 57.168736 30.327124 108.684969
std 631.609634 6.918890 5.239150 0.174917 1.378265 20.370387 583.131708 0.019509 31.771019 14.713252 82.620202
min 3.000000 0.000000 3.100000 0.000000 0.000000 7.000000 78.000000 0.003000 9.000000 8.000000 1.000000
25% 555.000000 6.000000 12.800000 0.000000 1.400000 36.000000 879.000000 0.025500 36.000000 20.000000 37.000000
50% 1113.000000 11.000000 16.600000 0.000000 2.300000 51.000000 1577.000000 0.039000 51.000000 26.000000 96.000000
75% 1652.000000 17.000000 20.100000 0.000000 3.400000 69.000000 1994.000000 0.052000 69.000000 37.000000 150.000000
max 2179.000000 23.000000 30.000000 1.000000 8.000000 99.000000 2000.000000 0.125000 269.000000 90.000000 431.000000
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1457 entries, 0 to 1458
Data columns (total 11 columns):
id                        1457 non-null int64
hour                      1457 non-null int64
hour_bef_temperature      1457 non-null float64
hour_bef_precipitation    1457 non-null float64
hour_bef_windspeed        1450 non-null float64
hour_bef_humidity         1457 non-null float64
hour_bef_visibility       1457 non-null float64
hour_bef_ozone            1383 non-null float64
hour_bef_pm10             1369 non-null float64
hour_bef_pm2.5            1342 non-null float64
count                     1457 non-null float64
dtypes: float64(9), int64(2)
memory usage: 136.6 KB
# 시간에 따른 이용량 시각화
train.groupby(['hour'], as_index = False)['count'].mean().plot.bar()
# 1시간전 기온에 따른 이용량 분석
plt.scatter(train['hour_bef_temperature'], train['count'])
train[['hour_bef_temperature', 'count']].corr(method = 'pearson')
hour_bef_temperature count
hour_bef_temperature 1.000000 0.619404
count 0.619404 1.000000

# 강수 유무에 따른 이용량 분석
0.0    111.130404
1.0     33.673913
Name: count, dtype: float64
# 바람세기에 따른 이용량 분석
plt.scatter(train['hour_bef_windspeed'], train['count'])
train[['hour_bef_windspeed', 'count']].corr(method = 'pearson')
hour_bef_windspeed count
hour_bef_windspeed 1.000000 0.459906
count 0.459906 1.000000

# 습도에 따른 이용량 분석
plt.scatter(train['hour_bef_humidity'], train['count'])
train[['hour_bef_humidity', 'count']].corr(method = 'pearson')
hour_bef_humidity count
hour_bef_humidity 1.000000 -0.471142
count -0.471142 1.000000

# 가시성에 따른 이용량 분석
plt.scatter(train['hour_bef_visibility'], train['count'])
train[['hour_bef_visibility', 'count']].corr(method = 'pearson')
hour_bef_visibility count
hour_bef_visibility 1.000000 0.299094
count 0.299094 1.000000

# 오존량에 따른 이용량 분석
plt.scatter(train['hour_bef_ozone'], train['count'])
train[['hour_bef_ozone', 'count']].corr(method = 'pearson')
hour_bef_ozone count
hour_bef_ozone 1.000000 0.477614
count 0.477614 1.000000

# pm10에 따른 이용량 분석
plt.scatter(train['hour_bef_pm10'], train['count'])
train[['hour_bef_pm10', 'count']].corr(method = 'pearson')
hour_bef_pm10 count
hour_bef_pm10 1.000000 -0.114288
count -0.114288 1.000000

# pm2.5에 따른 이용량 분석
plt.scatter(train['hour_bef_pm2.5'], train['count'])
train[['hour_bef_pm2.5', 'count']].corr(method = 'pearson')
hour_bef_pm2.5 count
hour_bef_pm2.5 1.000000 -0.134293
count -0.134293 1.000000

# 오존농도를 기상청에서 알려주는 좋음/보통/나쁨/매우나쁨 을 기준으로 다른 class 부여
train['hour_bef_ozone'] = train['hour_bef_ozone'].apply(lambda x : 0 if x <= 0.03 else 
                                                                   1 if 0.03 < x and x <= 0.09 else 
                                                                   2 if 0.09 < x and x <= 0.151 else
                                                                   3 if 0.151 < x else x)
# 미세먼지를 기상청에서 알려주는 좋음/보통/나쁨/매우나쁨 을 기준으로 다른 class 부여
train['hour_bef_pm10'] = train['hour_bef_pm10'].apply(lambda x : 0 if x <= 30 else 
                                                                 1 if 30 < x and x <= 80 else 
                                                                 2 if 80 < x and x <= 150 else
                                                                 3 if 150 < x else x)
# 초미세먼지를 기상청에서 알려주는 좋음/보통/나쁨/매우나쁨 을 기준으로 다른 class 부여
train['hour_bef_pm2.5'] = train['hour_bef_pm2.5'].apply(lambda x : 0 if x <= 15 else 
                                                                   1 if 15 < x and x <= 35 else 
                                                                   2 if 35 < x and x <= 75 else
                                                                   3 if 75 < x else x)
train.median(axis = 0)
id                        1113.0
hour                        11.0
hour_bef_temperature        16.6
hour_bef_precipitation       0.0
hour_bef_windspeed           2.3
hour_bef_humidity           51.0
hour_bef_visibility       1577.0
hour_bef_ozone               1.0
hour_bef_pm10                1.0
hour_bef_pm2.5               1.0
count                       96.0
dtype: float64
# 결측치를 median 값으로 처리
train.fillna(train.median(), inplace = True)
# train data 확인
id                        0
hour                      0
hour_bef_temperature      0
hour_bef_precipitation    0
hour_bef_windspeed        0
hour_bef_humidity         0
hour_bef_visibility       0
hour_bef_ozone            0
hour_bef_pm10             0
hour_bef_pm2.5            0
count                     0
dtype: int64
# train data 확인
id hour hour_bef_temperature hour_bef_precipitation hour_bef_windspeed hour_bef_humidity hour_bef_visibility hour_bef_ozone hour_bef_pm10 hour_bef_pm2.5 count
0 3 20 16.3 1.0 1.5 89.0 576.0 0.0 1.0 1.0 49.0
1 6 13 20.1 0.0 1.4 48.0 916.0 1.0 1.0 2.0 159.0
2 7 6 13.9 0.0 0.7 79.0 1382.0 1.0 1.0 1.0 26.0
# train data set의 id값 제거, count 값 분리, array형태 변환
train_x = np.array(train.iloc[:, 1:-1])
train_y = np.array(train.iloc[:, 0])


# 모듈설치
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from xgboost import XGBRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
# 모델별 rmse를 확인하는 함수
def rmse(model):
  kf = KFold(10, shuffle = True, random_state = 42)
  rmse = np.sqrt(-cross_val_score(model, train_x, train_y, scoring = 'neg_mean_squared_error', cv = kf))
# 모델별 hyperparameter에 따른 rmse를 확인하는 함수
def girds(model, hyperparameters):
  kf = KFold(10, shuffle = True, random_state = 42)
  grid_search = GridSearchCV(model, param_grid = hyperparameters, scoring = 'neg_mean_squared_error', cv = kf)
  grid_search.fit(train_x, train_y)
  grid_search_result = grid_search.cv_results_
  for mean_score, params in zip(grid_search_result['mean_test_score'], grid_search_result['params']):
    print(np.sqrt(-mean_score), params)
# 최종 제출본 만들기
def submission(prediction):
  sub = pd.DataFrame()
  sub['id'] = test['id']
  sub['pred'] = prediction
  sub.to_csv('submission.csv', index = False)
lasso = make_pipeline(RobustScaler(), Lasso(random_state=3))
lasso_params = {'lasso__alpha' : [0.0001, 0.001, 0.01, 0.1]}
girds(lasso, lasso_params)
52.75776537898072 {'lasso__alpha': 0.0001}
52.75762990190659 {'lasso__alpha': 0.001}
52.75632230762074 {'lasso__alpha': 0.01}
52.74969915001244 {'lasso__alpha': 0.1}
model_GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
model_rf = RandomForestRegressor()
rf_params = {'n_estimators': [3, 10, 30, 60, 90]}
girds(model_rf, rf_params)
43.87376948585613 {'n_estimators': 3}
39.77442736314237 {'n_estimators': 10}
39.145458200601915 {'n_estimators': 30}
38.71933540674437 {'n_estimators': 60}
38.703951845692636 {'n_estimators': 90}
model_lgb = lgb.LGBMRegressor()
model_xgb = xgb.XGBRegressor()
# 가장 성능이 좋은 3개의 model을 선정해 학습
model_rf = RandomForestRegressor(n_estimators = 90)
model_rf.fit(train_x, train_y)
model_xgb = xgb.XGBRegressor()
model_xgb.fit(train_x, train_y)
model_lgb = lgb.LGBMRegressor()
model_lgb.fit(train_x, train_y)
# test data set 처리
test_raw['hour_bef_ozone'] = test_raw['hour_bef_ozone'].apply(lambda x : 0 if x <= 0.03 else 
                                                                   1 if 0.03 < x and x <= 0.09 else 
                                                                   2 if 0.09 < x and x <= 0.151 else
                                                                   3 if 0.151 < x else x)
test_raw['hour_bef_pm10'] = test_raw['hour_bef_pm10'].apply(lambda x : 0 if x <= 30 else 
                                                                 1 if 30 < x and x <= 80 else 
                                                                 2 if 80 < x and x <= 150 else
                                                                 3 if 150 < x else x)
test_raw['hour_bef_pm2.5'] = test_raw['hour_bef_pm2.5'].apply(lambda x : 0 if x <= 15 else 
                                                                   1 if 15 < x and x <= 35 else 
                                                                   2 if 35 < x and x <= 75 else
                                                                   3 if 75 < x else x)
test = test_raw.fillna(train.median())
test_x = np.array(test.iloc[:, 1:])
# 3개의 모델을 적절한 비율로 ensemble하여 predict
ensemble = model_rf.predict(test_x) * 0.25 + model_lgb.predict(test_x) * 0.5 + model_xgb.predict(test_x) * 0.25