Upload for_py.py
Browse files
for_py.py
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import seaborn as sns
|
| 3 |
+
import numpy as np
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
import matplotlib
|
| 6 |
+
import sklearn
|
| 7 |
+
import xgboost
|
| 8 |
+
from xgboost import XGBRegressor
|
| 9 |
+
from sklearn.model_selection import KFold
|
| 10 |
+
import math
|
| 11 |
+
import time
|
| 12 |
+
import random as rn
|
| 13 |
+
RANDOM_SEED = 2025
|
| 14 |
+
np.random.seed(RANDOM_SEED)
|
| 15 |
+
rn.seed(RANDOM_SEED)
|
| 16 |
+
from datetime import datetime
|
| 17 |
+
import warnings
|
| 18 |
+
|
| 19 |
+
warnings.filterwarnings('ignore')
|
| 20 |
+
pd.set_option('display.max_columns', None)
|
| 21 |
+
def smape(gt, preds):
|
| 22 |
+
gt= np.array(gt)
|
| 23 |
+
preds = np.array(preds)
|
| 24 |
+
v = 2 * abs(preds - gt) / (abs(preds) + abs(gt))
|
| 25 |
+
score = np.mean(v) * 100
|
| 26 |
+
return score
|
| 27 |
+
|
| 28 |
+
def weighted_mse(alpha = 1):
|
| 29 |
+
def weighted_mse_fixed(label, pred):
|
| 30 |
+
residual = (label - pred).astype("float")
|
| 31 |
+
grad = np.where(residual>0, -2*alpha*residual, -2*residual)
|
| 32 |
+
hess = np.where(residual>0, 2*alpha, 2.0)
|
| 33 |
+
return grad, hess
|
| 34 |
+
return weighted_mse_fixed
|
| 35 |
+
|
| 36 |
+
def custom_smape(preds, dtrain):
|
| 37 |
+
labels = dtrain.get_label()
|
| 38 |
+
return 'custom_smape', np.mean(2 * abs(preds - labels) / (abs(preds) + abs(labels))) * 100
|
| 39 |
+
train = pd.read_csv('train.csv')
|
| 40 |
+
test = pd.read_csv('test.csv')
|
| 41 |
+
building_info = pd.read_csv('building_info.csv')
|
| 42 |
+
train = train.rename(columns={
|
| 43 |
+
'๊ฑด๋ฌผ๋ฒํธ': 'building_number',
|
| 44 |
+
'์ผ์': 'date_time',
|
| 45 |
+
'๊ธฐ์จ(ยฐC)': 'temperature',
|
| 46 |
+
'๊ฐ์๋(mm)': 'rainfall',
|
| 47 |
+
'ํ์(m/s)': 'windspeed',
|
| 48 |
+
'์ต๋(%)': 'humidity',
|
| 49 |
+
'์ผ์กฐ(hr)': 'sunshine',
|
| 50 |
+
'์ผ์ฌ(MJ/m2)': 'solar_radiation',
|
| 51 |
+
'์ ๋ ฅ์๋น๋(kWh)': 'power_consumption'
|
| 52 |
+
})
|
| 53 |
+
train.drop('num_date_time', axis = 1, inplace=True)
|
| 54 |
+
|
| 55 |
+
test = test.rename(columns={
|
| 56 |
+
'๊ฑด๋ฌผ๋ฒํธ': 'building_number',
|
| 57 |
+
'์ผ์': 'date_time',
|
| 58 |
+
'๊ธฐ์จ(ยฐC)': 'temperature',
|
| 59 |
+
'๊ฐ์๋(mm)': 'rainfall',
|
| 60 |
+
'ํ์(m/s)': 'windspeed',
|
| 61 |
+
'์ต๋(%)': 'humidity',
|
| 62 |
+
'์ผ์กฐ(hr)': 'sunshine',
|
| 63 |
+
'์ผ์ฌ(MJ/m2)': 'solar_radiation',
|
| 64 |
+
'์ ๋ ฅ์๋น๋(kWh)': 'power_consumption'
|
| 65 |
+
})
|
| 66 |
+
test.drop('num_date_time', axis = 1, inplace=True)
|
| 67 |
+
|
| 68 |
+
building_info = building_info.rename(columns={
|
| 69 |
+
'๊ฑด๋ฌผ๋ฒํธ': 'building_number',
|
| 70 |
+
'๊ฑด๋ฌผ์ ํ': 'building_type',
|
| 71 |
+
'์ฐ๋ฉด์ (m2)': 'total_area',
|
| 72 |
+
'๋๋ฐฉ๋ฉด์ (m2)': 'cooling_area',
|
| 73 |
+
'ํ์๊ด์ฉ๋(kW)': 'solar_power_capacity',
|
| 74 |
+
'ESS์ ์ฅ์ฉ๋(kWh)': 'ess_capacity',
|
| 75 |
+
'PCS์ฉ๋(kW)': 'pcs_capacity'
|
| 76 |
+
})
|
| 77 |
+
|
| 78 |
+
translation_dict = {
|
| 79 |
+
'๊ฑด๋ฌผ๊ธฐํ': 'Other Buildings',
|
| 80 |
+
'๊ณต๊ณต': 'Public',
|
| 81 |
+
'ํ๊ต': 'University',
|
| 82 |
+
'๋ฐฑํ์ ': 'Department Store',
|
| 83 |
+
'๋ณ์': 'Hospital',
|
| 84 |
+
'์์ฉ': 'Commercial',
|
| 85 |
+
'์ํํธ': 'Apartment',
|
| 86 |
+
'์ฐ๊ตฌ์': 'Research Institute',
|
| 87 |
+
'IDC(์ ํ๊ตญ)': 'IDC',
|
| 88 |
+
'ํธํ
': 'Hotel'
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
building_info['building_type'] = building_info['building_type'].replace(translation_dict)
|
| 92 |
+
|
| 93 |
+
building_info['solar_power_utility'] = np.where(building_info.solar_power_capacity !='-',1,0)
|
| 94 |
+
building_info['ess_utility'] = np.where(building_info.ess_capacity !='-',1,0)
|
| 95 |
+
|
| 96 |
+
train = pd.merge(train, building_info, on='building_number', how='left')
|
| 97 |
+
test = pd.merge(test, building_info, on='building_number', how='left')
|
| 98 |
+
train['date_time'] = pd.to_datetime(train['date_time'], format='%Y%m%d %H')
|
| 99 |
+
|
| 100 |
+
# date time feature ์์ฑ
|
| 101 |
+
train['hour'] = train['date_time'].dt.hour
|
| 102 |
+
train['day'] = train['date_time'].dt.day
|
| 103 |
+
train['month'] = train['date_time'].dt.month
|
| 104 |
+
train['day_of_week'] = train['date_time'].dt.dayofweek #์์ผ
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
test['date_time'] = pd.to_datetime(test['date_time'], format='%Y%m%d %H')
|
| 108 |
+
|
| 109 |
+
# date time feature ์์ฑ
|
| 110 |
+
test['hour'] = test['date_time'].dt.hour
|
| 111 |
+
test['day'] = test['date_time'].dt.day
|
| 112 |
+
test['month'] = test['date_time'].dt.month
|
| 113 |
+
test['day_of_week'] = test['date_time'].dt.dayofweek #์์ผ
|
| 114 |
+
def calculate_day_values(dataframe, target_column, output_column, aggregation_func):
|
| 115 |
+
result_dict = {}
|
| 116 |
+
|
| 117 |
+
grouped_temp = dataframe.groupby(['building_number', 'month', 'day'])[target_column].agg(aggregation_func)
|
| 118 |
+
|
| 119 |
+
for (building, month, day), value in grouped_temp.items():
|
| 120 |
+
result_dict.setdefault(building, {}).setdefault(month, {})[day] = value
|
| 121 |
+
|
| 122 |
+
dataframe[output_column] = [
|
| 123 |
+
result_dict.get(row['building_number'], {}).get(row['month'], {}).get(row['day'], None)
|
| 124 |
+
for _, row in dataframe.iterrows()
|
| 125 |
+
]
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
train['day_max_temperature'] = 0.0
|
| 129 |
+
train['day_mean_temperature'] = 0.0
|
| 130 |
+
|
| 131 |
+
calculate_day_values(train, 'temperature', 'day_max_temperature', 'max')
|
| 132 |
+
calculate_day_values(train, 'temperature', 'day_mean_temperature', 'mean')
|
| 133 |
+
calculate_day_values(train, 'temperature', 'day_min_temperature', 'min')
|
| 134 |
+
|
| 135 |
+
train['day_temperature_range'] = train['day_max_temperature'] - train['day_min_temperature']
|
| 136 |
+
|
| 137 |
+
calculate_day_values(test, 'temperature', 'day_max_temperature', 'max')
|
| 138 |
+
calculate_day_values(test, 'temperature', 'day_mean_temperature', 'mean')
|
| 139 |
+
calculate_day_values(test, 'temperature', 'day_min_temperature', 'min')
|
| 140 |
+
|
| 141 |
+
test['day_temperature_range'] = test['day_max_temperature'] - test['day_min_temperature']
|
| 142 |
+
outlier_idx = train.index[train['power_consumption'] == 0].tolist()
|
| 143 |
+
print("์ ๊ฑฐ๏ฟฝ๏ฟฝ๏ฟฝ ํ ๊ฐ์:", len(outlier_idx))
|
| 144 |
+
print("์ธ๋ฑ์ค ์์:", outlier_idx[:10])
|
| 145 |
+
|
| 146 |
+
# 2) ํด๋น ์ธ๋ฑ์ค๋ค ๋๋กญ
|
| 147 |
+
train.drop(index=outlier_idx, inplace=True)
|
| 148 |
+
|
| 149 |
+
# 3) ๋๋กญ ํ ํ์ธ
|
| 150 |
+
print("๋จ์ ํ ๊ฐ์:", train.shape[0])
|
| 151 |
+
holi_weekday = ['2024-06-06', '2024-08-15']
|
| 152 |
+
|
| 153 |
+
train['holiday'] = np.where((train.day_of_week >= 5) | (train.date_time.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0)
|
| 154 |
+
test['holiday'] = np.where((test.day_of_week >= 5) | (test.date_time.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0)
|
| 155 |
+
#์๊ฐ
|
| 156 |
+
train['sin_hour'] = np.sin(2 * np.pi * train['hour']/23.0)
|
| 157 |
+
train['cos_hour'] = np.cos(2 * np.pi * train['hour']/23.0)
|
| 158 |
+
test['sin_hour'] = np.sin(2 * np.pi * test['hour']/23.0)
|
| 159 |
+
test['cos_hour'] = np.cos(2 * np.pi * test['hour']/23.0)
|
| 160 |
+
|
| 161 |
+
#๋ ์ง
|
| 162 |
+
train['sin_date'] = -np.sin(2 * np.pi * (train['month']+train['day']/31)/12)
|
| 163 |
+
train['cos_date'] = -np.cos(2 * np.pi * (train['month']+train['day']/31)/12)
|
| 164 |
+
test['sin_date'] = -np.sin(2 * np.pi * (test['month']+test['day']/31)/12)
|
| 165 |
+
test['cos_date'] = -np.cos(2 * np.pi * (test['month']+test['day']/31)/12)
|
| 166 |
+
|
| 167 |
+
#์
|
| 168 |
+
train['sin_month'] = -np.sin(2 * np.pi * train['month']/12.0)
|
| 169 |
+
train['cos_month'] = -np.cos(2 * np.pi * train['month']/12.0)
|
| 170 |
+
test['sin_month'] = -np.sin(2 * np.pi * test['month']/12.0)
|
| 171 |
+
test['cos_month'] = -np.cos(2 * np.pi * test['month']/12.0)
|
| 172 |
+
|
| 173 |
+
#์์ผ
|
| 174 |
+
train['sin_dayofweek'] = -np.sin(2 * np.pi * (train['day_of_week']+1)/7.0)
|
| 175 |
+
train['cos_dayofweek'] = -np.cos(2 * np.pi * (train['day_of_week']+1)/7.0)
|
| 176 |
+
test['sin_dayofweek'] = -np.sin(2 * np.pi * (test['day_of_week']+1)/7.0)
|
| 177 |
+
test['cos_dayofweek'] = -np.cos(2 * np.pi * (test['day_of_week']+1)/7.0)
|
| 178 |
+
def CDH(xs):
|
| 179 |
+
cumsum = np.cumsum(xs - 26)
|
| 180 |
+
return np.concatenate((cumsum[:11], cumsum[11:] - cumsum[:-11]))
|
| 181 |
+
|
| 182 |
+
def calculate_and_add_cdh(dataframe):
|
| 183 |
+
cdhs = []
|
| 184 |
+
for i in range(1, 101):
|
| 185 |
+
temp = dataframe[dataframe['building_number'] == i]['temperature'].values
|
| 186 |
+
cdh = CDH(temp)
|
| 187 |
+
cdhs.append(cdh)
|
| 188 |
+
return np.concatenate(cdhs)
|
| 189 |
+
|
| 190 |
+
train['CDH'] = calculate_and_add_cdh(train)
|
| 191 |
+
test['CDH'] = calculate_and_add_cdh(test)
|
| 192 |
+
train['THI'] = 9/5*train['temperature'] - 0.55*(1-train['humidity']/100)*(9/5*train['humidity']-26)+32
|
| 193 |
+
test['THI'] = 9/5*test['temperature'] - 0.55*(1-test['humidity']/100)*(9/5*test['humidity']-26)+32
|
| 194 |
+
train['WCT'] = 13.12 + 0.6125*train['temperature'] - 11.37*(train['windspeed']**
|
| 195 |
+
0.16) + 0.3965*(train['windspeed']**0.16)*train['temperature']
|
| 196 |
+
test['WCT'] = 13.12 + 0.6125*test['temperature'] - 11.37*(test['windspeed']**
|
| 197 |
+
0.16) + 0.3965*(test['windspeed']**0.16)*test['temperature']
|
| 198 |
+
# Calculate 'day_hour_mean'
|
| 199 |
+
power_mean = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour', 'day_of_week'], aggfunc=np.mean).reset_index()
|
| 200 |
+
power_mean.columns = ['building_number', 'hour', 'day_of_week', 'day_hour_mean']
|
| 201 |
+
|
| 202 |
+
# Calculate 'day_hour_std'
|
| 203 |
+
power_std = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour', 'day_of_week'], aggfunc=np.std).reset_index()
|
| 204 |
+
power_std.columns = ['building_number', 'hour', 'day_of_week', 'day_hour_std']
|
| 205 |
+
|
| 206 |
+
# Calculate 'hour_mean'
|
| 207 |
+
power_hour_mean = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour'], aggfunc=np.mean).reset_index()
|
| 208 |
+
power_hour_mean.columns = ['building_number', 'hour', 'hour_mean']
|
| 209 |
+
|
| 210 |
+
# Calculate 'hour_std'
|
| 211 |
+
power_hour_std = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour'], aggfunc=np.std).reset_index()
|
| 212 |
+
power_hour_std.columns = ['building_number', 'hour', 'hour_std']
|
| 213 |
+
|
| 214 |
+
# Merge calculated features to 'train' and 'test' dataframes
|
| 215 |
+
train = train.merge(power_mean, on=['building_number', 'hour', 'day_of_week'], how='left')
|
| 216 |
+
test = test.merge(power_mean, on=['building_number', 'hour', 'day_of_week'], how='left')
|
| 217 |
+
|
| 218 |
+
train = train.merge(power_std, on=['building_number', 'hour', 'day_of_week'], how='left')
|
| 219 |
+
test = test.merge(power_std, on=['building_number', 'hour', 'day_of_week'], how='left')
|
| 220 |
+
|
| 221 |
+
train = train.merge(power_hour_mean, on=['building_number', 'hour'], how='left')
|
| 222 |
+
test = test.merge(power_hour_mean, on=['building_number', 'hour'], how='left')
|
| 223 |
+
|
| 224 |
+
train = train.merge(power_hour_std, on=['building_number', 'hour'], how='left')
|
| 225 |
+
test = test.merge(power_hour_std, on=['building_number', 'hour'], how='left')
|
| 226 |
+
|
| 227 |
+
train = train.reset_index(drop=True)
|
| 228 |
+
X = train.drop(['solar_power_capacity', 'ess_capacity', 'pcs_capacity',
|
| 229 |
+
'power_consumption','rainfall', 'sunshine', 'solar_radiation',
|
| 230 |
+
'hour','day','month','day_of_week','date_time'],axis =1 )
|
| 231 |
+
|
| 232 |
+
Y = train[['building_type','power_consumption']]
|
| 233 |
+
|
| 234 |
+
test_X = test.drop(['solar_power_capacity', 'ess_capacity', 'pcs_capacity','rainfall',
|
| 235 |
+
'hour','month','day_of_week','day','date_time'], axis=1)
|
| 236 |
+
type_list = []
|
| 237 |
+
for value in train.building_type.values:
|
| 238 |
+
if value not in type_list:
|
| 239 |
+
type_list.append(value)
|
| 240 |
+
|
| 241 |
+
RANDOM_SEED = 42
|
| 242 |
+
KFOLD_SPLITS = 2
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
type_list = X["building_type"].unique()
|
| 247 |
+
|
| 248 |
+
# ๊ฒฐ๊ณผ ๋ด์ DF
|
| 249 |
+
answer_df = pd.DataFrame(index=test_X.index, columns=["answer"], dtype=float)
|
| 250 |
+
pred_df = pd.DataFrame(index=X.index, columns=["pred"], dtype=float)
|
| 251 |
+
|
| 252 |
+
kf = KFold(n_splits=KFOLD_SPLITS, shuffle=True, random_state=RANDOM_SEED)
|
| 253 |
+
|
| 254 |
+
for btype in type_list:
|
| 255 |
+
start_proj = time.time()
|
| 256 |
+
x = X [X['building_type'] == btype].copy()
|
| 257 |
+
y = Y [Y['building_type'] == btype]['power_consumption'].copy()
|
| 258 |
+
xt = test_X[test_X['building_type'] == btype].copy()
|
| 259 |
+
|
| 260 |
+
x = pd.get_dummies(x, columns=["building_number"], drop_first=False)
|
| 261 |
+
xt = pd.get_dummies(xt, columns=["building_number"], drop_first=False)
|
| 262 |
+
|
| 263 |
+
xt = xt.reindex(columns=x.columns, fill_value=0)
|
| 264 |
+
|
| 265 |
+
drop_cols = ["building_type"]
|
| 266 |
+
x = x .drop(columns=drop_cols)
|
| 267 |
+
xt = xt.drop(columns=drop_cols)
|
| 268 |
+
|
| 269 |
+
preds_valid = pd.Series(index=y.index, dtype=float)
|
| 270 |
+
preds_test = []
|
| 271 |
+
|
| 272 |
+
x_values = x.values
|
| 273 |
+
y_values = y.values
|
| 274 |
+
|
| 275 |
+
fold_scores = []
|
| 276 |
+
for fold, (tr_idx, va_idx) in enumerate(kf.split(x_values), 1):
|
| 277 |
+
X_tr, X_va = x_values[tr_idx], x_values[va_idx]
|
| 278 |
+
y_tr, y_va = y_values[tr_idx], y_values[va_idx]
|
| 279 |
+
|
| 280 |
+
y_tr_log = np.log(y_tr)
|
| 281 |
+
y_va_log = np.log(y_va)
|
| 282 |
+
|
| 283 |
+
model = XGBRegressor(
|
| 284 |
+
learning_rate = 0.0013292918943162175,
|
| 285 |
+
n_estimators = 4759,
|
| 286 |
+
max_depth = 23,
|
| 287 |
+
subsample = 0.7993292420985183,
|
| 288 |
+
colsample_bytree = 0.5780093202212182,
|
| 289 |
+
min_child_weight = 2,
|
| 290 |
+
gamma = 0.2904180608409973,
|
| 291 |
+
reg_alpha = 0.6245760287469893,
|
| 292 |
+
reg_lambda = 0.002570603566117598,
|
| 293 |
+
random_state = RANDOM_SEED,
|
| 294 |
+
n_jobs = -1,
|
| 295 |
+
objective = weighted_mse(3),
|
| 296 |
+
#tree_method = "gpu_hist",
|
| 297 |
+
#gpu_id = 0,
|
| 298 |
+
early_stopping_rounds = 100,
|
| 299 |
+
)
|
| 300 |
+
|
| 301 |
+
model.fit(
|
| 302 |
+
X_tr, y_tr_log,
|
| 303 |
+
eval_set=[(X_va, y_va_log)],
|
| 304 |
+
eval_metric=custom_smape,
|
| 305 |
+
verbose=False,
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
va_pred = np.exp(model.predict(X_va))
|
| 309 |
+
preds_valid.iloc[va_idx] = va_pred
|
| 310 |
+
|
| 311 |
+
fold_smape = smape(y_va, va_pred)
|
| 312 |
+
fold_scores.append(fold_smape)
|
| 313 |
+
|
| 314 |
+
# ํ
์คํธ ์์ธก
|
| 315 |
+
preds_test.append(np.exp(model.predict(xt.values)))
|
| 316 |
+
"""
|
| 317 |
+
if fold == KFOLD_SPLITS: # ๋ง์ง๋ง fold ํ ๋ฒ๋ง ์๊ฐํ
|
| 318 |
+
sorted_idx = model.feature_importances_.argsort()
|
| 319 |
+
plt.figure(figsize=(8, 15))
|
| 320 |
+
plt.barh(x.columns[sorted_idx], model.feature_importances_[sorted_idx])
|
| 321 |
+
plt.xlabel(f"{btype}ย XGBย Featureย Importance")
|
| 322 |
+
plt.show()"""
|
| 323 |
+
|
| 324 |
+
# ๊ฒ์ฆ ์์ธก ์ ์ฅ
|
| 325 |
+
pred_df.loc[preds_valid.index, "pred"] = preds_valid
|
| 326 |
+
|
| 327 |
+
# ํ
์คํธ ์์ธก(์์๋ธ ํ๊ท ) ์ ์ฅ
|
| 328 |
+
answer_df.loc[xt.index, "answer"] = np.mean(preds_test, axis=0)
|
| 329 |
+
end_proj = time.time()
|
| 330 |
+
print(f"Building type = {btype} : XGB SMAPE = {np.mean(fold_scores):.4f} : {end_proj-start_proj:5f} sec")
|
| 331 |
+
|
| 332 |
+
total_smape = smape(
|
| 333 |
+
Y.sort_index()["power_consumption"].values,
|
| 334 |
+
pred_df.sort_index()["pred"].values
|
| 335 |
+
)
|
| 336 |
+
print(f"Total SMAPE = {total_smape:.4f}")
|
| 337 |
+
submit = pd.read_csv("sample_submission.csv")
|
| 338 |
+
submit["answer"] = answer_df.loc[submit.index, "answer"]
|
| 339 |
+
submit.to_csv("private_์ฌํ_3.csv", index=False)
|
| 340 |
+
print("Saved โ private_์ฌํ_3.csv")
|