jwheo commited on
Commit
7ebbdcc
ยท
verified ยท
1 Parent(s): f127d78

Upload for_py.py

Browse files
Files changed (1) hide show
  1. for_py.py +340 -0
for_py.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import seaborn as sns
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import matplotlib
6
+ import sklearn
7
+ import xgboost
8
+ from xgboost import XGBRegressor
9
+ from sklearn.model_selection import KFold
10
+ import math
11
+ import time
12
+ import random as rn
13
+ RANDOM_SEED = 2025
14
+ np.random.seed(RANDOM_SEED)
15
+ rn.seed(RANDOM_SEED)
16
+ from datetime import datetime
17
+ import warnings
18
+
19
+ warnings.filterwarnings('ignore')
20
+ pd.set_option('display.max_columns', None)
21
+ def smape(gt, preds):
22
+ gt= np.array(gt)
23
+ preds = np.array(preds)
24
+ v = 2 * abs(preds - gt) / (abs(preds) + abs(gt))
25
+ score = np.mean(v) * 100
26
+ return score
27
+
28
+ def weighted_mse(alpha = 1):
29
+ def weighted_mse_fixed(label, pred):
30
+ residual = (label - pred).astype("float")
31
+ grad = np.where(residual>0, -2*alpha*residual, -2*residual)
32
+ hess = np.where(residual>0, 2*alpha, 2.0)
33
+ return grad, hess
34
+ return weighted_mse_fixed
35
+
36
+ def custom_smape(preds, dtrain):
37
+ labels = dtrain.get_label()
38
+ return 'custom_smape', np.mean(2 * abs(preds - labels) / (abs(preds) + abs(labels))) * 100
39
+ train = pd.read_csv('train.csv')
40
+ test = pd.read_csv('test.csv')
41
+ building_info = pd.read_csv('building_info.csv')
42
+ train = train.rename(columns={
43
+ '๊ฑด๋ฌผ๋ฒˆํ˜ธ': 'building_number',
44
+ '์ผ์‹œ': 'date_time',
45
+ '๊ธฐ์˜จ(ยฐC)': 'temperature',
46
+ '๊ฐ•์ˆ˜๋Ÿ‰(mm)': 'rainfall',
47
+ 'ํ’์†(m/s)': 'windspeed',
48
+ '์Šต๋„(%)': 'humidity',
49
+ '์ผ์กฐ(hr)': 'sunshine',
50
+ '์ผ์‚ฌ(MJ/m2)': 'solar_radiation',
51
+ '์ „๋ ฅ์†Œ๋น„๋Ÿ‰(kWh)': 'power_consumption'
52
+ })
53
+ train.drop('num_date_time', axis = 1, inplace=True)
54
+
55
+ test = test.rename(columns={
56
+ '๊ฑด๋ฌผ๋ฒˆํ˜ธ': 'building_number',
57
+ '์ผ์‹œ': 'date_time',
58
+ '๊ธฐ์˜จ(ยฐC)': 'temperature',
59
+ '๊ฐ•์ˆ˜๋Ÿ‰(mm)': 'rainfall',
60
+ 'ํ’์†(m/s)': 'windspeed',
61
+ '์Šต๋„(%)': 'humidity',
62
+ '์ผ์กฐ(hr)': 'sunshine',
63
+ '์ผ์‚ฌ(MJ/m2)': 'solar_radiation',
64
+ '์ „๋ ฅ์†Œ๋น„๋Ÿ‰(kWh)': 'power_consumption'
65
+ })
66
+ test.drop('num_date_time', axis = 1, inplace=True)
67
+
68
+ building_info = building_info.rename(columns={
69
+ '๊ฑด๋ฌผ๋ฒˆํ˜ธ': 'building_number',
70
+ '๊ฑด๋ฌผ์œ ํ˜•': 'building_type',
71
+ '์—ฐ๋ฉด์ (m2)': 'total_area',
72
+ '๋ƒ‰๋ฐฉ๋ฉด์ (m2)': 'cooling_area',
73
+ 'ํƒœ์–‘๊ด‘์šฉ๋Ÿ‰(kW)': 'solar_power_capacity',
74
+ 'ESS์ €์žฅ์šฉ๋Ÿ‰(kWh)': 'ess_capacity',
75
+ 'PCS์šฉ๋Ÿ‰(kW)': 'pcs_capacity'
76
+ })
77
+
78
+ translation_dict = {
79
+ '๊ฑด๋ฌผ๊ธฐํƒ€': 'Other Buildings',
80
+ '๊ณต๊ณต': 'Public',
81
+ 'ํ•™๊ต': 'University',
82
+ '๋ฐฑํ™”์ ': 'Department Store',
83
+ '๋ณ‘์›': 'Hospital',
84
+ '์ƒ์šฉ': 'Commercial',
85
+ '์•„ํŒŒํŠธ': 'Apartment',
86
+ '์—ฐ๊ตฌ์†Œ': 'Research Institute',
87
+ 'IDC(์ „ํ™”๊ตญ)': 'IDC',
88
+ 'ํ˜ธํ…”': 'Hotel'
89
+ }
90
+
91
+ building_info['building_type'] = building_info['building_type'].replace(translation_dict)
92
+
93
+ building_info['solar_power_utility'] = np.where(building_info.solar_power_capacity !='-',1,0)
94
+ building_info['ess_utility'] = np.where(building_info.ess_capacity !='-',1,0)
95
+
96
+ train = pd.merge(train, building_info, on='building_number', how='left')
97
+ test = pd.merge(test, building_info, on='building_number', how='left')
98
+ train['date_time'] = pd.to_datetime(train['date_time'], format='%Y%m%d %H')
99
+
100
+ # date time feature ์ƒ์„ฑ
101
+ train['hour'] = train['date_time'].dt.hour
102
+ train['day'] = train['date_time'].dt.day
103
+ train['month'] = train['date_time'].dt.month
104
+ train['day_of_week'] = train['date_time'].dt.dayofweek #์š”์ผ
105
+
106
+
107
+ test['date_time'] = pd.to_datetime(test['date_time'], format='%Y%m%d %H')
108
+
109
+ # date time feature ์ƒ์„ฑ
110
+ test['hour'] = test['date_time'].dt.hour
111
+ test['day'] = test['date_time'].dt.day
112
+ test['month'] = test['date_time'].dt.month
113
+ test['day_of_week'] = test['date_time'].dt.dayofweek #์š”์ผ
114
+ def calculate_day_values(dataframe, target_column, output_column, aggregation_func):
115
+ result_dict = {}
116
+
117
+ grouped_temp = dataframe.groupby(['building_number', 'month', 'day'])[target_column].agg(aggregation_func)
118
+
119
+ for (building, month, day), value in grouped_temp.items():
120
+ result_dict.setdefault(building, {}).setdefault(month, {})[day] = value
121
+
122
+ dataframe[output_column] = [
123
+ result_dict.get(row['building_number'], {}).get(row['month'], {}).get(row['day'], None)
124
+ for _, row in dataframe.iterrows()
125
+ ]
126
+
127
+
128
+ train['day_max_temperature'] = 0.0
129
+ train['day_mean_temperature'] = 0.0
130
+
131
+ calculate_day_values(train, 'temperature', 'day_max_temperature', 'max')
132
+ calculate_day_values(train, 'temperature', 'day_mean_temperature', 'mean')
133
+ calculate_day_values(train, 'temperature', 'day_min_temperature', 'min')
134
+
135
+ train['day_temperature_range'] = train['day_max_temperature'] - train['day_min_temperature']
136
+
137
+ calculate_day_values(test, 'temperature', 'day_max_temperature', 'max')
138
+ calculate_day_values(test, 'temperature', 'day_mean_temperature', 'mean')
139
+ calculate_day_values(test, 'temperature', 'day_min_temperature', 'min')
140
+
141
+ test['day_temperature_range'] = test['day_max_temperature'] - test['day_min_temperature']
142
+ outlier_idx = train.index[train['power_consumption'] == 0].tolist()
143
+ print("์ œ๊ฑฐ๏ฟฝ๏ฟฝ๏ฟฝ ํ–‰ ๊ฐœ์ˆ˜:", len(outlier_idx))
144
+ print("์ธ๋ฑ์Šค ์˜ˆ์‹œ:", outlier_idx[:10])
145
+
146
+ # 2) ํ•ด๋‹น ์ธ๋ฑ์Šค๋“ค ๋“œ๋กญ
147
+ train.drop(index=outlier_idx, inplace=True)
148
+
149
+ # 3) ๋“œ๋กญ ํ›„ ํ™•์ธ
150
+ print("๋‚จ์€ ํ–‰ ๊ฐœ์ˆ˜:", train.shape[0])
151
+ holi_weekday = ['2024-06-06', '2024-08-15']
152
+
153
+ train['holiday'] = np.where((train.day_of_week >= 5) | (train.date_time.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0)
154
+ test['holiday'] = np.where((test.day_of_week >= 5) | (test.date_time.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0)
155
+ #์‹œ๊ฐ„
156
+ train['sin_hour'] = np.sin(2 * np.pi * train['hour']/23.0)
157
+ train['cos_hour'] = np.cos(2 * np.pi * train['hour']/23.0)
158
+ test['sin_hour'] = np.sin(2 * np.pi * test['hour']/23.0)
159
+ test['cos_hour'] = np.cos(2 * np.pi * test['hour']/23.0)
160
+
161
+ #๋‚ ์งœ
162
+ train['sin_date'] = -np.sin(2 * np.pi * (train['month']+train['day']/31)/12)
163
+ train['cos_date'] = -np.cos(2 * np.pi * (train['month']+train['day']/31)/12)
164
+ test['sin_date'] = -np.sin(2 * np.pi * (test['month']+test['day']/31)/12)
165
+ test['cos_date'] = -np.cos(2 * np.pi * (test['month']+test['day']/31)/12)
166
+
167
+ #์›”
168
+ train['sin_month'] = -np.sin(2 * np.pi * train['month']/12.0)
169
+ train['cos_month'] = -np.cos(2 * np.pi * train['month']/12.0)
170
+ test['sin_month'] = -np.sin(2 * np.pi * test['month']/12.0)
171
+ test['cos_month'] = -np.cos(2 * np.pi * test['month']/12.0)
172
+
173
+ #์š”์ผ
174
+ train['sin_dayofweek'] = -np.sin(2 * np.pi * (train['day_of_week']+1)/7.0)
175
+ train['cos_dayofweek'] = -np.cos(2 * np.pi * (train['day_of_week']+1)/7.0)
176
+ test['sin_dayofweek'] = -np.sin(2 * np.pi * (test['day_of_week']+1)/7.0)
177
+ test['cos_dayofweek'] = -np.cos(2 * np.pi * (test['day_of_week']+1)/7.0)
178
+ def CDH(xs):
179
+ cumsum = np.cumsum(xs - 26)
180
+ return np.concatenate((cumsum[:11], cumsum[11:] - cumsum[:-11]))
181
+
182
+ def calculate_and_add_cdh(dataframe):
183
+ cdhs = []
184
+ for i in range(1, 101):
185
+ temp = dataframe[dataframe['building_number'] == i]['temperature'].values
186
+ cdh = CDH(temp)
187
+ cdhs.append(cdh)
188
+ return np.concatenate(cdhs)
189
+
190
+ train['CDH'] = calculate_and_add_cdh(train)
191
+ test['CDH'] = calculate_and_add_cdh(test)
192
+ train['THI'] = 9/5*train['temperature'] - 0.55*(1-train['humidity']/100)*(9/5*train['humidity']-26)+32
193
+ test['THI'] = 9/5*test['temperature'] - 0.55*(1-test['humidity']/100)*(9/5*test['humidity']-26)+32
194
+ train['WCT'] = 13.12 + 0.6125*train['temperature'] - 11.37*(train['windspeed']**
195
+ 0.16) + 0.3965*(train['windspeed']**0.16)*train['temperature']
196
+ test['WCT'] = 13.12 + 0.6125*test['temperature'] - 11.37*(test['windspeed']**
197
+ 0.16) + 0.3965*(test['windspeed']**0.16)*test['temperature']
198
+ # Calculate 'day_hour_mean'
199
+ power_mean = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour', 'day_of_week'], aggfunc=np.mean).reset_index()
200
+ power_mean.columns = ['building_number', 'hour', 'day_of_week', 'day_hour_mean']
201
+
202
+ # Calculate 'day_hour_std'
203
+ power_std = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour', 'day_of_week'], aggfunc=np.std).reset_index()
204
+ power_std.columns = ['building_number', 'hour', 'day_of_week', 'day_hour_std']
205
+
206
+ # Calculate 'hour_mean'
207
+ power_hour_mean = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour'], aggfunc=np.mean).reset_index()
208
+ power_hour_mean.columns = ['building_number', 'hour', 'hour_mean']
209
+
210
+ # Calculate 'hour_std'
211
+ power_hour_std = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour'], aggfunc=np.std).reset_index()
212
+ power_hour_std.columns = ['building_number', 'hour', 'hour_std']
213
+
214
+ # Merge calculated features to 'train' and 'test' dataframes
215
+ train = train.merge(power_mean, on=['building_number', 'hour', 'day_of_week'], how='left')
216
+ test = test.merge(power_mean, on=['building_number', 'hour', 'day_of_week'], how='left')
217
+
218
+ train = train.merge(power_std, on=['building_number', 'hour', 'day_of_week'], how='left')
219
+ test = test.merge(power_std, on=['building_number', 'hour', 'day_of_week'], how='left')
220
+
221
+ train = train.merge(power_hour_mean, on=['building_number', 'hour'], how='left')
222
+ test = test.merge(power_hour_mean, on=['building_number', 'hour'], how='left')
223
+
224
+ train = train.merge(power_hour_std, on=['building_number', 'hour'], how='left')
225
+ test = test.merge(power_hour_std, on=['building_number', 'hour'], how='left')
226
+
227
+ train = train.reset_index(drop=True)
228
+ X = train.drop(['solar_power_capacity', 'ess_capacity', 'pcs_capacity',
229
+ 'power_consumption','rainfall', 'sunshine', 'solar_radiation',
230
+ 'hour','day','month','day_of_week','date_time'],axis =1 )
231
+
232
+ Y = train[['building_type','power_consumption']]
233
+
234
+ test_X = test.drop(['solar_power_capacity', 'ess_capacity', 'pcs_capacity','rainfall',
235
+ 'hour','month','day_of_week','day','date_time'], axis=1)
236
+ type_list = []
237
+ for value in train.building_type.values:
238
+ if value not in type_list:
239
+ type_list.append(value)
240
+
241
+ RANDOM_SEED = 42
242
+ KFOLD_SPLITS = 2
243
+
244
+
245
+
246
+ type_list = X["building_type"].unique()
247
+
248
+ # ๊ฒฐ๊ณผ ๋‹ด์„ DF
249
+ answer_df = pd.DataFrame(index=test_X.index, columns=["answer"], dtype=float)
250
+ pred_df = pd.DataFrame(index=X.index, columns=["pred"], dtype=float)
251
+
252
+ kf = KFold(n_splits=KFOLD_SPLITS, shuffle=True, random_state=RANDOM_SEED)
253
+
254
+ for btype in type_list:
255
+ start_proj = time.time()
256
+ x = X [X['building_type'] == btype].copy()
257
+ y = Y [Y['building_type'] == btype]['power_consumption'].copy()
258
+ xt = test_X[test_X['building_type'] == btype].copy()
259
+
260
+ x = pd.get_dummies(x, columns=["building_number"], drop_first=False)
261
+ xt = pd.get_dummies(xt, columns=["building_number"], drop_first=False)
262
+
263
+ xt = xt.reindex(columns=x.columns, fill_value=0)
264
+
265
+ drop_cols = ["building_type"]
266
+ x = x .drop(columns=drop_cols)
267
+ xt = xt.drop(columns=drop_cols)
268
+
269
+ preds_valid = pd.Series(index=y.index, dtype=float)
270
+ preds_test = []
271
+
272
+ x_values = x.values
273
+ y_values = y.values
274
+
275
+ fold_scores = []
276
+ for fold, (tr_idx, va_idx) in enumerate(kf.split(x_values), 1):
277
+ X_tr, X_va = x_values[tr_idx], x_values[va_idx]
278
+ y_tr, y_va = y_values[tr_idx], y_values[va_idx]
279
+
280
+ y_tr_log = np.log(y_tr)
281
+ y_va_log = np.log(y_va)
282
+
283
+ model = XGBRegressor(
284
+ learning_rate = 0.0013292918943162175,
285
+ n_estimators = 4759,
286
+ max_depth = 23,
287
+ subsample = 0.7993292420985183,
288
+ colsample_bytree = 0.5780093202212182,
289
+ min_child_weight = 2,
290
+ gamma = 0.2904180608409973,
291
+ reg_alpha = 0.6245760287469893,
292
+ reg_lambda = 0.002570603566117598,
293
+ random_state = RANDOM_SEED,
294
+ n_jobs = -1,
295
+ objective = weighted_mse(3),
296
+ #tree_method = "gpu_hist",
297
+ #gpu_id = 0,
298
+ early_stopping_rounds = 100,
299
+ )
300
+
301
+ model.fit(
302
+ X_tr, y_tr_log,
303
+ eval_set=[(X_va, y_va_log)],
304
+ eval_metric=custom_smape,
305
+ verbose=False,
306
+ )
307
+
308
+ va_pred = np.exp(model.predict(X_va))
309
+ preds_valid.iloc[va_idx] = va_pred
310
+
311
+ fold_smape = smape(y_va, va_pred)
312
+ fold_scores.append(fold_smape)
313
+
314
+ # ํ…Œ์ŠคํŠธ ์˜ˆ์ธก
315
+ preds_test.append(np.exp(model.predict(xt.values)))
316
+ """
317
+ if fold == KFOLD_SPLITS: # ๋งˆ์ง€๋ง‰ fold ํ•œ ๋ฒˆ๋งŒ ์‹œ๊ฐํ™”
318
+ sorted_idx = model.feature_importances_.argsort()
319
+ plt.figure(figsize=(8, 15))
320
+ plt.barh(x.columns[sorted_idx], model.feature_importances_[sorted_idx])
321
+ plt.xlabel(f"{btype}ย XGBย Featureย Importance")
322
+ plt.show()"""
323
+
324
+ # ๊ฒ€์ฆ ์˜ˆ์ธก ์ €์žฅ
325
+ pred_df.loc[preds_valid.index, "pred"] = preds_valid
326
+
327
+ # ํ…Œ์ŠคํŠธ ์˜ˆ์ธก(์•™์ƒ๋ธ” ํ‰๊ท ) ์ €์žฅ
328
+ answer_df.loc[xt.index, "answer"] = np.mean(preds_test, axis=0)
329
+ end_proj = time.time()
330
+ print(f"Building type = {btype} : XGB SMAPE = {np.mean(fold_scores):.4f} : {end_proj-start_proj:5f} sec")
331
+
332
+ total_smape = smape(
333
+ Y.sort_index()["power_consumption"].values,
334
+ pred_df.sort_index()["pred"].values
335
+ )
336
+ print(f"Total SMAPE = {total_smape:.4f}")
337
+ submit = pd.read_csv("sample_submission.csv")
338
+ submit["answer"] = answer_df.loc[submit.index, "answer"]
339
+ submit.to_csv("private_์žฌํ˜„_3.csv", index=False)
340
+ print("Saved โ†’ private_์žฌํ˜„_3.csv")