| import pandas as pd |
| import seaborn as sns |
| import numpy as np |
| import matplotlib.pyplot as plt |
| import matplotlib |
| import sklearn |
| import xgboost |
| from xgboost import XGBRegressor |
| from sklearn.model_selection import KFold |
| import math |
| import time |
| import random as rn |
| RANDOM_SEED = 2025 |
| np.random.seed(RANDOM_SEED) |
| rn.seed(RANDOM_SEED) |
| from datetime import datetime |
| import warnings |
|
|
| warnings.filterwarnings('ignore') |
| pd.set_option('display.max_columns', None) |
| def smape(gt, preds): |
| gt= np.array(gt) |
| preds = np.array(preds) |
| v = 2 * abs(preds - gt) / (abs(preds) + abs(gt)) |
| score = np.mean(v) * 100 |
| return score |
|
|
| def weighted_mse(alpha = 1): |
| def weighted_mse_fixed(label, pred): |
| residual = (label - pred).astype("float") |
| grad = np.where(residual>0, -2*alpha*residual, -2*residual) |
| hess = np.where(residual>0, 2*alpha, 2.0) |
| return grad, hess |
| return weighted_mse_fixed |
|
|
| def custom_smape(preds, dtrain): |
| labels = dtrain.get_label() |
| return 'custom_smape', np.mean(2 * abs(preds - labels) / (abs(preds) + abs(labels))) * 100 |
| train = pd.read_csv('train.csv') |
| test = pd.read_csv('test.csv') |
| building_info = pd.read_csv('building_info.csv') |
| train = train.rename(columns={ |
| '건물번호': 'building_number', |
| '일시': 'date_time', |
| '기온(°C)': 'temperature', |
| '강수량(mm)': 'rainfall', |
| '풍속(m/s)': 'windspeed', |
| '습도(%)': 'humidity', |
| '일조(hr)': 'sunshine', |
| '일사(MJ/m2)': 'solar_radiation', |
| '전력소비량(kWh)': 'power_consumption' |
| }) |
| train.drop('num_date_time', axis = 1, inplace=True) |
|
|
| test = test.rename(columns={ |
| '건물번호': 'building_number', |
| '일시': 'date_time', |
| '기온(°C)': 'temperature', |
| '강수량(mm)': 'rainfall', |
| '풍속(m/s)': 'windspeed', |
| '습도(%)': 'humidity', |
| '일조(hr)': 'sunshine', |
| '일사(MJ/m2)': 'solar_radiation', |
| '전력소비량(kWh)': 'power_consumption' |
| }) |
| test.drop('num_date_time', axis = 1, inplace=True) |
|
|
| building_info = building_info.rename(columns={ |
| '건물번호': 'building_number', |
| '건물유형': 'building_type', |
| '연면적(m2)': 'total_area', |
| '냉방면적(m2)': 'cooling_area', |
| '태양광용량(kW)': 'solar_power_capacity', |
| 'ESS저장용량(kWh)': 'ess_capacity', |
| 'PCS용량(kW)': 'pcs_capacity' |
| }) |
|
|
| translation_dict = { |
| '건물기타': 'Other Buildings', |
| '공공': 'Public', |
| '학교': 'University', |
| '백화점': 'Department Store', |
| '병원': 'Hospital', |
| '상용': 'Commercial', |
| '아파트': 'Apartment', |
| '연구소': 'Research Institute', |
| 'IDC(전화국)': 'IDC', |
| '호텔': 'Hotel' |
| } |
|
|
| building_info['building_type'] = building_info['building_type'].replace(translation_dict) |
|
|
| building_info['solar_power_utility'] = np.where(building_info.solar_power_capacity !='-',1,0) |
| building_info['ess_utility'] = np.where(building_info.ess_capacity !='-',1,0) |
|
|
| train = pd.merge(train, building_info, on='building_number', how='left') |
| test = pd.merge(test, building_info, on='building_number', how='left') |
| train['date_time'] = pd.to_datetime(train['date_time'], format='%Y%m%d %H') |
|
|
| |
| train['hour'] = train['date_time'].dt.hour |
| train['day'] = train['date_time'].dt.day |
| train['month'] = train['date_time'].dt.month |
| train['day_of_week'] = train['date_time'].dt.dayofweek |
|
|
|
|
| test['date_time'] = pd.to_datetime(test['date_time'], format='%Y%m%d %H') |
|
|
| |
| test['hour'] = test['date_time'].dt.hour |
| test['day'] = test['date_time'].dt.day |
| test['month'] = test['date_time'].dt.month |
| test['day_of_week'] = test['date_time'].dt.dayofweek |
| def calculate_day_values(dataframe, target_column, output_column, aggregation_func): |
| result_dict = {} |
|
|
| grouped_temp = dataframe.groupby(['building_number', 'month', 'day'])[target_column].agg(aggregation_func) |
|
|
| for (building, month, day), value in grouped_temp.items(): |
| result_dict.setdefault(building, {}).setdefault(month, {})[day] = value |
|
|
| dataframe[output_column] = [ |
| result_dict.get(row['building_number'], {}).get(row['month'], {}).get(row['day'], None) |
| for _, row in dataframe.iterrows() |
| ] |
|
|
|
|
| train['day_max_temperature'] = 0.0 |
| train['day_mean_temperature'] = 0.0 |
|
|
| calculate_day_values(train, 'temperature', 'day_max_temperature', 'max') |
| calculate_day_values(train, 'temperature', 'day_mean_temperature', 'mean') |
| calculate_day_values(train, 'temperature', 'day_min_temperature', 'min') |
|
|
| train['day_temperature_range'] = train['day_max_temperature'] - train['day_min_temperature'] |
|
|
| calculate_day_values(test, 'temperature', 'day_max_temperature', 'max') |
| calculate_day_values(test, 'temperature', 'day_mean_temperature', 'mean') |
| calculate_day_values(test, 'temperature', 'day_min_temperature', 'min') |
|
|
| test['day_temperature_range'] = test['day_max_temperature'] - test['day_min_temperature'] |
| outlier_idx = train.index[train['power_consumption'] == 0].tolist() |
| print("제거할 행 개수:", len(outlier_idx)) |
| print("인덱스 예시:", outlier_idx[:10]) |
|
|
| |
| train.drop(index=outlier_idx, inplace=True) |
|
|
| |
| print("남은 행 개수:", train.shape[0]) |
| holi_weekday = ['2024-06-06', '2024-08-15'] |
|
|
| train['holiday'] = np.where((train.day_of_week >= 5) | (train.date_time.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0) |
| test['holiday'] = np.where((test.day_of_week >= 5) | (test.date_time.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0) |
| |
| train['sin_hour'] = np.sin(2 * np.pi * train['hour']/23.0) |
| train['cos_hour'] = np.cos(2 * np.pi * train['hour']/23.0) |
| test['sin_hour'] = np.sin(2 * np.pi * test['hour']/23.0) |
| test['cos_hour'] = np.cos(2 * np.pi * test['hour']/23.0) |
|
|
| |
| train['sin_date'] = -np.sin(2 * np.pi * (train['month']+train['day']/31)/12) |
| train['cos_date'] = -np.cos(2 * np.pi * (train['month']+train['day']/31)/12) |
| test['sin_date'] = -np.sin(2 * np.pi * (test['month']+test['day']/31)/12) |
| test['cos_date'] = -np.cos(2 * np.pi * (test['month']+test['day']/31)/12) |
|
|
| |
| train['sin_month'] = -np.sin(2 * np.pi * train['month']/12.0) |
| train['cos_month'] = -np.cos(2 * np.pi * train['month']/12.0) |
| test['sin_month'] = -np.sin(2 * np.pi * test['month']/12.0) |
| test['cos_month'] = -np.cos(2 * np.pi * test['month']/12.0) |
|
|
| |
| train['sin_dayofweek'] = -np.sin(2 * np.pi * (train['day_of_week']+1)/7.0) |
| train['cos_dayofweek'] = -np.cos(2 * np.pi * (train['day_of_week']+1)/7.0) |
| test['sin_dayofweek'] = -np.sin(2 * np.pi * (test['day_of_week']+1)/7.0) |
| test['cos_dayofweek'] = -np.cos(2 * np.pi * (test['day_of_week']+1)/7.0) |
| def CDH(xs): |
| cumsum = np.cumsum(xs - 26) |
| return np.concatenate((cumsum[:11], cumsum[11:] - cumsum[:-11])) |
|
|
| def calculate_and_add_cdh(dataframe): |
| cdhs = [] |
| for i in range(1, 101): |
| temp = dataframe[dataframe['building_number'] == i]['temperature'].values |
| cdh = CDH(temp) |
| cdhs.append(cdh) |
| return np.concatenate(cdhs) |
|
|
| train['CDH'] = calculate_and_add_cdh(train) |
| test['CDH'] = calculate_and_add_cdh(test) |
| train['THI'] = 9/5*train['temperature'] - 0.55*(1-train['humidity']/100)*(9/5*train['humidity']-26)+32 |
| test['THI'] = 9/5*test['temperature'] - 0.55*(1-test['humidity']/100)*(9/5*test['humidity']-26)+32 |
| train['WCT'] = 13.12 + 0.6125*train['temperature'] - 11.37*(train['windspeed']** |
| 0.16) + 0.3965*(train['windspeed']**0.16)*train['temperature'] |
| test['WCT'] = 13.12 + 0.6125*test['temperature'] - 11.37*(test['windspeed']** |
| 0.16) + 0.3965*(test['windspeed']**0.16)*test['temperature'] |
| |
| power_mean = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour', 'day_of_week'], aggfunc=np.mean).reset_index() |
| power_mean.columns = ['building_number', 'hour', 'day_of_week', 'day_hour_mean'] |
|
|
| |
| power_std = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour', 'day_of_week'], aggfunc=np.std).reset_index() |
| power_std.columns = ['building_number', 'hour', 'day_of_week', 'day_hour_std'] |
|
|
| |
| power_hour_mean = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour'], aggfunc=np.mean).reset_index() |
| power_hour_mean.columns = ['building_number', 'hour', 'hour_mean'] |
|
|
| |
| power_hour_std = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour'], aggfunc=np.std).reset_index() |
| power_hour_std.columns = ['building_number', 'hour', 'hour_std'] |
|
|
| |
| train = train.merge(power_mean, on=['building_number', 'hour', 'day_of_week'], how='left') |
| test = test.merge(power_mean, on=['building_number', 'hour', 'day_of_week'], how='left') |
|
|
| train = train.merge(power_std, on=['building_number', 'hour', 'day_of_week'], how='left') |
| test = test.merge(power_std, on=['building_number', 'hour', 'day_of_week'], how='left') |
|
|
| train = train.merge(power_hour_mean, on=['building_number', 'hour'], how='left') |
| test = test.merge(power_hour_mean, on=['building_number', 'hour'], how='left') |
|
|
| train = train.merge(power_hour_std, on=['building_number', 'hour'], how='left') |
| test = test.merge(power_hour_std, on=['building_number', 'hour'], how='left') |
|
|
| train = train.reset_index(drop=True) |
| X = train.drop(['solar_power_capacity', 'ess_capacity', 'pcs_capacity', |
| 'power_consumption','rainfall', 'sunshine', 'solar_radiation', |
| 'hour','day','month','day_of_week','date_time'],axis =1 ) |
|
|
| Y = train[['building_type','power_consumption']] |
|
|
| test_X = test.drop(['solar_power_capacity', 'ess_capacity', 'pcs_capacity','rainfall', |
| 'hour','month','day_of_week','day','date_time'], axis=1) |
| type_list = [] |
| for value in train.building_type.values: |
| if value not in type_list: |
| type_list.append(value) |
|
|
| RANDOM_SEED = 42 |
| KFOLD_SPLITS = 2 |
|
|
|
|
|
|
| type_list = X["building_type"].unique() |
|
|
| |
| answer_df = pd.DataFrame(index=test_X.index, columns=["answer"], dtype=float) |
| pred_df = pd.DataFrame(index=X.index, columns=["pred"], dtype=float) |
|
|
| kf = KFold(n_splits=KFOLD_SPLITS, shuffle=True, random_state=RANDOM_SEED) |
|
|
| for btype in type_list: |
| start_proj = time.time() |
| x = X [X['building_type'] == btype].copy() |
| y = Y [Y['building_type'] == btype]['power_consumption'].copy() |
| xt = test_X[test_X['building_type'] == btype].copy() |
|
|
| x = pd.get_dummies(x, columns=["building_number"], drop_first=False) |
| xt = pd.get_dummies(xt, columns=["building_number"], drop_first=False) |
|
|
| xt = xt.reindex(columns=x.columns, fill_value=0) |
|
|
| drop_cols = ["building_type"] |
| x = x .drop(columns=drop_cols) |
| xt = xt.drop(columns=drop_cols) |
|
|
| preds_valid = pd.Series(index=y.index, dtype=float) |
| preds_test = [] |
|
|
| x_values = x.values |
| y_values = y.values |
|
|
| fold_scores = [] |
| for fold, (tr_idx, va_idx) in enumerate(kf.split(x_values), 1): |
| X_tr, X_va = x_values[tr_idx], x_values[va_idx] |
| y_tr, y_va = y_values[tr_idx], y_values[va_idx] |
|
|
| y_tr_log = np.log(y_tr) |
| y_va_log = np.log(y_va) |
|
|
| model = XGBRegressor( |
| learning_rate = 0.0013292918943162175, |
| n_estimators = 4759, |
| max_depth = 23, |
| subsample = 0.7993292420985183, |
| colsample_bytree = 0.5780093202212182, |
| min_child_weight = 2, |
| gamma = 0.2904180608409973, |
| reg_alpha = 0.6245760287469893, |
| reg_lambda = 0.002570603566117598, |
| random_state = RANDOM_SEED, |
| n_jobs = -1, |
| objective = weighted_mse(3), |
| |
| |
| early_stopping_rounds = 100, |
| ) |
|
|
| model.fit( |
| X_tr, y_tr_log, |
| eval_set=[(X_va, y_va_log)], |
| eval_metric=custom_smape, |
| verbose=False, |
| ) |
|
|
| va_pred = np.exp(model.predict(X_va)) |
| preds_valid.iloc[va_idx] = va_pred |
|
|
| fold_smape = smape(y_va, va_pred) |
| fold_scores.append(fold_smape) |
|
|
| |
| preds_test.append(np.exp(model.predict(xt.values))) |
| """ |
| if fold == KFOLD_SPLITS: # 마지막 fold 한 번만 시각화 |
| sorted_idx = model.feature_importances_.argsort() |
| plt.figure(figsize=(8, 15)) |
| plt.barh(x.columns[sorted_idx], model.feature_importances_[sorted_idx]) |
| plt.xlabel(f"{btype} XGB Feature Importance") |
| plt.show()""" |
|
|
| |
| pred_df.loc[preds_valid.index, "pred"] = preds_valid |
|
|
| |
| answer_df.loc[xt.index, "answer"] = np.mean(preds_test, axis=0) |
| end_proj = time.time() |
| print(f"Building type = {btype} : XGB SMAPE = {np.mean(fold_scores):.4f} : {end_proj-start_proj:5f} sec") |
|
|
| total_smape = smape( |
| Y.sort_index()["power_consumption"].values, |
| pred_df.sort_index()["pred"].values |
| ) |
| print(f"Total SMAPE = {total_smape:.4f}") |
| submit = pd.read_csv("sample_submission.csv") |
| submit["answer"] = answer_df.loc[submit.index, "answer"] |
| submit.to_csv("private_재현_3.csv", index=False) |
| print("Saved → private_재현_3.csv") |