import pandas as pd import seaborn as sns import numpy as np import matplotlib.pyplot as plt import matplotlib import sklearn import xgboost from xgboost import XGBRegressor from sklearn.model_selection import KFold import math import time import random as rn RANDOM_SEED = 2025 np.random.seed(RANDOM_SEED) rn.seed(RANDOM_SEED) from datetime import datetime import warnings warnings.filterwarnings('ignore') pd.set_option('display.max_columns', None) def smape(gt, preds): gt= np.array(gt) preds = np.array(preds) v = 2 * abs(preds - gt) / (abs(preds) + abs(gt)) score = np.mean(v) * 100 return score def weighted_mse(alpha = 1): def weighted_mse_fixed(label, pred): residual = (label - pred).astype("float") grad = np.where(residual>0, -2*alpha*residual, -2*residual) hess = np.where(residual>0, 2*alpha, 2.0) return grad, hess return weighted_mse_fixed def custom_smape(preds, dtrain): labels = dtrain.get_label() return 'custom_smape', np.mean(2 * abs(preds - labels) / (abs(preds) + abs(labels))) * 100 train = pd.read_csv('train.csv') test = pd.read_csv('test.csv') building_info = pd.read_csv('building_info.csv') train = train.rename(columns={ '건물번호': 'building_number', '일시': 'date_time', '기온(°C)': 'temperature', '강수량(mm)': 'rainfall', '풍속(m/s)': 'windspeed', '습도(%)': 'humidity', '일조(hr)': 'sunshine', '일사(MJ/m2)': 'solar_radiation', '전력소비량(kWh)': 'power_consumption' }) train.drop('num_date_time', axis = 1, inplace=True) test = test.rename(columns={ '건물번호': 'building_number', '일시': 'date_time', '기온(°C)': 'temperature', '강수량(mm)': 'rainfall', '풍속(m/s)': 'windspeed', '습도(%)': 'humidity', '일조(hr)': 'sunshine', '일사(MJ/m2)': 'solar_radiation', '전력소비량(kWh)': 'power_consumption' }) test.drop('num_date_time', axis = 1, inplace=True) building_info = building_info.rename(columns={ '건물번호': 'building_number', '건물유형': 'building_type', '연면적(m2)': 'total_area', '냉방면적(m2)': 'cooling_area', '태양광용량(kW)': 'solar_power_capacity', 'ESS저장용량(kWh)': 'ess_capacity', 'PCS용량(kW)': 'pcs_capacity' }) translation_dict = { '건물기타': 'Other Buildings', '공공': 'Public', '학교': 'University', '백화점': 'Department Store', '병원': 'Hospital', '상용': 'Commercial', '아파트': 'Apartment', '연구소': 'Research Institute', 'IDC(전화국)': 'IDC', '호텔': 'Hotel' } building_info['building_type'] = building_info['building_type'].replace(translation_dict) building_info['solar_power_utility'] = np.where(building_info.solar_power_capacity !='-',1,0) building_info['ess_utility'] = np.where(building_info.ess_capacity !='-',1,0) train = pd.merge(train, building_info, on='building_number', how='left') test = pd.merge(test, building_info, on='building_number', how='left') train['date_time'] = pd.to_datetime(train['date_time'], format='%Y%m%d %H') # date time feature 생성 train['hour'] = train['date_time'].dt.hour train['day'] = train['date_time'].dt.day train['month'] = train['date_time'].dt.month train['day_of_week'] = train['date_time'].dt.dayofweek #요일 test['date_time'] = pd.to_datetime(test['date_time'], format='%Y%m%d %H') # date time feature 생성 test['hour'] = test['date_time'].dt.hour test['day'] = test['date_time'].dt.day test['month'] = test['date_time'].dt.month test['day_of_week'] = test['date_time'].dt.dayofweek #요일 def calculate_day_values(dataframe, target_column, output_column, aggregation_func): result_dict = {} grouped_temp = dataframe.groupby(['building_number', 'month', 'day'])[target_column].agg(aggregation_func) for (building, month, day), value in grouped_temp.items(): result_dict.setdefault(building, {}).setdefault(month, {})[day] = value dataframe[output_column] = [ result_dict.get(row['building_number'], {}).get(row['month'], {}).get(row['day'], None) for _, row in dataframe.iterrows() ] train['day_max_temperature'] = 0.0 train['day_mean_temperature'] = 0.0 calculate_day_values(train, 'temperature', 'day_max_temperature', 'max') calculate_day_values(train, 'temperature', 'day_mean_temperature', 'mean') calculate_day_values(train, 'temperature', 'day_min_temperature', 'min') train['day_temperature_range'] = train['day_max_temperature'] - train['day_min_temperature'] calculate_day_values(test, 'temperature', 'day_max_temperature', 'max') calculate_day_values(test, 'temperature', 'day_mean_temperature', 'mean') calculate_day_values(test, 'temperature', 'day_min_temperature', 'min') test['day_temperature_range'] = test['day_max_temperature'] - test['day_min_temperature'] outlier_idx = train.index[train['power_consumption'] == 0].tolist() print("제거할 행 개수:", len(outlier_idx)) print("인덱스 예시:", outlier_idx[:10]) # 2) 해당 인덱스들 드롭 train.drop(index=outlier_idx, inplace=True) # 3) 드롭 후 확인 print("남은 행 개수:", train.shape[0]) holi_weekday = ['2024-06-06', '2024-08-15'] train['holiday'] = np.where((train.day_of_week >= 5) | (train.date_time.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0) test['holiday'] = np.where((test.day_of_week >= 5) | (test.date_time.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0) #시간 train['sin_hour'] = np.sin(2 * np.pi * train['hour']/23.0) train['cos_hour'] = np.cos(2 * np.pi * train['hour']/23.0) test['sin_hour'] = np.sin(2 * np.pi * test['hour']/23.0) test['cos_hour'] = np.cos(2 * np.pi * test['hour']/23.0) #날짜 train['sin_date'] = -np.sin(2 * np.pi * (train['month']+train['day']/31)/12) train['cos_date'] = -np.cos(2 * np.pi * (train['month']+train['day']/31)/12) test['sin_date'] = -np.sin(2 * np.pi * (test['month']+test['day']/31)/12) test['cos_date'] = -np.cos(2 * np.pi * (test['month']+test['day']/31)/12) #월 train['sin_month'] = -np.sin(2 * np.pi * train['month']/12.0) train['cos_month'] = -np.cos(2 * np.pi * train['month']/12.0) test['sin_month'] = -np.sin(2 * np.pi * test['month']/12.0) test['cos_month'] = -np.cos(2 * np.pi * test['month']/12.0) #요일 train['sin_dayofweek'] = -np.sin(2 * np.pi * (train['day_of_week']+1)/7.0) train['cos_dayofweek'] = -np.cos(2 * np.pi * (train['day_of_week']+1)/7.0) test['sin_dayofweek'] = -np.sin(2 * np.pi * (test['day_of_week']+1)/7.0) test['cos_dayofweek'] = -np.cos(2 * np.pi * (test['day_of_week']+1)/7.0) def CDH(xs): cumsum = np.cumsum(xs - 26) return np.concatenate((cumsum[:11], cumsum[11:] - cumsum[:-11])) def calculate_and_add_cdh(dataframe): cdhs = [] for i in range(1, 101): temp = dataframe[dataframe['building_number'] == i]['temperature'].values cdh = CDH(temp) cdhs.append(cdh) return np.concatenate(cdhs) train['CDH'] = calculate_and_add_cdh(train) test['CDH'] = calculate_and_add_cdh(test) train['THI'] = 9/5*train['temperature'] - 0.55*(1-train['humidity']/100)*(9/5*train['humidity']-26)+32 test['THI'] = 9/5*test['temperature'] - 0.55*(1-test['humidity']/100)*(9/5*test['humidity']-26)+32 train['WCT'] = 13.12 + 0.6125*train['temperature'] - 11.37*(train['windspeed']** 0.16) + 0.3965*(train['windspeed']**0.16)*train['temperature'] test['WCT'] = 13.12 + 0.6125*test['temperature'] - 11.37*(test['windspeed']** 0.16) + 0.3965*(test['windspeed']**0.16)*test['temperature'] # Calculate 'day_hour_mean' power_mean = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour', 'day_of_week'], aggfunc=np.mean).reset_index() power_mean.columns = ['building_number', 'hour', 'day_of_week', 'day_hour_mean'] # Calculate 'day_hour_std' power_std = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour', 'day_of_week'], aggfunc=np.std).reset_index() power_std.columns = ['building_number', 'hour', 'day_of_week', 'day_hour_std'] # Calculate 'hour_mean' power_hour_mean = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour'], aggfunc=np.mean).reset_index() power_hour_mean.columns = ['building_number', 'hour', 'hour_mean'] # Calculate 'hour_std' power_hour_std = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour'], aggfunc=np.std).reset_index() power_hour_std.columns = ['building_number', 'hour', 'hour_std'] # Merge calculated features to 'train' and 'test' dataframes train = train.merge(power_mean, on=['building_number', 'hour', 'day_of_week'], how='left') test = test.merge(power_mean, on=['building_number', 'hour', 'day_of_week'], how='left') train = train.merge(power_std, on=['building_number', 'hour', 'day_of_week'], how='left') test = test.merge(power_std, on=['building_number', 'hour', 'day_of_week'], how='left') train = train.merge(power_hour_mean, on=['building_number', 'hour'], how='left') test = test.merge(power_hour_mean, on=['building_number', 'hour'], how='left') train = train.merge(power_hour_std, on=['building_number', 'hour'], how='left') test = test.merge(power_hour_std, on=['building_number', 'hour'], how='left') train = train.reset_index(drop=True) X = train.drop(['solar_power_capacity', 'ess_capacity', 'pcs_capacity', 'power_consumption','rainfall', 'sunshine', 'solar_radiation', 'hour','day','month','day_of_week','date_time'],axis =1 ) Y = train[['building_type','power_consumption']] test_X = test.drop(['solar_power_capacity', 'ess_capacity', 'pcs_capacity','rainfall', 'hour','month','day_of_week','day','date_time'], axis=1) type_list = [] for value in train.building_type.values: if value not in type_list: type_list.append(value) RANDOM_SEED = 42 KFOLD_SPLITS = 2 type_list = X["building_type"].unique() # 결과 담을 DF answer_df = pd.DataFrame(index=test_X.index, columns=["answer"], dtype=float) pred_df = pd.DataFrame(index=X.index, columns=["pred"], dtype=float) kf = KFold(n_splits=KFOLD_SPLITS, shuffle=True, random_state=RANDOM_SEED) for btype in type_list: start_proj = time.time() x = X [X['building_type'] == btype].copy() y = Y [Y['building_type'] == btype]['power_consumption'].copy() xt = test_X[test_X['building_type'] == btype].copy() x = pd.get_dummies(x, columns=["building_number"], drop_first=False) xt = pd.get_dummies(xt, columns=["building_number"], drop_first=False) xt = xt.reindex(columns=x.columns, fill_value=0) drop_cols = ["building_type"] x = x .drop(columns=drop_cols) xt = xt.drop(columns=drop_cols) preds_valid = pd.Series(index=y.index, dtype=float) preds_test = [] x_values = x.values y_values = y.values fold_scores = [] for fold, (tr_idx, va_idx) in enumerate(kf.split(x_values), 1): X_tr, X_va = x_values[tr_idx], x_values[va_idx] y_tr, y_va = y_values[tr_idx], y_values[va_idx] y_tr_log = np.log(y_tr) y_va_log = np.log(y_va) model = XGBRegressor( learning_rate = 0.0013292918943162175, n_estimators = 4759, max_depth = 23, subsample = 0.7993292420985183, colsample_bytree = 0.5780093202212182, min_child_weight = 2, gamma = 0.2904180608409973, reg_alpha = 0.6245760287469893, reg_lambda = 0.002570603566117598, random_state = RANDOM_SEED, n_jobs = -1, objective = weighted_mse(3), #tree_method = "gpu_hist", #gpu_id = 0, early_stopping_rounds = 100, ) model.fit( X_tr, y_tr_log, eval_set=[(X_va, y_va_log)], eval_metric=custom_smape, verbose=False, ) va_pred = np.exp(model.predict(X_va)) preds_valid.iloc[va_idx] = va_pred fold_smape = smape(y_va, va_pred) fold_scores.append(fold_smape) # 테스트 예측 preds_test.append(np.exp(model.predict(xt.values))) """ if fold == KFOLD_SPLITS: # 마지막 fold 한 번만 시각화 sorted_idx = model.feature_importances_.argsort() plt.figure(figsize=(8, 15)) plt.barh(x.columns[sorted_idx], model.feature_importances_[sorted_idx]) plt.xlabel(f"{btype} XGB Feature Importance") plt.show()""" # 검증 예측 저장 pred_df.loc[preds_valid.index, "pred"] = preds_valid # 테스트 예측(앙상블 평균) 저장 answer_df.loc[xt.index, "answer"] = np.mean(preds_test, axis=0) end_proj = time.time() print(f"Building type = {btype} : XGB SMAPE = {np.mean(fold_scores):.4f} : {end_proj-start_proj:5f} sec") total_smape = smape( Y.sort_index()["power_consumption"].values, pred_df.sort_index()["pred"].values ) print(f"Total SMAPE = {total_smape:.4f}") submit = pd.read_csv("sample_submission.csv") submit["answer"] = answer_df.loc[submit.index, "answer"] submit.to_csv("private_재현_3.csv", index=False) print("Saved → private_재현_3.csv")