elec / for_py.py

Upload for_py.py

7ebbdcc verified 10 months ago

13.3 kB

	import pandas as pd
	import seaborn as sns
	import numpy as np
	import matplotlib.pyplot as plt
	import matplotlib
	import sklearn
	import xgboost
	from xgboost import XGBRegressor
	from sklearn.model_selection import KFold
	import math
	import time
	import random as rn
	RANDOM_SEED = 2025
	np.random.seed(RANDOM_SEED)
	rn.seed(RANDOM_SEED)
	from datetime import datetime
	import warnings

	warnings.filterwarnings('ignore')
	pd.set_option('display.max_columns', None)
	def smape(gt, preds):
	gt= np.array(gt)
	preds = np.array(preds)
	v = 2 * abs(preds - gt) / (abs(preds) + abs(gt))
	score = np.mean(v) * 100
	return score

	def weighted_mse(alpha = 1):
	def weighted_mse_fixed(label, pred):
	residual = (label - pred).astype("float")
	grad = np.where(residual>0, -2alpharesidual, -2*residual)
	hess = np.where(residual>0, 2*alpha, 2.0)
	return grad, hess
	return weighted_mse_fixed

	def custom_smape(preds, dtrain):
	labels = dtrain.get_label()
	return 'custom_smape', np.mean(2 * abs(preds - labels) / (abs(preds) + abs(labels))) * 100
	train = pd.read_csv('train.csv')
	test = pd.read_csv('test.csv')
	building_info = pd.read_csv('building_info.csv')
	train = train.rename(columns={
	'건물번호': 'building_number',
	'일시': 'date_time',
	'기온(°C)': 'temperature',
	'강수량(mm)': 'rainfall',
	'풍속(m/s)': 'windspeed',
	'습도(%)': 'humidity',
	'일조(hr)': 'sunshine',
	'일사(MJ/m2)': 'solar_radiation',
	'전력소비량(kWh)': 'power_consumption'
	})
	train.drop('num_date_time', axis = 1, inplace=True)

	test = test.rename(columns={
	'건물번호': 'building_number',
	'일시': 'date_time',
	'기온(°C)': 'temperature',
	'강수량(mm)': 'rainfall',
	'풍속(m/s)': 'windspeed',
	'습도(%)': 'humidity',
	'일조(hr)': 'sunshine',
	'일사(MJ/m2)': 'solar_radiation',
	'전력소비량(kWh)': 'power_consumption'
	})
	test.drop('num_date_time', axis = 1, inplace=True)

	building_info = building_info.rename(columns={
	'건물번호': 'building_number',
	'건물유형': 'building_type',
	'연면적(m2)': 'total_area',
	'냉방면적(m2)': 'cooling_area',
	'태양광용량(kW)': 'solar_power_capacity',
	'ESS저장용량(kWh)': 'ess_capacity',
	'PCS용량(kW)': 'pcs_capacity'
	})

	translation_dict = {
	'건물기타': 'Other Buildings',
	'공공': 'Public',
	'학교': 'University',
	'백화점': 'Department Store',
	'병원': 'Hospital',
	'상용': 'Commercial',
	'아파트': 'Apartment',
	'연구소': 'Research Institute',
	'IDC(전화국)': 'IDC',
	'호텔': 'Hotel'
	}

	building_info['building_type'] = building_info['building_type'].replace(translation_dict)

	building_info['solar_power_utility'] = np.where(building_info.solar_power_capacity !='-',1,0)
	building_info['ess_utility'] = np.where(building_info.ess_capacity !='-',1,0)

	train = pd.merge(train, building_info, on='building_number', how='left')
	test = pd.merge(test, building_info, on='building_number', how='left')
	train['date_time'] = pd.to_datetime(train['date_time'], format='%Y%m%d %H')

	# date time feature 생성
	train['hour'] = train['date_time'].dt.hour
	train['day'] = train['date_time'].dt.day
	train['month'] = train['date_time'].dt.month
	train['day_of_week'] = train['date_time'].dt.dayofweek #요일


	test['date_time'] = pd.to_datetime(test['date_time'], format='%Y%m%d %H')

	# date time feature 생성
	test['hour'] = test['date_time'].dt.hour
	test['day'] = test['date_time'].dt.day
	test['month'] = test['date_time'].dt.month
	test['day_of_week'] = test['date_time'].dt.dayofweek #요일
	def calculate_day_values(dataframe, target_column, output_column, aggregation_func):
	result_dict = {}

	grouped_temp = dataframe.groupby(['building_number', 'month', 'day'])[target_column].agg(aggregation_func)

	for (building, month, day), value in grouped_temp.items():
	result_dict.setdefault(building, {}).setdefault(month, {})[day] = value

	dataframe[output_column] = [
	result_dict.get(row['building_number'], {}).get(row['month'], {}).get(row['day'], None)
	for _, row in dataframe.iterrows()
	]


	train['day_max_temperature'] = 0.0
	train['day_mean_temperature'] = 0.0

	calculate_day_values(train, 'temperature', 'day_max_temperature', 'max')
	calculate_day_values(train, 'temperature', 'day_mean_temperature', 'mean')
	calculate_day_values(train, 'temperature', 'day_min_temperature', 'min')

	train['day_temperature_range'] = train['day_max_temperature'] - train['day_min_temperature']

	calculate_day_values(test, 'temperature', 'day_max_temperature', 'max')
	calculate_day_values(test, 'temperature', 'day_mean_temperature', 'mean')
	calculate_day_values(test, 'temperature', 'day_min_temperature', 'min')

	test['day_temperature_range'] = test['day_max_temperature'] - test['day_min_temperature']
	outlier_idx = train.index[train['power_consumption'] == 0].tolist()
	print("제거할 행 개수:", len(outlier_idx))
	print("인덱스 예시:", outlier_idx[:10])

	# 2) 해당 인덱스들 드롭
	train.drop(index=outlier_idx, inplace=True)

	# 3) 드롭 후 확인
	print("남은 행 개수:", train.shape[0])
	holi_weekday = ['2024-06-06', '2024-08-15']

	train['holiday'] = np.where((train.day_of_week >= 5) \| (train.date_time.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0)
	test['holiday'] = np.where((test.day_of_week >= 5) \| (test.date_time.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0)
	#시간
	train['sin_hour'] = np.sin(2 * np.pi * train['hour']/23.0)
	train['cos_hour'] = np.cos(2 * np.pi * train['hour']/23.0)
	test['sin_hour'] = np.sin(2 * np.pi * test['hour']/23.0)
	test['cos_hour'] = np.cos(2 * np.pi * test['hour']/23.0)

	#날짜
	train['sin_date'] = -np.sin(2 * np.pi * (train['month']+train['day']/31)/12)
	train['cos_date'] = -np.cos(2 * np.pi * (train['month']+train['day']/31)/12)
	test['sin_date'] = -np.sin(2 * np.pi * (test['month']+test['day']/31)/12)
	test['cos_date'] = -np.cos(2 * np.pi * (test['month']+test['day']/31)/12)

	#월
	train['sin_month'] = -np.sin(2 * np.pi * train['month']/12.0)
	train['cos_month'] = -np.cos(2 * np.pi * train['month']/12.0)
	test['sin_month'] = -np.sin(2 * np.pi * test['month']/12.0)
	test['cos_month'] = -np.cos(2 * np.pi * test['month']/12.0)

	#요일
	train['sin_dayofweek'] = -np.sin(2 * np.pi * (train['day_of_week']+1)/7.0)
	train['cos_dayofweek'] = -np.cos(2 * np.pi * (train['day_of_week']+1)/7.0)
	test['sin_dayofweek'] = -np.sin(2 * np.pi * (test['day_of_week']+1)/7.0)
	test['cos_dayofweek'] = -np.cos(2 * np.pi * (test['day_of_week']+1)/7.0)
	def CDH(xs):
	cumsum = np.cumsum(xs - 26)
	return np.concatenate((cumsum[:11], cumsum[11:] - cumsum[:-11]))

	def calculate_and_add_cdh(dataframe):
	cdhs = []
	for i in range(1, 101):
	temp = dataframe[dataframe['building_number'] == i]['temperature'].values
	cdh = CDH(temp)
	cdhs.append(cdh)
	return np.concatenate(cdhs)

	train['CDH'] = calculate_and_add_cdh(train)
	test['CDH'] = calculate_and_add_cdh(test)
	train['THI'] = 9/5train['temperature'] - 0.55(1-train['humidity']/100)(9/5train['humidity']-26)+32
	test['THI'] = 9/5test['temperature'] - 0.55(1-test['humidity']/100)(9/5test['humidity']-26)+32
	train['WCT'] = 13.12 + 0.6125train['temperature'] - 11.37(train['windspeed']**
	0.16) + 0.3965(train['windspeed']0.16)train['temperature']
	test['WCT'] = 13.12 + 0.6125test['temperature'] - 11.37(test['windspeed']**
	0.16) + 0.3965(test['windspeed']0.16)test['temperature']
	# Calculate 'day_hour_mean'
	power_mean = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour', 'day_of_week'], aggfunc=np.mean).reset_index()
	power_mean.columns = ['building_number', 'hour', 'day_of_week', 'day_hour_mean']

	# Calculate 'day_hour_std'
	power_std = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour', 'day_of_week'], aggfunc=np.std).reset_index()
	power_std.columns = ['building_number', 'hour', 'day_of_week', 'day_hour_std']

	# Calculate 'hour_mean'
	power_hour_mean = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour'], aggfunc=np.mean).reset_index()
	power_hour_mean.columns = ['building_number', 'hour', 'hour_mean']

	# Calculate 'hour_std'
	power_hour_std = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour'], aggfunc=np.std).reset_index()
	power_hour_std.columns = ['building_number', 'hour', 'hour_std']

	# Merge calculated features to 'train' and 'test' dataframes
	train = train.merge(power_mean, on=['building_number', 'hour', 'day_of_week'], how='left')
	test = test.merge(power_mean, on=['building_number', 'hour', 'day_of_week'], how='left')

	train = train.merge(power_std, on=['building_number', 'hour', 'day_of_week'], how='left')
	test = test.merge(power_std, on=['building_number', 'hour', 'day_of_week'], how='left')

	train = train.merge(power_hour_mean, on=['building_number', 'hour'], how='left')
	test = test.merge(power_hour_mean, on=['building_number', 'hour'], how='left')

	train = train.merge(power_hour_std, on=['building_number', 'hour'], how='left')
	test = test.merge(power_hour_std, on=['building_number', 'hour'], how='left')

	train = train.reset_index(drop=True)
	X = train.drop(['solar_power_capacity', 'ess_capacity', 'pcs_capacity',
	'power_consumption','rainfall', 'sunshine', 'solar_radiation',
	'hour','day','month','day_of_week','date_time'],axis =1 )

	Y = train[['building_type','power_consumption']]

	test_X = test.drop(['solar_power_capacity', 'ess_capacity', 'pcs_capacity','rainfall',
	'hour','month','day_of_week','day','date_time'], axis=1)
	type_list = []
	for value in train.building_type.values:
	if value not in type_list:
	type_list.append(value)

	RANDOM_SEED = 42
	KFOLD_SPLITS = 2



	type_list = X["building_type"].unique()

	# 결과 담을 DF
	answer_df = pd.DataFrame(index=test_X.index, columns=["answer"], dtype=float)
	pred_df = pd.DataFrame(index=X.index, columns=["pred"], dtype=float)

	kf = KFold(n_splits=KFOLD_SPLITS, shuffle=True, random_state=RANDOM_SEED)

	for btype in type_list:
	start_proj = time.time()
	x = X [X['building_type'] == btype].copy()
	y = Y [Y['building_type'] == btype]['power_consumption'].copy()
	xt = test_X[test_X['building_type'] == btype].copy()

	x = pd.get_dummies(x, columns=["building_number"], drop_first=False)
	xt = pd.get_dummies(xt, columns=["building_number"], drop_first=False)

	xt = xt.reindex(columns=x.columns, fill_value=0)

	drop_cols = ["building_type"]
	x = x .drop(columns=drop_cols)
	xt = xt.drop(columns=drop_cols)

	preds_valid = pd.Series(index=y.index, dtype=float)
	preds_test = []

	x_values = x.values
	y_values = y.values

	fold_scores = []
	for fold, (tr_idx, va_idx) in enumerate(kf.split(x_values), 1):
	X_tr, X_va = x_values[tr_idx], x_values[va_idx]
	y_tr, y_va = y_values[tr_idx], y_values[va_idx]

	y_tr_log = np.log(y_tr)
	y_va_log = np.log(y_va)

	model = XGBRegressor(
	learning_rate = 0.0013292918943162175,
	n_estimators = 4759,
	max_depth = 23,
	subsample = 0.7993292420985183,
	colsample_bytree = 0.5780093202212182,
	min_child_weight = 2,
	gamma = 0.2904180608409973,
	reg_alpha = 0.6245760287469893,
	reg_lambda = 0.002570603566117598,
	random_state = RANDOM_SEED,
	n_jobs = -1,
	objective = weighted_mse(3),
	#tree_method = "gpu_hist",
	#gpu_id = 0,
	early_stopping_rounds = 100,
	)

	model.fit(
	X_tr, y_tr_log,
	eval_set=[(X_va, y_va_log)],
	eval_metric=custom_smape,
	verbose=False,
	)

	va_pred = np.exp(model.predict(X_va))
	preds_valid.iloc[va_idx] = va_pred

	fold_smape = smape(y_va, va_pred)
	fold_scores.append(fold_smape)

	# 테스트 예측
	preds_test.append(np.exp(model.predict(xt.values)))
	"""
	if fold == KFOLD_SPLITS: # 마지막 fold 한 번만 시각화
	sorted_idx = model.feature_importances_.argsort()
	plt.figure(figsize=(8, 15))
	plt.barh(x.columns[sorted_idx], model.feature_importances_[sorted_idx])
	plt.xlabel(f"{btype} XGB Feature Importance")
	plt.show()"""

	# 검증 예측 저장
	pred_df.loc[preds_valid.index, "pred"] = preds_valid

	# 테스트 예측(앙상블 평균) 저장
	answer_df.loc[xt.index, "answer"] = np.mean(preds_test, axis=0)
	end_proj = time.time()
	print(f"Building type = {btype} : XGB SMAPE = {np.mean(fold_scores):.4f} : {end_proj-start_proj:5f} sec")

	total_smape = smape(
	Y.sort_index()["power_consumption"].values,
	pred_df.sort_index()["pred"].values
	)
	print(f"Total SMAPE = {total_smape:.4f}")
	submit = pd.read_csv("sample_submission.csv")
	submit["answer"] = answer_df.loc[submit.index, "answer"]
	submit.to_csv("private_재현_3.csv", index=False)
	print("Saved → private_재현_3.csv")