|
|
|
|
|
|
|
|
""" |
|
|
============================================================ |
|
|
train_core.py โ ํ์ต ํต์ฌ ๋ก์ง(์ฃผ์ ์์ฃผ ์์ธํ) |
|
|
------------------------------------------------------------ |
|
|
์ด ํ์ผ์ ๋ค์ ์ผ์ ํด์: |
|
|
1) ํ๊ฐ ์งํ ํจ์ ์ ์(RMSE/MAE/MAPE) |
|
|
2) ์ฌ์ฉํ ๋ชจ๋ธ ํ๋ณด๋ค์ ๋ชจ์์ฃผ๋ ํจ์(get_candidates) |
|
|
3) ์๊ณ์ด ๋ถํ (ํ์ต/๊ฒ์ฆ ๋๋๊ธฐ) |
|
|
4) ๊ฐ๋จํ ์์๋ธ(SimpleEnsemble) |
|
|
5) (์ต์
) Optuna ๋ก ํ์ดํผํ๋ผ๋ฏธํฐ ํ๋ |
|
|
6) train_and_score: ๋ชจ๋ธ๋ค ํ์ต โ ๊ฒ์ฆ ์ฑ๋ฅ ๋น๊ต โ ๋ฒ ์คํธ ์ ํ |
|
|
7) save_artifacts: ๋ฒ ์คํธ ๋ชจ๋ธ/๋ฆฌ๋๋ณด๋ ์ ์ฅ |
|
|
|
|
|
โป XGBoost/LightGBM/Optuna ๋ ์ค์น๋์ด ์์ง ์์ผ๋ฉด |
|
|
์๋์ผ๋ก ๊ฑด๋๋ฐ๋๋ก ๋ง๋ค์ด์ก์ต๋๋ค. |
|
|
============================================================ |
|
|
""" |
|
|
|
|
|
import os |
|
|
import pickle |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
from sklearn.metrics import mean_squared_error, mean_absolute_error |
|
|
|
|
|
|
|
|
from sklearn.linear_model import LinearRegression |
|
|
from sklearn.ensemble import RandomForestRegressor |
|
|
|
|
|
|
|
|
try: |
|
|
from xgboost import XGBRegressor |
|
|
except Exception: |
|
|
XGBRegressor = None |
|
|
|
|
|
try: |
|
|
from lightgbm import LGBMRegressor |
|
|
except Exception: |
|
|
LGBMRegressor = None |
|
|
|
|
|
|
|
|
try: |
|
|
import optuna |
|
|
except Exception: |
|
|
optuna = None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def rmse(a, b): |
|
|
""" |
|
|
RMSE (Root Mean Squared Error) |
|
|
- ์์ธก์ด ์ค์ ์ ์ผ๋ง๋ ๋ค๋ฅธ์ง, '์ ๊ณฑ ํ๊ท ์ค์ฐจ์ ์ ๊ณฑ๊ทผ' |
|
|
- ๊ฐ์ด ์์์๋ก ์ข์์. |
|
|
""" |
|
|
a = np.array(a); b = np.array(b) |
|
|
return float(np.sqrt(mean_squared_error(a, b))) if len(a) else float("nan") |
|
|
|
|
|
|
|
|
def mae(a, b): |
|
|
""" |
|
|
MAE (Mean Absolute Error) |
|
|
- ์์ธก๊ณผ ์ค์ ์ ์ฐจ์ด์ '์ ๋๊ฐ'์ ํ๊ท ๋ธ ๊ฐ |
|
|
- ์ฌ์ด ์ง๊ด: ํ๊ท ์ ์ผ๋ก ๋ช ๊ฐ(๋๋ ๋ช ๋จ์) ๋งํผ ํ๋ ธ๋? |
|
|
""" |
|
|
a = np.array(a); b = np.array(b) |
|
|
return float(mean_absolute_error(a, b)) if len(a) else float("nan") |
|
|
|
|
|
|
|
|
def mape(a, b): |
|
|
""" |
|
|
MAPE (Mean Absolute Percentage Error) |
|
|
- ํผ์ผํธ(%) ๊ธฐ์ค ์ค์ฐจ. 10%๋ฉด 'ํ๊ท ์ ์ผ๋ก 10% ํ๋ ธ๋ค'๋ ๋ป. |
|
|
- ์ค์ ๊ฐ์ด 0์ด๋ฉด ๋๋์
์ด ์ ๋๋ฏ๋ก 1๋ก ๋ฐ๊ฟ์ ์์ ์ฒ๋ฆฌํด์. |
|
|
""" |
|
|
a = np.array(a); b = np.array(b) |
|
|
if len(a) == 0: |
|
|
return float("nan") |
|
|
denom = np.where(a == 0, 1, a) |
|
|
return float(np.mean(np.abs((a - b) / denom)) * 100.0) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_candidates(): |
|
|
""" |
|
|
์ฌ์ฉํ ์ ์๋ ๋ชจ๋ธ ๋ชฉ๋ก์ ํํ๋ก ๋ชจ์ ๋ฐํํด์. |
|
|
๊ฐ ์์: (์ด๋ฆ, ๋ชจ๋ธ๊ฐ์ฒด, fit(ํ์ต)ํ ๋ ๋ฃ์ ์ถ๊ฐ ํ๋ผ๋ฏธํฐ ๋์
๋๋ฆฌ) |
|
|
|
|
|
- LinearRegression: ๊ฐ์ฅ ๊ธฐ๋ณธ์ ์ธ ์ ํ ๋ชจ๋ธ |
|
|
- RandomForest: ๋น์ ํ ํจํด๋ ์ ์ก๋ ๋๋ฌด ์์๋ธ |
|
|
- XGBoost / LightGBM: ๋น ๋ฅด๊ณ ๊ฐ๋ ฅํ ๋ถ์คํ
๋ชจ๋ธ(์ค์น๋ ๊ฒฝ์ฐ๋ง ์ฌ์ฉ) |
|
|
""" |
|
|
models = [] |
|
|
|
|
|
|
|
|
models.append(("LinearRegression", LinearRegression(), {})) |
|
|
|
|
|
|
|
|
models.append(("RandomForest", RandomForestRegressor( |
|
|
n_estimators=300, |
|
|
max_depth=None, |
|
|
random_state=42, |
|
|
n_jobs=-1 |
|
|
), {})) |
|
|
|
|
|
|
|
|
if XGBRegressor is not None: |
|
|
models.append(("XGBoost", XGBRegressor( |
|
|
n_estimators=400, |
|
|
max_depth=6, |
|
|
learning_rate=0.05, |
|
|
subsample=0.9, |
|
|
colsample_bytree=0.9, |
|
|
reg_lambda=1.0, |
|
|
random_state=42, |
|
|
tree_method="hist", |
|
|
n_jobs=-1 |
|
|
), {"verbose": False})) |
|
|
|
|
|
|
|
|
if LGBMRegressor is not None: |
|
|
models.append(("LightGBM", LGBMRegressor( |
|
|
n_estimators=600, |
|
|
max_depth=-1, |
|
|
learning_rate=0.05, |
|
|
subsample=0.9, |
|
|
colsample_bytree=0.9, |
|
|
reg_lambda=1.0, |
|
|
random_state=42, |
|
|
n_jobs=-1 |
|
|
), {})) |
|
|
|
|
|
return models |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def time_split(X, y, valid_ratio=0.2): |
|
|
""" |
|
|
์๊ฐ ์์๋ฅผ ์งํค๊ธฐ ์ํด, ์์ชฝ์ 'ํ์ต', ๋ค์ชฝ์ '๊ฒ์ฆ'์ผ๋ก ๋๋ ์. |
|
|
(์๊ณ์ด์ ๋๋ค ์๊ธฐ๋ฅผ ์ ํ๋ ๊ฒ ์ผ๋ฐ์ ) |
|
|
|
|
|
valid_ratio=0.2 ์ด๋ฉด ๋ฐ์ดํฐ์ 20%๋ฅผ ๊ฒ์ฆ์ฉ์ผ๋ก ์ฌ์ฉ. |
|
|
""" |
|
|
n = len(X) |
|
|
v = max(1, int(n * valid_ratio)) |
|
|
t = n - v |
|
|
return (X[:t], y[:t], X[t:], y[t:]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SimpleEnsemble: |
|
|
""" |
|
|
์ฌ๋ฌ ๋ชจ๋ธ์ ์์ธก์ ์์ด์ ํ๋๋ก ๋ง๋๋ ๊ฐ๋จํ ์์๋ธ. |
|
|
- weights: ๊ฐ์ค์น(๊ฐ์ด ํฌ๋ฉด ๊ทธ ๋ชจ๋ธ์ ๋ ์ ๋ขฐํ๋ค๋ ๋ป) |
|
|
- ์ฌ๊ธฐ์๋ ๋ชจ๋ธ๋ณ ๊ฒ์ฆ RMSE ์ ์ญ์๋ฅผ ๊ฐ์ค์น๋ก ์ฌ์ฉ(์ข์์๋ก ํฐ ๊ฐ์ค) |
|
|
""" |
|
|
def __init__(self, models, weights): |
|
|
self.models = models |
|
|
|
|
|
self.weights = np.array(weights, dtype=float) / max(np.sum(weights), 1e-9) |
|
|
|
|
|
def predict(self, X): |
|
|
|
|
|
preds = [m.predict(X) for m in self.models] |
|
|
return np.sum(np.array(preds).T * self.weights, axis=1) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _tune_with_optuna(name, base_model, X_tr, y_tr, X_va, y_va, n_trials=20): |
|
|
""" |
|
|
ํน์ ๋ชจ๋ธ์ ๋ํด Optuna ๋ก '์ข์ ํ์ดํผํ๋ผ๋ฏธํฐ'๋ฅผ ์ฐพ์์. |
|
|
- name: ๋ชจ๋ธ๋ช
๋ฌธ์์ด (RandomForest/XGBoost/LightGBM) |
|
|
- base_model: ์๋ ๋ชจ๋ธ(๋์ฒด๋ก ๋ฌด์ํ๊ณ ์๋ก ๋ง๋ฆ) |
|
|
- X_tr, y_tr: ํ์ต ์ธํธ |
|
|
- X_va, y_va: ๊ฒ์ฆ ์ธํธ |
|
|
- n_trials: ์๋ ํ์(๋ง์์๋ก ๋ ๊ผผ๊ผผํ์ง๋ง ์๊ฐ์ด ์ค๋ ๊ฑธ๋ฆผ) |
|
|
|
|
|
๋ฐํ: |
|
|
- ํ๋์ด ๊ฐ๋ฅํ๋ฉด '์ต์ ๋ชจ๋ธ' ๊ฐ์ฒด๋ฅผ ๋ฐํ |
|
|
- Optuna๊ฐ ์๊ฑฐ๋ ๋ชจ๋ธ์ด ๋งค์นญ๋์ง ์์ผ๋ฉด None |
|
|
""" |
|
|
if optuna is None: |
|
|
return None |
|
|
|
|
|
|
|
|
def objective(trial): |
|
|
if name == "RandomForest": |
|
|
|
|
|
n_estimators = trial.suggest_int("n_estimators", 200, 800, step=100) |
|
|
max_depth = trial.suggest_int("max_depth", 6, 24, step=2) |
|
|
m = RandomForestRegressor( |
|
|
n_estimators=n_estimators, |
|
|
max_depth=max_depth, |
|
|
random_state=42, |
|
|
n_jobs=-1 |
|
|
) |
|
|
|
|
|
elif name == "XGBoost" and XGBRegressor is not None: |
|
|
n_estimators = trial.suggest_int("n_estimators", 300, 900, step=100) |
|
|
max_depth = trial.suggest_int("max_depth", 4, 10) |
|
|
lr = trial.suggest_float("learning_rate", 0.02, 0.2, log=True) |
|
|
subsample = trial.suggest_float("subsample", 0.7, 1.0) |
|
|
colsample = trial.suggest_float("colsample_bytree", 0.7, 1.0) |
|
|
lam = trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True) |
|
|
m = XGBRegressor( |
|
|
n_estimators=n_estimators, |
|
|
max_depth=max_depth, |
|
|
learning_rate=lr, |
|
|
subsample=subsample, |
|
|
colsample_bytree=colsample, |
|
|
reg_lambda=lam, |
|
|
random_state=42, |
|
|
tree_method="hist", |
|
|
n_jobs=-1 |
|
|
) |
|
|
|
|
|
elif name == "LightGBM" and LGBMRegressor is not None: |
|
|
n_estimators = trial.suggest_int("n_estimators", 400, 1400, step=200) |
|
|
lr = trial.suggest_float("learning_rate", 0.02, 0.2, log=True) |
|
|
num_leaves = trial.suggest_int("num_leaves", 31, 255, step=16) |
|
|
subsample = trial.suggest_float("subsample", 0.7, 1.0) |
|
|
colsample = trial.suggest_float("colsample_bytree", 0.7, 1.0) |
|
|
m = LGBMRegressor( |
|
|
n_estimators=n_estimators, |
|
|
learning_rate=lr, |
|
|
num_leaves=num_leaves, |
|
|
subsample=subsample, |
|
|
colsample_bytree=colsample, |
|
|
random_state=42, |
|
|
n_jobs=-1 |
|
|
) |
|
|
else: |
|
|
|
|
|
return 1e9 |
|
|
|
|
|
|
|
|
m.fit(X_tr, y_tr) |
|
|
p = m.predict(X_va) |
|
|
return rmse(y_va, p) |
|
|
|
|
|
|
|
|
study = optuna.create_study(direction="minimize") |
|
|
study.optimize(objective, n_trials=n_trials, show_progress_bar=False) |
|
|
|
|
|
|
|
|
best_params = study.best_params |
|
|
if name == "RandomForest": |
|
|
m = RandomForestRegressor( |
|
|
n_estimators=best_params["n_estimators"], |
|
|
max_depth=best_params["max_depth"], |
|
|
random_state=42, |
|
|
n_jobs=-1 |
|
|
) |
|
|
elif name == "XGBoost" and XGBRegressor is not None: |
|
|
m = XGBRegressor( |
|
|
n_estimators=best_params["n_estimators"], |
|
|
max_depth=best_params["max_depth"], |
|
|
learning_rate=best_params["learning_rate"], |
|
|
subsample=best_params["subsample"], |
|
|
colsample_bytree=best_params["colsample_bytree"], |
|
|
reg_lambda=best_params["reg_lambda"], |
|
|
random_state=42, |
|
|
tree_method="hist", |
|
|
n_jobs=-1 |
|
|
) |
|
|
elif name == "LightGBM" and LGBMRegressor is not None: |
|
|
m = LGBMRegressor( |
|
|
n_estimators=best_params["n_estimators"], |
|
|
learning_rate=best_params["learning_rate"], |
|
|
num_leaves=best_params["num_leaves"], |
|
|
subsample=best_params["subsample"], |
|
|
colsample_bytree=best_params["colsample_bytree"], |
|
|
random_state=42, |
|
|
n_jobs=-1 |
|
|
) |
|
|
else: |
|
|
return None |
|
|
|
|
|
|
|
|
m.fit(X_tr, y_tr) |
|
|
return m |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def train_and_score(X, y, valid_ratio=0.2, use_optuna=False, optuna_trials=15, build_ensemble=True): |
|
|
""" |
|
|
์ฌ๋ฌ ๋ชจ๋ธ์ ํ์ต์ํค๊ณ , ๊ฒ์ฆ ์ฑ๋ฅ(RMSE/MAE/MAPE)์ ๋น๊ตํด |
|
|
'๊ฐ์ฅ ์ข์ ๋ชจ๋ธ'์ ์ฐพ์ ๋ฐํํด์. |
|
|
|
|
|
์
๋ ฅ: |
|
|
- X, y: ํ์ต ๋ฐ์ดํฐ(๋ฐฐ์ด/๋ํ์ด) |
|
|
- valid_ratio: ๊ฒ์ฆ ๋น์จ(0.2 = 20%) |
|
|
- use_optuna: True๋ฉด ๋ชจ๋ธ๋ณ ํ๋ ์๋ |
|
|
- optuna_trials: ํ๋ ์๋ ํ์ |
|
|
- build_ensemble: True๋ฉด ๊ฐ๋จ ์์๋ธ๋ ํ๋ณด๋ก ์ถ๊ฐ |
|
|
|
|
|
๋ฐํ: |
|
|
- best_model: ๊ฐ์ฅ ์ฑ๋ฅ ์ข์ ๋ชจ๋ธ(๋จ์ผ ๋๋ Ensemble) |
|
|
- lb: ์ฑ๋ฅ ๋ฆฌ๋๋ณด๋(DataFrame, rmse ์ค๋ฆ์ฐจ์ ์ ๋ ฌ) |
|
|
""" |
|
|
|
|
|
X_tr, y_tr, X_va, y_va = time_split(X, y, valid_ratio=valid_ratio) |
|
|
|
|
|
rows = [] |
|
|
best = (None, None, float("inf")) |
|
|
fitted = [] |
|
|
va_preds = [] |
|
|
|
|
|
|
|
|
for name, mdl, fit_params in get_candidates(): |
|
|
try: |
|
|
|
|
|
if use_optuna: |
|
|
tuned = _tune_with_optuna(name, mdl, X_tr, y_tr, X_va, y_va, n_trials=optuna_trials) |
|
|
if tuned is not None: |
|
|
mdl = tuned |
|
|
|
|
|
|
|
|
mdl.fit(X_tr, y_tr, **fit_params) |
|
|
|
|
|
|
|
|
pred = mdl.predict(X_va) |
|
|
|
|
|
|
|
|
row = { |
|
|
"model": name, |
|
|
"rmse": rmse(y_va, pred), |
|
|
"mae": mae(y_va, pred), |
|
|
"mape": mape(y_va, pred) |
|
|
} |
|
|
rows.append(row) |
|
|
|
|
|
|
|
|
fitted.append((name, mdl)) |
|
|
va_preds.append(pred) |
|
|
|
|
|
|
|
|
if row["rmse"] < best[2]: |
|
|
best = (name, mdl, row["rmse"]) |
|
|
|
|
|
except Exception: |
|
|
|
|
|
rows.append({"model": name, "rmse": np.nan, "mae": np.nan, "mape": np.nan}) |
|
|
|
|
|
|
|
|
|
|
|
if build_ensemble and len(va_preds) >= 2: |
|
|
|
|
|
rmses = [rmse(y_va, p) for p in va_preds] |
|
|
weights = [1.0 / max(r, 1e-6) for r in rmses] |
|
|
|
|
|
ens = SimpleEnsemble([m for _, m in fitted], weights) |
|
|
ens_pred = ens.predict(X_va) |
|
|
|
|
|
row = { |
|
|
"model": "Ensemble", |
|
|
"rmse": rmse(y_va, ens_pred), |
|
|
"mae": mae(y_va, ens_pred), |
|
|
"mape": mape(y_va, ens_pred) |
|
|
} |
|
|
rows.append(row) |
|
|
|
|
|
|
|
|
if row["rmse"] < best[2]: |
|
|
best = ("Ensemble", ens, row["rmse"]) |
|
|
|
|
|
|
|
|
lb = pd.DataFrame(rows).sort_values("rmse", na_position="last").reset_index(drop=True) |
|
|
|
|
|
|
|
|
return best[1], lb |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save_artifacts(out_dirs, best_model, feature_names, mapping, leaderboard_df): |
|
|
""" |
|
|
ํ์ต ๊ฒฐ๊ณผ๋ฅผ ๋์คํฌ์ ์ ์ฅํด์. |
|
|
|
|
|
- out_dirs: ์ ์ฅํ ํด๋ ๋ชฉ๋ก(์: ['artifacts', 'models']) |
|
|
๋ ํด๋ ๋ชจ๋์ ๋์ผํ ํ์ผ์ ๋ง๋ค์ด ๋ก๋๋ค(๋ณต๊ตฌ/๊ณต์ ํธ์). |
|
|
- best_model: train_and_score ์์ ๋ฝํ ์ต๊ณ ๋ชจ๋ธ(๋๋ ์์๋ธ) |
|
|
- feature_names: ๋ชจ๋ธ ์
๋ ฅ ์ปฌ๋ผ ์ด๋ฆ ๋ฆฌ์คํธ |
|
|
- mapping: ๋ ์ง/ํ๊น/์นดํ
๊ณ ๋ฆฌ ๋งคํ ๋์
๋๋ฆฌ (์ฌํ/์์ธก ์ ํ์) |
|
|
- leaderboard_df: ์ฑ๋ฅ ํ(DataFrame) |
|
|
|
|
|
์์ฑ ํ์ผ: |
|
|
- best_model.pkl: {model, feature_names, mapping} ๋ฅผ pickle ๋ก ์ ์ฅ |
|
|
- leaderboard.csv: ์ฑ๋ฅ ํ (UTF-8-SIG, ์์
ํธํ) |
|
|
- leaderboard.parquet: ํ์ผ์ด(์์ผ๋ฉด) |
|
|
""" |
|
|
payload = { |
|
|
"model": best_model, |
|
|
"feature_names": feature_names, |
|
|
"mapping": mapping |
|
|
} |
|
|
|
|
|
for d in out_dirs: |
|
|
os.makedirs(d, exist_ok=True) |
|
|
|
|
|
|
|
|
with open(os.path.join(d, "best_model.pkl"), "wb"): |
|
|
|
|
|
pass |
|
|
with open(os.path.join(d, "best_model.pkl"), "wb") as f: |
|
|
pickle.dump(payload, f) |
|
|
|
|
|
|
|
|
leaderboard_df.to_csv( |
|
|
os.path.join(d, "leaderboard.csv"), |
|
|
index=False, |
|
|
encoding="utf-8-sig" |
|
|
) |
|
|
|
|
|
|
|
|
try: |
|
|
leaderboard_df.to_parquet( |
|
|
os.path.join(d, "leaderboard.parquet"), |
|
|
index=False |
|
|
) |
|
|
except Exception: |
|
|
|
|
|
pass |
|
|
|