cstore / train_core.py
leedami's picture
Upload 7 files
5841e58 verified
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
============================================================
train_core.py โ€” ํ•™์Šต ํ•ต์‹ฌ ๋กœ์ง(์ฃผ์„ ์•„์ฃผ ์ž์„ธํžˆ)
------------------------------------------------------------
์ด ํŒŒ์ผ์€ ๋‹ค์Œ ์ผ์„ ํ•ด์š”:
1) ํ‰๊ฐ€ ์ง€ํ‘œ ํ•จ์ˆ˜ ์ •์˜(RMSE/MAE/MAPE)
2) ์‚ฌ์šฉํ•  ๋ชจ๋ธ ํ›„๋ณด๋“ค์„ ๋ชจ์•„์ฃผ๋Š” ํ•จ์ˆ˜(get_candidates)
3) ์‹œ๊ณ„์—ด ๋ถ„ํ• (ํ•™์Šต/๊ฒ€์ฆ ๋‚˜๋ˆ„๊ธฐ)
4) ๊ฐ„๋‹จํ•œ ์•™์ƒ๋ธ”(SimpleEnsemble)
5) (์˜ต์…˜) Optuna ๋กœ ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹
6) train_and_score: ๋ชจ๋ธ๋“ค ํ•™์Šต โ†’ ๊ฒ€์ฆ ์„ฑ๋Šฅ ๋น„๊ต โ†’ ๋ฒ ์ŠคํŠธ ์„ ํƒ
7) save_artifacts: ๋ฒ ์ŠคํŠธ ๋ชจ๋ธ/๋ฆฌ๋”๋ณด๋“œ ์ €์žฅ
โ€ป XGBoost/LightGBM/Optuna ๋Š” ์„ค์น˜๋˜์–ด ์žˆ์ง€ ์•Š์œผ๋ฉด
์ž๋™์œผ๋กœ ๊ฑด๋„ˆ๋›ฐ๋„๋ก ๋งŒ๋“ค์–ด์กŒ์Šต๋‹ˆ๋‹ค.
============================================================
"""
import os
import pickle
import numpy as np
import pandas as pd
# ํ‰๊ฐ€ ์ง€ํ‘œ ๊ณ„์‚ฐ์„ ์œ„ํ•ด scikit-learn ํ•จ์ˆ˜ ์‚ฌ์šฉ
from sklearn.metrics import mean_squared_error, mean_absolute_error
# ๊ธฐ๋ณธ ์„ ํ˜•ํšŒ๊ท€/๋žœ๋คํฌ๋ ˆ์ŠคํŠธ
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
# XGBoost / LightGBM ์€ ์žˆ์„ ์ˆ˜๋„, ์—†์„ ์ˆ˜๋„ ์žˆ์–ด์š”. (try/except)
try:
from xgboost import XGBRegressor
except Exception:
XGBRegressor = None
try:
from lightgbm import LGBMRegressor
except Exception:
LGBMRegressor = None
# Optuna(ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ์ž๋™ ํƒ์ƒ‰๊ธฐ)๋„ ์„ ํƒ์‚ฌํ•ญ
try:
import optuna
except Exception:
optuna = None
# ------------------------------------------------------------
# 1) ํ‰๊ฐ€ ์ง€ํ‘œ: RMSE / MAE / MAPE
# ------------------------------------------------------------
def rmse(a, b):
"""
RMSE (Root Mean Squared Error)
- ์˜ˆ์ธก์ด ์‹ค์ œ์™€ ์–ผ๋งˆ๋‚˜ ๋‹ค๋ฅธ์ง€, '์ œ๊ณฑ ํ‰๊ท  ์˜ค์ฐจ์˜ ์ œ๊ณฑ๊ทผ'
- ๊ฐ’์ด ์ž‘์„์ˆ˜๋ก ์ข‹์•„์š”.
"""
a = np.array(a); b = np.array(b)
return float(np.sqrt(mean_squared_error(a, b))) if len(a) else float("nan")
def mae(a, b):
"""
MAE (Mean Absolute Error)
- ์˜ˆ์ธก๊ณผ ์‹ค์ œ์˜ ์ฐจ์ด์˜ '์ ˆ๋Œ€๊ฐ’'์„ ํ‰๊ท ๋‚ธ ๊ฐ’
- ์‰ฌ์šด ์ง๊ด€: ํ‰๊ท ์ ์œผ๋กœ ๋ช‡ ๊ฐœ(๋˜๋Š” ๋ช‡ ๋‹จ์œ„) ๋งŒํผ ํ‹€๋ ธ๋‚˜?
"""
a = np.array(a); b = np.array(b)
return float(mean_absolute_error(a, b)) if len(a) else float("nan")
def mape(a, b):
"""
MAPE (Mean Absolute Percentage Error)
- ํผ์„ผํŠธ(%) ๊ธฐ์ค€ ์˜ค์ฐจ. 10%๋ฉด 'ํ‰๊ท ์ ์œผ๋กœ 10% ํ‹€๋ ธ๋‹ค'๋Š” ๋œป.
- ์‹ค์ œ๊ฐ’์ด 0์ด๋ฉด ๋‚˜๋ˆ—์…ˆ์ด ์•ˆ ๋˜๋ฏ€๋กœ 1๋กœ ๋ฐ”๊ฟ”์„œ ์•ˆ์ „ ์ฒ˜๋ฆฌํ•ด์š”.
"""
a = np.array(a); b = np.array(b)
if len(a) == 0:
return float("nan")
denom = np.where(a == 0, 1, a) # 0์ธ ๊ณณ์€ 1๋กœ ์น˜ํ™˜(๋ถ„๋ชจ ์•ˆ์ „์žฅ์น˜)
return float(np.mean(np.abs((a - b) / denom)) * 100.0)
# ------------------------------------------------------------
# 2) ๋ชจ๋ธ ํ›„๋ณด๋ฅผ ๋งŒ๋“ค์–ด ์ฃผ๋Š” ํ•จ์ˆ˜
# ------------------------------------------------------------
def get_candidates():
"""
์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ๋Š” ๋ชจ๋ธ ๋ชฉ๋ก์„ ํŠœํ”Œ๋กœ ๋ชจ์•„ ๋ฐ˜ํ™˜ํ•ด์š”.
๊ฐ ์›์†Œ: (์ด๋ฆ„, ๋ชจ๋ธ๊ฐ์ฒด, fit(ํ•™์Šต)ํ•  ๋•Œ ๋„ฃ์„ ์ถ”๊ฐ€ ํŒŒ๋ผ๋ฏธํ„ฐ ๋”•์…”๋„ˆ๋ฆฌ)
- LinearRegression: ๊ฐ€์žฅ ๊ธฐ๋ณธ์ ์ธ ์„ ํ˜• ๋ชจ๋ธ
- RandomForest: ๋น„์„ ํ˜• ํŒจํ„ด๋„ ์ž˜ ์žก๋Š” ๋‚˜๋ฌด ์•™์ƒ๋ธ”
- XGBoost / LightGBM: ๋น ๋ฅด๊ณ  ๊ฐ•๋ ฅํ•œ ๋ถ€์ŠคํŒ… ๋ชจ๋ธ(์„ค์น˜๋œ ๊ฒฝ์šฐ๋งŒ ์‚ฌ์šฉ)
"""
models = []
# 1) ์„ ํ˜•ํšŒ๊ท€ (์„ค์ •ํ•  ๊ฒŒ ๊ฑฐ์˜ ์—†์Œ)
models.append(("LinearRegression", LinearRegression(), {}))
# 2) ๋žœ๋คํฌ๋ ˆ์ŠคํŠธ (๋‚˜๋ฌด 300๊ทธ๋ฃจ, ๋ฉ€ํ‹ฐ์ฝ”์–ด ์‚ฌ์šฉ)
models.append(("RandomForest", RandomForestRegressor(
n_estimators=300, # ๋‚˜๋ฌด ๊ฐœ์ˆ˜
max_depth=None, # ๊นŠ์ด ์ œํ•œ ์—†์Œ(๊ณผ์ ํ•ฉ ์‹œ ์ค„์ด๊ธฐ)
random_state=42,
n_jobs=-1 # CPU ์ฝ”์–ด ๋ชจ๋‘ ์‚ฌ์šฉ
), {}))
# 3) XGBoost (์žˆ์„ ๋•Œ๋งŒ)
if XGBRegressor is not None:
models.append(("XGBoost", XGBRegressor(
n_estimators=400,
max_depth=6,
learning_rate=0.05,
subsample=0.9,
colsample_bytree=0.9,
reg_lambda=1.0,
random_state=42,
tree_method="hist", # ๋น ๋ฅธ ํžˆ์Šคํ† ๊ทธ๋žจ ๋ถ„ํ• 
n_jobs=-1
), {"verbose": False})) # fit์— ๋„ฃ์„ ์ถ”๊ฐ€ ์ธ์ž ์˜ˆ์‹œ
# 4) LightGBM (์žˆ์„ ๋•Œ๋งŒ)
if LGBMRegressor is not None:
models.append(("LightGBM", LGBMRegressor(
n_estimators=600,
max_depth=-1, # ์ž๋™
learning_rate=0.05,
subsample=0.9,
colsample_bytree=0.9,
reg_lambda=1.0,
random_state=42,
n_jobs=-1
), {}))
return models
# ------------------------------------------------------------
# 3) ์‹œ๊ณ„์—ด ๋ถ„ํ• : ์•ž๋ถ€๋ถ„(ํ•™์Šต) / ๋’ท๋ถ€๋ถ„(๊ฒ€์ฆ)
# ------------------------------------------------------------
def time_split(X, y, valid_ratio=0.2):
"""
์‹œ๊ฐ„ ์ˆœ์„œ๋ฅผ ์ง€ํ‚ค๊ธฐ ์œ„ํ•ด, ์•ž์ชฝ์€ 'ํ•™์Šต', ๋’ค์ชฝ์€ '๊ฒ€์ฆ'์œผ๋กœ ๋‚˜๋ˆ ์š”.
(์‹œ๊ณ„์—ด์€ ๋žœ๋ค ์„ž๊ธฐ๋ฅผ ์•ˆ ํ•˜๋Š” ๊ฒŒ ์ผ๋ฐ˜์ )
valid_ratio=0.2 ์ด๋ฉด ๋ฐ์ดํ„ฐ์˜ 20%๋ฅผ ๊ฒ€์ฆ์šฉ์œผ๋กœ ์‚ฌ์šฉ.
"""
n = len(X)
v = max(1, int(n * valid_ratio)) # ๊ฒ€์ฆ ์ƒ˜ํ”Œ ๊ฐœ์ˆ˜(์ตœ์†Œ 1)
t = n - v # ํ•™์Šต ์ƒ˜ํ”Œ ๊ฐœ์ˆ˜
return (X[:t], y[:t], X[t:], y[t:])
# ------------------------------------------------------------
# 4) ๊ฐ„๋‹จํ•œ ์•™์ƒ๋ธ”: ์—ฌ๋Ÿฌ ๋ชจ๋ธ ์˜ˆ์ธก์„ '๊ฐ€์ค‘ ํ‰๊ท '
# ------------------------------------------------------------
class SimpleEnsemble:
"""
์—ฌ๋Ÿฌ ๋ชจ๋ธ์˜ ์˜ˆ์ธก์„ ์„ž์–ด์„œ ํ•˜๋‚˜๋กœ ๋งŒ๋“œ๋Š” ๊ฐ„๋‹จํ•œ ์•™์ƒ๋ธ”.
- weights: ๊ฐ€์ค‘์น˜(๊ฐ’์ด ํฌ๋ฉด ๊ทธ ๋ชจ๋ธ์„ ๋” ์‹ ๋ขฐํ•œ๋‹ค๋Š” ๋œป)
- ์—ฌ๊ธฐ์„œ๋Š” ๋ชจ๋ธ๋ณ„ ๊ฒ€์ฆ RMSE ์˜ ์—ญ์ˆ˜๋ฅผ ๊ฐ€์ค‘์น˜๋กœ ์‚ฌ์šฉ(์ข‹์„์ˆ˜๋ก ํฐ ๊ฐ€์ค‘)
"""
def __init__(self, models, weights):
self.models = models
# ๊ฐ€์ค‘์น˜ ํ•ฉ์ด 1์ด ๋˜๋„๋ก ์ •๊ทœํ™”(ํ•ฉ์ด 0์ด๋ฉด ๋ถ„๋ชจ๋ฅผ ์•„์ฃผ ์ž‘์€ ๊ฐ’์œผ๋กœ)
self.weights = np.array(weights, dtype=float) / max(np.sum(weights), 1e-9)
def predict(self, X):
# ๊ฐ ๋ชจ๋ธ์˜ ์˜ˆ์ธก์„ ๋ชจ์•„์„œ(์—ด๋ฐฉํ–ฅ) ๊ฐ€์ค‘ ํ‰๊ท 
preds = [m.predict(X) for m in self.models] # ๋ฆฌ์ŠคํŠธ ๊ธธ์ด = ๋ชจ๋ธ ์ˆ˜
return np.sum(np.array(preds).T * self.weights, axis=1) # (์ƒ˜ํ”Œ, ๋ชจ๋ธ) ยท (๋ชจ๋ธ,) โ†’ (์ƒ˜ํ”Œ,)
# ------------------------------------------------------------
# 5) Optuna ๋กœ ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹(์„ ํƒ)
# ------------------------------------------------------------
def _tune_with_optuna(name, base_model, X_tr, y_tr, X_va, y_va, n_trials=20):
"""
ํŠน์ • ๋ชจ๋ธ์— ๋Œ€ํ•ด Optuna ๋กœ '์ข‹์€ ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ'๋ฅผ ์ฐพ์•„์š”.
- name: ๋ชจ๋ธ๋ช… ๋ฌธ์ž์—ด (RandomForest/XGBoost/LightGBM)
- base_model: ์›๋ž˜ ๋ชจ๋ธ(๋Œ€์ฒด๋กœ ๋ฌด์‹œํ•˜๊ณ  ์ƒˆ๋กœ ๋งŒ๋“ฆ)
- X_tr, y_tr: ํ•™์Šต ์„ธํŠธ
- X_va, y_va: ๊ฒ€์ฆ ์„ธํŠธ
- n_trials: ์‹œ๋„ ํšŸ์ˆ˜(๋งŽ์„์ˆ˜๋ก ๋” ๊ผผ๊ผผํ•˜์ง€๋งŒ ์‹œ๊ฐ„์ด ์˜ค๋ž˜ ๊ฑธ๋ฆผ)
๋ฐ˜ํ™˜:
- ํŠœ๋‹์ด ๊ฐ€๋Šฅํ•˜๋ฉด '์ตœ์  ๋ชจ๋ธ' ๊ฐ์ฒด๋ฅผ ๋ฐ˜ํ™˜
- Optuna๊ฐ€ ์—†๊ฑฐ๋‚˜ ๋ชจ๋ธ์ด ๋งค์นญ๋˜์ง€ ์•Š์œผ๋ฉด None
"""
if optuna is None:
return None # Optuna ์„ค์น˜ ์•ˆ ๋˜์–ด ์žˆ์œผ๋ฉด ์Šคํ‚ต
# ํƒ์ƒ‰ ๋ชฉํ‘œ ํ•จ์ˆ˜: ๊ฒ€์ฆ RMSE ๋ฅผ ์ตœ์†Œํ™”
def objective(trial):
if name == "RandomForest":
# ํƒ์ƒ‰ ๋ฒ”์œ„ ์ •์˜(๋Œ€๋žต์ ์ธ ํ•ฉ๋ฆฌ์  ๊ตฌ๊ฐ„)
n_estimators = trial.suggest_int("n_estimators", 200, 800, step=100)
max_depth = trial.suggest_int("max_depth", 6, 24, step=2)
m = RandomForestRegressor(
n_estimators=n_estimators,
max_depth=max_depth,
random_state=42,
n_jobs=-1
)
elif name == "XGBoost" and XGBRegressor is not None:
n_estimators = trial.suggest_int("n_estimators", 300, 900, step=100)
max_depth = trial.suggest_int("max_depth", 4, 10)
lr = trial.suggest_float("learning_rate", 0.02, 0.2, log=True)
subsample = trial.suggest_float("subsample", 0.7, 1.0)
colsample = trial.suggest_float("colsample_bytree", 0.7, 1.0)
lam = trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True)
m = XGBRegressor(
n_estimators=n_estimators,
max_depth=max_depth,
learning_rate=lr,
subsample=subsample,
colsample_bytree=colsample,
reg_lambda=lam,
random_state=42,
tree_method="hist",
n_jobs=-1
)
elif name == "LightGBM" and LGBMRegressor is not None:
n_estimators = trial.suggest_int("n_estimators", 400, 1400, step=200)
lr = trial.suggest_float("learning_rate", 0.02, 0.2, log=True)
num_leaves = trial.suggest_int("num_leaves", 31, 255, step=16)
subsample = trial.suggest_float("subsample", 0.7, 1.0)
colsample = trial.suggest_float("colsample_bytree", 0.7, 1.0)
m = LGBMRegressor(
n_estimators=n_estimators,
learning_rate=lr,
num_leaves=num_leaves,
subsample=subsample,
colsample_bytree=colsample,
random_state=42,
n_jobs=-1
)
else:
# ์ด ํ•จ์ˆ˜๊ฐ€ ์ง€์›ํ•˜์ง€ ์•Š๋Š” ๋ชจ๋ธ์ด๋ฉด ํฐ ์ˆซ์ž(๋‚˜์œ ์ ์ˆ˜) ๋ฐ˜ํ™˜
return 1e9
# ํ•™์Šต ํ›„ ๊ฒ€์ฆ์„ธํŠธ ์˜ˆ์ธก โ†’ RMSE ๋ฐ˜ํ™˜
m.fit(X_tr, y_tr)
p = m.predict(X_va)
return rmse(y_va, p)
# Optuna ์‹คํ–‰(์ตœ์†Œํ™”)
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
# ์ตœ์  ํŒŒ๋ผ๋ฏธํ„ฐ๋กœ '๋‹ค์‹œ' ๋ชจ๋ธ์„ ๋งŒ๋“ค์–ด ํ•™์Šตํ•ด ๋ฐ˜ํ™˜
best_params = study.best_params
if name == "RandomForest":
m = RandomForestRegressor(
n_estimators=best_params["n_estimators"],
max_depth=best_params["max_depth"],
random_state=42,
n_jobs=-1
)
elif name == "XGBoost" and XGBRegressor is not None:
m = XGBRegressor(
n_estimators=best_params["n_estimators"],
max_depth=best_params["max_depth"],
learning_rate=best_params["learning_rate"],
subsample=best_params["subsample"],
colsample_bytree=best_params["colsample_bytree"],
reg_lambda=best_params["reg_lambda"],
random_state=42,
tree_method="hist",
n_jobs=-1
)
elif name == "LightGBM" and LGBMRegressor is not None:
m = LGBMRegressor(
n_estimators=best_params["n_estimators"],
learning_rate=best_params["learning_rate"],
num_leaves=best_params["num_leaves"],
subsample=best_params["subsample"],
colsample_bytree=best_params["colsample_bytree"],
random_state=42,
n_jobs=-1
)
else:
return None
# ์ตœ์  ๋ชจ๋ธ์€ ๋‹ค์‹œ ์ „์ฒด ํ•™์Šต์„ธํŠธ์— ๋งž์ถฐ์„œ ๋ฐ˜ํ™˜
m.fit(X_tr, y_tr)
return m
# ------------------------------------------------------------
# 6) ํ•™์Šต & ์„ฑ๋Šฅ ๋น„๊ต โ†’ ๋ฒ ์ŠคํŠธ ๋ชจ๋ธ ์„ ํƒ
# ------------------------------------------------------------
def train_and_score(X, y, valid_ratio=0.2, use_optuna=False, optuna_trials=15, build_ensemble=True):
"""
์—ฌ๋Ÿฌ ๋ชจ๋ธ์„ ํ•™์Šต์‹œํ‚ค๊ณ , ๊ฒ€์ฆ ์„ฑ๋Šฅ(RMSE/MAE/MAPE)์„ ๋น„๊ตํ•ด
'๊ฐ€์žฅ ์ข‹์€ ๋ชจ๋ธ'์„ ์ฐพ์•„ ๋ฐ˜ํ™˜ํ•ด์š”.
์ž…๋ ฅ:
- X, y: ํ•™์Šต ๋ฐ์ดํ„ฐ(๋ฐฐ์—ด/๋„˜ํŒŒ์ด)
- valid_ratio: ๊ฒ€์ฆ ๋น„์œจ(0.2 = 20%)
- use_optuna: True๋ฉด ๋ชจ๋ธ๋ณ„ ํŠœ๋‹ ์‹œ๋„
- optuna_trials: ํŠœ๋‹ ์‹œ๋„ ํšŸ์ˆ˜
- build_ensemble: True๋ฉด ๊ฐ„๋‹จ ์•™์ƒ๋ธ”๋„ ํ›„๋ณด๋กœ ์ถ”๊ฐ€
๋ฐ˜ํ™˜:
- best_model: ๊ฐ€์žฅ ์„ฑ๋Šฅ ์ข‹์€ ๋ชจ๋ธ(๋‹จ์ผ ๋˜๋Š” Ensemble)
- lb: ์„ฑ๋Šฅ ๋ฆฌ๋”๋ณด๋“œ(DataFrame, rmse ์˜ค๋ฆ„์ฐจ์ˆœ ์ •๋ ฌ)
"""
# ์‹œ๊ฐ„ ์ˆœ์„œ ๊ธฐ๋ฐ˜ ๋ถ„ํ• (์•ž: ํ•™์Šต, ๋’ค: ๊ฒ€์ฆ)
X_tr, y_tr, X_va, y_va = time_split(X, y, valid_ratio=valid_ratio)
rows = [] # ๊ฐ ๋ชจ๋ธ์˜ ์„ฑ์ ํ‘œ๋ฅผ ๋‹ด์„ ๋ฆฌ์ŠคํŠธ(๋‚˜์ค‘์— DataFrame์œผ๋กœ)
best = (None, None, float("inf")) # (์ด๋ฆ„, ๋ชจ๋ธ, ํ˜„์žฌ๊นŒ์ง€์˜ ์ตœ์†Œ RMSE)
fitted = [] # ํ•™์Šต ์™„๋ฃŒ๋œ (์ด๋ฆ„, ๋ชจ๋ธ) ์ €์žฅ
va_preds = [] # ๊ฒ€์ฆ ์˜ˆ์ธก ๊ฒฐ๊ณผ(์•™์ƒ๋ธ” ๋งŒ๋“ค ๋•Œ ์‚ฌ์šฉ)
# ๋ชจ๋ธ ํ›„๋ณด๋“ค์„ ํ•˜๋‚˜์”ฉ ํ•™์Šต/ํ‰๊ฐ€
for name, mdl, fit_params in get_candidates():
try:
# Optuna ํŠœ๋‹์„ ์ผœ๋ฉด ๋จผ์ € ํŠœ๋‹์„ ์‹œ๋„
if use_optuna:
tuned = _tune_with_optuna(name, mdl, X_tr, y_tr, X_va, y_va, n_trials=optuna_trials)
if tuned is not None:
mdl = tuned # ํŠœ๋‹ ์„ฑ๊ณต ์‹œ ๊ทธ ๋ชจ๋ธ๋กœ ๊ต์ฒด
# ๋ชจ๋ธ ํ•™์Šต
mdl.fit(X_tr, y_tr, **fit_params)
# ๊ฒ€์ฆ ์˜ˆ์ธก
pred = mdl.predict(X_va)
# ์„ฑ์ ํ‘œ ํ•œ ์ค„ ์ž‘์„ฑ
row = {
"model": name,
"rmse": rmse(y_va, pred),
"mae": mae(y_va, pred),
"mape": mape(y_va, pred)
}
rows.append(row)
# ์•™์ƒ๋ธ” ํ›„๋ณด๋ฅผ ์œ„ํ•ด ์ €์žฅ
fitted.append((name, mdl))
va_preds.append(pred)
# ๋ฒ ์ŠคํŠธ ๊ฐฑ์‹ (๋” ์ž‘์€ RMSE๊ฐ€ ๋‚˜์˜ค๋ฉด ๊ต์ฒด)
if row["rmse"] < best[2]:
best = (name, mdl, row["rmse"])
except Exception:
# ์–ด๋–ค ๋ชจ๋ธ์ด ์‹คํŒจํ•˜๋”๋ผ๋„ ์ „์ฒด ํŒŒ์ดํ”„๋ผ์ธ์€ ๊ณ„์† ๊ฐ€์š”.
rows.append({"model": name, "rmse": np.nan, "mae": np.nan, "mape": np.nan})
# ---- ๊ฐ„๋‹จ ์•™์ƒ๋ธ” ํ›„๋ณด ์ถ”๊ฐ€ (์›ํ•˜๋ฉด) ----
# 2๊ฐœ ์ด์ƒ ๋ชจ๋ธ์ด ์„ฑ๊ณตํ–ˆ์„ ๋•Œ๋งŒ ์•™์ƒ๋ธ” ์‹œ๋„
if build_ensemble and len(va_preds) >= 2:
# ๋ชจ๋ธ๋ณ„ RMSE์˜ ์—ญ์ˆ˜๋ฅผ ๊ฐ€์ค‘์น˜๋กœ ์‚ฌ์šฉ(์ข‹์„์ˆ˜๋ก ํฐ ๊ฐ€์ค‘)
rmses = [rmse(y_va, p) for p in va_preds]
weights = [1.0 / max(r, 1e-6) for r in rmses] # 0 ๋‚˜๋ˆ” ๋ฐฉ์ง€
ens = SimpleEnsemble([m for _, m in fitted], weights)
ens_pred = ens.predict(X_va)
row = {
"model": "Ensemble",
"rmse": rmse(y_va, ens_pred),
"mae": mae(y_va, ens_pred),
"mape": mape(y_va, ens_pred)
}
rows.append(row)
# ์•™์ƒ๋ธ”์ด ์ œ์ผ ์ข‹์œผ๋ฉด ๋ฒ ์ŠคํŠธ๋กœ ๊ต์ฒด
if row["rmse"] < best[2]:
best = ("Ensemble", ens, row["rmse"])
# ๋ฆฌ๋”๋ณด๋“œ ํ…Œ์ด๋ธ” ๋งŒ๋“ค๊ธฐ(์ž‘์€ rmse ์ˆœ)
lb = pd.DataFrame(rows).sort_values("rmse", na_position="last").reset_index(drop=True)
# best[1] = ๋ฒ ์ŠคํŠธ ๋ชจ๋ธ ๊ฐ์ฒด
return best[1], lb
# ------------------------------------------------------------
# 7) ์‚ฐ์ถœ๋ฌผ ์ €์žฅ(๋ฒ ์ŠคํŠธ ๋ชจ๋ธ/ํ”ผ์ฒ˜๋ช…/๋งคํ•‘/๋ฆฌ๋”๋ณด๋“œ)
# ------------------------------------------------------------
def save_artifacts(out_dirs, best_model, feature_names, mapping, leaderboard_df):
"""
ํ•™์Šต ๊ฒฐ๊ณผ๋ฅผ ๋””์Šคํฌ์— ์ €์žฅํ•ด์š”.
- out_dirs: ์ €์žฅํ•  ํด๋” ๋ชฉ๋ก(์˜ˆ: ['artifacts', 'models'])
๋‘ ํด๋” ๋ชจ๋‘์— ๋™์ผํ•œ ํŒŒ์ผ์„ ๋งŒ๋“ค์–ด ๋‘ก๋‹ˆ๋‹ค(๋ณต๊ตฌ/๊ณต์œ  ํŽธ์˜).
- best_model: train_and_score ์—์„œ ๋ฝ‘ํžŒ ์ตœ๊ณ  ๋ชจ๋ธ(๋˜๋Š” ์•™์ƒ๋ธ”)
- feature_names: ๋ชจ๋ธ ์ž…๋ ฅ ์ปฌ๋Ÿผ ์ด๋ฆ„ ๋ฆฌ์ŠคํŠธ
- mapping: ๋‚ ์งœ/ํƒ€๊นƒ/์นดํ…Œ๊ณ ๋ฆฌ ๋งคํ•‘ ๋”•์…”๋„ˆ๋ฆฌ (์žฌํ˜„/์˜ˆ์ธก ์‹œ ํ•„์š”)
- leaderboard_df: ์„ฑ๋Šฅ ํ‘œ(DataFrame)
์ƒ์„ฑ ํŒŒ์ผ:
- best_model.pkl: {model, feature_names, mapping} ๋ฅผ pickle ๋กœ ์ €์žฅ
- leaderboard.csv: ์„ฑ๋Šฅ ํ‘œ (UTF-8-SIG, ์—‘์…€ ํ˜ธํ™˜)
- leaderboard.parquet: ํŒŒ์ผ€์ด(์žˆ์œผ๋ฉด)
"""
payload = {
"model": best_model,
"feature_names": feature_names,
"mapping": mapping
}
for d in out_dirs:
os.makedirs(d, exist_ok=True)
# 1) ๋ฒ ์ŠคํŠธ ๋ชจ๋ธ ํŒจํ‚ค์ง€ ์ €์žฅ
with open(os.path.join(d, "best_model.pkl"), "wb"):
# pickle.dump: ํŒŒ์ด์ฌ ๊ฐ์ฒด๋ฅผ ํŒŒ์ผ๋กœ ์ง๋ ฌํ™”ํ•ด์„œ ์ €์žฅ
pass
with open(os.path.join(d, "best_model.pkl"), "wb") as f:
pickle.dump(payload, f)
# 2) ๋ฆฌ๋”๋ณด๋“œ ์ €์žฅ (CSV)
leaderboard_df.to_csv(
os.path.join(d, "leaderboard.csv"),
index=False,
encoding="utf-8-sig" # ์—‘์…€์—์„œ ํ•œ๊ธ€ ์•ˆ๊นจ์ง€๋„๋ก
)
# 3) ๋ฆฌ๋”๋ณด๋“œ ์ €์žฅ (Parquet, ์„ ํƒ์‚ฌํ•ญ)
try:
leaderboard_df.to_parquet(
os.path.join(d, "leaderboard.parquet"),
index=False
)
except Exception:
# pyarrow ๊ฐ™์€ ์˜์กด์„ฑ์ด ์—†์„ ์ˆ˜ ์žˆ์œผ๋‹ˆ ์‹คํŒจํ•ด๋„ ๊ทธ๋ƒฅ ๋„˜์–ด๊ฐ
pass