Spaces:
Paused
Paused
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| """ | |
| ============================================================ | |
| train_core.py โ ํ์ต ํต์ฌ ๋ก์ง(์ฃผ์ ์์ฃผ ์์ธํ) | |
| ------------------------------------------------------------ | |
| ์ด ํ์ผ์ ๋ค์ ์ผ์ ํด์: | |
| 1) ํ๊ฐ ์งํ ํจ์ ์ ์(RMSE/MAE/MAPE) | |
| 2) ์ฌ์ฉํ ๋ชจ๋ธ ํ๋ณด๋ค์ ๋ชจ์์ฃผ๋ ํจ์(get_candidates) | |
| 3) ์๊ณ์ด ๋ถํ (ํ์ต/๊ฒ์ฆ ๋๋๊ธฐ) | |
| 4) ๊ฐ๋จํ ์์๋ธ(SimpleEnsemble) | |
| 5) (์ต์ ) Optuna ๋ก ํ์ดํผํ๋ผ๋ฏธํฐ ํ๋ | |
| 6) train_and_score: ๋ชจ๋ธ๋ค ํ์ต โ ๊ฒ์ฆ ์ฑ๋ฅ ๋น๊ต โ ๋ฒ ์คํธ ์ ํ | |
| 7) save_artifacts: ๋ฒ ์คํธ ๋ชจ๋ธ/๋ฆฌ๋๋ณด๋ ์ ์ฅ | |
| โป XGBoost/LightGBM/Optuna ๋ ์ค์น๋์ด ์์ง ์์ผ๋ฉด | |
| ์๋์ผ๋ก ๊ฑด๋๋ฐ๋๋ก ๋ง๋ค์ด์ก์ต๋๋ค. | |
| ============================================================ | |
| """ | |
| import os | |
| import pickle | |
| import numpy as np | |
| import pandas as pd | |
| # ํ๊ฐ ์งํ ๊ณ์ฐ์ ์ํด scikit-learn ํจ์ ์ฌ์ฉ | |
| from sklearn.metrics import mean_squared_error, mean_absolute_error | |
| # ๊ธฐ๋ณธ ์ ํํ๊ท/๋๋คํฌ๋ ์คํธ | |
| from sklearn.linear_model import LinearRegression | |
| from sklearn.ensemble import RandomForestRegressor | |
| # XGBoost / LightGBM ์ ์์ ์๋, ์์ ์๋ ์์ด์. (try/except) | |
| try: | |
| from xgboost import XGBRegressor | |
| except Exception: | |
| XGBRegressor = None | |
| try: | |
| from lightgbm import LGBMRegressor | |
| except Exception: | |
| LGBMRegressor = None | |
| # Optuna(ํ์ดํผํ๋ผ๋ฏธํฐ ์๋ ํ์๊ธฐ)๋ ์ ํ์ฌํญ | |
| try: | |
| import optuna | |
| except Exception: | |
| optuna = None | |
| # ------------------------------------------------------------ | |
| # 1) ํ๊ฐ ์งํ: RMSE / MAE / MAPE | |
| # ------------------------------------------------------------ | |
| def rmse(a, b): | |
| """ | |
| RMSE (Root Mean Squared Error) | |
| - ์์ธก์ด ์ค์ ์ ์ผ๋ง๋ ๋ค๋ฅธ์ง, '์ ๊ณฑ ํ๊ท ์ค์ฐจ์ ์ ๊ณฑ๊ทผ' | |
| - ๊ฐ์ด ์์์๋ก ์ข์์. | |
| """ | |
| a = np.array(a); b = np.array(b) | |
| return float(np.sqrt(mean_squared_error(a, b))) if len(a) else float("nan") | |
| def mae(a, b): | |
| """ | |
| MAE (Mean Absolute Error) | |
| - ์์ธก๊ณผ ์ค์ ์ ์ฐจ์ด์ '์ ๋๊ฐ'์ ํ๊ท ๋ธ ๊ฐ | |
| - ์ฌ์ด ์ง๊ด: ํ๊ท ์ ์ผ๋ก ๋ช ๊ฐ(๋๋ ๋ช ๋จ์) ๋งํผ ํ๋ ธ๋? | |
| """ | |
| a = np.array(a); b = np.array(b) | |
| return float(mean_absolute_error(a, b)) if len(a) else float("nan") | |
| def mape(a, b): | |
| """ | |
| MAPE (Mean Absolute Percentage Error) | |
| - ํผ์ผํธ(%) ๊ธฐ์ค ์ค์ฐจ. 10%๋ฉด 'ํ๊ท ์ ์ผ๋ก 10% ํ๋ ธ๋ค'๋ ๋ป. | |
| - ์ค์ ๊ฐ์ด 0์ด๋ฉด ๋๋์ ์ด ์ ๋๋ฏ๋ก 1๋ก ๋ฐ๊ฟ์ ์์ ์ฒ๋ฆฌํด์. | |
| """ | |
| a = np.array(a); b = np.array(b) | |
| if len(a) == 0: | |
| return float("nan") | |
| denom = np.where(a == 0, 1, a) # 0์ธ ๊ณณ์ 1๋ก ์นํ(๋ถ๋ชจ ์์ ์ฅ์น) | |
| return float(np.mean(np.abs((a - b) / denom)) * 100.0) | |
| # ------------------------------------------------------------ | |
| # 2) ๋ชจ๋ธ ํ๋ณด๋ฅผ ๋ง๋ค์ด ์ฃผ๋ ํจ์ | |
| # ------------------------------------------------------------ | |
| def get_candidates(): | |
| """ | |
| ์ฌ์ฉํ ์ ์๋ ๋ชจ๋ธ ๋ชฉ๋ก์ ํํ๋ก ๋ชจ์ ๋ฐํํด์. | |
| ๊ฐ ์์: (์ด๋ฆ, ๋ชจ๋ธ๊ฐ์ฒด, fit(ํ์ต)ํ ๋ ๋ฃ์ ์ถ๊ฐ ํ๋ผ๋ฏธํฐ ๋์ ๋๋ฆฌ) | |
| - LinearRegression: ๊ฐ์ฅ ๊ธฐ๋ณธ์ ์ธ ์ ํ ๋ชจ๋ธ | |
| - RandomForest: ๋น์ ํ ํจํด๋ ์ ์ก๋ ๋๋ฌด ์์๋ธ | |
| - XGBoost / LightGBM: ๋น ๋ฅด๊ณ ๊ฐ๋ ฅํ ๋ถ์คํ ๋ชจ๋ธ(์ค์น๋ ๊ฒฝ์ฐ๋ง ์ฌ์ฉ) | |
| """ | |
| models = [] | |
| # 1) ์ ํํ๊ท (์ค์ ํ ๊ฒ ๊ฑฐ์ ์์) | |
| models.append(("LinearRegression", LinearRegression(), {})) | |
| # 2) ๋๋คํฌ๋ ์คํธ (๋๋ฌด 300๊ทธ๋ฃจ, ๋ฉํฐ์ฝ์ด ์ฌ์ฉ) | |
| models.append(("RandomForest", RandomForestRegressor( | |
| n_estimators=300, # ๋๋ฌด ๊ฐ์ | |
| max_depth=None, # ๊น์ด ์ ํ ์์(๊ณผ์ ํฉ ์ ์ค์ด๊ธฐ) | |
| random_state=42, | |
| n_jobs=-1 # CPU ์ฝ์ด ๋ชจ๋ ์ฌ์ฉ | |
| ), {})) | |
| # 3) XGBoost (์์ ๋๋ง) | |
| if XGBRegressor is not None: | |
| models.append(("XGBoost", XGBRegressor( | |
| n_estimators=400, | |
| max_depth=6, | |
| learning_rate=0.05, | |
| subsample=0.9, | |
| colsample_bytree=0.9, | |
| reg_lambda=1.0, | |
| random_state=42, | |
| tree_method="hist", # ๋น ๋ฅธ ํ์คํ ๊ทธ๋จ ๋ถํ | |
| n_jobs=-1 | |
| ), {"verbose": False})) # fit์ ๋ฃ์ ์ถ๊ฐ ์ธ์ ์์ | |
| # 4) LightGBM (์์ ๋๋ง) | |
| if LGBMRegressor is not None: | |
| models.append(("LightGBM", LGBMRegressor( | |
| n_estimators=600, | |
| max_depth=-1, # ์๋ | |
| learning_rate=0.05, | |
| subsample=0.9, | |
| colsample_bytree=0.9, | |
| reg_lambda=1.0, | |
| random_state=42, | |
| n_jobs=-1 | |
| ), {})) | |
| return models | |
| # ------------------------------------------------------------ | |
| # 3) ์๊ณ์ด ๋ถํ : ์๋ถ๋ถ(ํ์ต) / ๋ท๋ถ๋ถ(๊ฒ์ฆ) | |
| # ------------------------------------------------------------ | |
| def time_split(X, y, valid_ratio=0.2): | |
| """ | |
| ์๊ฐ ์์๋ฅผ ์งํค๊ธฐ ์ํด, ์์ชฝ์ 'ํ์ต', ๋ค์ชฝ์ '๊ฒ์ฆ'์ผ๋ก ๋๋ ์. | |
| (์๊ณ์ด์ ๋๋ค ์๊ธฐ๋ฅผ ์ ํ๋ ๊ฒ ์ผ๋ฐ์ ) | |
| valid_ratio=0.2 ์ด๋ฉด ๋ฐ์ดํฐ์ 20%๋ฅผ ๊ฒ์ฆ์ฉ์ผ๋ก ์ฌ์ฉ. | |
| """ | |
| n = len(X) | |
| v = max(1, int(n * valid_ratio)) # ๊ฒ์ฆ ์ํ ๊ฐ์(์ต์ 1) | |
| t = n - v # ํ์ต ์ํ ๊ฐ์ | |
| return (X[:t], y[:t], X[t:], y[t:]) | |
| # ------------------------------------------------------------ | |
| # 4) ๊ฐ๋จํ ์์๋ธ: ์ฌ๋ฌ ๋ชจ๋ธ ์์ธก์ '๊ฐ์ค ํ๊ท ' | |
| # ------------------------------------------------------------ | |
| class SimpleEnsemble: | |
| """ | |
| ์ฌ๋ฌ ๋ชจ๋ธ์ ์์ธก์ ์์ด์ ํ๋๋ก ๋ง๋๋ ๊ฐ๋จํ ์์๋ธ. | |
| - weights: ๊ฐ์ค์น(๊ฐ์ด ํฌ๋ฉด ๊ทธ ๋ชจ๋ธ์ ๋ ์ ๋ขฐํ๋ค๋ ๋ป) | |
| - ์ฌ๊ธฐ์๋ ๋ชจ๋ธ๋ณ ๊ฒ์ฆ RMSE ์ ์ญ์๋ฅผ ๊ฐ์ค์น๋ก ์ฌ์ฉ(์ข์์๋ก ํฐ ๊ฐ์ค) | |
| """ | |
| def __init__(self, models, weights): | |
| self.models = models | |
| # ๊ฐ์ค์น ํฉ์ด 1์ด ๋๋๋ก ์ ๊ทํ(ํฉ์ด 0์ด๋ฉด ๋ถ๋ชจ๋ฅผ ์์ฃผ ์์ ๊ฐ์ผ๋ก) | |
| self.weights = np.array(weights, dtype=float) / max(np.sum(weights), 1e-9) | |
| def predict(self, X): | |
| # ๊ฐ ๋ชจ๋ธ์ ์์ธก์ ๋ชจ์์(์ด๋ฐฉํฅ) ๊ฐ์ค ํ๊ท | |
| preds = [m.predict(X) for m in self.models] # ๋ฆฌ์คํธ ๊ธธ์ด = ๋ชจ๋ธ ์ | |
| return np.sum(np.array(preds).T * self.weights, axis=1) # (์ํ, ๋ชจ๋ธ) ยท (๋ชจ๋ธ,) โ (์ํ,) | |
| # ------------------------------------------------------------ | |
| # 5) Optuna ๋ก ํ์ดํผํ๋ผ๋ฏธํฐ ํ๋(์ ํ) | |
| # ------------------------------------------------------------ | |
| def _tune_with_optuna(name, base_model, X_tr, y_tr, X_va, y_va, n_trials=20): | |
| """ | |
| ํน์ ๋ชจ๋ธ์ ๋ํด Optuna ๋ก '์ข์ ํ์ดํผํ๋ผ๋ฏธํฐ'๋ฅผ ์ฐพ์์. | |
| - name: ๋ชจ๋ธ๋ช ๋ฌธ์์ด (RandomForest/XGBoost/LightGBM) | |
| - base_model: ์๋ ๋ชจ๋ธ(๋์ฒด๋ก ๋ฌด์ํ๊ณ ์๋ก ๋ง๋ฆ) | |
| - X_tr, y_tr: ํ์ต ์ธํธ | |
| - X_va, y_va: ๊ฒ์ฆ ์ธํธ | |
| - n_trials: ์๋ ํ์(๋ง์์๋ก ๋ ๊ผผ๊ผผํ์ง๋ง ์๊ฐ์ด ์ค๋ ๊ฑธ๋ฆผ) | |
| ๋ฐํ: | |
| - ํ๋์ด ๊ฐ๋ฅํ๋ฉด '์ต์ ๋ชจ๋ธ' ๊ฐ์ฒด๋ฅผ ๋ฐํ | |
| - Optuna๊ฐ ์๊ฑฐ๋ ๋ชจ๋ธ์ด ๋งค์นญ๋์ง ์์ผ๋ฉด None | |
| """ | |
| if optuna is None: | |
| return None # Optuna ์ค์น ์ ๋์ด ์์ผ๋ฉด ์คํต | |
| # ํ์ ๋ชฉํ ํจ์: ๊ฒ์ฆ RMSE ๋ฅผ ์ต์ํ | |
| def objective(trial): | |
| if name == "RandomForest": | |
| # ํ์ ๋ฒ์ ์ ์(๋๋ต์ ์ธ ํฉ๋ฆฌ์ ๊ตฌ๊ฐ) | |
| n_estimators = trial.suggest_int("n_estimators", 200, 800, step=100) | |
| max_depth = trial.suggest_int("max_depth", 6, 24, step=2) | |
| m = RandomForestRegressor( | |
| n_estimators=n_estimators, | |
| max_depth=max_depth, | |
| random_state=42, | |
| n_jobs=-1 | |
| ) | |
| elif name == "XGBoost" and XGBRegressor is not None: | |
| n_estimators = trial.suggest_int("n_estimators", 300, 900, step=100) | |
| max_depth = trial.suggest_int("max_depth", 4, 10) | |
| lr = trial.suggest_float("learning_rate", 0.02, 0.2, log=True) | |
| subsample = trial.suggest_float("subsample", 0.7, 1.0) | |
| colsample = trial.suggest_float("colsample_bytree", 0.7, 1.0) | |
| lam = trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True) | |
| m = XGBRegressor( | |
| n_estimators=n_estimators, | |
| max_depth=max_depth, | |
| learning_rate=lr, | |
| subsample=subsample, | |
| colsample_bytree=colsample, | |
| reg_lambda=lam, | |
| random_state=42, | |
| tree_method="hist", | |
| n_jobs=-1 | |
| ) | |
| elif name == "LightGBM" and LGBMRegressor is not None: | |
| n_estimators = trial.suggest_int("n_estimators", 400, 1400, step=200) | |
| lr = trial.suggest_float("learning_rate", 0.02, 0.2, log=True) | |
| num_leaves = trial.suggest_int("num_leaves", 31, 255, step=16) | |
| subsample = trial.suggest_float("subsample", 0.7, 1.0) | |
| colsample = trial.suggest_float("colsample_bytree", 0.7, 1.0) | |
| m = LGBMRegressor( | |
| n_estimators=n_estimators, | |
| learning_rate=lr, | |
| num_leaves=num_leaves, | |
| subsample=subsample, | |
| colsample_bytree=colsample, | |
| random_state=42, | |
| n_jobs=-1 | |
| ) | |
| else: | |
| # ์ด ํจ์๊ฐ ์ง์ํ์ง ์๋ ๋ชจ๋ธ์ด๋ฉด ํฐ ์ซ์(๋์ ์ ์) ๋ฐํ | |
| return 1e9 | |
| # ํ์ต ํ ๊ฒ์ฆ์ธํธ ์์ธก โ RMSE ๋ฐํ | |
| m.fit(X_tr, y_tr) | |
| p = m.predict(X_va) | |
| return rmse(y_va, p) | |
| # Optuna ์คํ(์ต์ํ) | |
| study = optuna.create_study(direction="minimize") | |
| study.optimize(objective, n_trials=n_trials, show_progress_bar=False) | |
| # ์ต์ ํ๋ผ๋ฏธํฐ๋ก '๋ค์' ๋ชจ๋ธ์ ๋ง๋ค์ด ํ์ตํด ๋ฐํ | |
| best_params = study.best_params | |
| if name == "RandomForest": | |
| m = RandomForestRegressor( | |
| n_estimators=best_params["n_estimators"], | |
| max_depth=best_params["max_depth"], | |
| random_state=42, | |
| n_jobs=-1 | |
| ) | |
| elif name == "XGBoost" and XGBRegressor is not None: | |
| m = XGBRegressor( | |
| n_estimators=best_params["n_estimators"], | |
| max_depth=best_params["max_depth"], | |
| learning_rate=best_params["learning_rate"], | |
| subsample=best_params["subsample"], | |
| colsample_bytree=best_params["colsample_bytree"], | |
| reg_lambda=best_params["reg_lambda"], | |
| random_state=42, | |
| tree_method="hist", | |
| n_jobs=-1 | |
| ) | |
| elif name == "LightGBM" and LGBMRegressor is not None: | |
| m = LGBMRegressor( | |
| n_estimators=best_params["n_estimators"], | |
| learning_rate=best_params["learning_rate"], | |
| num_leaves=best_params["num_leaves"], | |
| subsample=best_params["subsample"], | |
| colsample_bytree=best_params["colsample_bytree"], | |
| random_state=42, | |
| n_jobs=-1 | |
| ) | |
| else: | |
| return None | |
| # ์ต์ ๋ชจ๋ธ์ ๋ค์ ์ ์ฒด ํ์ต์ธํธ์ ๋ง์ถฐ์ ๋ฐํ | |
| m.fit(X_tr, y_tr) | |
| return m | |
| # ------------------------------------------------------------ | |
| # 6) ํ์ต & ์ฑ๋ฅ ๋น๊ต โ ๋ฒ ์คํธ ๋ชจ๋ธ ์ ํ | |
| # ------------------------------------------------------------ | |
| def train_and_score(X, y, valid_ratio=0.2, use_optuna=False, optuna_trials=15, build_ensemble=True): | |
| """ | |
| ์ฌ๋ฌ ๋ชจ๋ธ์ ํ์ต์ํค๊ณ , ๊ฒ์ฆ ์ฑ๋ฅ(RMSE/MAE/MAPE)์ ๋น๊ตํด | |
| '๊ฐ์ฅ ์ข์ ๋ชจ๋ธ'์ ์ฐพ์ ๋ฐํํด์. | |
| ์ ๋ ฅ: | |
| - X, y: ํ์ต ๋ฐ์ดํฐ(๋ฐฐ์ด/๋ํ์ด) | |
| - valid_ratio: ๊ฒ์ฆ ๋น์จ(0.2 = 20%) | |
| - use_optuna: True๋ฉด ๋ชจ๋ธ๋ณ ํ๋ ์๋ | |
| - optuna_trials: ํ๋ ์๋ ํ์ | |
| - build_ensemble: True๋ฉด ๊ฐ๋จ ์์๋ธ๋ ํ๋ณด๋ก ์ถ๊ฐ | |
| ๋ฐํ: | |
| - best_model: ๊ฐ์ฅ ์ฑ๋ฅ ์ข์ ๋ชจ๋ธ(๋จ์ผ ๋๋ Ensemble) | |
| - lb: ์ฑ๋ฅ ๋ฆฌ๋๋ณด๋(DataFrame, rmse ์ค๋ฆ์ฐจ์ ์ ๋ ฌ) | |
| """ | |
| # ์๊ฐ ์์ ๊ธฐ๋ฐ ๋ถํ (์: ํ์ต, ๋ค: ๊ฒ์ฆ) | |
| X_tr, y_tr, X_va, y_va = time_split(X, y, valid_ratio=valid_ratio) | |
| rows = [] # ๊ฐ ๋ชจ๋ธ์ ์ฑ์ ํ๋ฅผ ๋ด์ ๋ฆฌ์คํธ(๋์ค์ DataFrame์ผ๋ก) | |
| best = (None, None, float("inf")) # (์ด๋ฆ, ๋ชจ๋ธ, ํ์ฌ๊น์ง์ ์ต์ RMSE) | |
| fitted = [] # ํ์ต ์๋ฃ๋ (์ด๋ฆ, ๋ชจ๋ธ) ์ ์ฅ | |
| va_preds = [] # ๊ฒ์ฆ ์์ธก ๊ฒฐ๊ณผ(์์๋ธ ๋ง๋ค ๋ ์ฌ์ฉ) | |
| # ๋ชจ๋ธ ํ๋ณด๋ค์ ํ๋์ฉ ํ์ต/ํ๊ฐ | |
| for name, mdl, fit_params in get_candidates(): | |
| try: | |
| # Optuna ํ๋์ ์ผ๋ฉด ๋จผ์ ํ๋์ ์๋ | |
| if use_optuna: | |
| tuned = _tune_with_optuna(name, mdl, X_tr, y_tr, X_va, y_va, n_trials=optuna_trials) | |
| if tuned is not None: | |
| mdl = tuned # ํ๋ ์ฑ๊ณต ์ ๊ทธ ๋ชจ๋ธ๋ก ๊ต์ฒด | |
| # ๋ชจ๋ธ ํ์ต | |
| mdl.fit(X_tr, y_tr, **fit_params) | |
| # ๊ฒ์ฆ ์์ธก | |
| pred = mdl.predict(X_va) | |
| # ์ฑ์ ํ ํ ์ค ์์ฑ | |
| row = { | |
| "model": name, | |
| "rmse": rmse(y_va, pred), | |
| "mae": mae(y_va, pred), | |
| "mape": mape(y_va, pred) | |
| } | |
| rows.append(row) | |
| # ์์๋ธ ํ๋ณด๋ฅผ ์ํด ์ ์ฅ | |
| fitted.append((name, mdl)) | |
| va_preds.append(pred) | |
| # ๋ฒ ์คํธ ๊ฐฑ์ (๋ ์์ RMSE๊ฐ ๋์ค๋ฉด ๊ต์ฒด) | |
| if row["rmse"] < best[2]: | |
| best = (name, mdl, row["rmse"]) | |
| except Exception: | |
| # ์ด๋ค ๋ชจ๋ธ์ด ์คํจํ๋๋ผ๋ ์ ์ฒด ํ์ดํ๋ผ์ธ์ ๊ณ์ ๊ฐ์. | |
| rows.append({"model": name, "rmse": np.nan, "mae": np.nan, "mape": np.nan}) | |
| # ---- ๊ฐ๋จ ์์๋ธ ํ๋ณด ์ถ๊ฐ (์ํ๋ฉด) ---- | |
| # 2๊ฐ ์ด์ ๋ชจ๋ธ์ด ์ฑ๊ณตํ์ ๋๋ง ์์๋ธ ์๋ | |
| if build_ensemble and len(va_preds) >= 2: | |
| # ๋ชจ๋ธ๋ณ RMSE์ ์ญ์๋ฅผ ๊ฐ์ค์น๋ก ์ฌ์ฉ(์ข์์๋ก ํฐ ๊ฐ์ค) | |
| rmses = [rmse(y_va, p) for p in va_preds] | |
| weights = [1.0 / max(r, 1e-6) for r in rmses] # 0 ๋๋ ๋ฐฉ์ง | |
| ens = SimpleEnsemble([m for _, m in fitted], weights) | |
| ens_pred = ens.predict(X_va) | |
| row = { | |
| "model": "Ensemble", | |
| "rmse": rmse(y_va, ens_pred), | |
| "mae": mae(y_va, ens_pred), | |
| "mape": mape(y_va, ens_pred) | |
| } | |
| rows.append(row) | |
| # ์์๋ธ์ด ์ ์ผ ์ข์ผ๋ฉด ๋ฒ ์คํธ๋ก ๊ต์ฒด | |
| if row["rmse"] < best[2]: | |
| best = ("Ensemble", ens, row["rmse"]) | |
| # ๋ฆฌ๋๋ณด๋ ํ ์ด๋ธ ๋ง๋ค๊ธฐ(์์ rmse ์) | |
| lb = pd.DataFrame(rows).sort_values("rmse", na_position="last").reset_index(drop=True) | |
| # best[1] = ๋ฒ ์คํธ ๋ชจ๋ธ ๊ฐ์ฒด | |
| return best[1], lb | |
| # ------------------------------------------------------------ | |
| # 7) ์ฐ์ถ๋ฌผ ์ ์ฅ(๋ฒ ์คํธ ๋ชจ๋ธ/ํผ์ฒ๋ช /๋งคํ/๋ฆฌ๋๋ณด๋) | |
| # ------------------------------------------------------------ | |
| def save_artifacts(out_dirs, best_model, feature_names, mapping, leaderboard_df): | |
| """ | |
| ํ์ต ๊ฒฐ๊ณผ๋ฅผ ๋์คํฌ์ ์ ์ฅํด์. | |
| - out_dirs: ์ ์ฅํ ํด๋ ๋ชฉ๋ก(์: ['artifacts', 'models']) | |
| ๋ ํด๋ ๋ชจ๋์ ๋์ผํ ํ์ผ์ ๋ง๋ค์ด ๋ก๋๋ค(๋ณต๊ตฌ/๊ณต์ ํธ์). | |
| - best_model: train_and_score ์์ ๋ฝํ ์ต๊ณ ๋ชจ๋ธ(๋๋ ์์๋ธ) | |
| - feature_names: ๋ชจ๋ธ ์ ๋ ฅ ์ปฌ๋ผ ์ด๋ฆ ๋ฆฌ์คํธ | |
| - mapping: ๋ ์ง/ํ๊น/์นดํ ๊ณ ๋ฆฌ ๋งคํ ๋์ ๋๋ฆฌ (์ฌํ/์์ธก ์ ํ์) | |
| - leaderboard_df: ์ฑ๋ฅ ํ(DataFrame) | |
| ์์ฑ ํ์ผ: | |
| - best_model.pkl: {model, feature_names, mapping} ๋ฅผ pickle ๋ก ์ ์ฅ | |
| - leaderboard.csv: ์ฑ๋ฅ ํ (UTF-8-SIG, ์์ ํธํ) | |
| - leaderboard.parquet: ํ์ผ์ด(์์ผ๋ฉด) | |
| """ | |
| payload = { | |
| "model": best_model, | |
| "feature_names": feature_names, | |
| "mapping": mapping | |
| } | |
| for d in out_dirs: | |
| os.makedirs(d, exist_ok=True) | |
| # 1) ๋ฒ ์คํธ ๋ชจ๋ธ ํจํค์ง ์ ์ฅ | |
| with open(os.path.join(d, "best_model.pkl"), "wb"): | |
| # pickle.dump: ํ์ด์ฌ ๊ฐ์ฒด๋ฅผ ํ์ผ๋ก ์ง๋ ฌํํด์ ์ ์ฅ | |
| pass | |
| with open(os.path.join(d, "best_model.pkl"), "wb") as f: | |
| pickle.dump(payload, f) | |
| # 2) ๋ฆฌ๋๋ณด๋ ์ ์ฅ (CSV) | |
| leaderboard_df.to_csv( | |
| os.path.join(d, "leaderboard.csv"), | |
| index=False, | |
| encoding="utf-8-sig" # ์์ ์์ ํ๊ธ ์๊นจ์ง๋๋ก | |
| ) | |
| # 3) ๋ฆฌ๋๋ณด๋ ์ ์ฅ (Parquet, ์ ํ์ฌํญ) | |
| try: | |
| leaderboard_df.to_parquet( | |
| os.path.join(d, "leaderboard.parquet"), | |
| index=False | |
| ) | |
| except Exception: | |
| # pyarrow ๊ฐ์ ์์กด์ฑ์ด ์์ ์ ์์ผ๋ ์คํจํด๋ ๊ทธ๋ฅ ๋์ด๊ฐ | |
| pass | |