|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
import json
|
|
|
import os
|
|
|
from datetime import datetime
|
|
|
|
|
|
|
|
|
import mlflow
|
|
|
import mlflow.sklearn
|
|
|
import mlflow.xgboost
|
|
|
|
|
|
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
|
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
|
|
|
from xgboost import XGBRegressor
|
|
|
import joblib
|
|
|
|
|
|
|
|
|
class TRAIN_MODEL():
|
|
|
def __init__(self, nombre, use_grid_search=False, config_path="config/model_config.json"):
|
|
|
"""
|
|
|
Entrenar modelo con tracking MLflow
|
|
|
|
|
|
Args:
|
|
|
nombre: Identificador del modelo (ej: "v3_production")
|
|
|
use_grid_search: True = buscar hiperparámetros, False = usar config guardado
|
|
|
config_path: Ruta al archivo de configuración con hiperparámetros
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
mlflow.set_tracking_uri("file:./mlruns")
|
|
|
mlflow.set_experiment("corners_prediction")
|
|
|
|
|
|
self.nombre = nombre
|
|
|
self.use_grid_search = use_grid_search
|
|
|
self.config_path = config_path
|
|
|
self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
|
|
|
|
|
|
|
with mlflow.start_run(run_name=f"{nombre}_{self.timestamp}") as run:
|
|
|
self.run_id = run.info.run_id
|
|
|
|
|
|
print(f"\n{'='*80}")
|
|
|
print(f"🚀 Entrenamiento iniciado con MLflow")
|
|
|
print(f" Run ID: {self.run_id}")
|
|
|
print(f" Nombre: {nombre}")
|
|
|
print(f" GridSearch: {'SÍ' if use_grid_search else 'NO (usando config)'}")
|
|
|
print(f"{'='*80}\n")
|
|
|
|
|
|
|
|
|
mlflow.set_tags({
|
|
|
"model_name": nombre,
|
|
|
"timestamp": self.timestamp,
|
|
|
"grid_search_used": str(use_grid_search),
|
|
|
"framework": "XGBoost",
|
|
|
"task": "regression"
|
|
|
})
|
|
|
|
|
|
|
|
|
try:
|
|
|
self.init_variables()
|
|
|
self.load_dataset()
|
|
|
self.split_train_test(0.15)
|
|
|
self.define_model()
|
|
|
|
|
|
if use_grid_search:
|
|
|
print("🔍 Ejecutando GridSearch (puede tardar)...")
|
|
|
self.train_grid_search()
|
|
|
self.save_best_params()
|
|
|
else:
|
|
|
print("⚡ Usando hiperparámetros guardados (rápido)")
|
|
|
self.load_best_params()
|
|
|
|
|
|
self.train_model()
|
|
|
self.test_and_eval()
|
|
|
self.top_features()
|
|
|
self.save_models(nombre)
|
|
|
|
|
|
mlflow.set_tag("status", "SUCCESS")
|
|
|
print(f"\n✅ Entrenamiento completado")
|
|
|
print(f"📊 Ver en MLflow UI: mlflow ui")
|
|
|
|
|
|
except Exception as e:
|
|
|
mlflow.set_tag("status", "FAILED")
|
|
|
print(f"\n❌ Error: {e}")
|
|
|
raise
|
|
|
|
|
|
def init_variables(self):
|
|
|
"""Definir espacio de búsqueda para GridSearch"""
|
|
|
|
|
|
self.param_grid = {
|
|
|
'n_estimators': [200],
|
|
|
'max_depth': [3, 4, 5],
|
|
|
'learning_rate': [0.02, 0.03],
|
|
|
'reg_alpha': [3.0, 5.0],
|
|
|
'reg_lambda': [5.0, 8.0],
|
|
|
'gamma': [0.5, 1.0],
|
|
|
'subsample': [0.7],
|
|
|
'colsample_bytree': [0.7],
|
|
|
'colsample_bylevel': [0.6],
|
|
|
'min_child_weight': [5, 7]
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if self.use_grid_search:
|
|
|
for param, values in self.param_grid.items():
|
|
|
mlflow.log_param(f"grid_{param}", str(values))
|
|
|
|
|
|
print("✅ Variables inicializadas")
|
|
|
|
|
|
def load_dataset(self):
|
|
|
"""Cargar y preparar dataset"""
|
|
|
|
|
|
self.df_data = pd.read_csv("dataset/processed/dataset_processed.csv")
|
|
|
self.y = self.df_data["y"]
|
|
|
self.df_data = self.df_data.drop(["y"], axis=1)
|
|
|
self.y_array = np.array(self.y).flatten()
|
|
|
|
|
|
|
|
|
mask = (self.y_array >= 3) & (self.y_array <= 17)
|
|
|
self.df_data = self.df_data[mask].copy()
|
|
|
self.y_array = self.y_array[mask]
|
|
|
|
|
|
|
|
|
if self.df_data.isnull().any().any():
|
|
|
self.df_data = self.df_data.fillna(0)
|
|
|
|
|
|
|
|
|
mlflow.log_params({
|
|
|
"dataset_samples": len(self.df_data),
|
|
|
"dataset_features": self.df_data.shape[1],
|
|
|
"target_min": float(self.y_array.min()),
|
|
|
"target_max": float(self.y_array.max()),
|
|
|
"target_mean": float(self.y_array.mean()),
|
|
|
"target_std": float(self.y_array.std())
|
|
|
})
|
|
|
|
|
|
print(f"✅ Dataset cargado: {self.df_data.shape}")
|
|
|
|
|
|
def split_train_test(self, test_size_):
|
|
|
"""Dividir datos en train/val/test"""
|
|
|
|
|
|
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
|
|
|
self.df_data, self.y_array,
|
|
|
test_size=test_size_,
|
|
|
random_state=42,
|
|
|
shuffle=True
|
|
|
)
|
|
|
|
|
|
|
|
|
self.scaler = StandardScaler()
|
|
|
self.X_train = pd.DataFrame(
|
|
|
self.scaler.fit_transform(self.X_train),
|
|
|
columns=self.X_train.columns
|
|
|
)
|
|
|
self.X_test = pd.DataFrame(
|
|
|
self.scaler.transform(self.X_test),
|
|
|
columns=self.X_test.columns
|
|
|
)
|
|
|
|
|
|
|
|
|
self.X_train_fit, self.X_val, self.y_train_fit, self.y_val = train_test_split(
|
|
|
self.X_train, self.y_train,
|
|
|
test_size=0.15,
|
|
|
random_state=43
|
|
|
)
|
|
|
|
|
|
|
|
|
mlflow.log_params({
|
|
|
"train_samples": len(self.X_train_fit),
|
|
|
"val_samples": len(self.X_val),
|
|
|
"test_samples": len(self.X_test),
|
|
|
"test_size": test_size_
|
|
|
})
|
|
|
|
|
|
print(f"✅ Train: {len(self.X_train_fit)} | Val: {len(self.X_val)} | Test: {len(self.X_test)}")
|
|
|
|
|
|
def define_model(self):
|
|
|
"""Definir modelo base y GridSearch"""
|
|
|
|
|
|
self.xgb_base = XGBRegressor(
|
|
|
objective="reg:squarederror",
|
|
|
tree_method="hist",
|
|
|
random_state=42,
|
|
|
n_jobs=-1,
|
|
|
verbosity=0
|
|
|
)
|
|
|
|
|
|
if self.use_grid_search:
|
|
|
self.kfold = KFold(n_splits=5, shuffle=True, random_state=42)
|
|
|
self.mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
|
|
|
|
|
|
self.grid_search = GridSearchCV(
|
|
|
estimator=self.xgb_base,
|
|
|
param_grid=self.param_grid,
|
|
|
cv=self.kfold,
|
|
|
scoring=self.mae_scorer,
|
|
|
n_jobs=-1,
|
|
|
verbose=2,
|
|
|
return_train_score=True
|
|
|
)
|
|
|
|
|
|
def train_grid_search(self):
|
|
|
"""Ejecutar GridSearch y guardar mejores params"""
|
|
|
|
|
|
print("\n🔍 Buscando mejores hiperparámetros...")
|
|
|
self.grid_search.fit(self.X_train_fit, self.y_train_fit)
|
|
|
|
|
|
|
|
|
self.best_params = self.grid_search.best_params_
|
|
|
|
|
|
|
|
|
for param, value in self.best_params.items():
|
|
|
mlflow.log_param(f"best_{param}", value)
|
|
|
|
|
|
mlflow.log_metric("cv_best_mae", -self.grid_search.best_score_)
|
|
|
|
|
|
print(f"\n✅ Mejores hiperparámetros encontrados:")
|
|
|
for param, value in self.best_params.items():
|
|
|
print(f" {param}: {value}")
|
|
|
print(f" CV MAE: {-self.grid_search.best_score_:.4f}")
|
|
|
|
|
|
def save_best_params(self):
|
|
|
"""Guardar mejores hiperparámetros en archivo JSON"""
|
|
|
|
|
|
os.makedirs("config", exist_ok=True)
|
|
|
|
|
|
config = {
|
|
|
"model_name": self.nombre,
|
|
|
"timestamp": self.timestamp,
|
|
|
"best_params": self.best_params,
|
|
|
"cv_mae": float(-self.grid_search.best_score_),
|
|
|
"run_id": self.run_id
|
|
|
}
|
|
|
|
|
|
with open(self.config_path, 'w') as f:
|
|
|
json.dump(config, f, indent=4)
|
|
|
|
|
|
|
|
|
mlflow.log_artifact(self.config_path)
|
|
|
|
|
|
print(f"💾 Hiperparámetros guardados en: {self.config_path}")
|
|
|
|
|
|
def load_best_params(self):
|
|
|
"""Cargar hiperparámetros desde archivo JSON"""
|
|
|
|
|
|
if not os.path.exists(self.config_path):
|
|
|
raise FileNotFoundError(
|
|
|
f"No se encontró {self.config_path}. "
|
|
|
"Ejecuta primero con use_grid_search=True"
|
|
|
)
|
|
|
|
|
|
with open(self.config_path, 'r') as f:
|
|
|
config = json.load(f)
|
|
|
|
|
|
self.best_params = config["best_params"]
|
|
|
|
|
|
|
|
|
for param, value in self.best_params.items():
|
|
|
mlflow.log_param(f"loaded_{param}", value)
|
|
|
|
|
|
mlflow.log_param("config_source", self.config_path)
|
|
|
mlflow.log_param("previous_cv_mae", config.get("cv_mae", "N/A"))
|
|
|
|
|
|
print(f"✅ Hiperparámetros cargados desde: {self.config_path}")
|
|
|
print(f" Origen: {config.get('model_name', 'unknown')} ({config.get('timestamp', 'unknown')})")
|
|
|
|
|
|
def train_model(self):
|
|
|
"""Entrenar modelo final con mejores params"""
|
|
|
|
|
|
self.xgb_model = XGBRegressor(
|
|
|
**self.best_params,
|
|
|
objective="reg:squarederror",
|
|
|
tree_method="hist",
|
|
|
random_state=42,
|
|
|
n_jobs=-1,
|
|
|
verbosity=0
|
|
|
)
|
|
|
|
|
|
self.xgb_model.fit(
|
|
|
self.X_train_fit,
|
|
|
self.y_train_fit,
|
|
|
eval_set=[(self.X_val, self.y_val)],
|
|
|
verbose=False
|
|
|
)
|
|
|
|
|
|
print("✅ Modelo entrenado")
|
|
|
|
|
|
def test_and_eval(self):
|
|
|
"""Evaluar y loggear métricas"""
|
|
|
|
|
|
|
|
|
y_train_pred = self.xgb_model.predict(self.X_train_fit)
|
|
|
y_val_pred = self.xgb_model.predict(self.X_val)
|
|
|
y_test_pred = self.xgb_model.predict(self.X_test)
|
|
|
|
|
|
|
|
|
metrics = {
|
|
|
'train': {
|
|
|
'mae': mean_absolute_error(self.y_train_fit, y_train_pred),
|
|
|
'rmse': np.sqrt(mean_squared_error(self.y_train_fit, y_train_pred)),
|
|
|
'r2': r2_score(self.y_train_fit, y_train_pred)
|
|
|
},
|
|
|
'val': {
|
|
|
'mae': mean_absolute_error(self.y_val, y_val_pred),
|
|
|
'rmse': np.sqrt(mean_squared_error(self.y_val, y_val_pred)),
|
|
|
'r2': r2_score(self.y_val, y_val_pred)
|
|
|
},
|
|
|
'test': {
|
|
|
'mae': mean_absolute_error(self.y_test, y_test_pred),
|
|
|
'rmse': np.sqrt(mean_squared_error(self.y_test, y_test_pred)),
|
|
|
'r2': r2_score(self.y_test, y_test_pred)
|
|
|
}
|
|
|
}
|
|
|
|
|
|
|
|
|
for set_name, set_metrics in metrics.items():
|
|
|
for metric_name, value in set_metrics.items():
|
|
|
mlflow.log_metric(f"{set_name}_{metric_name}", value)
|
|
|
|
|
|
|
|
|
cv_mae = cross_val_score(
|
|
|
self.xgb_model, self.X_train, self.y_train,
|
|
|
cv=5, scoring='neg_mean_absolute_error'
|
|
|
)
|
|
|
cv_r2 = cross_val_score(
|
|
|
self.xgb_model, self.X_train, self.y_train,
|
|
|
cv=5, scoring='r2'
|
|
|
)
|
|
|
|
|
|
mlflow.log_metric("cv_mae_mean", -cv_mae.mean())
|
|
|
mlflow.log_metric("cv_mae_std", cv_mae.std())
|
|
|
mlflow.log_metric("cv_r2_mean", cv_r2.mean())
|
|
|
mlflow.log_metric("cv_r2_std", cv_r2.std())
|
|
|
|
|
|
|
|
|
test_errors = np.abs(self.y_test - y_test_pred)
|
|
|
mlflow.log_metric("test_error_median", float(np.median(test_errors)))
|
|
|
mlflow.log_metric("test_error_p90", float(np.percentile(test_errors, 90)))
|
|
|
mlflow.log_metric("test_pct_error_lt_2", float((test_errors < 2.0).sum() / len(test_errors) * 100))
|
|
|
|
|
|
|
|
|
gap = metrics['train']['r2'] - metrics['test']['r2']
|
|
|
mlflow.log_metric("overfitting_gap", gap)
|
|
|
|
|
|
print(f"\n📊 MÉTRICAS:")
|
|
|
print(f" Train MAE: {metrics['train']['mae']:.4f} | R²: {metrics['train']['r2']:.4f}")
|
|
|
print(f" Val MAE: {metrics['val']['mae']:.4f} | R²: {metrics['val']['r2']:.4f}")
|
|
|
print(f" Test MAE: {metrics['test']['mae']:.4f} | R²: {metrics['test']['r2']:.4f}")
|
|
|
print(f" CV MAE: {-cv_mae.mean():.4f} ± {cv_mae.std():.4f}")
|
|
|
print(f" Overfitting Gap: {gap:.4f}")
|
|
|
|
|
|
def top_features(self):
|
|
|
"""Guardar importancia de features"""
|
|
|
|
|
|
feature_importance = pd.DataFrame({
|
|
|
'feature': self.df_data.columns,
|
|
|
'importance': self.xgb_model.feature_importances_
|
|
|
}).sort_values('importance', ascending=False)
|
|
|
|
|
|
|
|
|
feature_importance.to_csv(f"models/feature_importance_{self.nombre}.csv", index=False)
|
|
|
mlflow.log_artifact(f"models/feature_importance_{self.nombre}.csv")
|
|
|
|
|
|
|
|
|
for idx, row in feature_importance.head(10).iterrows():
|
|
|
mlflow.log_metric(f"feat_imp_{row['feature']}", row['importance'])
|
|
|
|
|
|
print(f"\n🔍 Top 5 features:")
|
|
|
for idx, row in feature_importance.head(5).iterrows():
|
|
|
print(f" {row['feature']}: {row['importance']:.4f}")
|
|
|
|
|
|
def save_models(self, nombre):
|
|
|
"""Guardar modelos localmente y en MLflow"""
|
|
|
|
|
|
os.makedirs("models", exist_ok=True)
|
|
|
|
|
|
|
|
|
model_path = f'models/xgboost_corners_{nombre}.pkl'
|
|
|
scaler_path = f'models/scaler_corners_{nombre}.pkl'
|
|
|
|
|
|
|
|
|
joblib.dump(self.xgb_model, model_path)
|
|
|
joblib.dump(self.scaler, scaler_path)
|
|
|
|
|
|
|
|
|
mlflow.xgboost.log_model(
|
|
|
self.xgb_model,
|
|
|
artifact_path="model",
|
|
|
registered_model_name=f"corners_predictor"
|
|
|
)
|
|
|
mlflow.log_artifact(scaler_path, artifact_path="preprocessing")
|
|
|
|
|
|
print(f"\n💾 Modelos guardados:")
|
|
|
print(f" {model_path}")
|
|
|
print(f" {scaler_path}")
|
|
|
print(f" MLflow Model Registry ✓")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model = TRAIN_MODEL(
|
|
|
nombre="v4_retrain",
|
|
|
use_grid_search=True
|
|
|
) |