Spaces:
Running
Running
| import numpy as np | |
| import pandas as pd | |
| import json | |
| import os | |
| from datetime import datetime | |
| # MLflow | |
| import mlflow | |
| import mlflow.sklearn | |
| import mlflow.xgboost | |
| from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer | |
| from xgboost import XGBRegressor | |
| import joblib | |
| class TRAIN_MODEL(): | |
| def __init__(self, nombre, use_grid_search=False, config_path="config/model_config.json"): | |
| """ | |
| Entrenar modelo con tracking MLflow | |
| Args: | |
| nombre: Identificador del modelo (ej: "v3_production") | |
| use_grid_search: True = buscar hiperparámetros, False = usar config guardado | |
| config_path: Ruta al archivo de configuración con hiperparámetros | |
| """ | |
| # =========================== | |
| # CONFIGURACIÓN MLFLOW | |
| # =========================== | |
| mlflow.set_tracking_uri("file:./mlruns") | |
| mlflow.set_experiment("corners_prediction") | |
| self.nombre = nombre | |
| self.use_grid_search = use_grid_search | |
| self.config_path = config_path | |
| self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| # Iniciar run de MLflow | |
| with mlflow.start_run(run_name=f"{nombre}_{self.timestamp}") as run: | |
| self.run_id = run.info.run_id | |
| print(f"\n{'='*80}") | |
| print(f"🚀 Entrenamiento iniciado con MLflow") | |
| print(f" Run ID: {self.run_id}") | |
| print(f" Nombre: {nombre}") | |
| print(f" GridSearch: {'SÍ' if use_grid_search else 'NO (usando config)'}") | |
| print(f"{'='*80}\n") | |
| # Tags básicos | |
| mlflow.set_tags({ | |
| "model_name": nombre, | |
| "timestamp": self.timestamp, | |
| "grid_search_used": str(use_grid_search), | |
| "framework": "XGBoost", | |
| "task": "regression" | |
| }) | |
| # Pipeline de entrenamiento | |
| try: | |
| self.init_variables() | |
| self.load_dataset() | |
| self.split_train_test(0.15) | |
| self.define_model() | |
| if use_grid_search: | |
| print("🔍 Ejecutando GridSearch (puede tardar)...") | |
| self.train_grid_search() | |
| self.save_best_params() # Guardar para futuros entrenamientos | |
| else: | |
| print("⚡ Usando hiperparámetros guardados (rápido)") | |
| self.load_best_params() | |
| self.train_model() | |
| self.test_and_eval() | |
| self.top_features() | |
| self.save_models(nombre) | |
| mlflow.set_tag("status", "SUCCESS") | |
| print(f"\n✅ Entrenamiento completado") | |
| print(f"📊 Ver en MLflow UI: mlflow ui") | |
| except Exception as e: | |
| mlflow.set_tag("status", "FAILED") | |
| print(f"\n❌ Error: {e}") | |
| raise | |
| def init_variables(self): | |
| """Definir espacio de búsqueda para GridSearch""" | |
| # ✅ GRID INTELIGENTE (~243 combinaciones = 1-3 horas) | |
| self.param_grid = { | |
| 'n_estimators': [200], # 1 valor (200 suele ser óptimo) | |
| 'max_depth': [3, 4, 5], # 3 valores (clave) | |
| 'learning_rate': [0.02, 0.03], # 2 valores (0.01 es muy lento) | |
| 'reg_alpha': [3.0, 5.0], # 2 valores | |
| 'reg_lambda': [5.0, 8.0], # 2 valores | |
| 'gamma': [0.5, 1.0], # 2 valores | |
| 'subsample': [0.7], # 1 valor (0.7 suele funcionar) | |
| 'colsample_bytree': [0.7], # 1 valor | |
| 'colsample_bylevel': [0.6], # 1 valor | |
| 'min_child_weight': [5, 7] # 2 valores | |
| } | |
| # Combinaciones: 1 × 3 × 2 × 2 × 2 × 2 × 1 × 1 × 1 × 2 = 192 | |
| # Tiempo: ~1.5-3 horas ⏱️ | |
| # Loggear configuración del grid | |
| if self.use_grid_search: | |
| for param, values in self.param_grid.items(): | |
| mlflow.log_param(f"grid_{param}", str(values)) | |
| print("✅ Variables inicializadas") | |
| def load_dataset(self): | |
| """Cargar y preparar dataset""" | |
| self.df_data = pd.read_csv("dataset/processed/dataset_processed.csv") | |
| self.y = self.df_data["y"] | |
| self.df_data = self.df_data.drop(["y"], axis=1) | |
| self.y_array = np.array(self.y).flatten() | |
| # Filtrar outliers (3-17 corners) | |
| mask = (self.y_array >= 3) & (self.y_array <= 17) | |
| self.df_data = self.df_data[mask].copy() | |
| self.y_array = self.y_array[mask] | |
| # Limpiar nulos | |
| if self.df_data.isnull().any().any(): | |
| self.df_data = self.df_data.fillna(0) | |
| # Loggear info del dataset | |
| mlflow.log_params({ | |
| "dataset_samples": len(self.df_data), | |
| "dataset_features": self.df_data.shape[1], | |
| "target_min": float(self.y_array.min()), | |
| "target_max": float(self.y_array.max()), | |
| "target_mean": float(self.y_array.mean()), | |
| "target_std": float(self.y_array.std()) | |
| }) | |
| print(f"✅ Dataset cargado: {self.df_data.shape}") | |
| def split_train_test(self, test_size_): | |
| """Dividir datos en train/val/test""" | |
| self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( | |
| self.df_data, self.y_array, | |
| test_size=test_size_, | |
| random_state=42, | |
| shuffle=True | |
| ) | |
| # Escalar | |
| self.scaler = StandardScaler() | |
| self.X_train = pd.DataFrame( | |
| self.scaler.fit_transform(self.X_train), | |
| columns=self.X_train.columns | |
| ) | |
| self.X_test = pd.DataFrame( | |
| self.scaler.transform(self.X_test), | |
| columns=self.X_test.columns | |
| ) | |
| # Split validación | |
| self.X_train_fit, self.X_val, self.y_train_fit, self.y_val = train_test_split( | |
| self.X_train, self.y_train, | |
| test_size=0.15, | |
| random_state=43 | |
| ) | |
| # Loggear splits | |
| mlflow.log_params({ | |
| "train_samples": len(self.X_train_fit), | |
| "val_samples": len(self.X_val), | |
| "test_samples": len(self.X_test), | |
| "test_size": test_size_ | |
| }) | |
| print(f"✅ Train: {len(self.X_train_fit)} | Val: {len(self.X_val)} | Test: {len(self.X_test)}") | |
| def define_model(self): | |
| """Definir modelo base y GridSearch""" | |
| self.xgb_base = XGBRegressor( | |
| objective="reg:squarederror", | |
| tree_method="hist", | |
| random_state=42, | |
| n_jobs=-1, | |
| verbosity=0 | |
| ) | |
| if self.use_grid_search: | |
| self.kfold = KFold(n_splits=5, shuffle=True, random_state=42) | |
| self.mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False) | |
| self.grid_search = GridSearchCV( | |
| estimator=self.xgb_base, | |
| param_grid=self.param_grid, | |
| cv=self.kfold, | |
| scoring=self.mae_scorer, | |
| n_jobs=-1, | |
| verbose=2, | |
| return_train_score=True | |
| ) | |
| def train_grid_search(self): | |
| """Ejecutar GridSearch y guardar mejores params""" | |
| print("\n🔍 Buscando mejores hiperparámetros...") | |
| self.grid_search.fit(self.X_train_fit, self.y_train_fit) | |
| # Mejores parámetros | |
| self.best_params = self.grid_search.best_params_ | |
| # Loggear en MLflow | |
| for param, value in self.best_params.items(): | |
| mlflow.log_param(f"best_{param}", value) | |
| mlflow.log_metric("cv_best_mae", -self.grid_search.best_score_) | |
| print(f"\n✅ Mejores hiperparámetros encontrados:") | |
| for param, value in self.best_params.items(): | |
| print(f" {param}: {value}") | |
| print(f" CV MAE: {-self.grid_search.best_score_:.4f}") | |
| def save_best_params(self): | |
| """Guardar mejores hiperparámetros en archivo JSON""" | |
| os.makedirs("config", exist_ok=True) | |
| config = { | |
| "model_name": self.nombre, | |
| "timestamp": self.timestamp, | |
| "best_params": self.best_params, | |
| "cv_mae": float(-self.grid_search.best_score_), | |
| "run_id": self.run_id | |
| } | |
| with open(self.config_path, 'w') as f: | |
| json.dump(config, f, indent=4) | |
| # Loggear archivo en MLflow | |
| mlflow.log_artifact(self.config_path) | |
| print(f"💾 Hiperparámetros guardados en: {self.config_path}") | |
| def load_best_params(self): | |
| """Cargar hiperparámetros desde archivo JSON""" | |
| if not os.path.exists(self.config_path): | |
| raise FileNotFoundError( | |
| f"No se encontró {self.config_path}. " | |
| "Ejecuta primero con use_grid_search=True" | |
| ) | |
| with open(self.config_path, 'r') as f: | |
| config = json.load(f) | |
| self.best_params = config["best_params"] | |
| # Loggear params en MLflow | |
| for param, value in self.best_params.items(): | |
| mlflow.log_param(f"loaded_{param}", value) | |
| mlflow.log_param("config_source", self.config_path) | |
| mlflow.log_param("previous_cv_mae", config.get("cv_mae", "N/A")) | |
| print(f"✅ Hiperparámetros cargados desde: {self.config_path}") | |
| print(f" Origen: {config.get('model_name', 'unknown')} ({config.get('timestamp', 'unknown')})") | |
| def train_model(self): | |
| """Entrenar modelo final con mejores params""" | |
| self.xgb_model = XGBRegressor( | |
| **self.best_params, | |
| objective="reg:squarederror", | |
| tree_method="hist", | |
| random_state=42, | |
| n_jobs=-1, | |
| verbosity=0 | |
| ) | |
| self.xgb_model.fit( | |
| self.X_train_fit, | |
| self.y_train_fit, | |
| eval_set=[(self.X_val, self.y_val)], | |
| verbose=False | |
| ) | |
| print("✅ Modelo entrenado") | |
| def test_and_eval(self): | |
| """Evaluar y loggear métricas""" | |
| # Predicciones | |
| y_train_pred = self.xgb_model.predict(self.X_train_fit) | |
| y_val_pred = self.xgb_model.predict(self.X_val) | |
| y_test_pred = self.xgb_model.predict(self.X_test) | |
| # Calcular métricas | |
| metrics = { | |
| 'train': { | |
| 'mae': mean_absolute_error(self.y_train_fit, y_train_pred), | |
| 'rmse': np.sqrt(mean_squared_error(self.y_train_fit, y_train_pred)), | |
| 'r2': r2_score(self.y_train_fit, y_train_pred) | |
| }, | |
| 'val': { | |
| 'mae': mean_absolute_error(self.y_val, y_val_pred), | |
| 'rmse': np.sqrt(mean_squared_error(self.y_val, y_val_pred)), | |
| 'r2': r2_score(self.y_val, y_val_pred) | |
| }, | |
| 'test': { | |
| 'mae': mean_absolute_error(self.y_test, y_test_pred), | |
| 'rmse': np.sqrt(mean_squared_error(self.y_test, y_test_pred)), | |
| 'r2': r2_score(self.y_test, y_test_pred) | |
| } | |
| } | |
| # Loggear TODAS las métricas en MLflow | |
| for set_name, set_metrics in metrics.items(): | |
| for metric_name, value in set_metrics.items(): | |
| mlflow.log_metric(f"{set_name}_{metric_name}", value) | |
| # Cross-validation | |
| cv_mae = cross_val_score( | |
| self.xgb_model, self.X_train, self.y_train, | |
| cv=5, scoring='neg_mean_absolute_error' | |
| ) | |
| cv_r2 = cross_val_score( | |
| self.xgb_model, self.X_train, self.y_train, | |
| cv=5, scoring='r2' | |
| ) | |
| mlflow.log_metric("cv_mae_mean", -cv_mae.mean()) | |
| mlflow.log_metric("cv_mae_std", cv_mae.std()) | |
| mlflow.log_metric("cv_r2_mean", cv_r2.mean()) | |
| mlflow.log_metric("cv_r2_std", cv_r2.std()) | |
| # Análisis de errores | |
| test_errors = np.abs(self.y_test - y_test_pred) | |
| mlflow.log_metric("test_error_median", float(np.median(test_errors))) | |
| mlflow.log_metric("test_error_p90", float(np.percentile(test_errors, 90))) | |
| mlflow.log_metric("test_pct_error_lt_2", float((test_errors < 2.0).sum() / len(test_errors) * 100)) | |
| # Gap de overfitting | |
| gap = metrics['train']['r2'] - metrics['test']['r2'] | |
| mlflow.log_metric("overfitting_gap", gap) | |
| print(f"\n📊 MÉTRICAS:") | |
| print(f" Train MAE: {metrics['train']['mae']:.4f} | R²: {metrics['train']['r2']:.4f}") | |
| print(f" Val MAE: {metrics['val']['mae']:.4f} | R²: {metrics['val']['r2']:.4f}") | |
| print(f" Test MAE: {metrics['test']['mae']:.4f} | R²: {metrics['test']['r2']:.4f}") | |
| print(f" CV MAE: {-cv_mae.mean():.4f} ± {cv_mae.std():.4f}") | |
| print(f" Overfitting Gap: {gap:.4f}") | |
| def top_features(self): | |
| """Guardar importancia de features""" | |
| feature_importance = pd.DataFrame({ | |
| 'feature': self.df_data.columns, | |
| 'importance': self.xgb_model.feature_importances_ | |
| }).sort_values('importance', ascending=False) | |
| # Guardar CSV | |
| feature_importance.to_csv(f"models/feature_importance_{self.nombre}.csv", index=False) | |
| mlflow.log_artifact(f"models/feature_importance_{self.nombre}.csv") | |
| # Loggear top 10 | |
| for idx, row in feature_importance.head(10).iterrows(): | |
| mlflow.log_metric(f"feat_imp_{row['feature']}", row['importance']) | |
| print(f"\n🔍 Top 5 features:") | |
| for idx, row in feature_importance.head(5).iterrows(): | |
| print(f" {row['feature']}: {row['importance']:.4f}") | |
| def save_models(self, nombre): | |
| """Guardar modelos localmente y en MLflow""" | |
| os.makedirs("models", exist_ok=True) | |
| # Paths | |
| model_path = f'models/xgboost_corners_{nombre}.pkl' | |
| scaler_path = f'models/scaler_corners_{nombre}.pkl' | |
| # Guardar archivos | |
| joblib.dump(self.xgb_model, model_path) | |
| joblib.dump(self.scaler, scaler_path) | |
| # Loggear en MLflow | |
| mlflow.xgboost.log_model( | |
| self.xgb_model, | |
| artifact_path="model", | |
| registered_model_name=f"corners_predictor" | |
| ) | |
| mlflow.log_artifact(scaler_path, artifact_path="preprocessing") | |
| print(f"\n💾 Modelos guardados:") | |
| print(f" {model_path}") | |
| print(f" {scaler_path}") | |
| print(f" MLflow Model Registry ✓") | |
| # =========================== | |
| # USO | |
| # =========================== | |
| if __name__ == "__main__": | |
| # ======================================== | |
| # OPCIÓN 1: Primera vez o cada 3-6 meses | |
| # Ejecutar GridSearch (LENTO, 30-60 min) | |
| # ======================================== | |
| # model = TRAIN_MODEL( | |
| # nombre="v4_grid_search", | |
| # use_grid_search=True # Busca mejores hiperparámetros | |
| # ) | |
| # ======================================== | |
| # OPCIÓN 2: Reentrenamiento regular | |
| # Usar hiperparámetros guardados (RÁPIDO, 2-5 min) | |
| # ======================================== | |
| model = TRAIN_MODEL( | |
| nombre="v4_retrain", | |
| use_grid_search=True # Usa config/model_config.json | |
| ) |