# src/train.py from pathlib import Path import joblib import pandas as pd from sklearn.metrics import roc_auc_score, f1_score import mlflow import mlflow.sklearn from .config import ( PROCESSED_DATA_DIR, MODELS_DIR, TARGET_COLUMN, RANDOM_STATE, LATEST_MODEL_PATH, MLFLOW_TRACKING_URI, MLFLOW_EXPERIMENT_NAME, CV_N_SPLITS, CV_THRESHOLD, OPTUNA_N_TRIALS, OPTUNA_TIMEOUT, OPTUNA_STUDY_NAME, OPTUNA_DIRECTION, OPTUNA_METRIC, METRICS_DIR, BASELINE_METRICS_PATH, ) from .model_utils import split_features_target, build_model_pipeline from .cross_validation import perform_cross_validation, get_cv_summary from .hyperparameter_optimization import optimize_hyperparameters def main() -> None: train_path = PROCESSED_DATA_DIR / "train.csv" valid_path = PROCESSED_DATA_DIR / "valid.csv" if not train_path.exists() or not valid_path.exists(): raise FileNotFoundError( "No se encontraron train.csv / valid.csv. " "Ejecuta primero: dvc repro (o python -m src.data_prep)." ) # 1) Cargar datos train_df = pd.read_csv(train_path, parse_dates=["application_date"]) valid_df = pd.read_csv(valid_path, parse_dates=["application_date"]) X_train, y_train = split_features_target(train_df, TARGET_COLUMN) X_valid, y_valid = split_features_target(valid_df, TARGET_COLUMN) # 2) Optimize hyperparameters using Optuna optimization_results = optimize_hyperparameters( X=X_train, y=y_train, n_trials=OPTUNA_N_TRIALS, n_splits=CV_N_SPLITS, metric=OPTUNA_METRIC, timeout=OPTUNA_TIMEOUT, study_name=OPTUNA_STUDY_NAME, direction=OPTUNA_DIRECTION, show_progress_bar=True, ) best_params = optimization_results["best_params"] best_value = optimization_results["best_value"] # 3) Build model pipeline with optimized hyperparameters model_params = { "random_state": RANDOM_STATE, "C": best_params["C"], "penalty": best_params["penalty"], "solver": best_params["solver"], "max_iter": best_params["max_iter"], "class_weight": best_params["class_weight"], } # Add l1_ratio if present (for elasticnet penalty) if "l1_ratio" in best_params: model_params["l1_ratio"] = best_params["l1_ratio"] model = build_model_pipeline(**model_params) # 4) Configurar MLflow (tracking local) mlflow.set_tracking_uri(MLFLOW_TRACKING_URI) mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME) with mlflow.start_run(run_name="logreg_optimized"): # 4.1) Log de parámetros mlflow.log_params( { "model_type": "LogisticRegression", "random_state": RANDOM_STATE, "train_rows": len(train_df), "valid_rows": len(valid_df), "cv_n_splits": CV_N_SPLITS, "cv_threshold": CV_THRESHOLD, "optuna_n_trials": OPTUNA_N_TRIALS, "optuna_metric": OPTUNA_METRIC, **{f"best_{k}": v for k, v in best_params.items()}, } ) # Log best optimization value mlflow.log_metric(f"optuna_best_{OPTUNA_METRIC}", best_value) # 5) Perform Cross-Validation on training set with optimized model cv_model_params = { "random_state": RANDOM_STATE, "C": best_params["C"], "penalty": best_params["penalty"], "solver": best_params["solver"], "max_iter": best_params["max_iter"], "class_weight": best_params["class_weight"], } if "l1_ratio" in best_params: cv_model_params["l1_ratio"] = best_params["l1_ratio"] cv_results = perform_cross_validation( model=build_model_pipeline(**cv_model_params), X=X_train, y=y_train, n_splits=CV_N_SPLITS, random_state=RANDOM_STATE, threshold=CV_THRESHOLD, ) # 5.1) Log CV metrics to MLflow cv_metrics = get_cv_summary(cv_results) mlflow.log_metrics(cv_metrics) # Also log std metrics cv_std_metrics = {f"cv_{metric}_std": stats["std"] for metric, stats in cv_results.items()} mlflow.log_metrics(cv_std_metrics) # 6) Train final model on all training data with optimized hyperparameters print("\nTraining final model on full training set with optimized hyperparameters...") model.fit(X_train, y_train) # 7) Evaluate on validation set y_proba = model.predict_proba(X_valid)[:, 1] y_pred = (y_proba >= CV_THRESHOLD).astype(int) auc = roc_auc_score(y_valid, y_proba) f1 = f1_score(y_valid, y_pred) print(f"\nValidation Set Results:") print(f"AUC valid: {auc:.4f}") print(f"F1 valid: {f1:.4f}") # 7.1) Log validation metrics mlflow.log_metrics( { "auc_valid": auc, "f1_valid": f1, } ) # 8) Log del modelo en MLflow mlflow.sklearn.log_model( sk_model=model, artifact_path="model", registered_model_name=None, # puedes usar un Model Registry si quieres ) # 9) Guardar el modelo "oficial" para la API MODELS_DIR.mkdir(parents=True, exist_ok=True) joblib.dump(model, LATEST_MODEL_PATH) print(f"Modelo guardado en: {LATEST_MODEL_PATH}") # 10) Guardar métricas baseline para detección de drift import json METRICS_DIR.mkdir(parents=True, exist_ok=True) baseline_metrics = { "auc_valid": float(auc), "f1_valid": float(f1), } with open(BASELINE_METRICS_PATH, "w", encoding="utf-8") as f: json.dump(baseline_metrics, f, indent=2) print(f"Métricas baseline guardadas en: {BASELINE_METRICS_PATH}") if __name__ == "__main__": main()