github-actions[bot]
deploy: sync from GitHub main
811858e
# src/train.py
from pathlib import Path
import joblib
import pandas as pd
from sklearn.metrics import roc_auc_score, f1_score
import mlflow
import mlflow.sklearn
from .config import (
PROCESSED_DATA_DIR,
MODELS_DIR,
TARGET_COLUMN,
RANDOM_STATE,
LATEST_MODEL_PATH,
MLFLOW_TRACKING_URI,
MLFLOW_EXPERIMENT_NAME,
CV_N_SPLITS,
CV_THRESHOLD,
OPTUNA_N_TRIALS,
OPTUNA_TIMEOUT,
OPTUNA_STUDY_NAME,
OPTUNA_DIRECTION,
OPTUNA_METRIC,
METRICS_DIR,
BASELINE_METRICS_PATH,
)
from .model_utils import split_features_target, build_model_pipeline
from .cross_validation import perform_cross_validation, get_cv_summary
from .hyperparameter_optimization import optimize_hyperparameters
def main() -> None:
train_path = PROCESSED_DATA_DIR / "train.csv"
valid_path = PROCESSED_DATA_DIR / "valid.csv"
if not train_path.exists() or not valid_path.exists():
raise FileNotFoundError(
"No se encontraron train.csv / valid.csv. "
"Ejecuta primero: dvc repro (o python -m src.data_prep)."
)
# 1) Cargar datos
train_df = pd.read_csv(train_path, parse_dates=["application_date"])
valid_df = pd.read_csv(valid_path, parse_dates=["application_date"])
X_train, y_train = split_features_target(train_df, TARGET_COLUMN)
X_valid, y_valid = split_features_target(valid_df, TARGET_COLUMN)
# 2) Optimize hyperparameters using Optuna
optimization_results = optimize_hyperparameters(
X=X_train,
y=y_train,
n_trials=OPTUNA_N_TRIALS,
n_splits=CV_N_SPLITS,
metric=OPTUNA_METRIC,
timeout=OPTUNA_TIMEOUT,
study_name=OPTUNA_STUDY_NAME,
direction=OPTUNA_DIRECTION,
show_progress_bar=True,
)
best_params = optimization_results["best_params"]
best_value = optimization_results["best_value"]
# 3) Build model pipeline with optimized hyperparameters
model_params = {
"random_state": RANDOM_STATE,
"C": best_params["C"],
"penalty": best_params["penalty"],
"solver": best_params["solver"],
"max_iter": best_params["max_iter"],
"class_weight": best_params["class_weight"],
}
# Add l1_ratio if present (for elasticnet penalty)
if "l1_ratio" in best_params:
model_params["l1_ratio"] = best_params["l1_ratio"]
model = build_model_pipeline(**model_params)
# 4) Configurar MLflow (tracking local)
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
with mlflow.start_run(run_name="logreg_optimized"):
# 4.1) Log de parámetros
mlflow.log_params(
{
"model_type": "LogisticRegression",
"random_state": RANDOM_STATE,
"train_rows": len(train_df),
"valid_rows": len(valid_df),
"cv_n_splits": CV_N_SPLITS,
"cv_threshold": CV_THRESHOLD,
"optuna_n_trials": OPTUNA_N_TRIALS,
"optuna_metric": OPTUNA_METRIC,
**{f"best_{k}": v for k, v in best_params.items()},
}
)
# Log best optimization value
mlflow.log_metric(f"optuna_best_{OPTUNA_METRIC}", best_value)
# 5) Perform Cross-Validation on training set with optimized model
cv_model_params = {
"random_state": RANDOM_STATE,
"C": best_params["C"],
"penalty": best_params["penalty"],
"solver": best_params["solver"],
"max_iter": best_params["max_iter"],
"class_weight": best_params["class_weight"],
}
if "l1_ratio" in best_params:
cv_model_params["l1_ratio"] = best_params["l1_ratio"]
cv_results = perform_cross_validation(
model=build_model_pipeline(**cv_model_params),
X=X_train,
y=y_train,
n_splits=CV_N_SPLITS,
random_state=RANDOM_STATE,
threshold=CV_THRESHOLD,
)
# 5.1) Log CV metrics to MLflow
cv_metrics = get_cv_summary(cv_results)
mlflow.log_metrics(cv_metrics)
# Also log std metrics
cv_std_metrics = {f"cv_{metric}_std": stats["std"] for metric, stats in cv_results.items()}
mlflow.log_metrics(cv_std_metrics)
# 6) Train final model on all training data with optimized hyperparameters
print("\nTraining final model on full training set with optimized hyperparameters...")
model.fit(X_train, y_train)
# 7) Evaluate on validation set
y_proba = model.predict_proba(X_valid)[:, 1]
y_pred = (y_proba >= CV_THRESHOLD).astype(int)
auc = roc_auc_score(y_valid, y_proba)
f1 = f1_score(y_valid, y_pred)
print(f"\nValidation Set Results:")
print(f"AUC valid: {auc:.4f}")
print(f"F1 valid: {f1:.4f}")
# 7.1) Log validation metrics
mlflow.log_metrics(
{
"auc_valid": auc,
"f1_valid": f1,
}
)
# 8) Log del modelo en MLflow
mlflow.sklearn.log_model(
sk_model=model,
artifact_path="model",
registered_model_name=None, # puedes usar un Model Registry si quieres
)
# 9) Guardar el modelo "oficial" para la API
MODELS_DIR.mkdir(parents=True, exist_ok=True)
joblib.dump(model, LATEST_MODEL_PATH)
print(f"Modelo guardado en: {LATEST_MODEL_PATH}")
# 10) Guardar métricas baseline para detección de drift
import json
METRICS_DIR.mkdir(parents=True, exist_ok=True)
baseline_metrics = {
"auc_valid": float(auc),
"f1_valid": float(f1),
}
with open(BASELINE_METRICS_PATH, "w", encoding="utf-8") as f:
json.dump(baseline_metrics, f, indent=2)
print(f"Métricas baseline guardadas en: {BASELINE_METRICS_PATH}")
if __name__ == "__main__":
main()