import mlflow import mlflow.sklearn from imblearn.over_sampling import SMOTE from imblearn.pipeline import Pipeline as ImbPipeline from scipy.stats import randint, uniform from sklearn.metrics import classification_report, confusion_matrix from sklearn.model_selection import RandomizedSearchCV, train_test_split from xgboost import XGBClassifier def train_model(X, y): """ Train/tune XGBoost avec SMOTE (de optimisation.py/improvement.py). Retourne best_model, best_params, cv_f1. Choix : RandomizedSearch (efficace large grille) ; SMOTE in-pipeline (gère CV) ; F1 scoring (déséquilibre). """ X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) ratio = sum(y == 0) / sum(y == 1) pipeline = ImbPipeline( [("sampler", SMOTE(random_state=42)), ("clf", XGBClassifier(random_state=42))] ) param_dist = { "clf__max_depth": randint(3, 15), "clf__n_estimators": randint(100, 1000), "clf__learning_rate": uniform(0.001, 0.5), "clf__subsample": uniform(0.4, 0.6), "clf__reg_alpha": uniform(0, 3), "clf__gamma": uniform(0, 10), "clf__colsample_bytree": uniform(0.5, 0.5), "clf__min_child_weight": randint(1, 15), "clf__scale_pos_weight": uniform(1, ratio), "clf__tree_method": ["auto", "hist"], # CPU } random = RandomizedSearchCV( pipeline, param_dist, n_iter=1000, cv=5, scoring="f1", n_jobs=-1, random_state=42, ) # Ajout MLflow : Encapsule training pour tracking auto (./mlruns) with mlflow.start_run(run_name="XGBoost_Tuning"): random.fit(X_train, y_train) best_model = random.best_estimator_ # type: ignore[assignment] best_params = random.best_params_ cv_f1 = random.best_score_ mlflow.log_params( best_params ) # Choix : Log tous hyperparams pour reproductibilité. mlflow.log_metric( "cv_f1", cv_f1 ) # Choix : Métrique clé (F1 CV pour déséquilibre). y_pred = best_model.predict(X_test) # type: ignore[attr-defined] report = classification_report(y_test, y_pred, output_dict=True) # type: ignore[arg-type] # Type ignore car classification_report avec output_dict=True retourne dict, pas str mlflow.log_metric("test_precision", float(report["1"]["precision"])) # type: ignore[index] mlflow.log_metric("test_recall", float(report["1"]["recall"])) # type: ignore[index] mlflow.log_metric("test_f1", float(report["1"]["f1-score"])) # type: ignore[index] # Log model et récupère URI pour l'enregistrement model_info = mlflow.sklearn.log_model(best_model, "model") # type: ignore[attr-defined] # Enregistre dans Model Registry pour apparaître dans la page "Models" mlflow.register_model( model_uri=model_info.model_uri, name="XGBoost_Employee_Turnover" ) # Éval test (pédagogique) print("Meilleurs params:", best_params) print("Meilleur CV F1:", cv_f1) print(classification_report(y_test, y_pred)) print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred)) return best_model, best_params, cv_f1