oc_p5-dev / ml_model /train_model.py
ASI-Engineer's picture
Upload folder using huggingface_hub
aac75d5 verified
raw
history blame
3.33 kB
import mlflow
import mlflow.sklearn
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from scipy.stats import randint, uniform
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from xgboost import XGBClassifier
def train_model(X, y):
"""
Train/tune XGBoost avec SMOTE (de optimisation.py/improvement.py).
Retourne best_model, best_params, cv_f1.
Choix : RandomizedSearch (efficace large grille) ; SMOTE in-pipeline (gère CV) ; F1 scoring (déséquilibre).
"""
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
ratio = sum(y == 0) / sum(y == 1)
pipeline = ImbPipeline(
[("sampler", SMOTE(random_state=42)), ("clf", XGBClassifier(random_state=42))]
)
param_dist = {
"clf__max_depth": randint(3, 15),
"clf__n_estimators": randint(100, 1000),
"clf__learning_rate": uniform(0.001, 0.5),
"clf__subsample": uniform(0.4, 0.6),
"clf__reg_alpha": uniform(0, 3),
"clf__gamma": uniform(0, 10),
"clf__colsample_bytree": uniform(0.5, 0.5),
"clf__min_child_weight": randint(1, 15),
"clf__scale_pos_weight": uniform(1, ratio),
"clf__tree_method": ["auto", "hist"], # CPU
}
random = RandomizedSearchCV(
pipeline,
param_dist,
n_iter=1000,
cv=5,
scoring="f1",
n_jobs=-1,
random_state=42,
)
# Ajout MLflow : Encapsule training pour tracking auto (./mlruns)
with mlflow.start_run(run_name="XGBoost_Tuning"):
random.fit(X_train, y_train)
best_model = random.best_estimator_ # type: ignore[assignment]
best_params = random.best_params_
cv_f1 = random.best_score_
mlflow.log_params(
best_params
) # Choix : Log tous hyperparams pour reproductibilité.
mlflow.log_metric(
"cv_f1", cv_f1
) # Choix : Métrique clé (F1 CV pour déséquilibre).
y_pred = best_model.predict(X_test) # type: ignore[attr-defined]
report = classification_report(y_test, y_pred, output_dict=True) # type: ignore[arg-type]
# Type ignore car classification_report avec output_dict=True retourne dict, pas str
mlflow.log_metric("test_precision", float(report["1"]["precision"])) # type: ignore[index]
mlflow.log_metric("test_recall", float(report["1"]["recall"])) # type: ignore[index]
mlflow.log_metric("test_f1", float(report["1"]["f1-score"])) # type: ignore[index]
# Log model et récupère URI pour l'enregistrement
model_info = mlflow.sklearn.log_model(best_model, "model") # type: ignore[attr-defined]
# Enregistre dans Model Registry pour apparaître dans la page "Models"
mlflow.register_model(
model_uri=model_info.model_uri, name="XGBoost_Employee_Turnover"
)
# Éval test (pédagogique)
print("Meilleurs params:", best_params)
print("Meilleur CV F1:", cv_f1)
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
return best_model, best_params, cv_f1