oc_p5-dev / tests /test_mlflow_quick.py
ASI-Engineer's picture
Upload folder using huggingface_hub
aac75d5 verified
raw
history blame
4.79 kB
#!/usr/bin/env python3
"""
Test rapide MLflow avec n_iter=10 au lieu de 1000.
"""
import os
import mlflow
from sklearn.model_selection import RandomizedSearchCV
from ml_model.preprocess import preprocess_data
from ml_model.train_model import train_model
# Configure MLflow pour utiliser SQLite (nécessaire pour Model Registry)
mlflow.set_tracking_uri("sqlite:///mlflow.db")
# Patch temporaire pour test rapide
import ml_model.train_model as train_module
original_train = train_module.train_model
def quick_train(X, y):
"""Version rapide avec n_iter=10"""
import mlflow
import mlflow.sklearn
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from scipy.stats import randint, uniform
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from xgboost import XGBClassifier
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
ratio = sum(y == 0) / sum(y == 1)
pipeline = ImbPipeline(
[("sampler", SMOTE(random_state=42)), ("clf", XGBClassifier(random_state=42))]
)
param_dist = {
"clf__max_depth": randint(3, 15),
"clf__n_estimators": randint(100, 1000),
"clf__learning_rate": uniform(0.001, 0.5),
"clf__subsample": uniform(0.4, 0.6),
"clf__reg_alpha": uniform(0, 3),
"clf__gamma": uniform(0, 10),
"clf__colsample_bytree": uniform(0.5, 0.5),
"clf__min_child_weight": randint(1, 15),
"clf__scale_pos_weight": uniform(1, ratio),
"clf__tree_method": ["auto", "hist"],
}
random = RandomizedSearchCV(
pipeline,
param_dist,
n_iter=10, # 🚀 Test rapide !
cv=3, # Réduit aussi le CV
scoring="f1",
n_jobs=-1,
random_state=42,
)
with mlflow.start_run(run_name="XGBoost_Quick_Test"):
random.fit(X_train, y_train)
best_model = random.best_estimator_ # type: ignore[assignment]
best_params = random.best_params_
cv_f1 = random.best_score_
mlflow.log_params(best_params)
mlflow.log_metric("cv_f1", cv_f1)
y_pred = best_model.predict(X_test) # type: ignore[attr-defined]
report = classification_report(y_test, y_pred, output_dict=True) # type: ignore[arg-type]
mlflow.log_metric("test_precision", float(report["1"]["precision"])) # type: ignore[index]
mlflow.log_metric("test_recall", float(report["1"]["recall"])) # type: ignore[index]
mlflow.log_metric("test_f1", float(report["1"]["f1-score"])) # type: ignore[index]
# Log model et enregistre dans Model Registry
model_info = mlflow.sklearn.log_model(best_model, "model") # type: ignore[attr-defined]
mlflow.register_model(
model_uri=model_info.model_uri, name="XGBoost_Employee_Turnover"
)
print("Meilleurs params:", best_params)
print("Meilleur CV F1:", cv_f1)
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
return best_model, best_params, cv_f1
if __name__ == "__main__":
print("🔄 Chargement et prétraitement des données...")
data_paths = {
"sondage_path": "data/extrait_sondage.csv",
"eval_path": "data/extrait_eval.csv",
"sirh_path": "data/extrait_sirh.csv",
}
X, y, scaler, onehot, ordinal = preprocess_data(raw_data_paths=data_paths)
print(f"✅ Données prétraitées :")
print(f" X shape: {X.shape}, y shape: {y.shape}")
print(f" Distribution y: {y.value_counts().to_dict()}")
print("\n🚀 Lancement du test rapide (n_iter=10, cv=3)...\n")
best_model, best_params, cv_f1 = quick_train(X, y)
print(f"\n✅ Test terminé ! CV F1-score: {cv_f1:.4f}")
print("\n📁 Vérification des artifacts MLflow dans ./mlruns :")
if os.path.exists("./mlruns"):
for root, dirs, files in os.walk("./mlruns"):
level = root.replace("./mlruns", "").count(os.sep)
if level < 3: # Limite la profondeur
indent = " " * 2 * level
print(f"{indent}{os.path.basename(root)}/")
if level == 2: # Affiche fichiers dans les runs
subindent = " " * 2 * (level + 1)
for file in files[:3]:
print(f"{subindent}{file}")
if len(files) > 3:
print(f"{subindent}... (+{len(files) - 3} fichiers)")
print("\n💡 Pour visualiser les runs MLflow, exécutez :")
print(" mlflow ui")
print(" Puis ouvrez http://localhost:5000")