Spaces:

ASI-Engineer
/

oc_p5-dev

Sleeping

App Files Files Community

ASI-Engineer commited on Dec 25, 2025

Commit

aac75d5

verified ·

1 Parent(s): 7e1de5c

Upload folder using huggingface_hub

Browse files

Files changed (15) hide show

.flake8 +8 -2
.gitignore +9 -0
docs/mlflow_guide.md +412 -0
examples/01_find_best_model.py +106 -0
examples/02_compare_models.py +165 -0
examples/03_model_registry.py +205 -0
examples/README.md +111 -0
ml_model/preprocess.py +12 -14
ml_model/train_model.py +42 -15
pyproject.toml +1 -1
requirements.txt +9 -8
scripts/fix_lint.py +68 -0
tests/conftest.py +8 -0
tests/test_mlflow_local.py +54 -0
tests/test_mlflow_quick.py +130 -0

.flake8 CHANGED Viewed

@@ -10,6 +10,12 @@ exclude =
     .cache,
     .eggs,
     build,
-    dist
 # Max line pour compat Black (default 88 vs PEP8 79)
-max-line-length = 88

     .cache,
     .eggs,
     build,
+    dist,
+    mlruns
 # Max line pour compat Black (default 88 vs PEP8 79)
+max-line-length = 88
+# Ignorer certains warnings pour les scripts d'exemple (non-critique)
+per-file-ignores =
+    examples/*.py:F541,E722,F841
+    tests/test_mlflow_*.py:F401,E402,F811,F541

.gitignore CHANGED Viewed

@@ -35,3 +35,12 @@ Thumbs.db
 secrets.json
 data/raw/  # Pour datasets volumineux en data science (OC_P5)
 notebooks/*.ipynb_checkpoints/

 secrets.json
 data/raw/  # Pour datasets volumineux en data science (OC_P5)
 notebooks/*.ipynb_checkpoints/
+# MLflow
+mlflow.db
+mlflow.db-shm
+mlflow.db-wal
+mlruns/
+mlflow_ui.log
+mlflow_comparison.png
+nohup.out

docs/mlflow_guide.md ADDED Viewed

	@@ -0,0 +1,412 @@

+# 🚀 Guide MLflow - Projet Employee Turnover
+## 📋 Table des matières
+1. [Workflow complet MLflow](#workflow-complet)
+2. [Comparer plusieurs runs](#comparer-runs)
+3. [Trouver le meilleur modèle](#meilleur-modèle)
+4. [Charger un modèle pour l'API](#api-integration)
+5. [Best Practices](#best-practices)
+---
+## 1. Workflow complet MLflow
+### 🎯 Concept clé
+MLflow suit ce workflow :
+```
+Entraînement → Tracking → Registry → Déploiement → API
+```
+### Architecture actuelle du projet
+```
+train_model.py
+    ↓ (log params/metrics/model)
+mlflow.db (SQLite)
+    ↓ (query)
+MLflow UI (http://localhost:5000)
+    ↓ (select best model)
+Model Registry (XGBoost_Employee_Turnover)
+    ↓ (load)
+API FastAPI/Flask
+    ↓ (serve)
+Prédictions
+```
+---
+## 2. Comparer plusieurs runs
+### Scénario : Tester différents hyperparamètres
+**Exemple : Tester 3 configurations différentes**
+```python
+# tests/test_multiple_runs.py
+import mlflow
+from ml_model.preprocess import preprocess_data
+from ml_model.train_model import train_model
+mlflow.set_tracking_uri("sqlite:///mlflow.db")
+mlflow.set_experiment("Hyperparameter_Tuning")  # Créer une expérience dédiée
+# Chemins des données
+data_paths = {
+    "sondage_path": "data/extrait_sondage.csv",
+    "eval_path": "data/extrait_eval.csv",
+    "sirh_path": "data/extrait_sirh.csv",
+}
+# Préparer les données une seule fois
+X, y, scaler, encoders = preprocess_data(data_paths)
+# Tester 3 configurations
+configs = [
+    {"name": "Baseline", "n_iter": 100, "cv": 3},
+    {"name": "Intensive", "n_iter": 500, "cv": 5},
+    {"name": "Quick", "n_iter": 50, "cv": 3},
+]
+for config in configs:
+    with mlflow.start_run(run_name=config["name"]):
+        # Log la configuration testée
+        mlflow.log_param("config_name", config["name"])
+        mlflow.log_param("n_iter", config["n_iter"])
+        mlflow.log_param("cv", config["cv"])
+        # Entraîner (modifier train_model pour accepter n_iter/cv)
+        model, params, cv_f1 = train_model(X, y)
+        print(f"✅ {config['name']}: F1={cv_f1:.4f}")
+```
+**Résultat dans MLflow UI** :
+- Va sur **Experiments** → **Hyperparameter_Tuning**
+- Tu verras 3 runs avec leurs métriques côte à côte
+- Clique sur **"Compare"** pour voir un tableau comparatif
+---
+## 3. Trouver le meilleur modèle
+### Option A : Via l'API MLflow (recommandé pour l'API)
+```python
+# api/get_best_model.py
+import mlflow
+from mlflow.tracking import MlflowClient
+mlflow.set_tracking_uri("sqlite:///mlflow.db")
+client = MlflowClient()
+def get_best_model_from_experiment(experiment_name="Default", metric="cv_f1"):
+    """
+    Trouve le meilleur modèle d'une expérience basé sur une métrique.
+    Args:
+        experiment_name: Nom de l'expérience MLflow
+        metric: Métrique à optimiser (cv_f1, test_f1, etc.)
+    Returns:
+        run_id du meilleur modèle
+    """
+    # Récupérer l'expérience
+    experiment = client.get_experiment_by_name(experiment_name)
+    if not experiment:
+        raise ValueError(f"Expérience '{experiment_name}' introuvable")
+    # Rechercher tous les runs de l'expérience
+    runs = client.search_runs(
+        experiment_ids=[experiment.experiment_id],
+        order_by=[f"metrics.{metric} DESC"],  # Trier par métrique décroissante
+        max_results=1  # Prendre seulement le meilleur
+    )
+    if not runs:
+        raise ValueError(f"Aucun run trouvé dans l'expérience '{experiment_name}'")
+    best_run = runs[0]
+    print(f"🏆 Meilleur modèle trouvé:")
+    print(f"   Run ID: {best_run.info.run_id}")
+    print(f"   {metric}: {best_run.data.metrics.get(metric, 'N/A')}")
+    print(f"   Date: {best_run.info.start_time}")
+    return best_run.info.run_id
+# Exemple d'utilisation
+if __name__ == "__main__":
+    best_run_id = get_best_model_from_experiment("Default", "cv_f1")
+    # Charger le modèle
+    model_uri = f"runs:/{best_run_id}/model"
+    model = mlflow.sklearn.load_model(model_uri)
+    print(f"✅ Modèle chargé : {type(model)}")
+```
+### Option B : Via le Model Registry (pour production)
+```python
+# api/load_production_model.py
+import mlflow
+mlflow.set_tracking_uri("sqlite:///mlflow.db")
+def load_production_model(model_name="XGBoost_Employee_Turnover", stage="Production"):
+    """
+    Charge le modèle en production depuis le Model Registry.
+    Args:
+        model_name: Nom du modèle dans le Registry
+        stage: Stage du modèle ("Staging", "Production", "Archived")
+    Returns:
+        Modèle chargé
+    """
+    model_uri = f"models:/{model_name}/{stage}"
+    try:
+        model = mlflow.sklearn.load_model(model_uri)
+        print(f"✅ Modèle '{model_name}' ({stage}) chargé")
+        return model
+    except Exception as e:
+        print(f"⚠️ Erreur : {e}")
+        print(f"💡 Astuce : Promouvoir une version en '{stage}' dans MLflow UI")
+        # Fallback : Charger la dernière version
+        model_uri = f"models:/{model_name}/latest"
+        model = mlflow.sklearn.load_model(model_uri)
+        print(f"✅ Fallback : Dernière version chargée")
+        return model
+# Utilisation
+if __name__ == "__main__":
+    model = load_production_model()
+```
+---
+## 4. API Integration - Exemple complet
+### Créer une API Flask/FastAPI avec MLflow
+```python
+# api/app.py
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import mlflow
+import pandas as pd
+import numpy as np
+# Configuration
+mlflow.set_tracking_uri("sqlite:///mlflow.db")
+app = FastAPI(title="Employee Turnover Prediction API")
+# Charger le modèle au démarrage
+MODEL_NAME = "XGBoost_Employee_Turnover"
+model = None
+@app.on_event("startup")
+def load_model():
+    global model
+    try:
+        # Charger le dernier modèle du Registry
+        model_uri = f"models:/{MODEL_NAME}/latest"
+        model = mlflow.sklearn.load_model(model_uri)
+        print(f"✅ Modèle chargé : {MODEL_NAME}")
+    except Exception as e:
+        print(f"❌ Erreur chargement modèle : {e}")
+        raise
+# Schéma de requête
+class PredictionRequest(BaseModel):
+    features: list[float]  # Liste de 50 features (après prétraitement)
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "features": [0.5, 1.2, -0.3, 0.8] + [0.0] * 46  # 50 features
+            }
+        }
+class PredictionResponse(BaseModel):
+    prediction: int  # 0 ou 1
+    probability: float  # Probabilité de départ (classe 1)
+    model_version: str
+# Endpoint de prédiction
+@app.post("/predict", response_model=PredictionResponse)
+def predict(request: PredictionRequest):
+    """
+    Prédit si un employé va quitter l'entreprise.
+    - **features**: Liste de 50 features numériques (après prétraitement)
+    - Retourne la prédiction (0=reste, 1=part) et la probabilité
+    """
+    if model is None:
+        raise HTTPException(status_code=503, detail="Modèle non chargé")
+    try:
+        # Convertir en DataFrame
+        X = pd.DataFrame([request.features])
+        # Prédiction
+        prediction = int(model.predict(X)[0])
+        probability = float(model.predict_proba(X)[0][1])
+        return PredictionResponse(
+            prediction=prediction,
+            probability=round(probability, 4),
+            model_version=MODEL_NAME
+        )
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Erreur prédiction : {str(e)}")
+# Endpoint de santé
+@app.get("/health")
+def health():
+    return {
+        "status": "ok",
+        "model_loaded": model is not None,
+        "model_name": MODEL_NAME
+    }
+# Endpoint pour lister les modèles disponibles
+@app.get("/models")
+def list_models():
+    from mlflow.tracking import MlflowClient
+    client = MlflowClient()
+    models = []
+    for rm in client.search_registered_models():
+        latest_versions = rm.latest_versions
+        models.append({
+            "name": rm.name,
+            "versions": len(latest_versions),
+            "latest_version": latest_versions[0].version if latest_versions else None
+        })
+    return {"models": models}
+# Lancer avec : uvicorn api.app:app --reload
+```
+**Tester l'API** :
+```bash
+# Installer FastAPI
+pip install fastapi uvicorn
+# Lancer le serveur
+uvicorn api.app:app --reload --port 8000
+# Tester
+curl -X POST http://localhost:8000/predict \
+  -H "Content-Type: application/json" \
+  -d '{"features": [0.5, 1.2, -0.3] + [0.0] * 47}'
+```
+---
+## 5. Best Practices
+### ✅ Stratégie de versioning des modèles
+```python
+# Workflow recommandé
+# 1. Entraîner plusieurs modèles → Experiment "Development"
+# 2. Sélectionner le meilleur → Promouvoir en "Staging"
+# 3. Valider en staging → Promouvoir en "Production"
+# 4. API charge toujours "Production"
+from mlflow.tracking import MlflowClient
+client = MlflowClient()
+model_name = "XGBoost_Employee_Turnover"
+# Promouvoir version 2 en Production
+client.transition_model_version_stage(
+    name=model_name,
+    version=2,
+    stage="Production"
+)
+```
+### 📊 Logging avancé
+```python
+# Dans train_model.py, ajouter plus de contexte
+with mlflow.start_run():
+    # Log dataset info
+    mlflow.log_param("n_samples", len(X))
+    mlflow.log_param("n_features", X.shape[1])
+    mlflow.log_param("class_imbalance_ratio", sum(y==0)/sum(y==1))
+    # Log artifacts (graphiques, etc.)
+    import matplotlib.pyplot as plt
+    # Confusion matrix plot
+    plt.figure()
+    # ... plot code ...
+    plt.savefig("confusion_matrix.png")
+    mlflow.log_artifact("confusion_matrix.png")
+    # Log code version
+    import subprocess
+    git_commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).strip().decode()
+    mlflow.set_tag("git_commit", git_commit)
+```
+### 🔄 Retraining workflow
+```python
+# scripts/retrain_model.py
+import mlflow
+from datetime import datetime
+def retrain_and_compare():
+    """Entraîne un nouveau modèle et le compare à la production."""
+    # 1. Charger le modèle en production
+    prod_model = mlflow.sklearn.load_model("models:/XGBoost_Employee_Turnover/Production")
+    # 2. Entraîner nouveau modèle
+    X, y, _, _ = preprocess_data(data_paths)
+    new_model, params, new_f1 = train_model(X, y)
+    # 3. Comparer les performances
+    from sklearn.model_selection import cross_val_score
+    prod_f1 = cross_val_score(prod_model, X, y, cv=5, scoring='f1').mean()
+    print(f"Production F1: {prod_f1:.4f}")
+    print(f"New model F1: {new_f1:.4f}")
+    # 4. Si meilleur, promouvoir automatiquement
+    if new_f1 > prod_f1:
+        print("✅ Nouveau modèle meilleur ! Promotion en Staging...")
+        # Enregistrer dans Registry
+        # ... code de promotion ...
+    else:
+        print("⚠️ Nouveau modèle moins bon, conservation du modèle actuel")
+```
+---
+## 📚 Ressources
+- **MLflow Docs**: https://mlflow.org/docs/latest/index.html
+- **Model Registry**: https://mlflow.org/docs/latest/model-registry.html
+- **Python API**: https://mlflow.org/docs/latest/python_api/index.html
+---
+## 🎯 Prochaines étapes pour ton projet
+1. ✅ **MLflow configuré** - Tracking local avec SQLite
+2. ✅ **Modèle enregistré** - XGBoost_Employee_Turnover v1
+3. 🔄 **TODO: Créer l'API** - FastAPI avec chargement du modèle
+4. 🔄 **TODO: Tester comparaison** - Multiple runs avec hyperparams différents
+5. 🔄 **TODO: CI/CD** - Auto-retraining et déploiement
+**Commande pour démarrer l'API** :
+```bash
+# Créer api/app.py avec le code ci-dessus
+uvicorn api.app:app --reload --port 8000
+```

examples/01_find_best_model.py ADDED Viewed

	@@ -0,0 +1,106 @@

+#!/usr/bin/env python3
+"""
+Exemple 1 : Trouver le meilleur modèle dans MLflow
+Usage: python examples/01_find_best_model.py
+"""
+import mlflow
+from mlflow.tracking import MlflowClient
+# Configuration
+mlflow.set_tracking_uri("sqlite:///mlflow.db")
+client = MlflowClient()
+def find_best_model(experiment_name="Default", metric="cv_f1"):
+    """Trouve le meilleur modèle basé sur une métrique."""
+    print(f"🔍 Recherche du meilleur modèle dans '{experiment_name}'...")
+    print(f"📊 Métrique d'optimisation : {metric}\n")
+    # Récupérer l'expérience
+    experiment = client.get_experiment_by_name(experiment_name)
+    if not experiment:
+        print(f"❌ Expérience '{experiment_name}' introuvable")
+        return None
+    # Rechercher tous les runs
+    runs = client.search_runs(
+        experiment_ids=[experiment.experiment_id],
+        filter_string="",  # Pas de filtre
+        order_by=[f"metrics.{metric} DESC"],
+        max_results=5,  # Top 5
+    )
+    if not runs:
+        print(f"❌ Aucun run trouvé")
+        return None
+    print(f"📈 Top 5 des modèles :\n")
+    print(f"{'Rank':<6} {'Run ID':<35} {metric:<12} {'Date':<20}")
+    print("-" * 75)
+    for i, run in enumerate(runs, 1):
+        metric_value = run.data.metrics.get(metric, 0.0)
+        timestamp = run.info.start_time
+        from datetime import datetime
+        date_str = datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M")
+        print(f"{i:<6} {run.info.run_id:<35} {metric_value:<12.4f} {date_str:<20}")
+    # Meilleur modèle
+    best_run = runs[0]
+    best_metric = best_run.data.metrics.get(metric, 0.0)
+    print(f"\n🏆 Meilleur modèle :")
+    print(f"   Run ID    : {best_run.info.run_id}")
+    print(f"   {metric:<10}: {best_metric:.4f}")
+    print(f"   Status    : {best_run.info.status}")
+    # Afficher les hyperparamètres
+    print(f"\n⚙️  Hyperparamètres :")
+    for key, value in best_run.data.params.items():
+        print(f"   {key:<25} : {value}")
+    # Afficher toutes les métriques
+    print(f"\n📊 Métriques :")
+    for key, value in best_run.data.metrics.items():
+        print(f"   {key:<25} : {value:.4f}")
+    return best_run.info.run_id
+def load_best_model(run_id):
+    """Charge le modèle à partir d'un run_id."""
+    print(f"\n📦 Chargement du modèle...")
+    model_uri = f"runs:/{run_id}/model"
+    try:
+        model = mlflow.sklearn.load_model(model_uri)
+        print(f"✅ Modèle chargé avec succès")
+        print(f"   Type : {type(model).__name__}")
+        # Afficher la pipeline si c'est une Pipeline
+        if hasattr(model, "steps"):
+            print(f"   Pipeline steps :")
+            for name, step in model.steps:
+                print(f"      - {name}: {type(step).__name__}")
+        return model
+    except Exception as e:
+        print(f"❌ Erreur lors du chargement : {e}")
+        return None
+if __name__ == "__main__":
+    # Trouver le meilleur modèle
+    best_run_id = find_best_model("Default", "cv_f1")
+    if best_run_id:
+        # Charger le modèle
+        model = load_best_model(best_run_id)
+        if model:
+            print(f"\n💡 Pour utiliser ce modèle dans ton API :")
+            print(f"   model_uri = 'runs:/{best_run_id}/model'")
+            print(f"   model = mlflow.sklearn.load_model(model_uri)")

examples/02_compare_models.py ADDED Viewed

	@@ -0,0 +1,165 @@

+#!/usr/bin/env python3
+"""
+Exemple 2 : Comparer plusieurs modèles avec différents hyperparamètres
+Usage: python examples/02_compare_models.py
+"""
+import mlflow
+import pandas as pd
+from mlflow.tracking import MlflowClient
+# Configuration
+mlflow.set_tracking_uri("sqlite:///mlflow.db")
+client = MlflowClient()
+def compare_all_runs(experiment_name="Default"):
+    """Compare tous les runs d'une expérience."""
+    print(f"📊 Comparaison de tous les runs dans '{experiment_name}'\n")
+    # Récupérer l'expérience
+    experiment = client.get_experiment_by_name(experiment_name)
+    if not experiment:
+        print(f"❌ Expérience '{experiment_name}' introuvable")
+        return None
+    # Récupérer tous les runs
+    runs = client.search_runs(
+        experiment_ids=[experiment.experiment_id], order_by=["start_time DESC"]
+    )
+    if not runs:
+        print(f"❌ Aucun run trouvé")
+        return None
+    print(f"✅ {len(runs)} run(s) trouvé(s)\n")
+    # Créer un DataFrame pour comparaison
+    data = []
+    for run in runs:
+        from datetime import datetime
+        row = {
+            "run_id": run.info.run_id[:8],  # 8 premiers caractères
+            "status": run.info.status,
+            "start_time": datetime.fromtimestamp(run.info.start_time / 1000).strftime(
+                "%Y-%m-%d %H:%M"
+            ),
+        }
+        # Ajouter les métriques
+        for metric_name in ["cv_f1", "test_precision", "test_recall", "test_f1"]:
+            row[metric_name] = run.data.metrics.get(metric_name, None)
+        # Ajouter quelques hyperparamètres importants
+        for param_name in ["clf__n_estimators", "clf__max_depth", "clf__learning_rate"]:
+            param_value = run.data.params.get(param_name, None)
+            if param_value:
+                try:
+                    row[param_name] = (
+                        float(param_value)
+                        if "." in str(param_value)
+                        else int(param_value)
+                    )
+                except:
+                    row[param_name] = param_value
+        data.append(row)
+    # Créer DataFrame
+    df = pd.DataFrame(data)
+    # Afficher le tableau
+    print("📈 Comparaison des modèles :")
+    print("=" * 120)
+    pd.set_option("display.max_columns", None)
+    pd.set_option("display.width", 120)
+    print(df.to_string(index=False))
+    print("=" * 120)
+    # Statistiques
+    print(f"\n📊 Statistiques :")
+    if "cv_f1" in df.columns:
+        print(f"   CV F1 moyen     : {df['cv_f1'].mean():.4f}")
+        print(f"   CV F1 max       : {df['cv_f1'].max():.4f}")
+        print(f"   CV F1 min       : {df['cv_f1'].min():.4f}")
+        print(f"   Écart-type      : {df['cv_f1'].std():.4f}")
+    # Meilleur run
+    if "cv_f1" in df.columns:
+        best_idx = df["cv_f1"].idxmax()
+        best_run = df.iloc[best_idx]
+        print(f"\n🏆 Meilleur run : {best_run['run_id']}")
+        print(f"   CV F1 : {best_run['cv_f1']:.4f}")
+    return df
+def plot_metrics_comparison(experiment_name="Default"):
+    """Génère un graphique de comparaison (nécessite matplotlib)."""
+    try:
+        import matplotlib.pyplot as plt
+        experiment = client.get_experiment_by_name(experiment_name)
+        if not experiment:
+            return
+        runs = client.search_runs(
+            experiment_ids=[experiment.experiment_id], order_by=["start_time ASC"]
+        )
+        # Extraire les données
+        run_names = [f"Run {i + 1}" for i in range(len(runs))]
+        cv_f1_scores = [run.data.metrics.get("cv_f1", 0) for run in runs]
+        test_f1_scores = [run.data.metrics.get("test_f1", 0) for run in runs]
+        # Créer le graphique
+        fig, ax = plt.subplots(figsize=(12, 6))
+        x = range(len(runs))
+        width = 0.35
+        ax.bar(
+            [i - width / 2 for i in x], cv_f1_scores, width, label="CV F1", alpha=0.8
+        )
+        ax.bar(
+            [i + width / 2 for i in x],
+            test_f1_scores,
+            width,
+            label="Test F1",
+            alpha=0.8,
+        )
+        ax.set_xlabel("Runs")
+        ax.set_ylabel("F1 Score")
+        ax.set_title(f"Comparaison des F1 scores - Expérience: {experiment_name}")
+        ax.set_xticks(x)
+        ax.set_xticklabels(run_names, rotation=45)
+        ax.legend()
+        ax.grid(axis="y", alpha=0.3)
+        plt.tight_layout()
+        plt.savefig("mlflow_comparison.png", dpi=300, bbox_inches="tight")
+        print(f"\n📊 Graphique sauvegardé : mlflow_comparison.png")
+    except ImportError:
+        print("\n⚠️  matplotlib non installé, graphique non généré")
+        print("   Installation : pip install matplotlib")
+if __name__ == "__main__":
+    # Comparer tous les runs
+    df = compare_all_runs("Default")
+    if df is not None:
+        # Générer un graphique
+        plot_metrics_comparison("Default")
+        print(f"\n💡 Conseils :")
+        print(f"   - Les runs avec CV F1 élevé sont de meilleurs candidats")
+        print(
+            f"   - Vérifier que test_f1 est cohérent avec cv_f1 (pas de surapprentissage)"
+        )
+        print(
+            f"   - Favoriser les modèles avec moins de paramètres si performances similaires"
+        )

examples/03_model_registry.py ADDED Viewed

	@@ -0,0 +1,205 @@

+#!/usr/bin/env python3
+"""
+Exemple 3 : Gérer le Model Registry (versions, stages, promotion)
+Usage: python examples/03_model_registry.py
+"""
+import mlflow
+from mlflow.tracking import MlflowClient
+# Configuration
+mlflow.set_tracking_uri("sqlite:///mlflow.db")
+client = MlflowClient()
+def list_registered_models():
+    """Liste tous les modèles enregistrés dans le Registry."""
+    print("📦 Modèles enregistrés dans le Model Registry :\n")
+    models = client.search_registered_models()
+    if not models:
+        print("❌ Aucun modèle enregistré")
+        return []
+    for rm in models:
+        print(f"🔹 {rm.name}")
+        print(f"   Description : {rm.description or 'N/A'}")
+        print(f"   Création    : {rm.creation_timestamp}")
+        print(f"   Versions    : {len(rm.latest_versions)}")
+        # Lister les versions
+        versions = client.search_model_versions(f"name='{rm.name}'")
+        for mv in versions:
+            stage = mv.current_stage
+            emoji = (
+                "🚀" if stage == "Production" else "🧪" if stage == "Staging" else "📝"
+            )
+            print(f"      {emoji} Version {mv.version} - {stage}")
+            print(f"         Run ID: {mv.run_id}")
+            print(f"         Source: {mv.source}")
+        print()
+    return models
+def get_model_details(model_name="XGBoost_Employee_Turnover"):
+    """Affiche les détails d'un modèle spécifique."""
+    print(f"🔍 Détails du modèle '{model_name}' :\n")
+    try:
+        # Récupérer les infos du modèle
+        rm = client.get_registered_model(model_name)
+        from datetime import datetime
+        print(f"📦 Informations générales :")
+        print(f"   Nom         : {rm.name}")
+        print(f"   Description : {rm.description or 'N/A'}")
+        print(
+            f"   Création    : {datetime.fromtimestamp(rm.creation_timestamp / 1000).strftime('%Y-%m-%d %H:%M')}"
+        )
+        print(
+            f"   Dernière MAJ: {datetime.fromtimestamp(rm.last_updated_timestamp / 1000).strftime('%Y-%m-%d %H:%M')}"
+        )
+        # Lister toutes les versions
+        versions = client.search_model_versions(f"name='{model_name}'")
+        print(f"\n📊 Versions ({len(versions)}) :")
+        print(f"{'Version':<10} {'Stage':<15} {'Run ID':<35} {'Date':<20}")
+        print("-" * 85)
+        for mv in sorted(versions, key=lambda v: int(v.version), reverse=True):
+            date_str = datetime.fromtimestamp(mv.creation_timestamp / 1000).strftime(
+                "%Y-%m-%d %H:%M"
+            )
+            print(
+                f"{mv.version:<10} {mv.current_stage:<15} {mv.run_id:<35} {date_str:<20}"
+            )
+        # Afficher la version en production
+        prod_versions = [v for v in versions if v.current_stage == "Production"]
+        if prod_versions:
+            print(f"\n🚀 Version en production : {prod_versions[0].version}")
+        else:
+            print(f"\n⚠️  Aucune version en production")
+        return rm
+    except Exception as e:
+        print(f"❌ Erreur : {e}")
+        return None
+def promote_model(model_name, version, stage="Staging"):
+    """
+    Promouvoir une version de modèle vers un stage.
+    Args:
+        model_name: Nom du modèle
+        version: Numéro de version
+        stage: "Staging", "Production", ou "Archived"
+    """
+    print(f"🔄 Promotion du modèle '{model_name}' v{version} → {stage}...")
+    try:
+        # Transition vers le nouveau stage
+        client.transition_model_version_stage(
+            name=model_name,
+            version=version,
+            stage=stage,
+            archive_existing_versions=True,  # Archive les anciennes versions du même stage
+        )
+        print(f"✅ Modèle promu avec succès !")
+        print(f"   {model_name} v{version} est maintenant en {stage}")
+        # Afficher l'état mis à jour
+        mv = client.get_model_version(model_name, version)
+        print(f"   Status : {mv.status}")
+    except Exception as e:
+        print(f"❌ Erreur lors de la promotion : {e}")
+def load_model_from_registry(
+    model_name="XGBoost_Employee_Turnover", stage="Production"
+):
+    """Charge un modèle depuis le Registry."""
+    print(f"📦 Chargement du modèle '{model_name}' ({stage})...\n")
+    model_uri = f"models:/{model_name}/{stage}"
+    try:
+        model = mlflow.sklearn.load_model(model_uri)
+        print(f"✅ Modèle chargé avec succès")
+        print(f"   URI  : {model_uri}")
+        print(f"   Type : {type(model).__name__}")
+        return model
+    except mlflow.exceptions.MlflowException as e:
+        print(f"⚠️  Aucun modèle en {stage}")
+        print(f"   Essai avec 'latest'...")
+        # Fallback sur latest
+        model_uri = f"models:/{model_name}/latest"
+        model = mlflow.sklearn.load_model(model_uri)
+        print(f"✅ Dernière version chargée")
+        return model
+def demo_workflow():
+    """Démo du workflow complet de gestion des modèles."""
+    print("=" * 80)
+    print("🎯 DEMO - Workflow Model Registry")
+    print("=" * 80 + "\n")
+    # 1. Lister les modèles
+    print("1️⃣  Liste des modèles\n")
+    models = list_registered_models()
+    if not models:
+        print("⚠️  Aucun modèle trouvé. Exécute d'abord un training avec MLflow.")
+        return
+    # 2. Détails du premier modèle
+    model_name = models[0].name
+    print("\n" + "=" * 80)
+    print(f"2️⃣  Détails du modèle '{model_name}'\n")
+    get_model_details(model_name)
+    # 3. Exemple de promotion (commenté pour ne pas modifier l'état)
+    print("\n" + "=" * 80)
+    print("3️⃣  Promotion d'un modèle\n")
+    print("💡 Pour promouvoir la version 1 en Production :")
+    print(f"   promote_model('{model_name}', version=1, stage='Production')")
+    print("   (Décommente dans le code pour exécuter)")
+    # Décommente cette ligne pour promouvoir réellement :
+    # promote_model(model_name, version=1, stage="Production")
+    # 4. Charger un modèle
+    print("\n" + "=" * 80)
+    print("4️⃣  Chargement d'un modèle\n")
+    # Essayer de charger depuis Production
+    try:
+        model = load_model_from_registry(model_name, "Production")
+    except:
+        print("⚠️  Aucun modèle en Production, chargement de 'latest'")
+        model = load_model_from_registry(model_name, "None")
+    print("\n" + "=" * 80)
+    print("✅ Demo terminée !")
+    print("=" * 80)
+if __name__ == "__main__":
+    demo_workflow()

examples/README.md ADDED Viewed

	@@ -0,0 +1,111 @@

+# 📚 Exemples MLflow
+Ce dossier contient des exemples pratiques pour utiliser MLflow dans le projet.
+## 🚀 Exemples disponibles
+### 1. Trouver le meilleur modèle
+```bash
+python examples/01_find_best_model.py
+```
+**Ce qu'il fait** :
+- Liste les 5 meilleurs runs selon une métrique (cv_f1)
+- Affiche les hyperparamètres et métriques du meilleur
+- Charge le modèle pour vérifier qu'il fonctionne
+**Utilisation** : Parfait pour identifier quel modèle utiliser dans ton API
+---
+### 2. Comparer plusieurs modèles
+```bash
+python examples/02_compare_models.py
+```
+**Ce qu'il fait** :
+- Compare tous les runs d'une expérience
+- Affiche un tableau avec métriques et hyperparamètres
+- Génère un graphique de comparaison (si matplotlib installé)
+- Calcule des statistiques (moyenne, max, min, écart-type)
+**Utilisation** : Pour analyser tes expériences d'hyperparameter tuning
+---
+### 3. Gérer le Model Registry
+```bash
+python examples/03_model_registry.py
+```
+**Ce qu'il fait** :
+- Liste tous les modèles enregistrés
+- Affiche les versions et leurs stages
+- Démontre comment promouvoir un modèle
+- Charge un modèle depuis le Registry
+**Utilisation** : Workflow de versioning pour la production
+---
+## 📖 Guide complet
+Consulte `docs/mlflow_guide.md` pour :
+- Architecture MLflow complète
+- Intégration API FastAPI/Flask
+- Best practices
+- Workflow de retraining
+## 🎯 Workflow recommandé
+```bash
+# 1. Entraîner plusieurs modèles
+MLFLOW_TRACKING_URI=sqlite:///mlflow.db python tests/test_mlflow_quick.py
+# 2. Trouver le meilleur
+python examples/01_find_best_model.py
+# 3. Comparer tous les runs
+python examples/02_compare_models.py
+# 4. Gérer le Registry
+python examples/03_model_registry.py
+# 5. Promouvoir en production (dans le code Python)
+from mlflow.tracking import MlflowClient
+client = MlflowClient()
+client.transition_model_version_stage(
+    name="XGBoost_Employee_Turnover",
+    version=1,
+    stage="Production"
+)
+```
+## 🔗 Intégration API
+Une fois le meilleur modèle identifié :
+```python
+import mlflow
+# Option A : Charger par run_id
+model = mlflow.sklearn.load_model("runs:/RUN_ID/model")
+# Option B : Charger depuis le Registry
+model = mlflow.sklearn.load_model("models:/XGBoost_Employee_Turnover/Production")
+# Prédiction
+predictions = model.predict(X_new)
+```
+## 💡 Tips
+- **Métrique principale** : `cv_f1` (F1-score en cross-validation)
+- **Métriques secondaires** : `test_precision`, `test_recall`, `test_f1`
+- **Vérifier** : Que test_f1 ≈ cv_f1 (pas de surapprentissage)
+- **Favoriser** : Modèles simples si performances similaires
+## 🌐 MLflow UI
+Pour visualiser graphiquement :
+```bash
+mlflow ui --backend-store-uri sqlite:///mlflow.db --port 5000
+```
+Puis ouvre http://localhost:5000

ml_model/preprocess.py CHANGED Viewed

@@ -1,8 +1,8 @@
-import pandas as pd
 import numpy as np
-from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
-from scipy.stats.mstats import winsorize
 from scipy import stats
 def load_raw_data(
@@ -97,9 +97,16 @@ def preprocess_data(raw_data_paths=None):
     )
     # Assemblage
     df_engineered = pd.concat(
         [
             central_df[quantitative_cols],
             encoded_non_ord,
             encoded_ord,
             central_df["a_quitte_l_entreprise"],
@@ -107,17 +114,8 @@ def preprocess_data(raw_data_paths=None):
         axis=1,
     )  # Inclut cible
-    # Scaling (quantitatives + ordinal)
-    cols_to_scale = (
-        quantitative_cols.tolist()
-        + cat_ord
-        + [
-            "revenu_par_anciennete",
-            "experience_par_anciennete",
-            "satisfaction_moyenne",
-            "promo_par_anciennete",
-        ]
-    )
     scaler = StandardScaler()
     df_engineered[cols_to_scale] = scaler.fit_transform(df_engineered[cols_to_scale])

 import numpy as np
+import pandas as pd
 from scipy import stats
+from scipy.stats.mstats import winsorize
+from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
 def load_raw_data(
     )
     # Assemblage
+    engineered_cols = [
+        "revenu_par_anciennete",
+        "experience_par_anciennete",
+        "satisfaction_moyenne",
+        "promo_par_anciennete",
+    ]
     df_engineered = pd.concat(
         [
             central_df[quantitative_cols],
+            central_df[engineered_cols],
             encoded_non_ord,
             encoded_ord,
             central_df["a_quitte_l_entreprise"],
         axis=1,
     )  # Inclut cible
+    # Scaling (quantitatives + ordinal + engineered)
+    cols_to_scale = quantitative_cols.tolist() + engineered_cols + cat_ord
     scaler = StandardScaler()
     df_engineered[cols_to_scale] = scaler.fit_transform(df_engineered[cols_to_scale])

ml_model/train_model.py CHANGED Viewed

@@ -1,9 +1,11 @@
-from sklearn.model_selection import train_test_split, RandomizedSearchCV
-from sklearn.metrics import classification_report, confusion_matrix
 from imblearn.over_sampling import SMOTE
 from imblearn.pipeline import Pipeline as ImbPipeline
 from xgboost import XGBClassifier
-from scipy.stats import uniform, randint
 def train_model(X, y):
@@ -42,17 +44,42 @@ def train_model(X, y):
         n_jobs=-1,
         random_state=42,
     )
-    random.fit(X_train, y_train)
-    best_model = random.best_estimator_  # type: ignore[assignment]
-    best_params = random.best_params_
-    cv_f1 = random.best_score_
-    # Éval test (pédagogique)
-    y_pred = best_model.predict(X_test)  # type: ignore[attr-defined]
-    print("Meilleurs params:", best_params)
-    print("Meilleur CV F1:", cv_f1)
-    print(classification_report(y_test, y_pred))
-    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
     return best_model, best_params, cv_f1

+import mlflow
+import mlflow.sklearn
 from imblearn.over_sampling import SMOTE
 from imblearn.pipeline import Pipeline as ImbPipeline
+from scipy.stats import randint, uniform
+from sklearn.metrics import classification_report, confusion_matrix
+from sklearn.model_selection import RandomizedSearchCV, train_test_split
 from xgboost import XGBClassifier
 def train_model(X, y):
         n_jobs=-1,
         random_state=42,
     )
+    # Ajout MLflow : Encapsule training pour tracking auto (./mlruns)
+    with mlflow.start_run(run_name="XGBoost_Tuning"):
+        random.fit(X_train, y_train)
+        best_model = random.best_estimator_  # type: ignore[assignment]
+        best_params = random.best_params_
+        cv_f1 = random.best_score_
+        mlflow.log_params(
+            best_params
+        )  # Choix : Log tous hyperparams pour reproductibilité.
+        mlflow.log_metric(
+            "cv_f1", cv_f1
+        )  # Choix : Métrique clé (F1 CV pour déséquilibre).
+        y_pred = best_model.predict(X_test)  # type: ignore[attr-defined]
+        report = classification_report(y_test, y_pred, output_dict=True)  # type: ignore[arg-type]
+        # Type ignore car classification_report avec output_dict=True retourne dict, pas str
+        mlflow.log_metric("test_precision", float(report["1"]["precision"]))  # type: ignore[index]
+        mlflow.log_metric("test_recall", float(report["1"]["recall"]))  # type: ignore[index]
+        mlflow.log_metric("test_f1", float(report["1"]["f1-score"]))  # type: ignore[index]
+        # Log model et récupère URI pour l'enregistrement
+        model_info = mlflow.sklearn.log_model(best_model, "model")  # type: ignore[attr-defined]
+        # Enregistre dans Model Registry pour apparaître dans la page "Models"
+        mlflow.register_model(
+            model_uri=model_info.model_uri, name="XGBoost_Employee_Turnover"
+        )
+        # Éval test (pédagogique)
+        print("Meilleurs params:", best_params)
+        print("Meilleur CV F1:", cv_f1)
+        print(classification_report(y_test, y_pred))
+        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
     return best_model, best_params, cv_f1

pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ version = "0.1.0"
 description = "Projet OpenClassRoom mise en API d'un modèle ML"
 authors = ["chaton59 <v.trouillez@gmail.com>"]
 readme = "README.md"
-packages = [{include = "src"}]
 [tool.poetry.dependencies]
 python = "^3.12"

 description = "Projet OpenClassRoom mise en API d'un modèle ML"
 authors = ["chaton59 <v.trouillez@gmail.com>"]
 readme = "README.md"
+packages = [{include = "src"}, {include = "ml_model"}]
 [tool.poetry.dependencies]
 python = "^3.12"

requirements.txt CHANGED Viewed

@@ -36,11 +36,12 @@ uvicorn==0.38.0 ; python_version >= "3.12"
 uvloop==0.22.1 ; sys_platform != "win32" and sys_platform != "cygwin" and platform_python_implementation != "PyPy" and python_version >= "3.12"
 watchfiles==1.1.1 ; python_version >= "3.12"
 websockets==15.0.1 ; python_version >= "3.12"
-scikit-learn==1.3.2
-xgboost==2.0.3
-imbalanced-learn==0.11.0
-scipy==1.11.4
-numpy==1.24.3
-pandas==2.0.3
-joblib==1.3.2
-huggingface-hub==0.23.4

 uvloop==0.22.1 ; sys_platform != "win32" and sys_platform != "cygwin" and platform_python_implementation != "PyPy" and python_version >= "3.12"
 watchfiles==1.1.1 ; python_version >= "3.12"
 websockets==15.0.1 ; python_version >= "3.12"
+scikit-learn==1.6.1
+xgboost==2.1.4
+imbalanced-learn==0.13.0
+scipy==1.14.1
+numpy==2.0.2
+pandas==2.2.3
+joblib==1.4.2
+mlflow==3.8.0
+huggingface-hub==0.26.5

scripts/fix_lint.py ADDED Viewed

	@@ -0,0 +1,68 @@

+#!/usr/bin/env python3
+"""
+Script pour corriger automatiquement les problèmes de lint du projet.
+Usage: python scripts/fix_lint.py
+"""
+import subprocess
+import sys
+from pathlib import Path
+def run_command(cmd, description):
+    """Exécute une commande et affiche le résultat."""
+    print(f"\n{'=' * 60}")
+    print(f"🔧 {description}")
+    print(f"{'=' * 60}")
+    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+    if result.stdout:
+        print(result.stdout)
+    if result.stderr:
+        print(result.stderr, file=sys.stderr)
+    return result.returncode
+def main():
+    """Corrige tous les problèmes de lint."""
+    project_root = Path(__file__).parent.parent
+    print(f"📁 Projet : {project_root}")
+    # 1. Formater avec Black
+    returncode = run_command(
+        f"cd {project_root} && .venv/bin/black ml_model/ tests/ examples/ --line-length 88",
+        "Formatage avec Black",
+    )
+    # 2. Trier les imports avec isort
+    returncode += run_command(
+        f"cd {project_root} && .venv/bin/python -m isort ml_model/ tests/ examples/ --profile black",
+        "Tri des imports avec isort",
+    )
+    # 3. Vérifier avec Flake8
+    returncode += run_command(
+        f"cd {project_root} && .venv/bin/python -m flake8 ml_model/ tests/ examples/ --max-line-length=88 --extend-ignore=E203,W503",
+        "Vérification avec Flake8",
+    )
+    # 4. Lancer les tests
+    returncode += run_command(
+        f"cd {project_root} && .venv/bin/python -m pytest tests/test_basic.py -v",
+        "Exécution des tests",
+    )
+    print(f"\n{'=' * 60}")
+    if returncode == 0:
+        print("✅ Tous les checks passent !")
+    else:
+        print("⚠️  Certains problèmes subsistent. Vérifiez les logs ci-dessus.")
+    print(f"{'=' * 60}\n")
+    return returncode
+if __name__ == "__main__":
+    sys.exit(main())

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""Configuration pytest pour ajouter le dossier racine au PYTHONPATH."""
+import sys
+from pathlib import Path
+# Ajouter le dossier racine du projet au PYTHONPATH
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))

tests/test_mlflow_local.py ADDED Viewed

	@@ -0,0 +1,54 @@

+#!/usr/bin/env python3
+"""
+Script de test local pour train_model.py avec MLflow.
+Utilise preprocess_data pour charger et préparer les données.
+"""
+import os
+import mlflow
+from ml_model.preprocess import preprocess_data
+from ml_model.train_model import train_model
+# Configure MLflow pour utiliser SQLite (nécessaire pour Model Registry)
+mlflow.set_tracking_uri("sqlite:///mlflow.db")
+if __name__ == "__main__":
+    print("🔄 Chargement et prétraitement des données...")
+    # Chemins des fichiers de données
+    data_paths = {
+        "sondage_path": "data/extrait_sondage.csv",
+        "eval_path": "data/extrait_eval.csv",
+        "sirh_path": "data/extrait_sirh.csv",
+    }
+    X, y, scaler, onehot, ordinal = preprocess_data(raw_data_paths=data_paths)
+    print(f"✅ Données prétraitées :")
+    print(f"   X shape: {X.shape}, y shape: {y.shape}")
+    print(f"   Distribution y: {y.value_counts().to_dict()}")
+    print("\n🚀 Lancement de l'entraînement avec MLflow tracking...")
+    print("   (Cela peut prendre quelques minutes avec n_iter=1000...)\n")
+    best_model, best_params, cv_f1 = train_model(X, y)
+    print(f"\n✅ Entraînement terminé !")
+    print(f"   CV F1-score: {cv_f1:.4f}")
+    print("\n📁 Vérification des artifacts MLflow dans ./mlruns :")
+    if os.path.exists("./mlruns"):
+        for root, dirs, files in os.walk("./mlruns"):
+            level = root.replace("./mlruns", "").count(os.sep)
+            indent = " " * 2 * level
+            print(f"{indent}{os.path.basename(root)}/")
+            subindent = " " * 2 * (level + 1)
+            for file in files[:5]:  # Limite à 5 fichiers par dossier
+                print(f"{subindent}{file}")
+            if len(files) > 5:
+                print(f"{subindent}... ({len(files) - 5} autres fichiers)")
+    else:
+        print("   ⚠️ Dossier ./mlruns non trouvé")
+    print("\n💡 Pour visualiser les runs MLflow, exécutez :")
+    print("   mlflow ui")
+    print("   Puis ouvrez http://localhost:5000 dans votre navigateur")

tests/test_mlflow_quick.py ADDED Viewed

	@@ -0,0 +1,130 @@

+#!/usr/bin/env python3
+"""
+Test rapide MLflow avec n_iter=10 au lieu de 1000.
+"""
+import os
+import mlflow
+from sklearn.model_selection import RandomizedSearchCV
+from ml_model.preprocess import preprocess_data
+from ml_model.train_model import train_model
+# Configure MLflow pour utiliser SQLite (nécessaire pour Model Registry)
+mlflow.set_tracking_uri("sqlite:///mlflow.db")
+# Patch temporaire pour test rapide
+import ml_model.train_model as train_module
+original_train = train_module.train_model
+def quick_train(X, y):
+    """Version rapide avec n_iter=10"""
+    import mlflow
+    import mlflow.sklearn
+    from imblearn.over_sampling import SMOTE
+    from imblearn.pipeline import Pipeline as ImbPipeline
+    from scipy.stats import randint, uniform
+    from sklearn.metrics import classification_report, confusion_matrix
+    from sklearn.model_selection import RandomizedSearchCV, train_test_split
+    from xgboost import XGBClassifier
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=42, stratify=y
+    )
+    ratio = sum(y == 0) / sum(y == 1)
+    pipeline = ImbPipeline(
+        [("sampler", SMOTE(random_state=42)), ("clf", XGBClassifier(random_state=42))]
+    )
+    param_dist = {
+        "clf__max_depth": randint(3, 15),
+        "clf__n_estimators": randint(100, 1000),
+        "clf__learning_rate": uniform(0.001, 0.5),
+        "clf__subsample": uniform(0.4, 0.6),
+        "clf__reg_alpha": uniform(0, 3),
+        "clf__gamma": uniform(0, 10),
+        "clf__colsample_bytree": uniform(0.5, 0.5),
+        "clf__min_child_weight": randint(1, 15),
+        "clf__scale_pos_weight": uniform(1, ratio),
+        "clf__tree_method": ["auto", "hist"],
+    }
+    random = RandomizedSearchCV(
+        pipeline,
+        param_dist,
+        n_iter=10,  # 🚀 Test rapide !
+        cv=3,  # Réduit aussi le CV
+        scoring="f1",
+        n_jobs=-1,
+        random_state=42,
+    )
+    with mlflow.start_run(run_name="XGBoost_Quick_Test"):
+        random.fit(X_train, y_train)
+        best_model = random.best_estimator_  # type: ignore[assignment]
+        best_params = random.best_params_
+        cv_f1 = random.best_score_
+        mlflow.log_params(best_params)
+        mlflow.log_metric("cv_f1", cv_f1)
+        y_pred = best_model.predict(X_test)  # type: ignore[attr-defined]
+        report = classification_report(y_test, y_pred, output_dict=True)  # type: ignore[arg-type]
+        mlflow.log_metric("test_precision", float(report["1"]["precision"]))  # type: ignore[index]
+        mlflow.log_metric("test_recall", float(report["1"]["recall"]))  # type: ignore[index]
+        mlflow.log_metric("test_f1", float(report["1"]["f1-score"]))  # type: ignore[index]
+        # Log model et enregistre dans Model Registry
+        model_info = mlflow.sklearn.log_model(best_model, "model")  # type: ignore[attr-defined]
+        mlflow.register_model(
+            model_uri=model_info.model_uri, name="XGBoost_Employee_Turnover"
+        )
+        print("Meilleurs params:", best_params)
+        print("Meilleur CV F1:", cv_f1)
+        print(classification_report(y_test, y_pred))
+        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
+    return best_model, best_params, cv_f1
+if __name__ == "__main__":
+    print("🔄 Chargement et prétraitement des données...")
+    data_paths = {
+        "sondage_path": "data/extrait_sondage.csv",
+        "eval_path": "data/extrait_eval.csv",
+        "sirh_path": "data/extrait_sirh.csv",
+    }
+    X, y, scaler, onehot, ordinal = preprocess_data(raw_data_paths=data_paths)
+    print(f"✅ Données prétraitées :")
+    print(f"   X shape: {X.shape}, y shape: {y.shape}")
+    print(f"   Distribution y: {y.value_counts().to_dict()}")
+    print("\n🚀 Lancement du test rapide (n_iter=10, cv=3)...\n")
+    best_model, best_params, cv_f1 = quick_train(X, y)
+    print(f"\n✅ Test terminé ! CV F1-score: {cv_f1:.4f}")
+    print("\n📁 Vérification des artifacts MLflow dans ./mlruns :")
+    if os.path.exists("./mlruns"):
+        for root, dirs, files in os.walk("./mlruns"):
+            level = root.replace("./mlruns", "").count(os.sep)
+            if level < 3:  # Limite la profondeur
+                indent = " " * 2 * level
+                print(f"{indent}{os.path.basename(root)}/")
+                if level == 2:  # Affiche fichiers dans les runs
+                    subindent = " " * 2 * (level + 1)
+                    for file in files[:3]:
+                        print(f"{subindent}{file}")
+                    if len(files) > 3:
+                        print(f"{subindent}... (+{len(files) - 3} fichiers)")
+    print("\n💡 Pour visualiser les runs MLflow, exécutez :")
+    print("   mlflow ui")
+    print("   Puis ouvrez http://localhost:5000")