Spaces:

ASI-Engineer
/

oc_p5-dev

Sleeping

App Files Files Community

ASI-Engineer commited on Dec 25, 2025

Commit

4570c28

verified ·

1 Parent(s): aac75d5

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

docs/mlflow_guide.md +51 -185
pyproject.toml +2 -9
requirements.txt +7 -39
tests/test_basic.py +30 -2

docs/mlflow_guide.md CHANGED Viewed

@@ -4,7 +4,7 @@
 1. [Workflow complet MLflow](#workflow-complet)
 2. [Comparer plusieurs runs](#comparer-runs)
 3. [Trouver le meilleur modèle](#meilleur-modèle)
-4. [Charger un modèle pour l'API](#api-integration)
 5. [Best Practices](#best-practices)
 ---
@@ -14,7 +14,7 @@
 ### 🎯 Concept clé
 MLflow suit ce workflow :
 ```
-Entraînement → Tracking → Registry → Déploiement → API
 ```
 ### Architecture actuelle du projet
@@ -26,10 +26,8 @@ mlflow.db (SQLite)
 MLflow UI (http://localhost:5000)
     ↓ (select best model)
 Model Registry (XGBoost_Employee_Turnover)
-    ↓ (load)
-API FastAPI/Flask
-    ↓ (serve)
-Prédictions
 ```
 ---
@@ -88,10 +86,10 @@ for config in configs:
 ## 3. Trouver le meilleur modèle
-### Option A : Via l'API MLflow (recommandé pour l'API)
 ```python
-# api/get_best_model.py
 import mlflow
 from mlflow.tracking import MlflowClient
@@ -117,8 +115,8 @@ def get_best_model_from_experiment(experiment_name="Default", metric="cv_f1"):
     # Rechercher tous les runs de l'expérience
     runs = client.search_runs(
         experiment_ids=[experiment.experiment_id],
-        order_by=[f"metrics.{metric} DESC"],  # Trier par métrique décroissante
-        max_results=1  # Prendre seulement le meilleur
     )
     if not runs:
@@ -128,179 +126,43 @@ def get_best_model_from_experiment(experiment_name="Default", metric="cv_f1"):
     print(f"🏆 Meilleur modèle trouvé:")
     print(f"   Run ID: {best_run.info.run_id}")
     print(f"   {metric}: {best_run.data.metrics.get(metric, 'N/A')}")
-    print(f"   Date: {best_run.info.start_time}")
     return best_run.info.run_id
-# Exemple d'utilisation
-if __name__ == "__main__":
-    best_run_id = get_best_model_from_experiment("Default", "cv_f1")
-    # Charger le modèle
-    model_uri = f"runs:/{best_run_id}/model"
-    model = mlflow.sklearn.load_model(model_uri)
-    print(f"✅ Modèle chargé : {type(model)}")
-```
-### Option B : Via le Model Registry (pour production)
-```python
-# api/load_production_model.py
-import mlflow
-mlflow.set_tracking_uri("sqlite:///mlflow.db")
-def load_production_model(model_name="XGBoost_Employee_Turnover", stage="Production"):
-    """
-    Charge le modèle en production depuis le Model Registry.
-    Args:
-        model_name: Nom du modèle dans le Registry
-        stage: Stage du modèle ("Staging", "Production", "Archived")
-    Returns:
-        Modèle chargé
-    """
-    model_uri = f"models:/{model_name}/{stage}"
-    try:
-        model = mlflow.sklearn.load_model(model_uri)
-        print(f"✅ Modèle '{model_name}' ({stage}) chargé")
-        return model
-    except Exception as e:
-        print(f"⚠️ Erreur : {e}")
-        print(f"💡 Astuce : Promouvoir une version en '{stage}' dans MLflow UI")
-        # Fallback : Charger la dernière version
-        model_uri = f"models:/{model_name}/latest"
-        model = mlflow.sklearn.load_model(model_uri)
-        print(f"✅ Fallback : Dernière version chargée")
-        return model
-# Utilisation
-if __name__ == "__main__":
-    model = load_production_model()
 ```
 ---
-## 4. API Integration - Exemple complet
-### Créer une API Flask/FastAPI avec MLflow
 ```python
-# api/app.py
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
-import mlflow
-import pandas as pd
-import numpy as np
-# Configuration
-mlflow.set_tracking_uri("sqlite:///mlflow.db")
-app = FastAPI(title="Employee Turnover Prediction API")
-# Charger le modèle au démarrage
-MODEL_NAME = "XGBoost_Employee_Turnover"
-model = None
-@app.on_event("startup")
-def load_model():
-    global model
-    try:
-        # Charger le dernier modèle du Registry
-        model_uri = f"models:/{MODEL_NAME}/latest"
-        model = mlflow.sklearn.load_model(model_uri)
-        print(f"✅ Modèle chargé : {MODEL_NAME}")
-    except Exception as e:
-        print(f"❌ Erreur chargement modèle : {e}")
-        raise
-# Schéma de requête
-class PredictionRequest(BaseModel):
-    features: list[float]  # Liste de 50 features (après prétraitement)
-    class Config:
-        json_schema_extra = {
-            "example": {
-                "features": [0.5, 1.2, -0.3, 0.8] + [0.0] * 46  # 50 features
-            }
-        }
-class PredictionResponse(BaseModel):
-    prediction: int  # 0 ou 1
-    probability: float  # Probabilité de départ (classe 1)
-    model_version: str
-# Endpoint de prédiction
-@app.post("/predict", response_model=PredictionResponse)
-def predict(request: PredictionRequest):
-    """
-    Prédit si un employé va quitter l'entreprise.
-    - **features**: Liste de 50 features numériques (après prétraitement)
-    - Retourne la prédiction (0=reste, 1=part) et la probabilité
-    """
-    if model is None:
-        raise HTTPException(status_code=503, detail="Modèle non chargé")
-    try:
-        # Convertir en DataFrame
-        X = pd.DataFrame([request.features])
-        # Prédiction
-        prediction = int(model.predict(X)[0])
-        probability = float(model.predict_proba(X)[0][1])
-        return PredictionResponse(
-            prediction=prediction,
-            probability=round(probability, 4),
-            model_version=MODEL_NAME
-        )
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=f"Erreur prédiction : {str(e)}")
-# Endpoint de santé
-@app.get("/health")
-def health():
-    return {
-        "status": "ok",
-        "model_loaded": model is not None,
-        "model_name": MODEL_NAME
-    }
-# Endpoint pour lister les modèles disponibles
-@app.get("/models")
-def list_models():
-    from mlflow.tracking import MlflowClient
-    client = MlflowClient()
-    models = []
-    for rm in client.search_registered_models():
-        latest_versions = rm.latest_versions
-        models.append({
-            "name": rm.name,
-            "versions": len(latest_versions),
-            "latest_version": latest_versions[0].version if latest_versions else None
-        })
-    return {"models": models}
-# Lancer avec : uvicorn api.app:app --reload
-```
-**Tester l'API** :
-```bash
-# Installer FastAPI
-pip install fastapi uvicorn
-# Lancer le serveur
-uvicorn api.app:app --reload --port 8000
-# Tester
-curl -X POST http://localhost:8000/predict \
-  -H "Content-Type: application/json" \
-  -d '{"features": [0.5, 1.2, -0.3] + [0.0] * 47}'
 ```
 ---
@@ -314,7 +176,6 @@ curl -X POST http://localhost:8000/predict \
 # 1. Entraîner plusieurs modèles → Experiment "Development"
 # 2. Sélectionner le meilleur → Promouvoir en "Staging"
 # 3. Valider en staging → Promouvoir en "Production"
-# 4. API charge toujours "Production"
 from mlflow.tracking import MlflowClient
@@ -339,10 +200,8 @@ with mlflow.start_run():
     mlflow.log_param("n_features", X.shape[1])
     mlflow.log_param("class_imbalance_ratio", sum(y==0)/sum(y==1))
-    # Log artifacts (graphiques, etc.)
     import matplotlib.pyplot as plt
-    # Confusion matrix plot
     plt.figure()
     # ... plot code ...
     plt.savefig("confusion_matrix.png")
@@ -359,7 +218,6 @@ with mlflow.start_run():
 ```python
 # scripts/retrain_model.py
 import mlflow
-from datetime import datetime
 def retrain_and_compare():
     """Entraîne un nouveau modèle et le compare à la production."""
@@ -381,8 +239,6 @@ def retrain_and_compare():
     # 4. Si meilleur, promouvoir automatiquement
     if new_f1 > prod_f1:
         print("✅ Nouveau modèle meilleur ! Promotion en Staging...")
-        # Enregistrer dans Registry
-        # ... code de promotion ...
     else:
         print("⚠️ Nouveau modèle moins bon, conservation du modèle actuel")
 ```
@@ -397,16 +253,26 @@ def retrain_and_compare():
 ---
-## 🎯 Prochaines étapes pour ton projet
-1. ✅ **MLflow configuré** - Tracking local avec SQLite
-2. ✅ **Modèle enregistré** - XGBoost_Employee_Turnover v1
-3. 🔄 **TODO: Créer l'API** - FastAPI avec chargement du modèle
-4. 🔄 **TODO: Tester comparaison** - Multiple runs avec hyperparams différents
-5. 🔄 **TODO: CI/CD** - Auto-retraining et déploiement
-**Commande pour démarrer l'API** :
 ```bash
-# Créer api/app.py avec le code ci-dessus
-uvicorn api.app:app --reload --port 8000
 ```

 1. [Workflow complet MLflow](#workflow-complet)
 2. [Comparer plusieurs runs](#comparer-runs)
 3. [Trouver le meilleur modèle](#meilleur-modèle)
+4. [Model Registry](#model-registry)
 5. [Best Practices](#best-practices)
 ---
 ### 🎯 Concept clé
 MLflow suit ce workflow :
 ```
+Entraînement → Tracking → Registry → Sélection du meilleur modèle
 ```
 ### Architecture actuelle du projet
 MLflow UI (http://localhost:5000)
     ↓ (select best model)
 Model Registry (XGBoost_Employee_Turnover)
+    ↓ (versions & stages)
+Modèle prêt pour déploiement
 ```
 ---
 ## 3. Trouver le meilleur modèle
+### Via l'API MLflow
 ```python
+# examples/find_best_model.py (déjà créé dans le projet)
 import mlflow
 from mlflow.tracking import MlflowClient
     # Rechercher tous les runs de l'expérience
     runs = client.search_runs(
         experiment_ids=[experiment.experiment_id],
+        order_by=[f"metrics.{metric} DESC"],
+        max_results=1
     )
     if not runs:
     print(f"🏆 Meilleur modèle trouvé:")
     print(f"   Run ID: {best_run.info.run_id}")
     print(f"   {metric}: {best_run.data.metrics.get(metric, 'N/A')}")
     return best_run.info.run_id
+# Charger le modèle
+best_run_id = get_best_model_from_experiment("Default", "cv_f1")
+model_uri = f"runs:/{best_run_id}/model"
+model = mlflow.sklearn.load_model(model_uri)
 ```
 ---
+## 4. Model Registry
+### Gérer les versions de modèles
 ```python
+# examples/model_registry.py (déjà créé dans le projet)
+from mlflow.tracking import MlflowClient
+client = MlflowClient()
+model_name = "XGBoost_Employee_Turnover"
+# Lister les versions
+versions = client.search_model_versions(f"name='{model_name}'")
+for v in versions:
+    print(f"Version {v.version}: {v.current_stage}")
+# Promouvoir en Production
+client.transition_model_version_stage(
+    name=model_name,
+    version=1,
+    stage="Production",
+    archive_existing_versions=True
+)
+# Charger depuis le Registry
+model = mlflow.sklearn.load_model(f"models:/{model_name}/Production")
 ```
 ---
 # 1. Entraîner plusieurs modèles → Experiment "Development"
 # 2. Sélectionner le meilleur → Promouvoir en "Staging"
 # 3. Valider en staging → Promouvoir en "Production"
 from mlflow.tracking import MlflowClient
     mlflow.log_param("n_features", X.shape[1])
     mlflow.log_param("class_imbalance_ratio", sum(y==0)/sum(y==1))
+    # Log artifacts (graphiques)
     import matplotlib.pyplot as plt
     plt.figure()
     # ... plot code ...
     plt.savefig("confusion_matrix.png")
 ```python
 # scripts/retrain_model.py
 import mlflow
 def retrain_and_compare():
     """Entraîne un nouveau modèle et le compare à la production."""
     # 4. Si meilleur, promouvoir automatiquement
     if new_f1 > prod_f1:
         print("✅ Nouveau modèle meilleur ! Promotion en Staging...")
     else:
         print("⚠️ Nouveau modèle moins bon, conservation du modèle actuel")
 ```
 ---
+## 🎯 Utilisation du projet
+### Entraîner un modèle
+```bash
+python ml_model/train_model.py
+```
+### Lancer MLflow UI
 ```bash
+mlflow ui --backend-store-uri sqlite:///mlflow.db --port 5000
+```
+### Exemples disponibles
+```bash
+# Trouver le meilleur modèle
+python examples/01_find_best_model.py
+# Comparer tous les runs
+python examples/02_compare_models.py
+# Gérer le Model Registry
+python examples/03_model_registry.py
 ```

pyproject.toml CHANGED Viewed

@@ -1,19 +1,13 @@
 [tool.poetry]
 name = "oc-p5"
 version = "0.1.0"
-description = "Projet OpenClassRoom mise en API d'un modèle ML"
 authors = ["chaton59 <v.trouillez@gmail.com>"]
 readme = "README.md"
-packages = [{include = "src"}, {include = "ml_model"}]
 [tool.poetry.dependencies]
 python = "^3.12"
-fastapi = "^0.123.0"
-uvicorn = { extras = ["standard"], version = "^0.38.0" }
-sqlalchemy = "^2.0.0"
-pydantic = "^2.0.0"
-psycopg = "^3.2.0"
-black = "^25.12.0"
 mlflow = "^3.8.0"
 scikit-learn = "1.6.1"
 imbalanced-learn = "0.13.0"
@@ -22,7 +16,6 @@ scipy = "^1.14.0"
 numpy = "^2.0.0"
 pandas = "^2.2.0"
 joblib = "^1.4.0"
-huggingface-hub = "^0.26.0"
 [tool.poetry.group.dev.dependencies]
 pytest = "^9.0.0"

 [tool.poetry]
 name = "oc-p5"
 version = "0.1.0"
+description = "Projet OpenClassRoom - Modèle ML de prédiction du turnover avec MLflow"
 authors = ["chaton59 <v.trouillez@gmail.com>"]
 readme = "README.md"
+packages = [{include = "ml_model"}]
 [tool.poetry.dependencies]
 python = "^3.12"
 mlflow = "^3.8.0"
 scikit-learn = "1.6.1"
 imbalanced-learn = "0.13.0"
 numpy = "^2.0.0"
 pandas = "^2.2.0"
 joblib = "^1.4.0"
 [tool.poetry.group.dev.dependencies]
 pytest = "^9.0.0"

requirements.txt CHANGED Viewed

@@ -1,41 +1,10 @@
-annotated-doc==0.0.4 ; python_version >= "3.12"
-annotated-types==0.7.0 ; python_version >= "3.12"
-anyio==4.12.0 ; python_version >= "3.12"
-black==25.11.0 ; python_version >= "3.12"
-click==8.3.1 ; python_version >= "3.12"
-colorama==0.4.6 ; (platform_system == "Windows" or sys_platform == "win32") and python_version >= "3.12"
-coverage==7.12.0 ; python_version >= "3.12"
-fastapi==0.123.4 ; python_version >= "3.12"
-flake8==7.3.0 ; python_version >= "3.12"
-greenlet==3.2.4 ; python_version >= "3.12" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32")
-h11==0.16.0 ; python_version >= "3.12"
-httptools==0.7.1 ; python_version >= "3.12"
-idna==3.11 ; python_version >= "3.12"
-iniconfig==2.3.0 ; python_version >= "3.12"
-mccabe==0.7.0 ; python_version >= "3.12"
-mypy-extensions==1.1.0 ; python_version >= "3.12"
-packaging==25.0 ; python_version >= "3.12"
-pathspec==0.12.1 ; python_version >= "3.12"
-platformdirs==4.5.0 ; python_version >= "3.12"
-pluggy==1.6.0 ; python_version >= "3.12"
-pycodestyle==2.14.0 ; python_version >= "3.12"
-pydantic-core==2.41.5 ; python_version >= "3.12"
-pydantic==2.12.5 ; python_version >= "3.12"
-pyflakes==3.4.0 ; python_version >= "3.12"
-pygments==2.19.2 ; python_version >= "3.12"
-pytest-cov==7.0.0 ; python_version >= "3.12"
-pytest==9.0.1 ; python_version >= "3.12"
-python-dotenv==1.2.1 ; python_version >= "3.12"
-pytokens==0.3.0 ; python_version >= "3.12"
-pyyaml==6.0.3 ; python_version >= "3.12"
-sqlalchemy==2.0.44 ; python_version >= "3.12"
-starlette==0.50.0 ; python_version >= "3.12"
-typing-extensions==4.15.0 ; python_version >= "3.12"
-typing-inspection==0.4.2 ; python_version >= "3.12"
-uvicorn==0.38.0 ; python_version >= "3.12"
-uvloop==0.22.1 ; sys_platform != "win32" and sys_platform != "cygwin" and platform_python_implementation != "PyPy" and python_version >= "3.12"
-watchfiles==1.1.1 ; python_version >= "3.12"
-websockets==15.0.1 ; python_version >= "3.12"
 scikit-learn==1.6.1
 xgboost==2.1.4
 imbalanced-learn==0.13.0
@@ -44,4 +13,3 @@ numpy==2.0.2
 pandas==2.2.3
 joblib==1.4.2
 mlflow==3.8.0
-huggingface-hub==0.26.5

+# Core dependencies
+black==25.11.0
+flake8==7.3.0
+pytest==9.0.1
+pytest-cov==7.0.0
+# ML dependencies
 scikit-learn==1.6.1
 xgboost==2.1.4
 imbalanced-learn==0.13.0
 pandas==2.2.3
 joblib==1.4.2
 mlflow==3.8.0

tests/test_basic.py CHANGED Viewed

@@ -1,3 +1,31 @@
 def test_pipeline_placeholder():
-    """Test basique pour CI/CD (POC étape 2 ; évolutif étape 5)."""
-    assert True  # Succès simple ; ajoute cas ML (Pydantic/validation) plus tard

+"""Tests basiques pour le pipeline ML."""
+from pathlib import Path
 def test_pipeline_placeholder():
+    """Test basique pour CI/CD."""
+    assert True
+def test_data_files_exist():
+    """Vérifie que les fichiers de données existent."""
+    data_dir = Path("data")
+    assert (data_dir / "extrait_sondage.csv").exists()
+    assert (data_dir / "extrait_eval.csv").exists()
+    assert (data_dir / "extrait_sirh.csv").exists()
+def test_preprocess_imports():
+    """Vérifie que les imports ML fonctionnent."""
+    from ml_model.preprocess import preprocess_data, load_raw_data
+    assert preprocess_data is not None
+    assert load_raw_data is not None
+def test_train_imports():
+    """Vérifie que le module d'entraînement s'importe."""
+    from ml_model.train_model import train_model
+    assert train_model is not None