Spaces:

stephmnt
/

credit-scoring-mlops

Runtime error

App Files Files Community

GitHub Actions commited on Jan 9

Commit

271ec19

1 Parent(s): decf87a

Auto-deploy from GitHub Actions

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

hf_space/.github/workflows/deploy-assets.yml +0 -2
hf_space/.github/workflows/deploy.yml +0 -2
hf_space/.gitignore +0 -3
hf_space/Dockerfile +0 -2
hf_space/README.md +7 -29
hf_space/docs/performance/performance_report.md +7 -8
hf_space/hf_space/app/main.py +155 -11
hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml +18 -0
hf_space/hf_space/hf_space/data/xgb_final_model.pkl +3 -0
hf_space/hf_space/hf_space/gradio_app.py +1 -121
hf_space/hf_space/hf_space/hf_space/.gitignore +1 -1
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml +6 -2
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py +7 -5
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py +2 -3
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy.yml +0 -1
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py +127 -3
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py +8 -8
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/HistGB_final_model.pkl +3 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.env.example +46 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml +10 -11
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy.yml +2 -1
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/Dockerfile +1 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md +54 -17
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py +365 -60
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/performance/performance_report.md +2 -1
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py +161 -27
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py +108 -19
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/__init__.py +1 -3
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/Dockerfile +3 -2
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py +1 -19
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/__init__.py +3 -1
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py +11 -1
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app_entry.py +19 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/Dockerfile +1 -1
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md +17 -8
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.gitattributes +2 -33
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml +69 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy.yml +4 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.gitignore +3 -1
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py +25 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py +190 -13
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py +96 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md +136 -18
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/README.md +13 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/drift_report.html +140 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/logs_storage.png +0 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/AMT_ANNUITY.png +0 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/AMT_CREDIT.png +0 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/AMT_GOODS_PRICE.png +0 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/CODE_GENDER.png +0 -0

hf_space/.github/workflows/deploy-assets.yml CHANGED Viewed

@@ -22,8 +22,6 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v4
-        with:
-          lfs: true
       - name: Set up Python
         uses: actions/setup-python@v5

     steps:
       - name: Checkout
         uses: actions/checkout@v4
       - name: Set up Python
         uses: actions/setup-python@v5

hf_space/.github/workflows/deploy.yml CHANGED Viewed

@@ -12,8 +12,6 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v4
-        with:
-          lfs: true
       - name: Set up Python
         uses: actions/setup-python@v5

     steps:
       - name: Checkout
         uses: actions/checkout@v4
       - name: Set up Python
         uses: actions/setup-python@v5

hf_space/.gitignore CHANGED Viewed

@@ -5,10 +5,7 @@ __pycache__/
 logs/
 reports/
 data/*
-!data/*_final_model.pkl
-!data/data_final.parquet
 artifacts/*
-!artifacts/preprocessor.joblib
 .DS_Store
 .vscode/
 .idea/

 logs/
 reports/
 data/*
 artifacts/*
 .DS_Store
 .vscode/
 .idea/

hf_space/Dockerfile CHANGED Viewed

@@ -11,8 +11,6 @@ RUN pip install --no-cache-dir -r requirements.txt
 COPY app/ app/
 COPY app_entry.py app.py gradio_app.py ./
 COPY src/ src/
-COPY data/ data/
-COPY artifacts/ artifacts/
 EXPOSE 7860

 COPY app/ app/
 COPY app_entry.py app.py gradio_app.py ./
 COPY src/ src/
 EXPOSE 7860

hf_space/README.md CHANGED Viewed

@@ -381,7 +381,7 @@ python monitoring/drift_report.py \
   --logs logs/predictions.jsonl \
   --reference data/data_final.parquet \
   --output-dir reports \
-  --min-prod-samples 200 \
   --fdr-alpha 0.05 \
   --prod-since "2024-01-01T00:00:00Z" \
   --prod-until "2024-01-31T23:59:59Z"
@@ -391,7 +391,7 @@ Le rapport HTML est généré dans `reports/drift_report.html` (avec des plots d
 `reports/plots/`). Sur Hugging Face, le disque est éphemère : télécharger les logs
 avant d'analyser.
-Le drift est calcule uniquement si `n_prod >= --min-prod-samples` (defaut 200).
 Sinon, un badge "Sample insuffisant" est affiche et les alertes sont desactivees.
 Robustesse integree:
@@ -418,20 +418,16 @@ Captures (snapshot local du reporting + stockage):
 ## Profiling & Optimisation (Etape 4)
-Profiling et benchmark d'inference (cProfile + latence) :
-- Desormais via le notebook modélisation (section TODO 5).
-- L'ancien script est archive dans `dev_archive/profiling/profile_inference.py`.
-Sorties:
-- `docs/performance/benchmark_results.json`
-- `docs/performance/profile_summary.txt`
-- Rapport detaille: `docs/performance/performance_report.md`
 Dashboard local Streamlit (monitoring + drift):
 ```shell
 python -m streamlit run monitoring/streamlit_app.py
 ```
@@ -452,21 +448,3 @@ python -m streamlit run monitoring/streamlit_app.py
 - **CI/CD** : tests avec couverture (`pytest-cov`), build Docker et deploy vers Hugging Face Spaces.
 ![Screenshot MLFlow](https://raw.githubusercontent.com/stephmnt/credit-scoring-mlops/main/screen-mlflow.png)
-### Manques prioritaires
-* Mission 2 Étape 4 non couverte: pas de profiling/optimisation post‑déploiement ni rapport de gains, à livrer avec une version optimisée.
-### Preuves / doc à compléter
-* Lien explicite vers le dépôt public + stratégie de versions/branches à ajouter dans README.md.
-* Preuve de model registry/serving MLflow à conserver (capture UI registry ou commande de serving) en plus de screen-mlflow.png.
-* Dataset de référence non versionné (data_final.parquet est ignoré), documenter l’obtention pour exécuter drift_report.py.
-* Badge GitHub Actions pointe vers OCR_Projet05 dans README.md, corriger l’URL.
-* RGPD/PII: LOG_HASH_SK_ID est désactivé par défaut dans main.py, préciser l’activation en prod dans README.md.
-### Améliorations recommandées
-* Compléter les tests API: /logs (auth OK/KO), batch predict, param threshold, SK_ID_CURR manquant, outliers dans test_api.py.
-* Simplifier le fallback ALLOW_MISSING_ARTIFACTS et DummyModel si les artefacts sont versionnés (nettoyer main.py et conftest.py).
-* Si l’évaluateur attend une stratégie de branches, créer une branche feature et fusionner pour preuve.

   --logs logs/predictions.jsonl \
   --reference data/data_final.parquet \
   --output-dir reports \
+  --min-prod-samples 50 \
   --fdr-alpha 0.05 \
   --prod-since "2024-01-01T00:00:00Z" \
   --prod-until "2024-01-31T23:59:59Z"
 `reports/plots/`). Sur Hugging Face, le disque est éphemère : télécharger les logs
 avant d'analyser.
+Le drift est calcule uniquement si `n_prod >= --min-prod-samples` (defaut 50).
 Sinon, un badge "Sample insuffisant" est affiche et les alertes sont desactivees.
 Robustesse integree:
 ## Profiling & Optimisation (Etape 4)
+Profiling et benchmark d'inference (cProfile + latence):
+- Notebook: `notebooks/P6_MANET_Stephane_notebook_modélisation.ipynb` (section TODO 5).
+- Resultats: `docs/performance/benchmark_results.json`, `docs/performance/profile_summary.txt`, `docs/performance/performance_report.md`.
 Dashboard local Streamlit (monitoring + drift):
 ```shell
+streamlit run monitoring/streamlit_app.py
+# ou
 python -m streamlit run monitoring/streamlit_app.py
 ```
 - **CI/CD** : tests avec couverture (`pytest-cov`), build Docker et deploy vers Hugging Face Spaces.
 ![Screenshot MLFlow](https://raw.githubusercontent.com/stephmnt/credit-scoring-mlops/main/screen-mlflow.png)

hf_space/docs/performance/performance_report.md CHANGED Viewed

@@ -6,11 +6,10 @@ Mesurer la latence d'inference, identifier les goulots d'etranglement et propose
 ## Setup
-- Script (archivé): `dev_archive/profiling/profile_inference.py`
-- Workflow courant: notebook modélisation (section TODO 5)
 - Donnees: `data/data_final.parquet` (echantillon)
 - Parametres: `--sample-size 500 --batch-size 100 --runs 2`
-- Modele: `HistGB_final_model.pkl`
 Les resultats sont sauvegardes dans:
@@ -21,21 +20,21 @@ Les resultats sont sauvegardes dans:
 | Scenario | Batch | Mean (ms) | P50 (ms) | P95 (ms) | Throughput (rows/s) |
 | --- | --- | ---:| ---:| ---:| ---:|
-| optimized_preprocess | 100 | 187.37 | 169.96 | 271.41 | 533.71 |
-| legacy_preprocess_alignment | 100 | 273.05 | 264.45 | 357.41 | 366.23 |
-Gain observe (moyenne): ~31% de reduction de latence par batch sur le chemin optimise.
 ## Goulots d'etranglement (cProfile)
 Extrait `docs/performance/profile_summary.txt`:
-- `app.main:preprocess_input` represente l'essentiel du temps cumule (~0.90s sur 1.05s).
 - Operations pandas dominantes:
   - `DataFrame.__setitem__` / `insert`
   - `fillna`, `to_numeric`
   - `get_dummies`
-- `HistGradientBoostingClassifier.predict_proba` est present mais non majoritaire (~0.15s).
 ## Optimisation appliquee

 ## Setup
+- Notebook: `notebooks/P6_MANET_Stephane_notebook_modélisation.ipynb` (section TODO 5)
 - Donnees: `data/data_final.parquet` (echantillon)
 - Parametres: `--sample-size 500 --batch-size 100 --runs 2`
+- Modele: `data/*_final_model.pkl` (ex: `data/xgb_final_model.pkl`)
 Les resultats sont sauvegardes dans:
 | Scenario | Batch | Mean (ms) | P50 (ms) | P95 (ms) | Throughput (rows/s) |
 | --- | --- | ---:| ---:| ---:| ---:|
+| optimized_preprocess | 100 | 35.73 | 33.77 | 43.09 | 2798.44 |
+| legacy_preprocess_alignment | 100 | 47.57 | 47.19 | 51.23 | 2102.36 |
+Gain observe (moyenne): ~25% de reduction de latence par batch sur le chemin optimise.
 ## Goulots d'etranglement (cProfile)
 Extrait `docs/performance/profile_summary.txt`:
+- `app.main:preprocess_input` represente l'essentiel du temps cumule (voir `docs/performance/profile_summary.txt`).
 - Operations pandas dominantes:
   - `DataFrame.__setitem__` / `insert`
   - `fillna`, `to_numeric`
   - `get_dummies`
+- `predict_proba` est present mais non majoritaire.
 ## Optimisation appliquee

hf_space/hf_space/app/main.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
 import logging
 import os
 import pickle
@@ -8,6 +9,7 @@ from datetime import datetime, timezone
 import hashlib
 import json
 from pathlib import Path
 import time
 from typing import Any
 import uuid
@@ -16,6 +18,7 @@ from collections import deque
 import numpy as np
 import pandas as pd
 from fastapi import FastAPI, Header, HTTPException, Query, Response
 from pydantic import BaseModel
 from sklearn.preprocessing import MinMaxScaler
 import joblib
@@ -78,6 +81,19 @@ HF_CUSTOMER_REPO_ID = os.getenv("HF_CUSTOMER_REPO_ID")
 HF_CUSTOMER_REPO_TYPE = os.getenv("HF_CUSTOMER_REPO_TYPE", "dataset")
 HF_CUSTOMER_FILENAME = os.getenv("HF_CUSTOMER_FILENAME", CUSTOMER_DATA_PATH.name)
 IGNORE_FEATURES = ["is_train", "is_test", "TARGET", "SK_ID_CURR"]
 ENGINEERED_FEATURES = [
     "DAYS_EMPLOYED_ANOM",
@@ -218,6 +234,87 @@ def _hash_value(value: Any) -> str:
     return hashlib.sha256(str(value).encode("utf-8")).hexdigest()
 def _normalize_category_value(value: object, mapping: dict[str, str]) -> object:
     if pd.isna(value): # type: ignore
         return np.nan
@@ -234,7 +331,9 @@ def _ensure_hf_asset(
     repo_type: str,
 ) -> Path | None:
     if local_path.exists():
-        return local_path
     if not repo_id:
         return None
@@ -254,6 +353,16 @@ def _ensure_hf_asset(
     )
 def _normalize_inputs(
     df_raw: pd.DataFrame,
@@ -470,11 +579,39 @@ def _log_prediction_entries(
                     "prediction": result.get("prediction"),
                 }
             )
-        if error:
-            entry["error"] = error
-        entries.append(entry)
     _append_log_entries(entries)
 def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
     df = pd.read_parquet(data_path)
@@ -853,7 +990,7 @@ def _get_customer_reference(preprocessor: PreprocessorArtifacts) -> pd.DataFrame
     if cached is not None:
         return cached
     data_path = CUSTOMER_DATA_PATH
-    if not data_path.exists():
         downloaded = _ensure_hf_asset(
             data_path,
             HF_CUSTOMER_REPO_ID,
@@ -1362,7 +1499,7 @@ def startup_event() -> None:
     if getattr(app.state, "model", None) is not None and getattr(app.state, "preprocessor", None) is not None:
         return
     model_path = MODEL_PATH
-    if not model_path.exists():
         downloaded = _ensure_hf_asset(
             model_path,
             HF_MODEL_REPO_ID,
@@ -1371,7 +1508,7 @@ def startup_event() -> None:
         )
         if downloaded is not None:
             model_path = downloaded
-    if not model_path.exists():
         if ALLOW_MISSING_ARTIFACTS:
             logger.warning("Model file not found: %s. Using dummy model.", model_path)
             app.state.model = DummyModel()
@@ -1379,10 +1516,17 @@ def startup_event() -> None:
             raise RuntimeError(f"Model file not found: {model_path}")
     else:
         logger.info("Loading model from %s", model_path)
-        app.state.model = load_model(model_path)
     data_path = DATA_PATH
-    if not data_path.exists():
         downloaded = _ensure_hf_asset(
             data_path,
             HF_CUSTOMER_REPO_ID,
@@ -1393,7 +1537,7 @@ def startup_event() -> None:
             data_path = downloaded
     try:
         artifacts_path = ARTIFACTS_PATH
-        if not artifacts_path.exists():
             downloaded = _ensure_hf_asset(
                 artifacts_path,
                 HF_PREPROCESSOR_REPO_ID or None,
@@ -1404,7 +1548,7 @@ def startup_event() -> None:
                 artifacts_path = downloaded
         logger.info("Loading preprocessor artifacts from %s", artifacts_path)
         app.state.preprocessor = load_preprocessor(data_path, artifacts_path)
-    except RuntimeError as exc:
         if ALLOW_MISSING_ARTIFACTS:
             logger.warning("Preprocessor artifacts missing (%s). Using fallback preprocessor.", exc)
             app.state.preprocessor = build_fallback_preprocessor()

 from __future__ import annotations
+import io
 import logging
 import os
 import pickle
 import hashlib
 import json
 from pathlib import Path
+import threading
 import time
 from typing import Any
 import uuid
 import numpy as np
 import pandas as pd
 from fastapi import FastAPI, Header, HTTPException, Query, Response
+from huggingface_hub import HfApi
 from pydantic import BaseModel
 from sklearn.preprocessing import MinMaxScaler
 import joblib
 HF_CUSTOMER_REPO_TYPE = os.getenv("HF_CUSTOMER_REPO_TYPE", "dataset")
 HF_CUSTOMER_FILENAME = os.getenv("HF_CUSTOMER_FILENAME", CUSTOMER_DATA_PATH.name)
+HF_LOG_ENABLED = os.getenv("HF_LOG_ENABLED", "1") == "1"
+HF_LOG_DATASET_REPO = os.getenv("HF_LOG_DATASET_REPO")
+HF_LOG_PATH_PREFIX = os.getenv("HF_LOG_PATH_PREFIX", "prod_logs")
+HF_LOG_BUFFER_MAX = int(os.getenv("HF_LOG_BUFFER_MAX", "50"))
+HF_LOG_FLUSH_SECONDS = int(os.getenv("HF_LOG_FLUSH_SECONDS", "60"))
+_hf_api = HfApi(token=os.getenv("HF_TOKEN")) if os.getenv("HF_TOKEN") else None
+_hf_lock = threading.Lock()
+_hf_buffer: list[dict[str, Any]] = []
+_hf_last_flush = 0.0
+_hf_flusher_started = False
 IGNORE_FEATURES = ["is_train", "is_test", "TARGET", "SK_ID_CURR"]
 ENGINEERED_FEATURES = [
     "DAYS_EMPLOYED_ANOM",
     return hashlib.sha256(str(value).encode("utf-8")).hexdigest()
+def _utc_day() -> str:
+    return datetime.now(timezone.utc).strftime("%Y-%m-%d")
+def _utc_stamp() -> str:
+    return datetime.now(timezone.utc).strftime("%H%M%S")
+def _start_hf_flusher_if_needed() -> None:
+    global _hf_flusher_started
+    if _hf_flusher_started:
+        return
+    _hf_flusher_started = True
+    def _loop() -> None:
+        while True:
+            time.sleep(HF_LOG_FLUSH_SECONDS)
+            with _hf_lock:
+                _flush_hf_locked(force=True)
+    threading.Thread(target=_loop, daemon=True).start()
+def _upload_parquet_part(df: pd.DataFrame) -> None:
+    if not (HF_LOG_ENABLED and _hf_api and HF_LOG_DATASET_REPO):
+        return
+    part_path = (
+        f"{HF_LOG_PATH_PREFIX}/date={_utc_day()}/"
+        f"part-{_utc_stamp()}-{uuid.uuid4().hex}.parquet"
+    )
+    bio = io.BytesIO()
+    df.to_parquet(bio, index=False)
+    for attempt in range(3):
+        try:
+            bio.seek(0)
+            _hf_api.upload_file(
+                path_or_fileobj=bio,
+                path_in_repo=part_path,
+                repo_id=HF_LOG_DATASET_REPO,
+                repo_type="dataset",
+                commit_message=f"Add inference logs {_utc_day()}",
+            )
+            return
+        except Exception:
+            if attempt == 2:
+                raise
+            time.sleep(1.5 * (attempt + 1))
+def _flush_hf_locked(force: bool = False) -> None:
+    global _hf_buffer, _hf_last_flush
+    if not _hf_buffer:
+        return
+    now = time.time()
+    if not force:
+        if len(_hf_buffer) < HF_LOG_BUFFER_MAX and (now - _hf_last_flush) < HF_LOG_FLUSH_SECONDS:
+            return
+    df = pd.DataFrame(_hf_buffer)
+    _hf_buffer = []
+    _hf_last_flush = now
+    try:
+        _upload_parquet_part(df)
+    except Exception as exc:
+        logger.warning("HF log upload failed: %s", exc)
+def hf_log_rows(rows: list[dict[str, Any]]) -> None:
+    if not (HF_LOG_ENABLED and _hf_api and HF_LOG_DATASET_REPO):
+        return
+    _start_hf_flusher_if_needed()
+    with _hf_lock:
+        _hf_buffer.extend(rows)
+        _flush_hf_locked(force=False)
 def _normalize_category_value(value: object, mapping: dict[str, str]) -> object:
     if pd.isna(value): # type: ignore
         return np.nan
     repo_type: str,
 ) -> Path | None:
     if local_path.exists():
+        if not _is_lfs_pointer(local_path):
+            return local_path
+        logger.warning("LFS pointer detected for %s; attempting remote download.", local_path)
     if not repo_id:
         return None
     )
+def _is_lfs_pointer(path: Path) -> bool:
+    try:
+        with path.open("rb") as handle:
+            head = handle.read(200)
+    except OSError:
+        return False
+    text = head.decode("utf-8", errors="ignore")
+    return text.startswith("version https://git-lfs.github.com/spec/v1")
 def _normalize_inputs(
     df_raw: pd.DataFrame,
                     "prediction": result.get("prediction"),
                 }
             )
+    if error:
+        entry["error"] = error
+    entries.append(entry)
     _append_log_entries(entries)
+    flat_rows: list[dict[str, Any]] = []
+    for entry in entries:
+        row = {
+            "timestamp_utc": entry.get("timestamp"),
+            "request_id": entry.get("request_id"),
+            "endpoint": entry.get("endpoint"),
+            "source": entry.get("source"),
+            "status_code": entry.get("status_code"),
+            "latency_ms": entry.get("latency_ms"),
+            "model_version": entry.get("model_version"),
+            "threshold": entry.get("threshold"),
+            "sk_id_curr": entry.get("sk_id_curr"),
+            "probability": entry.get("probability"),
+            "prediction": entry.get("prediction"),
+            "error": entry.get("error"),
+        }
+        inputs = entry.get("inputs") or {}
+        for key, value in inputs.items():
+            row[f"input__{key}"] = value
+        dq = entry.get("data_quality") or {}
+        for key, value in dq.items():
+            row[f"dq__{key}"] = value
+        flat_rows.append(row)
+    hf_log_rows(flat_rows)
 def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
     df = pd.read_parquet(data_path)
     if cached is not None:
         return cached
     data_path = CUSTOMER_DATA_PATH
+    if not data_path.exists() or _is_lfs_pointer(data_path):
         downloaded = _ensure_hf_asset(
             data_path,
             HF_CUSTOMER_REPO_ID,
     if getattr(app.state, "model", None) is not None and getattr(app.state, "preprocessor", None) is not None:
         return
     model_path = MODEL_PATH
+    if not model_path.exists() or _is_lfs_pointer(model_path):
         downloaded = _ensure_hf_asset(
             model_path,
             HF_MODEL_REPO_ID,
         )
         if downloaded is not None:
             model_path = downloaded
+    if not model_path.exists() or _is_lfs_pointer(model_path):
         if ALLOW_MISSING_ARTIFACTS:
             logger.warning("Model file not found: %s. Using dummy model.", model_path)
             app.state.model = DummyModel()
             raise RuntimeError(f"Model file not found: {model_path}")
     else:
         logger.info("Loading model from %s", model_path)
+        try:
+            app.state.model = load_model(model_path)
+        except Exception as exc:
+            if ALLOW_MISSING_ARTIFACTS:
+                logger.warning("Model load failed (%s). Using dummy model.", exc)
+                app.state.model = DummyModel()
+            else:
+                raise
     data_path = DATA_PATH
+    if not data_path.exists() or _is_lfs_pointer(data_path):
         downloaded = _ensure_hf_asset(
             data_path,
             HF_CUSTOMER_REPO_ID,
             data_path = downloaded
     try:
         artifacts_path = ARTIFACTS_PATH
+        if not artifacts_path.exists() or _is_lfs_pointer(artifacts_path):
             downloaded = _ensure_hf_asset(
                 artifacts_path,
                 HF_PREPROCESSOR_REPO_ID or None,
                 artifacts_path = downloaded
         logger.info("Loading preprocessor artifacts from %s", artifacts_path)
         app.state.preprocessor = load_preprocessor(data_path, artifacts_path)
+    except Exception as exc:
         if ALLOW_MISSING_ARTIFACTS:
             logger.warning("Preprocessor artifacts missing (%s). Using fallback preprocessor.", exc)
             app.state.preprocessor = build_fallback_preprocessor()

hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml CHANGED Viewed

@@ -59,6 +59,24 @@ jobs:
           model_path = candidates[0]
           api = HfApi()
           for path in [model_path]:
               api.upload_file(
                   path_or_fileobj=str(path),

           model_path = candidates[0]
           api = HfApi()
+          existing = api.list_repo_files(
+              repo_id=repo_id,
+              repo_type=repo_type,
+              token=token,
+          )
+          to_delete = [
+              name
+              for name in existing
+              if name.endswith("_final_model.pkl") and name != model_path.name
+          ]
+          for name in to_delete:
+              api.delete_file(
+                  path_in_repo=name,
+                  repo_id=repo_id,
+                  repo_type=repo_type,
+                  token=token,
+                  commit_message=f"Remove {name}",
+              )
           for path in [model_path]:
               api.upload_file(
                   path_or_fileobj=str(path),

hf_space/hf_space/hf_space/data/xgb_final_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0fe10d7c60f50f96a87bafd298f2919653aed37d90a091059017800450e6273b
+size 1370510

hf_space/hf_space/hf_space/gradio_app.py CHANGED Viewed

@@ -21,20 +21,10 @@ from app.main import (
     _normalize_inputs,
 )
-import io
-import os
-import threading
-import time
-import uuid
-from datetime import datetime, timezone
-from huggingface_hub import HfApi
 def _ensure_startup() -> None:
     if not getattr(app.state, "preprocessor", None):
         startup_event()
-    _start_log_flusher_if_needed()
 def _customer_snapshot(sk_id_curr: int) -> dict[str, Any]:
@@ -297,8 +287,7 @@ with gr.Blocks(title="Credit scoring MLOps") as demo:
         """
         <p>Renseignez l'identifiant client, le montant du crédit et la durée.</p>
         <p>Le modèle prédit la probabilité de défaut de paiement ainsi que la prédiction binaire associée. Le tableau SHAP affiche les 10 features les plus influentes pour cette prédiction. Le snapshot client affiche quelques informations de référence sur le client.</p>
-        <p>Pour accéder au data drift monitoring et aux rapports, rendez-vous sur l'application Streamlit dédiée.</p>
-        <p> Le dataset est disponible sur <a href="https://huggingface.co/datasets/stephmnt/assets-credit-scoring-mlops" rel="noreferrer">Hugging Face</a>.</p>
         """
     )
@@ -328,115 +317,6 @@ with gr.Blocks(title="Credit scoring MLOps") as demo:
         outputs=[probability, prediction, shap_table, snapshot],
     )
-# =========================
-# HF Dataset logging (Parquet parts)
-# =========================
-LOG_ENABLED = os.getenv("LOG_ENABLED", "1") == "1"
-LOG_DATASET_REPO = os.getenv("LOG_DATASET_REPO", "stephmnt/assets-credit-scoring-mlops")
-LOG_PATH_PREFIX = os.getenv("LOG_PATH_PREFIX", "prod_logs")
-HF_TOKEN = os.getenv("HF_TOKEN")  # Secret HF (write) sur le Space inference
-LOG_BUFFER_MAX = int(os.getenv("LOG_BUFFER_MAX", "50"))       # flush dès 50 lignes
-LOG_FLUSH_SECONDS = int(os.getenv("LOG_FLUSH_SECONDS", "60")) # flush au moins toutes les 60s
-_hf_api = HfApi(token=HF_TOKEN) if HF_TOKEN else None
-_log_lock = threading.Lock()
-_log_buffer: list[dict] = []
-_last_flush_ts = 0.0
-_flusher_started = False
-def _now_utc_iso() -> str:
-    return datetime.now(timezone.utc).isoformat()
-def _upload_parquet_part(df: pd.DataFrame) -> None:
-    if _hf_api is None:
-        return  # pas de token => pas de write
-    day = datetime.now(timezone.utc).strftime("%Y-%m-%d")
-    stamp = datetime.now(timezone.utc).strftime("%H%M%S")
-    part = f"{LOG_PATH_PREFIX}/date={day}/part-{stamp}-{uuid.uuid4().hex}.parquet"
-    bio = io.BytesIO()
-    df.to_parquet(bio, index=False)
-    bio.seek(0)
-    _hf_api.upload_file(
-        path_or_fileobj=bio,
-        path_in_repo=part,
-        repo_id=LOG_DATASET_REPO,
-        repo_type="dataset",
-        commit_message=f"Add inference logs {day}",
-    )
-def _flush_logs_locked(force: bool = False) -> None:
-    global _log_buffer, _last_flush_ts
-    if not _log_buffer:
-        return
-    now = time.time()
-    if not force:
-        if len(_log_buffer) < LOG_BUFFER_MAX and (now - _last_flush_ts) < LOG_FLUSH_SECONDS:
-            return
-    df = pd.DataFrame(_log_buffer)
-    _log_buffer = []
-    _last_flush_ts = now
-    try:
-        _upload_parquet_part(df)
-    except Exception:
-        # En prod tu peux logger ça en stderr / structlog etc.
-        # On évite de faire échouer l'inférence.
-        pass
-def _start_log_flusher_if_needed() -> None:
-    global _flusher_started
-    if _flusher_started:
-        return
-    _flusher_started = True
-    def _loop():
-        while True:
-            time.sleep(LOG_FLUSH_SECONDS)
-            with _log_lock:
-                _flush_logs_locked(force=True)
-    t = threading.Thread(target=_loop, daemon=True)
-    t.start()
-def log_inference_row(row: dict) -> None:
-    if not LOG_ENABLED or _hf_api is None:
-        return
-    with _log_lock:
-        _log_buffer.append(row)
-        _flush_logs_locked(force=False)
-        # --- Logging (Evidently-friendly) ---
-        row = {
-            "timestamp_utc": _now_utc_iso(),
-            "model_version": MODEL_VERSION,
-            "source": "gradio",
-            "sk_id_curr": int(sk_id_curr),
-            "amt_credit_requested": float(amt_credit),
-            "duration_months": int(duration_months),
-            "probability": float(probability),
-            "prediction": int(pred_value),
-        }
-        # Ajoute quelques features "business" utiles au drift (cat + num)
-        # (tu peux en ajouter plus si tu veux)
-        for k, v in snapshot.items():
-            if k == "SK_ID_CURR":
-                continue
-            row[f"cust__{k}"] = v
-        log_inference_row(row)
 if __name__ == "__main__":
     _ensure_startup()
     demo.launch()

     _normalize_inputs,
 )
 def _ensure_startup() -> None:
     if not getattr(app.state, "preprocessor", None):
         startup_event()
 def _customer_snapshot(sk_id_curr: int) -> dict[str, Any]:
         """
         <p>Renseignez l'identifiant client, le montant du crédit et la durée.</p>
         <p>Le modèle prédit la probabilité de défaut de paiement ainsi que la prédiction binaire associée. Le tableau SHAP affiche les 10 features les plus influentes pour cette prédiction. Le snapshot client affiche quelques informations de référence sur le client.</p>
+        <p>Pour accéder au data drift monitoring et aux rapports, rendez-vous sur l'application Streamlit dédiée. Le dataset est disponible sur <a href="https://huggingface.co/datasets/stephmnt/assets-credit-scoring-mlops" rel="noreferrer">Hugging Face</a>.</p>
         """
     )
         outputs=[probability, prediction, shap_table, snapshot],
     )
 if __name__ == "__main__":
     _ensure_startup()
     demo.launch()

hf_space/hf_space/hf_space/hf_space/.gitignore CHANGED Viewed

@@ -5,7 +5,7 @@ __pycache__/
 logs/
 reports/
 data/*
-!data/HistGB_final_model.pkl
 !data/data_final.parquet
 artifacts/*
 !artifacts/preprocessor.joblib

 logs/
 reports/
 data/*
+!data/*_final_model.pkl
 !data/data_final.parquet
 artifacts/*
 !artifacts/preprocessor.joblib

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml CHANGED Viewed

@@ -11,6 +11,10 @@ on:
         description: "HF repo type (dataset or model)"
         required: true
         default: "dataset"
 jobs:
   upload-assets:
@@ -34,8 +38,8 @@ jobs:
       - name: Upload assets to Hugging Face Hub
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
-          HF_REPO_ID: ${{ inputs.repo_id }}
-          HF_REPO_TYPE: ${{ inputs.repo_type }}
         run: |
           python - <<'PY'
           import os

         description: "HF repo type (dataset or model)"
         required: true
         default: "dataset"
+  push:
+    branches: ["main"]
+    paths:
+      - "data/*_final_model.pkl"
 jobs:
   upload-assets:
       - name: Upload assets to Hugging Face Hub
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HF_REPO_ID: ${{ inputs.repo_id || 'stephmnt/assets-credit-scoring-mlops' }}
+          HF_REPO_TYPE: ${{ inputs.repo_type || 'dataset' }}
         run: |
           python - <<'PY'
           import os

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py CHANGED Viewed

@@ -237,22 +237,24 @@ def _ensure_hf_asset(
         return local_path
     if not repo_id:
         return None
-    try:
-        from huggingface_hub import hf_hub_download
-    except ImportError as exc:  # pragma: no cover - optional dependency
-        raise RuntimeError("huggingface_hub is required to download remote assets.") from exc
     local_path.parent.mkdir(parents=True, exist_ok=True)
     return Path(
         hf_hub_download(
             repo_id=repo_id,
             filename=filename,
             repo_type=repo_type,
             local_dir=str(local_path.parent),
-            local_dir_use_symlinks=False,
         )
     )
 def _normalize_inputs(
     df_raw: pd.DataFrame,
     preprocessor: PreprocessorArtifacts,

         return local_path
     if not repo_id:
         return None
+    from huggingface_hub import hf_hub_download
+    token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
     local_path.parent.mkdir(parents=True, exist_ok=True)
     return Path(
         hf_hub_download(
             repo_id=repo_id,
             filename=filename,
             repo_type=repo_type,
+            token=token,                 # ✅ essentiel pour repo gated
             local_dir=str(local_path.parent),
         )
     )
 def _normalize_inputs(
     df_raw: pd.DataFrame,
     preprocessor: PreprocessorArtifacts,

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py CHANGED Viewed

@@ -296,10 +296,9 @@ with gr.Blocks(title="Credit scoring MLOps") as demo:
     gr.HTML(
         """
         <p>Renseignez l'identifiant client, le montant du crédit et la durée.</p>
-        <p>Le modèle prédit la probabilité de défaut de paiement ainsi que la prédiction binaire associée.</p>
-        <p>Le tableau SHAP affiche les 10 features les plus influentes pour cette prédiction.</p>
-        <p>Le snapshot client affiche quelques informations de référence sur le client.</p>
         <p>Pour accéder au data drift monitoring et aux rapports, rendez-vous sur l'application Streamlit dédiée.</p>
         """
     )

     gr.HTML(
         """
         <p>Renseignez l'identifiant client, le montant du crédit et la durée.</p>
+        <p>Le modèle prédit la probabilité de défaut de paiement ainsi que la prédiction binaire associée. Le tableau SHAP affiche les 10 features les plus influentes pour cette prédiction. Le snapshot client affiche quelques informations de référence sur le client.</p>
         <p>Pour accéder au data drift monitoring et aux rapports, rendez-vous sur l'application Streamlit dédiée.</p>
+        <p> Le dataset est disponible sur <a href="https://huggingface.co/datasets/stephmnt/assets-credit-scoring-mlops" rel="noreferrer">Hugging Face</a>.</p>
         """
     )

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy.yml CHANGED Viewed

@@ -49,7 +49,6 @@ jobs:
             --exclude 'logs' \
             --exclude 'reports' \
             --exclude 'screen-mlflow.png' \
-            --exclude 'data/*_final_model.pkl' \
             --exclude 'artifacts/preprocessor.joblib' \
             --exclude 'data/*.csv' \
             --exclude 'data/*.parquet' \

             --exclude 'logs' \
             --exclude 'reports' \
             --exclude 'screen-mlflow.png' \
             --exclude 'artifacts/preprocessor.joblib' \
             --exclude 'data/*.csv' \
             --exclude 'data/*.parquet' \

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py CHANGED Viewed

@@ -21,10 +21,20 @@ from app.main import (
     _normalize_inputs,
 )
 def _ensure_startup() -> None:
     if not getattr(app.state, "preprocessor", None):
         startup_event()
 def _customer_snapshot(sk_id_curr: int) -> dict[str, Any]:
@@ -283,13 +293,19 @@ with gr.Blocks(title="Credit scoring MLOps") as demo:
             </a>
         </div>
         """)
-    gr.Markdown(
-        "Renseignez l'identifiant client, le montant du crédit et la durée. "
     )
     with gr.Row():
         sk_id_curr = gr.Number(label="Identifiant client", precision=0, value=100001)
-        amt_credit = gr.Number(label="Montant du crédit", value=200000)
         duration_months = gr.Number(label="Durée (mois)", precision=0, value=60)
     run_btn = gr.Button("Scorer")
@@ -313,6 +329,114 @@ with gr.Blocks(title="Credit scoring MLOps") as demo:
         outputs=[probability, prediction, shap_table, snapshot],
     )
 if __name__ == "__main__":
     _ensure_startup()

     _normalize_inputs,
 )
+import io
+import os
+import threading
+import time
+import uuid
+from datetime import datetime, timezone
+from huggingface_hub import HfApi
 def _ensure_startup() -> None:
     if not getattr(app.state, "preprocessor", None):
         startup_event()
+    _start_log_flusher_if_needed()
 def _customer_snapshot(sk_id_curr: int) -> dict[str, Any]:
             </a>
         </div>
         """)
+    gr.HTML(
+        """
+        <p>Renseignez l'identifiant client, le montant du crédit et la durée.</p>
+        <p>Le modèle prédit la probabilité de défaut de paiement ainsi que la prédiction binaire associée.</p>
+        <p>Le tableau SHAP affiche les 10 features les plus influentes pour cette prédiction.</p>
+        <p>Le snapshot client affiche quelques informations de référence sur le client.</p>
+        <p>Pour accéder au data drift monitoring et aux rapports, rendez-vous sur l'application Streamlit dédiée.</p>
+        """
     )
     with gr.Row():
         sk_id_curr = gr.Number(label="Identifiant client", precision=0, value=100001)
+        amt_credit = gr.Number(label="Montant du crédit", value=2000000)
         duration_months = gr.Number(label="Durée (mois)", precision=0, value=60)
     run_btn = gr.Button("Scorer")
         outputs=[probability, prediction, shap_table, snapshot],
     )
+# =========================
+# HF Dataset logging (Parquet parts)
+# =========================
+LOG_ENABLED = os.getenv("LOG_ENABLED", "1") == "1"
+LOG_DATASET_REPO = os.getenv("LOG_DATASET_REPO", "stephmnt/assets-credit-scoring-mlops")
+LOG_PATH_PREFIX = os.getenv("LOG_PATH_PREFIX", "prod_logs")
+HF_TOKEN = os.getenv("HF_TOKEN")  # Secret HF (write) sur le Space inference
+LOG_BUFFER_MAX = int(os.getenv("LOG_BUFFER_MAX", "50"))       # flush dès 50 lignes
+LOG_FLUSH_SECONDS = int(os.getenv("LOG_FLUSH_SECONDS", "60")) # flush au moins toutes les 60s
+_hf_api = HfApi(token=HF_TOKEN) if HF_TOKEN else None
+_log_lock = threading.Lock()
+_log_buffer: list[dict] = []
+_last_flush_ts = 0.0
+_flusher_started = False
+def _now_utc_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+def _upload_parquet_part(df: pd.DataFrame) -> None:
+    if _hf_api is None:
+        return  # pas de token => pas de write
+    day = datetime.now(timezone.utc).strftime("%Y-%m-%d")
+    stamp = datetime.now(timezone.utc).strftime("%H%M%S")
+    part = f"{LOG_PATH_PREFIX}/date={day}/part-{stamp}-{uuid.uuid4().hex}.parquet"
+    bio = io.BytesIO()
+    df.to_parquet(bio, index=False)
+    bio.seek(0)
+    _hf_api.upload_file(
+        path_or_fileobj=bio,
+        path_in_repo=part,
+        repo_id=LOG_DATASET_REPO,
+        repo_type="dataset",
+        commit_message=f"Add inference logs {day}",
+    )
+def _flush_logs_locked(force: bool = False) -> None:
+    global _log_buffer, _last_flush_ts
+    if not _log_buffer:
+        return
+    now = time.time()
+    if not force:
+        if len(_log_buffer) < LOG_BUFFER_MAX and (now - _last_flush_ts) < LOG_FLUSH_SECONDS:
+            return
+    df = pd.DataFrame(_log_buffer)
+    _log_buffer = []
+    _last_flush_ts = now
+    try:
+        _upload_parquet_part(df)
+    except Exception:
+        # En prod tu peux logger ça en stderr / structlog etc.
+        # On évite de faire échouer l'inférence.
+        pass
+def _start_log_flusher_if_needed() -> None:
+    global _flusher_started
+    if _flusher_started:
+        return
+    _flusher_started = True
+    def _loop():
+        while True:
+            time.sleep(LOG_FLUSH_SECONDS)
+            with _log_lock:
+                _flush_logs_locked(force=True)
+    t = threading.Thread(target=_loop, daemon=True)
+    t.start()
+def log_inference_row(row: dict) -> None:
+    if not LOG_ENABLED or _hf_api is None:
+        return
+    with _log_lock:
+        _log_buffer.append(row)
+        _flush_logs_locked(force=False)
+        # --- Logging (Evidently-friendly) ---
+        row = {
+            "timestamp_utc": _now_utc_iso(),
+            "model_version": MODEL_VERSION,
+            "source": "gradio",
+            "sk_id_curr": int(sk_id_curr),
+            "amt_credit_requested": float(amt_credit),
+            "duration_months": int(duration_months),
+            "probability": float(probability),
+            "prediction": int(pred_value),
+        }
+        # Ajoute quelques features "business" utiles au drift (cat + num)
+        # (tu peux en ajouter plus si tu veux)
+        for k, v in snapshot.items():
+            if k == "SK_ID_CURR":
+                continue
+            row[f"cust__{k}"] = v
+        log_inference_row(row)
 if __name__ == "__main__":
     _ensure_startup()

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py CHANGED Viewed

@@ -219,7 +219,7 @@ def _hash_value(value: Any) -> str:
 def _normalize_category_value(value: object, mapping: dict[str, str]) -> object:
-    if pd.isna(value):
         return np.nan
     key = str(value).strip().upper()
     if not key:
@@ -265,12 +265,12 @@ def _normalize_inputs(
     unknown_masks: dict[str, pd.Series] = {}
     if "CODE_GENDER" in df.columns:
         raw = df["CODE_GENDER"]
-        normalized = raw.apply(lambda v: _normalize_category_value(v, CODE_GENDER_MAPPING))
         unknown_masks["CODE_GENDER"] = normalized.eq("Unknown") & raw.notna()
         df["CODE_GENDER"] = normalized
     if "FLAG_OWN_CAR" in df.columns:
         raw = df["FLAG_OWN_CAR"]
-        normalized = raw.apply(lambda v: _normalize_category_value(v, FLAG_OWN_CAR_MAPPING))
         unknown_masks["FLAG_OWN_CAR"] = normalized.eq("Unknown") & raw.notna()
         df["FLAG_OWN_CAR"] = normalized
@@ -404,7 +404,7 @@ def _build_minimal_record(
         )
     if "AMT_GOODS_PRICE" in record:
         record["AMT_GOODS_PRICE"] = float(payload.amt_credit)
-    return record
 def _append_log_entries(entries: list[dict[str, Any]]) -> None:
@@ -1576,7 +1576,7 @@ def _predict_records(
             latency_ms = (time.perf_counter() - start_time) * 1000.0
             _log_prediction_entries(
                 request_id=request_id,
-                records=log_records,
                 results=results,
                 latency_ms=latency_ms,
                 threshold=use_threshold,
@@ -1598,7 +1598,7 @@ def _predict_records(
         latency_ms = (time.perf_counter() - start_time) * 1000.0
         _log_prediction_entries(
             request_id=request_id,
-            records=log_records,
             results=results,
             latency_ms=latency_ms,
             threshold=None,
@@ -1613,7 +1613,7 @@ def _predict_records(
         detail = exc.detail if isinstance(exc.detail, dict) else {"message": str(exc.detail)}
         _log_prediction_entries(
             request_id=request_id,
-            records=log_records if "log_records" in locals() else records,
             results=None,
             latency_ms=latency_ms,
             threshold=threshold,
@@ -1628,7 +1628,7 @@ def _predict_records(
         latency_ms = (time.perf_counter() - start_time) * 1000.0
         _log_prediction_entries(
             request_id=request_id,
-            records=log_records if "log_records" in locals() else records,
             results=None,
             latency_ms=latency_ms,
             threshold=threshold,

 def _normalize_category_value(value: object, mapping: dict[str, str]) -> object:
+    if pd.isna(value): # type: ignore
         return np.nan
     key = str(value).strip().upper()
     if not key:
     unknown_masks: dict[str, pd.Series] = {}
     if "CODE_GENDER" in df.columns:
         raw = df["CODE_GENDER"]
+        normalized = raw.apply(lambda v: _normalize_category_value(v, CODE_GENDER_MAPPING)) # type: ignore
         unknown_masks["CODE_GENDER"] = normalized.eq("Unknown") & raw.notna()
         df["CODE_GENDER"] = normalized
     if "FLAG_OWN_CAR" in df.columns:
         raw = df["FLAG_OWN_CAR"]
+        normalized = raw.apply(lambda v: _normalize_category_value(v, FLAG_OWN_CAR_MAPPING)) # type: ignore
         unknown_masks["FLAG_OWN_CAR"] = normalized.eq("Unknown") & raw.notna()
         df["FLAG_OWN_CAR"] = normalized
         )
     if "AMT_GOODS_PRICE" in record:
         record["AMT_GOODS_PRICE"] = float(payload.amt_credit)
+    return record # type: ignore
 def _append_log_entries(entries: list[dict[str, Any]]) -> None:
             latency_ms = (time.perf_counter() - start_time) * 1000.0
             _log_prediction_entries(
                 request_id=request_id,
+                records=log_records, # type: ignore
                 results=results,
                 latency_ms=latency_ms,
                 threshold=use_threshold,
         latency_ms = (time.perf_counter() - start_time) * 1000.0
         _log_prediction_entries(
             request_id=request_id,
+            records=log_records, # type: ignore
             results=results,
             latency_ms=latency_ms,
             threshold=None,
         detail = exc.detail if isinstance(exc.detail, dict) else {"message": str(exc.detail)}
         _log_prediction_entries(
             request_id=request_id,
+            records=log_records if "log_records" in locals() else records, # type: ignore
             results=None,
             latency_ms=latency_ms,
             threshold=threshold,
         latency_ms = (time.perf_counter() - start_time) * 1000.0
         _log_prediction_entries(
             request_id=request_id,
+            records=log_records if "log_records" in locals() else records, # type: ignore
             results=None,
             latency_ms=latency_ms,
             threshold=threshold,

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/HistGB_final_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c7b31d6b2aa9d622717d03b6eaf79e6e21297869ff401f2f61a2d688cc55d6f
+size 411244

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.env.example ADDED Viewed

	@@ -0,0 +1,46 @@

+# Core paths
+MODEL_PATH=data/HistGB_final_model.pkl
+DATA_PATH=data/data_final.parquet
+ARTIFACTS_PATH=artifacts/preprocessor.joblib
+# Prediction behavior
+PREDICTION_THRESHOLD=0.5
+CACHE_PREPROCESSOR=1
+USE_REDUCED_INPUTS=1
+ALLOW_MISSING_ARTIFACTS=0
+MISSING_INDICATOR_MIN_RATE=0.05
+# Feature selection (correlation)
+FEATURE_SELECTION_METHOD=correlation
+FEATURE_SELECTION_TOP_N=8
+FEATURE_SELECTION_MIN_CORR=0.02
+CORRELATION_THRESHOLD=0.85
+CORRELATION_SAMPLE_SIZE=50000
+# Logging
+LOG_PREDICTIONS=1
+LOG_DIR=logs
+LOG_FILE=predictions.jsonl
+LOG_INCLUDE_INPUTS=1
+LOG_HASH_SK_ID=0
+MODEL_VERSION=HistGB_final_model.pkl
+LOGS_ACCESS_TOKEN=
+# Customer reference lookup
+CUSTOMER_DATA_PATH=data/data_final.parquet
+CUSTOMER_LOOKUP_ENABLED=1
+CUSTOMER_LOOKUP_CACHE=1
+# Hugging Face assets (optional)
+HF_MODEL_REPO_ID=stephmnt/assets-credit-scoring-mlops
+HF_MODEL_REPO_TYPE=model
+HF_MODEL_FILENAME=HistGB_final_model.pkl
+HF_PREPROCESSOR_REPO_ID=stephmnt/assets-credit-scoring-mlops
+HF_PREPROCESSOR_REPO_TYPE=model
+HF_PREPROCESSOR_FILENAME=preprocessor.joblib
+HF_CUSTOMER_REPO_ID=stephmnt/assets-credit-scoring-mlops
+HF_CUSTOMER_REPO_TYPE=dataset
+HF_CUSTOMER_FILENAME=data_final.parquet
+# MLflow
+MLFLOW_TRACKING_URI=http://127.0.0.1:5000

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml CHANGED Viewed

@@ -46,24 +46,23 @@ jobs:
           repo_type = os.environ["HF_REPO_TYPE"]
           token = os.environ["HF_TOKEN"]
-          files = {
-              "data/HistGB_final_model.pkl": "HistGB_final_model.pkl",
-              "artifacts/preprocessor.joblib": "preprocessor.joblib",
-              "data/data_final.parquet": "data_final.parquet",
-          }
           api = HfApi()
-          for local_path, remote_name in files.items():
-              path = Path(local_path)
-              if not path.exists():
-                  raise SystemExit(f"Missing file: {path}")
               api.upload_file(
                   path_or_fileobj=str(path),
-                  path_in_repo=remote_name,
                   repo_id=repo_id,
                   repo_type=repo_type,
                   token=token,
-                  commit_message=f"Update {remote_name}",
               )
           print("Assets uploaded.")
           PY

           repo_type = os.environ["HF_REPO_TYPE"]
           token = os.environ["HF_TOKEN"]
+          candidates = sorted(Path("data").glob("*_final_model.pkl"))
+          if not candidates:
+              raise SystemExit("Missing model file: data/*_final_model.pkl")
+          if len(candidates) > 1:
+              names = ", ".join(path.name for path in candidates)
+              raise SystemExit(f"Multiple *_final_model.pkl files found: {names}")
+          model_path = candidates[0]
           api = HfApi()
+          for path in [model_path]:
               api.upload_file(
                   path_or_fileobj=str(path),
+                  path_in_repo=path.name,
                   repo_id=repo_id,
                   repo_type=repo_type,
                   token=token,
+                  commit_message=f"Update {path.name}",
               )
           print("Assets uploaded.")
           PY

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy.yml CHANGED Viewed

@@ -49,10 +49,11 @@ jobs:
             --exclude 'logs' \
             --exclude 'reports' \
             --exclude 'screen-mlflow.png' \
-            --exclude 'data/HistGB_final_model.pkl' \
             --exclude 'artifacts/preprocessor.joblib' \
             --exclude 'data/*.csv' \
             --exclude 'data/*.parquet' \
             ./ hf_space/
           cd hf_space
           git add .

             --exclude 'logs' \
             --exclude 'reports' \
             --exclude 'screen-mlflow.png' \
+            --exclude 'data/*_final_model.pkl' \
             --exclude 'artifacts/preprocessor.joblib' \
             --exclude 'data/*.csv' \
             --exclude 'data/*.parquet' \
+            --exclude 'notebooks/mlflow.db' \
             ./ hf_space/
           cd hf_space
           git add .

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/Dockerfile CHANGED Viewed

@@ -10,6 +10,7 @@ RUN pip install --no-cache-dir -r requirements.txt
 COPY app/ app/
 COPY app_entry.py app.py gradio_app.py ./
 COPY data/ data/
 COPY artifacts/ artifacts/

 COPY app/ app/
 COPY app_entry.py app.py gradio_app.py ./
+COPY src/ src/
 COPY data/ data/
 COPY artifacts/ artifacts/

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md CHANGED Viewed

@@ -14,6 +14,18 @@ pinned: false
 [![GitHub Release Date](https://img.shields.io/github/release-date/stephmnt/credit-scoring-mlops?display_date=published_at&style=flat-square)](https://github.com/stephmnt/credit-scoring-mlops/releases)
 [![project_license](https://img.shields.io/github/license/stephmnt/credit-scoring-mlops.svg)](https://github.com/stephmnt/credit-scoring-mlops/blob/main/LICENSE)
 ## Lancer MLFlow
 Le notebook est configure pour utiliser un serveur MLflow local (`http://127.0.0.1:5000`).
@@ -75,6 +87,28 @@ pytest -q
 uvicorn app.main:app --reload --port 7860
 ```
 ### Environnement Poetry (livrable)
 Le livrable inclut `pyproject.toml`, aligne sur `requirements.txt`. Si besoin :
@@ -85,9 +119,9 @@ poetry run pytest -q
 poetry run uvicorn app.main:app --reload --port 7860
 ```
-Important : le modele `HistGB_final_model.pkl` doit etre regenere avec la
 version de scikit-learn definie dans `requirements.txt` / `pyproject.toml`
-(re-execution de `P6_MANET_Stephane_notebook_modélisation.ipynb`, cellule de
 sauvegarde pickle).
 ### Exemple d'input (schema + valeurs)
@@ -158,10 +192,13 @@ Variables utiles :
 ### Data contract (validation)
 - Types numeriques stricts (invalides -> 422).
-- Ranges numeriques (min/max entrainement) controles.
 - Categoriels normalises: `CODE_GENDER` -> {`F`, `M`}, `FLAG_OWN_CAR` -> {`Y`, `N`}.
-- Sentinelle `DAYS_EMPLOYED=365243` remplacee par NaN.
-- Logs enrichis via `data_quality` pour distinguer drift vs qualite de donnees.
 ### Interface Gradio (scoring)
@@ -186,13 +223,13 @@ variables suivantes sont definies :
 Exemple (un seul repo dataset avec 3 fichiers) :
-- `HF_MODEL_REPO_ID=stephmnt/credit-scoring-mlops-assets`
 - `HF_MODEL_REPO_TYPE=dataset`
-- `HF_MODEL_FILENAME=HistGB_final_model.pkl`
-- `HF_PREPROCESSOR_REPO_ID=stephmnt/credit-scoring-mlops-assets`
 - `HF_PREPROCESSOR_REPO_TYPE=dataset`
 - `HF_PREPROCESSOR_FILENAME=preprocessor.joblib`
-- `HF_CUSTOMER_REPO_ID=stephmnt/credit-scoring-mlops-assets`
 - `HF_CUSTOMER_REPO_TYPE=dataset`
 - `HF_CUSTOMER_FILENAME=data_final.parquet`
@@ -311,8 +348,11 @@ Variables utiles :
 - `LOG_HASH_SK_ID=1` pour anonymiser `SK_ID_CURR`
 Les logs incluent un bloc `data_quality` par requete (champs manquants,
-types invalides, out-of-range, categories inconnues, sentinelle
-`DAYS_EMPLOYED`).
 Exemple local :
@@ -359,6 +399,7 @@ Robustesse integree:
 - Categoriels: PSI avec lissage (`--psi-eps`) + categories rares regroupees (OTHER).
 - Numeriques: KS corrige par FDR (Benjamini-Hochberg, `--fdr-alpha`).
 - Sentinel `DAYS_EMPLOYED`: converti en NaN + taux suivi.
 Le rapport inclut aussi la distribution des scores predits et le taux de prediction
 (option `--score-bins` pour ajuster le nombre de bins), ainsi qu'une section
@@ -379,12 +420,8 @@ Captures (snapshot local du reporting + stockage):
 Profiling et benchmark d'inference (cProfile + latence) :
-```shell
-python profiling/profile_inference.py \
-  --sample-size 2000 \
-  --batch-size 128 \
-  --runs 3
-```
 Sorties:

 [![GitHub Release Date](https://img.shields.io/github/release-date/stephmnt/credit-scoring-mlops?display_date=published_at&style=flat-square)](https://github.com/stephmnt/credit-scoring-mlops/releases)
 [![project_license](https://img.shields.io/github/license/stephmnt/credit-scoring-mlops.svg)](https://github.com/stephmnt/credit-scoring-mlops/blob/main/LICENSE)
+## Structure rapide
+- `app/` API FastAPI + preprocessing inference
+- `monitoring/` rapport drift + Streamlit
+- `notebooks/` exploration + modelisation
+- `src/` utilitaires ML (feature engineering / pipeline)
+- `docs/` preuves & rapports (monitoring, perf)
+- `tests/` tests unitaires/integration
+Le feature engineering est factorise dans `src/features.py` et reutilise
+par le notebook et l'API pour eviter le training-serving skew.
 ## Lancer MLFlow
 Le notebook est configure pour utiliser un serveur MLflow local (`http://127.0.0.1:5000`).
 uvicorn app.main:app --reload --port 7860
 ```
+### Workflow DEV (notebooks)
+Ordre recommande (dev uniquement) :
+1. `notebooks/P6_MANET_Stephane_notebook_exploration.ipynb` → genere `data/data_final.parquet` (ecrase).
+2. `notebooks/P6_MANET_Stephane_notebook_compare_tuning_mlflow.ipynb` → compare+tuning, log MLflow, ecrit `reports/best_model.json`.
+3. `notebooks/P6_MANET_Stephane_notebook_modélisation.ipynb` → rebuild preprocessor, entraine le modele final, exporte `data/<model>_final_model.pkl`.
+4. Lancer manuellement le workflow `deploy-assets.yml` pour pousser `data/*_final_model.pkl`.
+Note : ces notebooks restent dev-only. Le code prod reste dans `app/` et `monitoring/`.
+### Configuration (.env)
+Dupliquez `.env.example` en `.env` si vous voulez surcharger les chemins,
+seuils ou sources Hugging Face.
+Le seuil `MISSING_INDICATOR_MIN_RATE` limite les colonnes `is_missing_*`
+aux features avec un taux de NaN >= 5% (par defaut).
+```shell
+cp .env.example .env
+```
 ### Environnement Poetry (livrable)
 Le livrable inclut `pyproject.toml`, aligne sur `requirements.txt`. Si besoin :
 poetry run uvicorn app.main:app --reload --port 7860
 ```
+Important : le modele `*_final_model.pkl` doit etre regenere avec la
 version de scikit-learn definie dans `requirements.txt` / `pyproject.toml`
+(re-execution de `notebooks/P6_MANET_Stephane_notebook_modélisation.ipynb`, cellule de
 sauvegarde pickle).
 ### Exemple d'input (schema + valeurs)
 ### Data contract (validation)
 - Types numeriques stricts (invalides -> 422).
+- Ranges numeriques (min/max entrainement) controles, hors `SK_ID_CURR` (ID).
 - Categoriels normalises: `CODE_GENDER` -> {`F`, `M`}, `FLAG_OWN_CAR` -> {`Y`, `N`}.
+- Sentinelle `DAYS_EMPLOYED=365243` remplacee par NaN + flag `DAYS_EMPLOYED_ANOM`.
+- Ratios securises (division par zero) + flags `DENOM_ZERO_*`.
+- Outliers clippees (p1/p99) + flags `is_outlier_*`.
+- Missingness indicators `is_missing_*` pour les numeriques avec taux de NaN >= 5%.
+- Logs enrichis via `data_quality` et `source` pour distinguer drift vs qualite de donnees.
 ### Interface Gradio (scoring)
 Exemple (un seul repo dataset avec 3 fichiers) :
+- `HF_MODEL_REPO_ID=stephmnt/assets-credit-scoring-mlops`
 - `HF_MODEL_REPO_TYPE=dataset`
+- `HF_MODEL_FILENAME=histgb_final_model.pkl` (ou `lgbm_final_model.pkl` / `xgb_final_model.pkl`)
+- `HF_PREPROCESSOR_REPO_ID=stephmnt/assets-credit-scoring-mlops`
 - `HF_PREPROCESSOR_REPO_TYPE=dataset`
 - `HF_PREPROCESSOR_FILENAME=preprocessor.joblib`
+- `HF_CUSTOMER_REPO_ID=stephmnt/assets-credit-scoring-mlops`
 - `HF_CUSTOMER_REPO_TYPE=dataset`
 - `HF_CUSTOMER_FILENAME=data_final.parquet`
 - `LOG_HASH_SK_ID=1` pour anonymiser `SK_ID_CURR`
 Les logs incluent un bloc `data_quality` par requete (champs manquants,
+types invalides, out-of-range, outliers, categories inconnues, sentinelle
+`DAYS_EMPLOYED`) et un champ `source` (api/gradio/etc.).
+Astuce : vous pouvez passer un header `X-Client-Source` pour tagger la source
+des requetes (ex: `gradio`, `test`, `batch`).
 Exemple local :
 - Categoriels: PSI avec lissage (`--psi-eps`) + categories rares regroupees (OTHER).
 - Numeriques: KS corrige par FDR (Benjamini-Hochberg, `--fdr-alpha`).
 - Sentinel `DAYS_EMPLOYED`: converti en NaN + taux suivi.
+- Outliers: clipping p1/p99 + taux via `data_quality`.
 Le rapport inclut aussi la distribution des scores predits et le taux de prediction
 (option `--score-bins` pour ajuster le nombre de bins), ainsi qu'une section
 Profiling et benchmark d'inference (cProfile + latence) :
+- Desormais via le notebook modélisation (section TODO 5).
+- L'ancien script est archive dans `dev_archive/profiling/profile_inference.py`.
 Sorties:

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py CHANGED Viewed

@@ -20,9 +20,33 @@ from pydantic import BaseModel
 from sklearn.preprocessing import MinMaxScaler
 import joblib
 logger = logging.getLogger("uvicorn.error")
-MODEL_PATH = Path(os.getenv("MODEL_PATH", "data/HistGB_final_model.pkl"))
 DATA_PATH = Path(os.getenv("DATA_PATH", "data/data_final.parquet"))
 ARTIFACTS_PATH = Path(os.getenv("ARTIFACTS_PATH", "artifacts/preprocessor.joblib"))
 DEFAULT_THRESHOLD = float(os.getenv("PREDICTION_THRESHOLD", "0.5"))
@@ -56,11 +80,17 @@ HF_CUSTOMER_FILENAME = os.getenv("HF_CUSTOMER_FILENAME", CUSTOMER_DATA_PATH.name
 IGNORE_FEATURES = ["is_train", "is_test", "TARGET", "SK_ID_CURR"]
 ENGINEERED_FEATURES = [
     "DAYS_EMPLOYED_PERC",
     "INCOME_CREDIT_PERC",
     "INCOME_PER_PERSON",
     "ANNUITY_INCOME_PERC",
     "PAYMENT_RATE",
 ]
 ENGINEERED_SOURCES = [
     "DAYS_EMPLOYED",
@@ -98,6 +128,9 @@ OUTLIER_COLUMNS = [
     "AMT_REQ_CREDIT_BUREAU_YEAR",
     "AMT_REQ_CREDIT_BUREAU_QRT",
 ]
 CODE_GENDER_MAPPING = {
     "F": "F",
@@ -143,6 +176,8 @@ class PreprocessorArtifacts:
     numeric_medians: dict[str, float]
     categorical_columns: list[str]
     outlier_maxes: dict[str, float]
     numeric_ranges: dict[str, tuple[float, float]]
     features_to_scaled: list[str]
     scaler: MinMaxScaler
@@ -243,6 +278,7 @@ def _normalize_inputs(
     if "DAYS_EMPLOYED" in df.columns:
         values = pd.to_numeric(df["DAYS_EMPLOYED"], errors="coerce")
         sentinel_mask = values == DAYS_EMPLOYED_SENTINEL
         if sentinel_mask.any():
             df.loc[sentinel_mask, "DAYS_EMPLOYED"] = np.nan
@@ -267,6 +303,7 @@ def _build_data_quality_records(
     missing_mask = df_norm[required_cols].isna() if required_cols else pd.DataFrame(index=df_norm.index)
     invalid_masks: dict[str, pd.Series] = {}
     out_of_range_masks: dict[str, pd.Series] = {}
     for col in numeric_required:
         if col not in df_raw.columns:
@@ -283,6 +320,13 @@ def _build_data_quality_records(
         values = pd.to_numeric(df_norm[col], errors="coerce")
         out_of_range_masks[col] = (values < min_val) | (values > max_val)
     records: list[dict[str, Any]] = []
     for idx in df_norm.index:
         missing_cols = (
@@ -292,18 +336,26 @@ def _build_data_quality_records(
         )
         invalid_cols = [col for col, mask in invalid_masks.items() if mask.at[idx]]
         out_of_range_cols = [col for col, mask in out_of_range_masks.items() if mask.at[idx]]
         unknown_cols = [col for col, mask in unknown_masks.items() if mask.at[idx]]
         nan_rate = float(missing_mask.loc[idx].mean()) if not missing_mask.empty else 0.0
-        records.append(
-            {
-                "missing_required_columns": missing_cols,
-                "invalid_numeric_columns": invalid_cols,
-                "out_of_range_columns": out_of_range_cols,
-                "unknown_categories": unknown_cols,
-                "days_employed_sentinel": bool(sentinel_mask.at[idx]) if not sentinel_mask.empty else False,
-                "nan_rate": nan_rate,
-            }
-        )
     return records
@@ -376,6 +428,7 @@ def _log_prediction_entries(
     threshold: float | None,
     status_code: int,
     preprocessor: PreprocessorArtifacts,
     data_quality: list[dict[str, Any]] | None = None,
     error: str | None = None,
 ) -> None:
@@ -400,6 +453,7 @@ def _log_prediction_entries(
             "status_code": status_code,
             "model_version": MODEL_VERSION,
             "threshold": threshold,
             "inputs": inputs,
         }
         if data_quality and idx < len(data_quality):
@@ -420,25 +474,16 @@ def _log_prediction_entries(
     _append_log_entries(entries)
-def new_features_creation(df: pd.DataFrame) -> pd.DataFrame:
-    df_features = df.copy()
-    for col in ENGINEERED_SOURCES:
-        if col not in df_features.columns:
-            df_features[col] = np.nan
-    df_features["DAYS_EMPLOYED_PERC"] = df_features["DAYS_EMPLOYED"] / df_features["DAYS_BIRTH"]
-    df_features["INCOME_CREDIT_PERC"] = df_features["AMT_INCOME_TOTAL"] / df_features["AMT_CREDIT"]
-    df_features["INCOME_PER_PERSON"] = df_features["AMT_INCOME_TOTAL"] / df_features["CNT_FAM_MEMBERS"]
-    df_features["ANNUITY_INCOME_PERC"] = df_features["AMT_ANNUITY"] / df_features["AMT_INCOME_TOTAL"]
-    df_features["PAYMENT_RATE"] = df_features["AMT_ANNUITY"] / df_features["AMT_CREDIT"]
-    return df_features
 def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
     df = pd.read_parquet(data_path)
     raw_feature_columns = df.columns.tolist()
     input_feature_columns = [c for c in raw_feature_columns if c not in ["is_train", "is_test", "TARGET"]]
-    df = new_features_creation(df)
     df.replace([np.inf, -np.inf], np.nan, inplace=True)
     missing_rate = df.isna().mean()
@@ -448,6 +493,26 @@ def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
     df = df[columns_keep]
     df = df.dropna(subset=columns_must_not_missing)
     numeric_cols = df.select_dtypes(include=["number"]).columns
     numeric_medians = df[numeric_cols].median().to_dict()
     df[numeric_cols] = df[numeric_cols].fillna(numeric_medians)
@@ -455,12 +520,7 @@ def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
     categorical_columns = df.select_dtypes(include=["object"]).columns.tolist()
     df[categorical_columns] = df[categorical_columns].fillna("Unknown")
-    if "CODE_GENDER" in df.columns:
-        df = df[df["CODE_GENDER"] != "XNA"]
-    outlier_maxes = {col: df[col].max() for col in OUTLIER_COLUMNS if col in df.columns}
-    for col, max_val in outlier_maxes.items():
-        df = df[df[col] != max_val]
     reduced_input_columns, selection_scores, selection_method = _compute_reduced_inputs(
         df,
@@ -487,7 +547,11 @@ def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
             required_input = _fallback_reduced_inputs(input_feature_columns)
     else:
         required_input = sorted(required_raw)
-    numeric_required = sorted(col for col in required_input if col in numeric_medians)
     correlated_imputation = _build_correlated_imputation(
         df,
         input_feature_columns=input_feature_columns,
@@ -501,6 +565,8 @@ def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
         numeric_medians={k: float(v) for k, v in numeric_medians.items()},
         categorical_columns=categorical_columns,
         outlier_maxes={k: float(v) for k, v in outlier_maxes.items()},
         numeric_ranges=numeric_ranges,
         features_to_scaled=features_to_scaled,
         scaler=scaler,
@@ -554,9 +620,28 @@ def build_fallback_preprocessor() -> PreprocessorArtifacts:
         ]
     )
-    df = new_features_creation(base)
     df.replace([np.inf, -np.inf], np.nan, inplace=True)
     columns_keep = df.columns.tolist()
     columns_must_not_missing = [col for col in columns_keep if col not in IGNORE_FEATURES]
@@ -579,7 +664,9 @@ def build_fallback_preprocessor() -> PreprocessorArtifacts:
     required_raw.update(col for col in columns_must_not_missing if col in input_feature_columns)
     required_raw.add("SK_ID_CURR")
     required_input = _fallback_reduced_inputs(input_feature_columns)
-    numeric_required = sorted(col for col in required_input if col in numeric_medians)
     numeric_ranges = {col: (float(df[col].min()), float(df[col].max())) for col in numeric_cols}
@@ -588,7 +675,9 @@ def build_fallback_preprocessor() -> PreprocessorArtifacts:
         columns_must_not_missing=columns_must_not_missing,
         numeric_medians={k: float(v) for k, v in numeric_medians.items()},
         categorical_columns=categorical_columns,
-        outlier_maxes={},
         numeric_ranges=numeric_ranges,
         features_to_scaled=features_to_scaled,
         scaler=scaler,
@@ -633,7 +722,9 @@ def load_preprocessor(data_path: Path, artifacts_path: Path) -> PreprocessorArti
             updated = True
         if not hasattr(preprocessor, "numeric_required_columns"):
             preprocessor.numeric_required_columns = sorted(
-                col for col in preprocessor.required_input_columns if col in preprocessor.numeric_medians
             )
             updated = True
         if not hasattr(preprocessor, "numeric_ranges"):
@@ -646,6 +737,56 @@ def load_preprocessor(data_path: Path, artifacts_path: Path) -> PreprocessorArti
                     raise RuntimeError(f"Data file not found to rebuild preprocessor: {data_path}")
                 preprocessor = build_preprocessor(data_path)
                 updated = True
         if USE_REDUCED_INPUTS:
             reduced = _reduce_input_columns(preprocessor)
             if preprocessor.required_input_columns != reduced:
@@ -658,7 +799,9 @@ def load_preprocessor(data_path: Path, artifacts_path: Path) -> PreprocessorArti
                 required_updated = True
                 updated = True
         desired_numeric_required = sorted(
-            col for col in preprocessor.required_input_columns if col in preprocessor.numeric_medians
         )
         if getattr(preprocessor, "numeric_required_columns", None) != desired_numeric_required:
             preprocessor.numeric_required_columns = desired_numeric_required
@@ -890,7 +1033,11 @@ def _compute_reduced_inputs_from_data(
     if not data_path.exists():
         return _fallback_reduced_inputs(preprocessor.input_feature_columns), {}, "default"
     df = pd.read_parquet(data_path)
-    df = new_features_creation(df)
     df.replace([np.inf, -np.inf], np.nan, inplace=True)
     if preprocessor.columns_keep:
@@ -908,9 +1055,25 @@ def _compute_reduced_inputs_from_data(
     if "CODE_GENDER" in df.columns:
         df = df[df["CODE_GENDER"] != "XNA"]
-    for col, max_val in preprocessor.outlier_maxes.items():
-        if col in df.columns:
-            df = df[df[col] != max_val]
     return _compute_reduced_inputs(df, input_feature_columns=preprocessor.input_feature_columns)
@@ -920,7 +1083,11 @@ def _compute_correlated_imputation(
     preprocessor: PreprocessorArtifacts,
 ) -> dict[str, dict[str, float | str]]:
     df = pd.read_parquet(data_path)
-    df = new_features_creation(df)
     df.replace([np.inf, -np.inf], np.nan, inplace=True)
     df = df[preprocessor.columns_keep]
@@ -936,9 +1103,25 @@ def _compute_correlated_imputation(
     if "CODE_GENDER" in df.columns:
         df = df[df["CODE_GENDER"] != "XNA"]
-    for col, max_val in preprocessor.outlier_maxes.items():
-        if col in df.columns:
-            df = df[df[col] != max_val]
     return _build_correlated_imputation(
         df,
@@ -1048,11 +1231,30 @@ def preprocess_input(df_raw: pd.DataFrame, artifacts: PreprocessorArtifacts) ->
     if "TARGET" not in df.columns:
         df["TARGET"] = 0
-    df = new_features_creation(df)
     df.replace([np.inf, -np.inf], np.nan, inplace=True)
     df = df.reindex(columns=artifacts.columns_keep, fill_value=np.nan)
     _apply_correlated_imputation(df, artifacts)
     for col, median in artifacts.numeric_medians.items():
@@ -1072,16 +1274,6 @@ def preprocess_input(df_raw: pd.DataFrame, artifacts: PreprocessorArtifacts) ->
             detail={"message": "CODE_GENDER cannot be 'XNA' based on training rules."},
         )
-    for col, max_val in artifacts.outlier_maxes.items():
-        if col in df.columns and (df[col] >= max_val).any():
-            raise HTTPException(
-                status_code=422,
-                detail={
-                    "message": "Input contains outlier values removed during training.",
-                    "outlier_columns": [col],
-                },
-            )
     df_hot = pd.get_dummies(df, columns=artifacts.categorical_columns)
     df_hot = df_hot.reindex(columns=artifacts.features_to_scaled, fill_value=0)
@@ -1089,6 +1281,80 @@ def preprocess_input(df_raw: pd.DataFrame, artifacts: PreprocessorArtifacts) ->
     return pd.DataFrame(scaled, columns=artifacts.features_to_scaled, index=df.index)
 @app.on_event("startup")
 def startup_event() -> None:
     if getattr(app.state, "model", None) is not None and getattr(app.state, "preprocessor", None) is not None:
@@ -1183,9 +1449,19 @@ def features(include_all: bool = Query(default=False)) -> dict[str, Any]:
         for col in preprocessor.required_input_columns
         if col in scores
     }
     payload = {
         "required_input_features": preprocessor.required_input_columns,
         "engineered_features": ENGINEERED_FEATURES,
         "model_features_count": len(preprocessor.features_to_scaled),
         "feature_selection_method": preprocessor.feature_selection_method,
         "feature_selection_top_n": FEATURE_SELECTION_TOP_N,
@@ -1198,6 +1474,8 @@ def features(include_all: bool = Query(default=False)) -> dict[str, Any]:
     if include_all:
         payload["input_features"] = preprocessor.input_feature_columns
         payload["optional_input_features"] = optional_features
     else:
         payload["input_features"] = preprocessor.required_input_columns
         payload["optional_input_features"] = []
@@ -1235,8 +1513,28 @@ def logs(
     return Response(content="".join(lines), media_type="application/x-ndjson")
-def _predict_records(records: list[dict[str, Any]], threshold: float | None) -> dict[str, Any]:
     model = app.state.model
     preprocessor: PreprocessorArtifacts = app.state.preprocessor
     request_id = str(uuid.uuid4())
@@ -1260,7 +1558,8 @@ def _predict_records(records: list[dict[str, Any]], threshold: float | None) ->
             raise HTTPException(status_code=422, detail={"message": "SK_ID_CURR is required."})
         sk_ids = df_norm["SK_ID_CURR"].tolist()
-        features = preprocess_input(df_norm, preprocessor)
         if hasattr(model, "predict_proba"):
             proba = model.predict_proba(features)[:, 1]
@@ -1283,6 +1582,7 @@ def _predict_records(records: list[dict[str, Any]], threshold: float | None) ->
                 threshold=use_threshold,
                 status_code=200,
                 preprocessor=preprocessor,
                 data_quality=dq_records,
             )
             return {"predictions": results, "threshold": use_threshold}
@@ -1304,6 +1604,7 @@ def _predict_records(records: list[dict[str, Any]], threshold: float | None) ->
             threshold=None,
             status_code=200,
             preprocessor=preprocessor,
             data_quality=dq_records,
         )
         return {"predictions": results, "threshold": None}
@@ -1318,6 +1619,7 @@ def _predict_records(records: list[dict[str, Any]], threshold: float | None) ->
             threshold=threshold,
             status_code=exc.status_code,
             preprocessor=preprocessor,
             data_quality=dq_records if "dq_records" in locals() else None,
             error=json.dumps(detail, ensure_ascii=True),
         )
@@ -1332,6 +1634,7 @@ def _predict_records(records: list[dict[str, Any]], threshold: float | None) ->
             threshold=threshold,
             status_code=500,
             preprocessor=preprocessor,
             data_quality=dq_records if "dq_records" in locals() else None,
             error=str(exc),
         )
@@ -1342,16 +1645,18 @@ def _predict_records(records: list[dict[str, Any]], threshold: float | None) ->
 def predict(
     payload: PredictionRequest,
     threshold: float | None = Query(default=None, ge=0.0, le=1.0),
 ) -> dict[str, Any]:
     records = payload.data if isinstance(payload.data, list) else [payload.data]
-    return _predict_records(records, threshold)
 @app.post("/predict-minimal")
 def predict_minimal(
     payload: MinimalPredictionRequest,
     threshold: float | None = Query(default=None, ge=0.0, le=1.0),
 ) -> dict[str, Any]:
     preprocessor: PreprocessorArtifacts = app.state.preprocessor
     record = _build_minimal_record(payload, preprocessor)
-    return _predict_records([record], threshold)

 from sklearn.preprocessing import MinMaxScaler
 import joblib
+from src.features import (
+    add_missingness_indicators,
+    apply_outlier_clipping,
+    compute_outlier_bounds,
+    new_features_creation,
+    select_missing_indicator_columns,
+)
 logger = logging.getLogger("uvicorn.error")
+def _resolve_model_path() -> Path:
+    env_path = os.getenv("MODEL_PATH")
+    if env_path:
+        return Path(env_path)
+    candidates = sorted(Path("data").glob("*_final_model.pkl"))
+    if len(candidates) == 1:
+        return candidates[0]
+    if candidates:
+        logger.warning(
+            "Multiple *_final_model.pkl files found; set MODEL_PATH explicitly. Using %s",
+            candidates[0],
+        )
+        return candidates[0]
+    return Path("data/histgb_final_model.pkl")
+MODEL_PATH = _resolve_model_path()
 DATA_PATH = Path(os.getenv("DATA_PATH", "data/data_final.parquet"))
 ARTIFACTS_PATH = Path(os.getenv("ARTIFACTS_PATH", "artifacts/preprocessor.joblib"))
 DEFAULT_THRESHOLD = float(os.getenv("PREDICTION_THRESHOLD", "0.5"))
 IGNORE_FEATURES = ["is_train", "is_test", "TARGET", "SK_ID_CURR"]
 ENGINEERED_FEATURES = [
+    "DAYS_EMPLOYED_ANOM",
     "DAYS_EMPLOYED_PERC",
     "INCOME_CREDIT_PERC",
     "INCOME_PER_PERSON",
     "ANNUITY_INCOME_PERC",
     "PAYMENT_RATE",
+    "DENOM_ZERO_DAYS_EMPLOYED_PERC",
+    "DENOM_ZERO_INCOME_CREDIT_PERC",
+    "DENOM_ZERO_INCOME_PER_PERSON",
+    "DENOM_ZERO_ANNUITY_INCOME_PERC",
+    "DENOM_ZERO_PAYMENT_RATE",
 ]
 ENGINEERED_SOURCES = [
     "DAYS_EMPLOYED",
     "AMT_REQ_CREDIT_BUREAU_YEAR",
     "AMT_REQ_CREDIT_BUREAU_QRT",
 ]
+OUTLIER_LOWER_Q = 0.01
+OUTLIER_UPPER_Q = 0.99
+MISSING_INDICATOR_MIN_RATE = float(os.getenv("MISSING_INDICATOR_MIN_RATE", "0.05"))
 CODE_GENDER_MAPPING = {
     "F": "F",
     numeric_medians: dict[str, float]
     categorical_columns: list[str]
     outlier_maxes: dict[str, float]
+    outlier_bounds: dict[str, tuple[float, float]]
+    missing_indicator_columns: list[str]
     numeric_ranges: dict[str, tuple[float, float]]
     features_to_scaled: list[str]
     scaler: MinMaxScaler
     if "DAYS_EMPLOYED" in df.columns:
         values = pd.to_numeric(df["DAYS_EMPLOYED"], errors="coerce")
         sentinel_mask = values == DAYS_EMPLOYED_SENTINEL
+        df["DAYS_EMPLOYED_ANOM"] = sentinel_mask.astype(int)
         if sentinel_mask.any():
             df.loc[sentinel_mask, "DAYS_EMPLOYED"] = np.nan
     missing_mask = df_norm[required_cols].isna() if required_cols else pd.DataFrame(index=df_norm.index)
     invalid_masks: dict[str, pd.Series] = {}
     out_of_range_masks: dict[str, pd.Series] = {}
+    outlier_masks: dict[str, pd.Series] = {}
     for col in numeric_required:
         if col not in df_raw.columns:
         values = pd.to_numeric(df_norm[col], errors="coerce")
         out_of_range_masks[col] = (values < min_val) | (values > max_val)
+    for col, (low, high) in getattr(preprocessor, "outlier_bounds", {}).items():
+        if col not in df_norm.columns:
+            outlier_masks[col] = pd.Series(False, index=df_norm.index)
+            continue
+        values = pd.to_numeric(df_norm[col], errors="coerce")
+        outlier_masks[col] = (values < low) | (values > high)
     records: list[dict[str, Any]] = []
     for idx in df_norm.index:
         missing_cols = (
         )
         invalid_cols = [col for col, mask in invalid_masks.items() if mask.at[idx]]
         out_of_range_cols = [col for col, mask in out_of_range_masks.items() if mask.at[idx]]
+        outlier_cols = [col for col, mask in outlier_masks.items() if mask.at[idx]]
         unknown_cols = [col for col, mask in unknown_masks.items() if mask.at[idx]]
+        unknown_values = {
+            col: df_raw.at[idx, col]
+            for col in unknown_cols
+            if col in df_raw.columns
+        }
         nan_rate = float(missing_mask.loc[idx].mean()) if not missing_mask.empty else 0.0
+        record = {
+            "missing_required_columns": missing_cols,
+            "invalid_numeric_columns": invalid_cols,
+            "out_of_range_columns": out_of_range_cols,
+            "outlier_columns": outlier_cols,
+            "unknown_categories": unknown_cols,
+            "days_employed_sentinel": bool(sentinel_mask.at[idx]) if not sentinel_mask.empty else False,
+            "nan_rate": nan_rate,
+        }
+        if unknown_values:
+            record["unknown_category_values"] = unknown_values
+        records.append(record)
     return records
     threshold: float | None,
     status_code: int,
     preprocessor: PreprocessorArtifacts,
+    source: str | None = None,
     data_quality: list[dict[str, Any]] | None = None,
     error: str | None = None,
 ) -> None:
             "status_code": status_code,
             "model_version": MODEL_VERSION,
             "threshold": threshold,
+            "source": source or "api",
             "inputs": inputs,
         }
         if data_quality and idx < len(data_quality):
     _append_log_entries(entries)
 def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
     df = pd.read_parquet(data_path)
     raw_feature_columns = df.columns.tolist()
     input_feature_columns = [c for c in raw_feature_columns if c not in ["is_train", "is_test", "TARGET"]]
+    df = new_features_creation(
+        df,
+        days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
+        engineered_sources=ENGINEERED_SOURCES,
+    )
     df.replace([np.inf, -np.inf], np.nan, inplace=True)
     missing_rate = df.isna().mean()
     df = df[columns_keep]
     df = df.dropna(subset=columns_must_not_missing)
+    if "CODE_GENDER" in df.columns:
+        df = df[df["CODE_GENDER"] != "XNA"]
+    missing_indicator_columns = select_missing_indicator_columns(
+        df,
+        exclude_cols=set(IGNORE_FEATURES),
+        min_missing_rate=MISSING_INDICATOR_MIN_RATE,
+    )
+    df = add_missingness_indicators(df, missing_indicator_columns)
+    outlier_bounds = compute_outlier_bounds(
+        df,
+        OUTLIER_COLUMNS,
+        lower_q=OUTLIER_LOWER_Q,
+        upper_q=OUTLIER_UPPER_Q,
+    )
+    df = apply_outlier_clipping(df, outlier_bounds)
+    columns_keep = df.columns.tolist()
     numeric_cols = df.select_dtypes(include=["number"]).columns
     numeric_medians = df[numeric_cols].median().to_dict()
     df[numeric_cols] = df[numeric_cols].fillna(numeric_medians)
     categorical_columns = df.select_dtypes(include=["object"]).columns.tolist()
     df[categorical_columns] = df[categorical_columns].fillna("Unknown")
+    outlier_maxes = {col: bounds[1] for col, bounds in outlier_bounds.items()}
     reduced_input_columns, selection_scores, selection_method = _compute_reduced_inputs(
         df,
             required_input = _fallback_reduced_inputs(input_feature_columns)
     else:
         required_input = sorted(required_raw)
+    numeric_required = sorted(
+        col
+        for col in required_input
+        if col in numeric_medians and col != "SK_ID_CURR"
+    )
     correlated_imputation = _build_correlated_imputation(
         df,
         input_feature_columns=input_feature_columns,
         numeric_medians={k: float(v) for k, v in numeric_medians.items()},
         categorical_columns=categorical_columns,
         outlier_maxes={k: float(v) for k, v in outlier_maxes.items()},
+        outlier_bounds={k: (float(v[0]), float(v[1])) for k, v in outlier_bounds.items()},
+        missing_indicator_columns=missing_indicator_columns,
         numeric_ranges=numeric_ranges,
         features_to_scaled=features_to_scaled,
         scaler=scaler,
         ]
     )
+    df = new_features_creation(
+        base,
+        days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
+        engineered_sources=ENGINEERED_SOURCES,
+    )
     df.replace([np.inf, -np.inf], np.nan, inplace=True)
+    missing_indicator_columns = select_missing_indicator_columns(
+        df,
+        exclude_cols=set(IGNORE_FEATURES),
+        min_missing_rate=MISSING_INDICATOR_MIN_RATE,
+    )
+    df = add_missingness_indicators(df, missing_indicator_columns)
+    outlier_bounds = compute_outlier_bounds(
+        df,
+        OUTLIER_COLUMNS,
+        lower_q=OUTLIER_LOWER_Q,
+        upper_q=OUTLIER_UPPER_Q,
+    )
+    df = apply_outlier_clipping(df, outlier_bounds)
     columns_keep = df.columns.tolist()
     columns_must_not_missing = [col for col in columns_keep if col not in IGNORE_FEATURES]
     required_raw.update(col for col in columns_must_not_missing if col in input_feature_columns)
     required_raw.add("SK_ID_CURR")
     required_input = _fallback_reduced_inputs(input_feature_columns)
+    numeric_required = sorted(
+        col for col in required_input if col in numeric_medians and col != "SK_ID_CURR"
+    )
     numeric_ranges = {col: (float(df[col].min()), float(df[col].max())) for col in numeric_cols}
         columns_must_not_missing=columns_must_not_missing,
         numeric_medians={k: float(v) for k, v in numeric_medians.items()},
         categorical_columns=categorical_columns,
+        outlier_maxes={k: float(v[1]) for k, v in outlier_bounds.items()},
+        outlier_bounds={k: (float(v[0]), float(v[1])) for k, v in outlier_bounds.items()},
+        missing_indicator_columns=missing_indicator_columns,
         numeric_ranges=numeric_ranges,
         features_to_scaled=features_to_scaled,
         scaler=scaler,
             updated = True
         if not hasattr(preprocessor, "numeric_required_columns"):
             preprocessor.numeric_required_columns = sorted(
+                col
+                for col in preprocessor.required_input_columns
+                if col in preprocessor.numeric_medians and col != "SK_ID_CURR"
             )
             updated = True
         if not hasattr(preprocessor, "numeric_ranges"):
                     raise RuntimeError(f"Data file not found to rebuild preprocessor: {data_path}")
                 preprocessor = build_preprocessor(data_path)
                 updated = True
+        needs_missing_indicators = (
+            not hasattr(preprocessor, "missing_indicator_columns")
+            or not preprocessor.missing_indicator_columns
+        )
+        needs_outlier_bounds = (
+            not hasattr(preprocessor, "outlier_bounds") or not preprocessor.outlier_bounds
+        )
+        prepared_df = None
+        if (needs_missing_indicators or needs_outlier_bounds) and data_path.exists():
+            prepared_df = pd.read_parquet(data_path)
+            prepared_df = new_features_creation(
+                prepared_df,
+                days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
+                engineered_sources=ENGINEERED_SOURCES,
+            )
+            prepared_df.replace([np.inf, -np.inf], np.nan, inplace=True)
+            if preprocessor.columns_keep:
+                prepared_df = prepared_df[preprocessor.columns_keep]
+            if preprocessor.columns_must_not_missing:
+                prepared_df = prepared_df.dropna(subset=preprocessor.columns_must_not_missing)
+            if "CODE_GENDER" in prepared_df.columns:
+                prepared_df = prepared_df[prepared_df["CODE_GENDER"] != "XNA"]
+        if needs_missing_indicators:
+            if prepared_df is not None:
+                preprocessor.missing_indicator_columns = select_missing_indicator_columns(
+                    prepared_df,
+                    exclude_cols=set(IGNORE_FEATURES),
+                    min_missing_rate=MISSING_INDICATOR_MIN_RATE,
+                )
+            else:
+                preprocessor.missing_indicator_columns = []
+            updated = True
+        if needs_outlier_bounds:
+            if prepared_df is not None:
+                preprocessor.outlier_bounds = compute_outlier_bounds(
+                    prepared_df,
+                    OUTLIER_COLUMNS,
+                    lower_q=OUTLIER_LOWER_Q,
+                    upper_q=OUTLIER_UPPER_Q,
+                )
+            else:
+                preprocessor.outlier_bounds = {}
+                for col, max_val in getattr(preprocessor, "outlier_maxes", {}).items():
+                    min_val = None
+                    if hasattr(preprocessor, "numeric_ranges") and col in preprocessor.numeric_ranges:
+                        min_val = preprocessor.numeric_ranges[col][0]
+                    if min_val is None:
+                        min_val = float("-inf")
+                    preprocessor.outlier_bounds[col] = (float(min_val), float(max_val))
+            updated = True
         if USE_REDUCED_INPUTS:
             reduced = _reduce_input_columns(preprocessor)
             if preprocessor.required_input_columns != reduced:
                 required_updated = True
                 updated = True
         desired_numeric_required = sorted(
+            col
+            for col in preprocessor.required_input_columns
+            if col in preprocessor.numeric_medians and col != "SK_ID_CURR"
         )
         if getattr(preprocessor, "numeric_required_columns", None) != desired_numeric_required:
             preprocessor.numeric_required_columns = desired_numeric_required
     if not data_path.exists():
         return _fallback_reduced_inputs(preprocessor.input_feature_columns), {}, "default"
     df = pd.read_parquet(data_path)
+    df = new_features_creation(
+        df,
+        days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
+        engineered_sources=ENGINEERED_SOURCES,
+    )
     df.replace([np.inf, -np.inf], np.nan, inplace=True)
     if preprocessor.columns_keep:
     if "CODE_GENDER" in df.columns:
         df = df[df["CODE_GENDER"] != "XNA"]
+    if getattr(preprocessor, "missing_indicator_columns", None):
+        df = add_missingness_indicators(df, preprocessor.missing_indicator_columns)
+    else:
+        df = add_missingness_indicators(
+            df,
+            select_missing_indicator_columns(
+                df,
+                exclude_cols=set(IGNORE_FEATURES),
+                min_missing_rate=MISSING_INDICATOR_MIN_RATE,
+            ),
+        )
+    outlier_bounds = getattr(preprocessor, "outlier_bounds", {}) or compute_outlier_bounds(
+        df,
+        OUTLIER_COLUMNS,
+        lower_q=OUTLIER_LOWER_Q,
+        upper_q=OUTLIER_UPPER_Q,
+    )
+    df = apply_outlier_clipping(df, outlier_bounds)
     return _compute_reduced_inputs(df, input_feature_columns=preprocessor.input_feature_columns)
     preprocessor: PreprocessorArtifacts,
 ) -> dict[str, dict[str, float | str]]:
     df = pd.read_parquet(data_path)
+    df = new_features_creation(
+        df,
+        days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
+        engineered_sources=ENGINEERED_SOURCES,
+    )
     df.replace([np.inf, -np.inf], np.nan, inplace=True)
     df = df[preprocessor.columns_keep]
     if "CODE_GENDER" in df.columns:
         df = df[df["CODE_GENDER"] != "XNA"]
+    if getattr(preprocessor, "missing_indicator_columns", None):
+        df = add_missingness_indicators(df, preprocessor.missing_indicator_columns)
+    else:
+        df = add_missingness_indicators(
+            df,
+            select_missing_indicator_columns(
+                df,
+                exclude_cols=set(IGNORE_FEATURES),
+                min_missing_rate=MISSING_INDICATOR_MIN_RATE,
+            ),
+        )
+    outlier_bounds = getattr(preprocessor, "outlier_bounds", {}) or compute_outlier_bounds(
+        df,
+        OUTLIER_COLUMNS,
+        lower_q=OUTLIER_LOWER_Q,
+        upper_q=OUTLIER_UPPER_Q,
+    )
+    df = apply_outlier_clipping(df, outlier_bounds)
     return _build_correlated_imputation(
         df,
     if "TARGET" not in df.columns:
         df["TARGET"] = 0
+    df = new_features_creation(
+        df,
+        days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
+        engineered_sources=ENGINEERED_SOURCES,
+    )
     df.replace([np.inf, -np.inf], np.nan, inplace=True)
     df = df.reindex(columns=artifacts.columns_keep, fill_value=np.nan)
+    indicator_cols = getattr(artifacts, "missing_indicator_columns", None) or select_missing_indicator_columns(
+        df,
+        exclude_cols=set(IGNORE_FEATURES),
+        min_missing_rate=MISSING_INDICATOR_MIN_RATE,
+    )
+    df = add_missingness_indicators(df, indicator_cols)
+    outlier_bounds = getattr(artifacts, "outlier_bounds", {}) or compute_outlier_bounds(
+        df,
+        OUTLIER_COLUMNS,
+        lower_q=OUTLIER_LOWER_Q,
+        upper_q=OUTLIER_UPPER_Q,
+    )
+    df = apply_outlier_clipping(df, outlier_bounds)
     _apply_correlated_imputation(df, artifacts)
     for col, median in artifacts.numeric_medians.items():
             detail={"message": "CODE_GENDER cannot be 'XNA' based on training rules."},
         )
     df_hot = pd.get_dummies(df, columns=artifacts.categorical_columns)
     df_hot = df_hot.reindex(columns=artifacts.features_to_scaled, fill_value=0)
     return pd.DataFrame(scaled, columns=artifacts.features_to_scaled, index=df.index)
+def _prepare_pipeline_input(
+    df_raw: pd.DataFrame,
+    artifacts: PreprocessorArtifacts,
+    model: Any,
+) -> pd.DataFrame:
+    df = df_raw.copy()
+    for col in artifacts.required_input_columns:
+        if col not in df.columns:
+            df[col] = np.nan
+    allow_missing = {"DAYS_EMPLOYED"}
+    _ensure_required_columns(df, artifacts.required_input_columns, allow_missing=allow_missing)
+    _validate_numeric_inputs(df, artifacts.numeric_required_columns)
+    _validate_numeric_ranges(
+        df,
+        {k: v for k, v in artifacts.numeric_ranges.items() if k in artifacts.numeric_required_columns},
+    )
+    df["is_train"] = 0
+    df["is_test"] = 1
+    if "TARGET" not in df.columns:
+        df["TARGET"] = 0
+    df = new_features_creation(
+        df,
+        days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
+        engineered_sources=ENGINEERED_SOURCES,
+    )
+    df.replace([np.inf, -np.inf], np.nan, inplace=True)
+    df = df.reindex(columns=artifacts.columns_keep, fill_value=np.nan)
+    indicator_cols = getattr(artifacts, "missing_indicator_columns", None) or select_missing_indicator_columns(
+        df,
+        exclude_cols=set(IGNORE_FEATURES),
+        min_missing_rate=MISSING_INDICATOR_MIN_RATE,
+    )
+    df = add_missingness_indicators(df, indicator_cols)
+    outlier_bounds = getattr(artifacts, "outlier_bounds", {}) or compute_outlier_bounds(
+        df,
+        OUTLIER_COLUMNS,
+        lower_q=OUTLIER_LOWER_Q,
+        upper_q=OUTLIER_UPPER_Q,
+    )
+    df = apply_outlier_clipping(df, outlier_bounds)
+    if "CODE_GENDER" in df.columns and (df["CODE_GENDER"] == "XNA").any():
+        raise HTTPException(
+            status_code=422,
+            detail={"message": "CODE_GENDER cannot be 'XNA' based on training rules."},
+        )
+    expected_cols = None
+    if hasattr(model, "named_steps"):
+        preprocessor = model.named_steps.get("preprocessing")
+        expected_cols = getattr(preprocessor, "feature_names_in_", None)
+    if expected_cols is None:
+        expected_cols = [c for c in artifacts.input_feature_columns if c not in IGNORE_FEATURES]
+    return df.reindex(columns=expected_cols, fill_value=np.nan)
+def prepare_inference_features(
+    df_raw: pd.DataFrame,
+    artifacts: PreprocessorArtifacts,
+    model: Any,
+) -> pd.DataFrame:
+    if hasattr(model, "named_steps") and model.named_steps.get("preprocessing") is not None:
+        return _prepare_pipeline_input(df_raw, artifacts, model)
+    return preprocess_input(df_raw, artifacts)
 @app.on_event("startup")
 def startup_event() -> None:
     if getattr(app.state, "model", None) is not None and getattr(app.state, "preprocessor", None) is not None:
         for col in preprocessor.required_input_columns
         if col in scores
     }
+    missing_indicator_features = [
+        f"is_missing_{col}"
+        for col in getattr(preprocessor, "missing_indicator_columns", []) or []
+    ]
+    outlier_indicator_features = [
+        f"is_outlier_{col}"
+        for col in getattr(preprocessor, "outlier_bounds", {}) or {}
+    ]
     payload = {
         "required_input_features": preprocessor.required_input_columns,
         "engineered_features": ENGINEERED_FEATURES,
+        "missing_indicator_features_count": len(missing_indicator_features),
+        "outlier_indicator_features_count": len(outlier_indicator_features),
         "model_features_count": len(preprocessor.features_to_scaled),
         "feature_selection_method": preprocessor.feature_selection_method,
         "feature_selection_top_n": FEATURE_SELECTION_TOP_N,
     if include_all:
         payload["input_features"] = preprocessor.input_feature_columns
         payload["optional_input_features"] = optional_features
+        payload["missing_indicator_features"] = missing_indicator_features
+        payload["outlier_indicator_features"] = outlier_indicator_features
     else:
         payload["input_features"] = preprocessor.required_input_columns
         payload["optional_input_features"] = []
     return Response(content="".join(lines), media_type="application/x-ndjson")
+def _align_features_to_model(features: pd.DataFrame, model: Any) -> pd.DataFrame:
+    expected = getattr(model, "feature_names_in_", None)
+    if expected is None:
+        return features
+    expected = list(expected)
+    extra = [c for c in features.columns if c not in expected]
+    missing = [c for c in expected if c not in features.columns]
+    if extra or missing:
+        logger.warning(
+            "Feature mismatch: extra=%s missing=%s",
+            extra[:15],
+            missing[:15],
+        )
+    return features.reindex(columns=expected, fill_value=0)
+def _predict_records(
+    records: list[dict[str, Any]],
+    threshold: float | None,
+    *,
+    source: str | None = None,
+) -> dict[str, Any]:
     model = app.state.model
     preprocessor: PreprocessorArtifacts = app.state.preprocessor
     request_id = str(uuid.uuid4())
             raise HTTPException(status_code=422, detail={"message": "SK_ID_CURR is required."})
         sk_ids = df_norm["SK_ID_CURR"].tolist()
+        features = prepare_inference_features(df_norm, preprocessor, model)
+        features = _align_features_to_model(features, model)
         if hasattr(model, "predict_proba"):
             proba = model.predict_proba(features)[:, 1]
                 threshold=use_threshold,
                 status_code=200,
                 preprocessor=preprocessor,
+                source=source,
                 data_quality=dq_records,
             )
             return {"predictions": results, "threshold": use_threshold}
             threshold=None,
             status_code=200,
             preprocessor=preprocessor,
+            source=source,
             data_quality=dq_records,
         )
         return {"predictions": results, "threshold": None}
             threshold=threshold,
             status_code=exc.status_code,
             preprocessor=preprocessor,
+            source=source,
             data_quality=dq_records if "dq_records" in locals() else None,
             error=json.dumps(detail, ensure_ascii=True),
         )
             threshold=threshold,
             status_code=500,
             preprocessor=preprocessor,
+            source=source,
             data_quality=dq_records if "dq_records" in locals() else None,
             error=str(exc),
         )
 def predict(
     payload: PredictionRequest,
     threshold: float | None = Query(default=None, ge=0.0, le=1.0),
+    x_client_source: str | None = Header(default=None, alias="X-Client-Source"),
 ) -> dict[str, Any]:
     records = payload.data if isinstance(payload.data, list) else [payload.data]
+    return _predict_records(records, threshold, source=x_client_source)
 @app.post("/predict-minimal")
 def predict_minimal(
     payload: MinimalPredictionRequest,
     threshold: float | None = Query(default=None, ge=0.0, le=1.0),
+    x_client_source: str | None = Header(default=None, alias="X-Client-Source"),
 ) -> dict[str, Any]:
     preprocessor: PreprocessorArtifacts = app.state.preprocessor
     record = _build_minimal_record(payload, preprocessor)
+    return _predict_records([record], threshold, source=x_client_source)

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/performance/performance_report.md CHANGED Viewed

@@ -6,7 +6,8 @@ Mesurer la latence d'inference, identifier les goulots d'etranglement et propose
 ## Setup
-- Script: `profiling/profile_inference.py`
 - Donnees: `data/data_final.parquet` (echantillon)
 - Parametres: `--sample-size 500 --batch-size 100 --runs 2`
 - Modele: `HistGB_final_model.pkl`

 ## Setup
+- Script (archivé): `dev_archive/profiling/profile_inference.py`
+- Workflow courant: notebook modélisation (section TODO 5)
 - Donnees: `data/data_final.parquet` (echantillon)
 - Parametres: `--sample-size 500 --batch-size 100 --runs 2`
 - Modele: `HistGB_final_model.pkl`

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py CHANGED Viewed

@@ -8,13 +8,17 @@ import pandas as pd
 from fastapi import HTTPException
 from app.main import (
     MinimalPredictionRequest,
     app,
     predict_minimal,
     startup_event,
     _build_minimal_record,
     _normalize_inputs,
-    preprocess_input,
 )
@@ -45,7 +49,7 @@ def _shap_error_table(message: str) -> pd.DataFrame:
         [
             {
                 "feature": message,
-                "value": np.nan,
                 "shap_value": np.nan,
             }
         ]
@@ -63,38 +67,171 @@ def _extract_shap_values(shap_values: Any) -> np.ndarray:
     return values
 def _compute_shap_top_features(record: dict[str, Any], top_k: int = 10) -> pd.DataFrame:
     preprocessor = app.state.preprocessor
     df_raw = pd.DataFrame.from_records([record])
     df_norm, _, _ = _normalize_inputs(df_raw, preprocessor)
-    features = preprocess_input(df_norm, preprocessor)
     try:
         import shap
     except ImportError:
         return _shap_error_table("SHAP not installed.")
-    explainer = getattr(app.state, "shap_explainer", None)
-    if explainer is None:
         try:
-            explainer = shap.TreeExplainer(app.state.model)
         except Exception:
-            explainer = shap.Explainer(app.state.model, features)
-        app.state.shap_explainer = explainer
     try:
-        explanation = explainer(features)
-        values = _extract_shap_values(explanation.values)
     except Exception:
-        values = _extract_shap_values(explainer.shap_values(features))
     shap_row = values[0]
-    feature_values = features.iloc[0].to_numpy()
     top_idx = np.argsort(np.abs(shap_row))[::-1][:top_k]
     rows = [
         {
-            "feature": str(features.columns[idx]),
-            "value": float(feature_values[idx]),
-            "shap_value": float(shap_row[idx]),
         }
         for idx in top_idx
     ]
@@ -105,8 +242,7 @@ def score_minimal(
     sk_id_curr: float,
     amt_credit: float,
     duration_months: float,
-    threshold: float,
-) -> tuple[float | None, str, float | None, pd.DataFrame, dict[str, Any]]:
     _ensure_startup()
     try:
         payload = MinimalPredictionRequest(
@@ -115,7 +251,7 @@ def score_minimal(
             duration_months=int(duration_months),
         )
         record = _build_minimal_record(payload, app.state.preprocessor)
-        response = predict_minimal(payload, threshold=float(threshold))
         result = response["predictions"][0]
         probability = float(result.get("probability", 0.0))
         pred_value = int(result.get("prediction", 0))
@@ -128,11 +264,11 @@ def score_minimal(
                 "DURATION_MONTHS": int(duration_months),
             }
         )
-        return probability, label, float(response.get("threshold", 0.0)), shap_table, snapshot
     except HTTPException as exc:
-        return None, f"Erreur: {exc.detail}", None, _shap_error_table("No SHAP available."), {"error": exc.detail}
     except Exception as exc:  # pragma: no cover - UI fallback
-        return None, f"Erreur: {exc}", None, _shap_error_table("No SHAP available."), {"error": str(exc)}
 with gr.Blocks(title="Credit scoring MLOps") as demo:
@@ -155,19 +291,17 @@ with gr.Blocks(title="Credit scoring MLOps") as demo:
         sk_id_curr = gr.Number(label="Identifiant client", precision=0, value=100001)
         amt_credit = gr.Number(label="Montant du crédit", value=200000)
         duration_months = gr.Number(label="Durée (mois)", precision=0, value=60)
-        threshold = gr.Slider(label="Seuil", minimum=0.0, maximum=1.0, value=0.5, step=0.01)
     run_btn = gr.Button("Scorer")
     with gr.Row():
         probability = gr.Number(label="Probabilité de défaut")
         prediction = gr.Textbox(label="Prédiction")
-        threshold_used = gr.Number(label="Seuil utilisé")
     shap_table = gr.Dataframe(
-        headers=["feature", "value", "shap_value"],
         label="Top 10 SHAP (local)",
-        datatype=["str", "number", "number"],
         interactive=False,
     )
@@ -175,8 +309,8 @@ with gr.Blocks(title="Credit scoring MLOps") as demo:
     run_btn.click(
         score_minimal,
-        inputs=[sk_id_curr, amt_credit, duration_months, threshold],
-        outputs=[probability, prediction, threshold_used, shap_table, snapshot],
     )

 from fastapi import HTTPException
 from app.main import (
+    DAYS_EMPLOYED_SENTINEL,
+    ENGINEERED_SOURCES,
+    MODEL_VERSION,
     MinimalPredictionRequest,
     app,
+    new_features_creation,
+    prepare_inference_features,
     predict_minimal,
     startup_event,
     _build_minimal_record,
     _normalize_inputs,
 )
         [
             {
                 "feature": message,
+                "raw_value": np.nan,
                 "shap_value": np.nan,
             }
         ]
     return values
+def _clean_raw_value(value: Any) -> Any:
+    if value is None or pd.isna(value):
+        return None
+    if isinstance(value, (np.integer, np.floating)):
+        return value.item()
+    return value
+def _strip_feature_prefix(feature_name: str) -> str:
+    return feature_name.split("__", 1)[1] if "__" in feature_name else feature_name
+def _lookup_raw_value(feature_name: str, raw_df: pd.DataFrame, preprocessor) -> Any:
+    cleaned_name = _strip_feature_prefix(feature_name)
+    if cleaned_name in raw_df.columns:
+        return raw_df.at[0, cleaned_name]
+    for prefix in ("is_missing_", "is_outlier_"):
+        if cleaned_name.startswith(prefix):
+            base = cleaned_name[len(prefix):]
+            if base in raw_df.columns:
+                return raw_df.at[0, base]
+    for col in getattr(preprocessor, "categorical_columns", []):
+        if cleaned_name.startswith(f"{col}_") and col in raw_df.columns:
+            return raw_df.at[0, col]
+    return None
+def _align_features_to_model(X: Any, model: Any) -> Any:
+    expected = getattr(model, "feature_names_in_", None)
+    if expected is None:
+        return X
+    if isinstance(X, pd.DataFrame):
+        return X.reindex(columns=list(expected), fill_value=0)
+    return X
+def _model_family(model: Any) -> str:
+    name = type(model).__name__.lower()
+    if "xgb" in name:
+        return "xgb"
+    if "lgbm" in name or "lightgbm" in name:
+        return "lgbm"
+    if "histgradientboosting" in name:
+        return "histgb"
+    return "unknown"
+def _xgb_pred_contribs(estimator: Any, X: Any) -> np.ndarray:
+    import xgboost as xgb
+    if isinstance(X, pd.DataFrame):
+        dm = xgb.DMatrix(X, feature_names=list(X.columns))
+    else:
+        dm = xgb.DMatrix(np.asarray(X))
+    booster = estimator.get_booster() if hasattr(estimator, "get_booster") else estimator
+    contrib = booster.predict(dm, pred_contribs=True)
+    return np.asarray(contrib)[:, :-1]
+def _lgbm_pred_contribs(estimator: Any, X: Any) -> np.ndarray:
+    contrib = estimator.predict(X, pred_contrib=True)
+    return np.asarray(contrib)[:, :-1]
 def _compute_shap_top_features(record: dict[str, Any], top_k: int = 10) -> pd.DataFrame:
     preprocessor = app.state.preprocessor
+    model = app.state.model
     df_raw = pd.DataFrame.from_records([record])
     df_norm, _, _ = _normalize_inputs(df_raw, preprocessor)
+    raw_reference = new_features_creation(
+        df_norm,
+        days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
+        engineered_sources=ENGINEERED_SOURCES,
+    )
+    features = prepare_inference_features(df_norm, preprocessor, model)
+    features = _align_features_to_model(features, model)
     try:
         import shap
     except ImportError:
         return _shap_error_table("SHAP not installed.")
+    estimator = model
+    X_shap = features
+    if hasattr(model, "named_steps") and model.named_steps.get("preprocessing") is not None:
+        estimator = model.named_steps.get("estimator", model)
+        pipeline_preprocessor = model.named_steps["preprocessing"]
+        try:
+            X_shap = pipeline_preprocessor.transform(features)
+        except Exception as exc:
+            return _shap_error_table(f"SHAP preprocessing failed: {exc}")
+        try:
+            import scipy.sparse as sp
+            if sp.issparse(X_shap):
+                X_shap = X_shap.toarray()
+        except Exception:
+            pass
         try:
+            feature_names = pipeline_preprocessor.get_feature_names_out()
         except Exception:
+            feature_names = None
+        if feature_names is not None:
+            X_shap = pd.DataFrame(X_shap, columns=feature_names)
+    family = _model_family(estimator)
+    values: np.ndarray | None = None
+    # 1) Contributions natives (meilleur choix pour XGB/LGBM)
     try:
+        if family == "xgb":
+            values = _xgb_pred_contribs(estimator, X_shap)
+        elif family == "lgbm":
+            values = _lgbm_pred_contribs(estimator, X_shap)
     except Exception:
+        values = None
+    # 2) Fallback SHAP (utile surtout pour HistGB / inconnus)
+    if values is None:
+        cache = getattr(app.state, "shap_explainer_cache", {})
+        key = f"{MODEL_VERSION}:{type(estimator).__name__}"
+        explainer = cache.get(key)
+        if explainer is None:
+            try:
+                import shap
+                predict_fn = (
+                    (lambda X: estimator.predict_proba(X)[:, 1])
+                    if hasattr(estimator, "predict_proba")
+                    else (lambda X: estimator.predict(X))
+                )
+                # Evite le background dégénéré (1 seule ligne)
+                if isinstance(X_shap, pd.DataFrame):
+                    bg = pd.concat([X_shap] * 50, ignore_index=True)
+                else:
+                    bg = np.repeat(np.asarray(X_shap), repeats=50, axis=0)
+                explainer = shap.Explainer(predict_fn, bg)
+            except Exception as exc:
+                return _shap_error_table(f"SHAP explainer init failed: {exc}")
+            cache[key] = explainer
+            app.state.shap_explainer_cache = cache
+        try:
+            import shap
+            explanation = explainer(X_shap)
+            values = _extract_shap_values(explanation.values)
+        except Exception as exc:
+            return _shap_error_table(f"SHAP failed: {exc}")
     shap_row = values[0]
+    if isinstance(X_shap, pd.DataFrame):
+        feature_values = X_shap.iloc[0].to_numpy()
+        feature_names = X_shap.columns
+    else:
+        feature_values = np.asarray(X_shap)[0]
+        feature_names = [f"feature_{idx}" for idx in range(len(feature_values))]
     top_idx = np.argsort(np.abs(shap_row))[::-1][:top_k]
     rows = [
         {
+            "feature": str(feature_names[idx]),
+            "raw_value": _clean_raw_value(
+                _lookup_raw_value(str(feature_names[idx]), raw_reference, preprocessor)
+            ),
+            "shap_value": float(np.round(shap_row[idx], 6)),
         }
         for idx in top_idx
     ]
     sk_id_curr: float,
     amt_credit: float,
     duration_months: float,
+) -> tuple[float | None, str, pd.DataFrame, dict[str, Any]]:
     _ensure_startup()
     try:
         payload = MinimalPredictionRequest(
             duration_months=int(duration_months),
         )
         record = _build_minimal_record(payload, app.state.preprocessor)
+        response = predict_minimal(payload, threshold=None, x_client_source="gradio")
         result = response["predictions"][0]
         probability = float(result.get("probability", 0.0))
         pred_value = int(result.get("prediction", 0))
                 "DURATION_MONTHS": int(duration_months),
             }
         )
+        return probability, label, shap_table, snapshot
     except HTTPException as exc:
+        return None, f"Erreur: {exc.detail}", _shap_error_table("No SHAP available."), {"error": exc.detail}
     except Exception as exc:  # pragma: no cover - UI fallback
+        return None, f"Erreur: {exc}", _shap_error_table("No SHAP available."), {"error": str(exc)}
 with gr.Blocks(title="Credit scoring MLOps") as demo:
         sk_id_curr = gr.Number(label="Identifiant client", precision=0, value=100001)
         amt_credit = gr.Number(label="Montant du crédit", value=200000)
         duration_months = gr.Number(label="Durée (mois)", precision=0, value=60)
     run_btn = gr.Button("Scorer")
     with gr.Row():
         probability = gr.Number(label="Probabilité de défaut")
         prediction = gr.Textbox(label="Prédiction")
     shap_table = gr.Dataframe(
+        headers=["feature", "raw_value", "shap_value"],
         label="Top 10 SHAP (local)",
+        datatype=["str", "str", "number"],
         interactive=False,
     )
     run_btn.click(
         score_minimal,
+        inputs=[sk_id_curr, amt_credit, duration_months],
+        outputs=[probability, prediction, shap_table, snapshot],
     )

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py CHANGED Viewed

@@ -3,9 +3,19 @@ from __future__ import annotations
 from typing import Any
 import gradio as gr
 from fastapi import HTTPException
-from app.main import MinimalPredictionRequest, app, predict_minimal, startup_event
 def _ensure_startup() -> None:
@@ -30,12 +40,73 @@ def _customer_snapshot(sk_id_curr: int) -> dict[str, Any]:
     return snapshot
 def score_minimal(
     sk_id_curr: float,
     amt_credit: float,
     duration_months: float,
     threshold: float,
-) -> tuple[float | None, str, float | None, dict[str, Any]]:
     _ensure_startup()
     try:
         payload = MinimalPredictionRequest(
@@ -43,11 +114,13 @@ def score_minimal(
             amt_credit=float(amt_credit),
             duration_months=int(duration_months),
         )
         response = predict_minimal(payload, threshold=float(threshold))
         result = response["predictions"][0]
         probability = float(result.get("probability", 0.0))
         pred_value = int(result.get("prediction", 0))
         label = "Default (1)" if pred_value == 1 else "No default (0)"
         snapshot = _customer_snapshot(int(sk_id_curr))
         snapshot.update(
             {
@@ -55,39 +128,55 @@ def score_minimal(
                 "DURATION_MONTHS": int(duration_months),
             }
         )
-        return probability, label, float(response.get("threshold", 0.0)), snapshot
     except HTTPException as exc:
-        return None, f"Erreur: {exc.detail}", None, {"error": exc.detail}
     except Exception as exc:  # pragma: no cover - UI fallback
-        return None, f"Erreur: {exc}", None, {"error": str(exc)}
-with gr.Blocks(title="Credit Scoring - Minimal Inputs") as demo:
-    gr.Markdown("# Credit Scoring - Minimal Inputs")
     gr.Markdown(
-        "Renseignez l'identifiant client, le montant du credit et la duree. "
-        "Les autres features proviennent des donnees clients reference."
     )
     with gr.Row():
-        sk_id_curr = gr.Number(label="SK_ID_CURR", precision=0, value=100001)
-        amt_credit = gr.Number(label="AMT_CREDIT", value=200000)
-        duration_months = gr.Number(label="Duree (mois)", precision=0, value=60)
         threshold = gr.Slider(label="Seuil", minimum=0.0, maximum=1.0, value=0.5, step=0.01)
     run_btn = gr.Button("Scorer")
     with gr.Row():
-        probability = gr.Number(label="Probabilite de defaut")
-        prediction = gr.Textbox(label="Decision")
-        threshold_used = gr.Number(label="Seuil utilise")
-    snapshot = gr.JSON(label="Snapshot client (reference)")
     run_btn.click(
         score_minimal,
         inputs=[sk_id_curr, amt_credit, duration_months, threshold],
-        outputs=[probability, prediction, threshold_used, snapshot],
     )

 from typing import Any
 import gradio as gr
+import numpy as np
+import pandas as pd
 from fastapi import HTTPException
+from app.main import (
+    MinimalPredictionRequest,
+    app,
+    predict_minimal,
+    startup_event,
+    _build_minimal_record,
+    _normalize_inputs,
+    preprocess_input,
+)
 def _ensure_startup() -> None:
     return snapshot
+def _shap_error_table(message: str) -> pd.DataFrame:
+    return pd.DataFrame(
+        [
+            {
+                "feature": message,
+                "value": np.nan,
+                "shap_value": np.nan,
+            }
+        ]
+    )
+def _extract_shap_values(shap_values: Any) -> np.ndarray:
+    if isinstance(shap_values, list):
+        shap_values = shap_values[1] if len(shap_values) > 1 else shap_values[0]
+    values = np.asarray(shap_values)
+    if values.ndim == 3:
+        values = values[:, :, 1]
+    if values.ndim == 1:
+        values = values.reshape(1, -1)
+    return values
+def _compute_shap_top_features(record: dict[str, Any], top_k: int = 10) -> pd.DataFrame:
+    preprocessor = app.state.preprocessor
+    df_raw = pd.DataFrame.from_records([record])
+    df_norm, _, _ = _normalize_inputs(df_raw, preprocessor)
+    features = preprocess_input(df_norm, preprocessor)
+    try:
+        import shap
+    except ImportError:
+        return _shap_error_table("SHAP not installed.")
+    explainer = getattr(app.state, "shap_explainer", None)
+    if explainer is None:
+        try:
+            explainer = shap.TreeExplainer(app.state.model)
+        except Exception:
+            explainer = shap.Explainer(app.state.model, features)
+        app.state.shap_explainer = explainer
+    try:
+        explanation = explainer(features)
+        values = _extract_shap_values(explanation.values)
+    except Exception:
+        values = _extract_shap_values(explainer.shap_values(features))
+    shap_row = values[0]
+    feature_values = features.iloc[0].to_numpy()
+    top_idx = np.argsort(np.abs(shap_row))[::-1][:top_k]
+    rows = [
+        {
+            "feature": str(features.columns[idx]),
+            "value": float(feature_values[idx]),
+            "shap_value": float(shap_row[idx]),
+        }
+        for idx in top_idx
+    ]
+    return pd.DataFrame(rows)
 def score_minimal(
     sk_id_curr: float,
     amt_credit: float,
     duration_months: float,
     threshold: float,
+) -> tuple[float | None, str, float | None, pd.DataFrame, dict[str, Any]]:
     _ensure_startup()
     try:
         payload = MinimalPredictionRequest(
             amt_credit=float(amt_credit),
             duration_months=int(duration_months),
         )
+        record = _build_minimal_record(payload, app.state.preprocessor)
         response = predict_minimal(payload, threshold=float(threshold))
         result = response["predictions"][0]
         probability = float(result.get("probability", 0.0))
         pred_value = int(result.get("prediction", 0))
         label = "Default (1)" if pred_value == 1 else "No default (0)"
+        shap_table = _compute_shap_top_features(record, top_k=10)
         snapshot = _customer_snapshot(int(sk_id_curr))
         snapshot.update(
             {
                 "DURATION_MONTHS": int(duration_months),
             }
         )
+        return probability, label, float(response.get("threshold", 0.0)), shap_table, snapshot
     except HTTPException as exc:
+        return None, f"Erreur: {exc.detail}", None, _shap_error_table("No SHAP available."), {"error": exc.detail}
     except Exception as exc:  # pragma: no cover - UI fallback
+        return None, f"Erreur: {exc}", None, _shap_error_table("No SHAP available."), {"error": str(exc)}
+with gr.Blocks(title="Credit scoring MLOps") as demo:
+    gr.Markdown("# Credit scoring MLOps")
+    gr.HTML("""
+            <div style="display:flex; gap:0.5rem; flex-wrap:wrap;">
+            <a href="https://github.com/stephmnt/credit-scoring-mlops/releases" target="_blank" rel="noreferrer">
+                <img src="https://img.shields.io/github/v/release/stephmnt/credit-scoring-mlops" alt="GitHub Release" />
+            </a>
+            <a href="https://github.com/stephmnt/credit-scoring-mlops/actions/workflows/deploy.yml" target="_blank" rel="noreferrer">
+                <img src="https://img.shields.io/github/actions/workflow/status/stephmnt/credit-scoring-mlops/deploy.yml" alt="GitHub Actions Workflow Status" />
+            </a>
+        </div>
+        """)
     gr.Markdown(
+        "Renseignez l'identifiant client, le montant du crédit et la durée. "
     )
     with gr.Row():
+        sk_id_curr = gr.Number(label="Identifiant client", precision=0, value=100001)
+        amt_credit = gr.Number(label="Montant du crédit", value=200000)
+        duration_months = gr.Number(label="Durée (mois)", precision=0, value=60)
         threshold = gr.Slider(label="Seuil", minimum=0.0, maximum=1.0, value=0.5, step=0.01)
     run_btn = gr.Button("Scorer")
     with gr.Row():
+        probability = gr.Number(label="Probabilité de défaut")
+        prediction = gr.Textbox(label="Prédiction")
+        threshold_used = gr.Number(label="Seuil utilisé")
+    shap_table = gr.Dataframe(
+        headers=["feature", "value", "shap_value"],
+        label="Top 10 SHAP (local)",
+        datatype=["str", "number", "number"],
+        interactive=False,
+    )
+    snapshot = gr.JSON(label="Snapshot client (référence)")
     run_btn.click(
         score_minimal,
         inputs=[sk_id_curr, amt_credit, duration_months, threshold],
+        outputs=[probability, prediction, threshold_used, shap_table, snapshot],
     )

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/__init__.py CHANGED Viewed

@@ -1,3 +1 @@
-"""Expose combined ASGI app for HF Spaces default loader."""
-from app_entry import app, demo  # re-export for uvicorn app:app


1	+ """Package marker for the FastAPI app package."""

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/Dockerfile CHANGED Viewed

@@ -9,8 +9,9 @@ COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY app/ app/
-COPY data/HistGB_final_model.pkl data/
-COPY artifacts/preprocessor.joblib artifacts/
 EXPOSE 7860

 RUN pip install --no-cache-dir -r requirements.txt
 COPY app/ app/
+COPY app_entry.py app.py gradio_app.py ./
+COPY data/ data/
+COPY artifacts/ artifacts/
 EXPOSE 7860

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py CHANGED Viewed

@@ -1,22 +1,4 @@
-from fastapi import FastAPI
-import gradio as gr
-from app.main import app as api_app
-from app.main import startup_event
-from gradio_app import demo
-root_app = FastAPI()
-root_app.mount("/api", api_app)
-root_app = gr.mount_gradio_app(root_app, demo, path="/")
-@root_app.on_event("startup")
-def _startup() -> None:
-    startup_event()
-app = root_app
 if __name__ == "__main__":


1	+ from app_entry import app, demo # re-export for HF Spaces


















2
3
4	if __name__ == "__main__":

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/__init__.py CHANGED Viewed

	@@ -1 +1,3 @@
1	- # ~~Package~~ ~~marker~~ for ~~app~~ ~~module~~.


1	+ """Expose combined ASGI app for HF Spaces default loader."""
2	+
3	+ from app_entry import app, demo # re-export for uvicorn app:app

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py CHANGED Viewed

@@ -1113,6 +1113,16 @@ def startup_event() -> None:
         logger.info("Loading model from %s", model_path)
         app.state.model = load_model(model_path)
     try:
         artifacts_path = ARTIFACTS_PATH
         if not artifacts_path.exists():
@@ -1125,7 +1135,7 @@ def startup_event() -> None:
             if downloaded is not None:
                 artifacts_path = downloaded
         logger.info("Loading preprocessor artifacts from %s", artifacts_path)
-        app.state.preprocessor = load_preprocessor(DATA_PATH, artifacts_path)
     except RuntimeError as exc:
         if ALLOW_MISSING_ARTIFACTS:
             logger.warning("Preprocessor artifacts missing (%s). Using fallback preprocessor.", exc)

         logger.info("Loading model from %s", model_path)
         app.state.model = load_model(model_path)
+    data_path = DATA_PATH
+    if not data_path.exists():
+        downloaded = _ensure_hf_asset(
+            data_path,
+            HF_CUSTOMER_REPO_ID,
+            HF_CUSTOMER_FILENAME,
+            HF_CUSTOMER_REPO_TYPE,
+        )
+        if downloaded is not None:
+            data_path = downloaded
     try:
         artifacts_path = ARTIFACTS_PATH
         if not artifacts_path.exists():
             if downloaded is not None:
                 artifacts_path = downloaded
         logger.info("Loading preprocessor artifacts from %s", artifacts_path)
+        app.state.preprocessor = load_preprocessor(data_path, artifacts_path)
     except RuntimeError as exc:
         if ALLOW_MISSING_ARTIFACTS:
             logger.warning("Preprocessor artifacts missing (%s). Using fallback preprocessor.", exc)

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app_entry.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from fastapi import FastAPI
+import gradio as gr
+from app.main import app as api_app
+from app.main import startup_event
+from gradio_app import demo
+root_app = FastAPI()
+root_app.mount("/api", api_app)
+root_app = gr.mount_gradio_app(root_app, demo, path="/")
+@root_app.on_event("startup")
+def _startup() -> None:
+    startup_event()
+app = root_app

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/Dockerfile CHANGED Viewed

@@ -14,4 +14,4 @@ COPY artifacts/preprocessor.joblib artifacts/
 EXPOSE 7860
-CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]


14
15	EXPOSE 7860
16
17	+ CMD ["uvicorn", "app_entry:app", "--host", "0.0.0.0", "--port", "7860"]

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md CHANGED Viewed

@@ -198,29 +198,38 @@ Exemple (un seul repo dataset avec 3 fichiers) :
 ### Demo live (commandes cles en main)
-Lancer l'API :
 ```shell
 uvicorn app.main:app --reload --port 7860
 ```
 Verifier le service (HF) :
 ```shell
 BASE_URL="https://stephmnt-credit-scoring-mlops.hf.space"
-curl -s "${BASE_URL}/health"
 ```
 Voir les features attendues (HF) :
 ```shell
-curl -s "${BASE_URL}/features"
 ```
 Predire un client (HF) :
 ```shell
-curl -s -X POST "${BASE_URL}/predict?threshold=0.5" \
   -H "Content-Type: application/json" \
   -d '{
     "data": {
@@ -242,7 +251,7 @@ curl -s -X POST "${BASE_URL}/predict?threshold=0.5" \
 Predire plusieurs clients (batch, HF) :
 ```shell
-curl -s -X POST "${BASE_URL}/predict?threshold=0.45" \
   -H "Content-Type: application/json" \
   -d '{
     "data": [
@@ -279,7 +288,7 @@ curl -s -X POST "${BASE_URL}/predict?threshold=0.45" \
 Exemple d'erreur (champ requis manquant, HF) :
 ```shell
-curl -s -X POST "${BASE_URL}/predict" \
   -H "Content-Type: application/json" \
   -d '{
     "data": {
@@ -316,13 +325,13 @@ Recuperer les logs (HF) :
 Configurer `LOGS_ACCESS_TOKEN` dans les secrets du Space, puis :
 ```shell
-curl -s -H "X-Logs-Token: $LOGS_ACCESS_TOKEN" "${BASE_URL}/logs?tail=200"
 ```
 Alternative :
 ```shell
-curl -s -H "Authorization: Bearer $LOGS_ACCESS_TOKEN" "${BASE_URL}/logs?tail=200"
 ```
 Apres quelques requêtes, générer le rapport de drift :

 ### Demo live (commandes cles en main)
+Lancer l'API (sans UI) :
 ```shell
 uvicorn app.main:app --reload --port 7860
 ```
+Lancer l'UI Gradio + API (chemin `/api`) :
+```shell
+uvicorn app_entry:app --reload --port 7860
+```
 Verifier le service (HF) :
 ```shell
 BASE_URL="https://stephmnt-credit-scoring-mlops.hf.space"
+API_BASE="${BASE_URL}/api"
+curl -s "${API_BASE}/health"
 ```
+Note : sur HF Spaces, l'UI Gradio est a la racine, l'API est sous `/api`.
 Voir les features attendues (HF) :
 ```shell
+curl -s "${API_BASE}/features"
 ```
 Predire un client (HF) :
 ```shell
+curl -s -X POST "${API_BASE}/predict?threshold=0.5" \
   -H "Content-Type: application/json" \
   -d '{
     "data": {
 Predire plusieurs clients (batch, HF) :
 ```shell
+curl -s -X POST "${API_BASE}/predict?threshold=0.45" \
   -H "Content-Type: application/json" \
   -d '{
     "data": [
 Exemple d'erreur (champ requis manquant, HF) :
 ```shell
+curl -s -X POST "${API_BASE}/predict" \
   -H "Content-Type: application/json" \
   -d '{
     "data": {
 Configurer `LOGS_ACCESS_TOKEN` dans les secrets du Space, puis :
 ```shell
+curl -s -H "X-Logs-Token: $LOGS_ACCESS_TOKEN" "${API_BASE}/logs?tail=200"
 ```
 Alternative :
 ```shell
+curl -s -H "Authorization: Bearer $LOGS_ACCESS_TOKEN" "${API_BASE}/logs?tail=200"
 ```
 Apres quelques requêtes, générer le rapport de drift :

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.gitattributes CHANGED Viewed

@@ -1,35 +1,4 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.parquet filter=lfs diff=lfs merge=lfs -text
+data/HistGB_final_model.pkl filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text

	@@ -0,0 +1,69 @@

+name: deploy-assets
+on:
+  workflow_dispatch:
+    inputs:
+      repo_id:
+        description: "HF repo id (e.g. stephmnt/assets-credit-scoring-mlops)"
+        required: true
+        default: "stephmnt/assets-credit-scoring-mlops"
+      repo_type:
+        description: "HF repo type (dataset or model)"
+        required: true
+        default: "dataset"
+jobs:
+  upload-assets:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install huggingface_hub
+      - name: Upload assets to Hugging Face Hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HF_REPO_ID: ${{ inputs.repo_id }}
+          HF_REPO_TYPE: ${{ inputs.repo_type }}
+        run: |
+          python - <<'PY'
+          import os
+          from pathlib import Path
+          from huggingface_hub import HfApi
+          repo_id = os.environ["HF_REPO_ID"]
+          repo_type = os.environ["HF_REPO_TYPE"]
+          token = os.environ["HF_TOKEN"]
+          files = {
+              "data/HistGB_final_model.pkl": "HistGB_final_model.pkl",
+              "artifacts/preprocessor.joblib": "preprocessor.joblib",
+              "data/data_final.parquet": "data_final.parquet",
+          }
+          api = HfApi()
+          for local_path, remote_name in files.items():
+              path = Path(local_path)
+              if not path.exists():
+                  raise SystemExit(f"Missing file: {path}")
+              api.upload_file(
+                  path_or_fileobj=str(path),
+                  path_in_repo=remote_name,
+                  repo_id=repo_id,
+                  repo_type=repo_type,
+                  token=token,
+                  commit_message=f"Update {remote_name}",
+              )
+          print("Assets uploaded.")
+          PY

@@ -12,6 +12,8 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v4
       - name: Set up Python
         uses: actions/setup-python@v5
@@ -47,6 +49,8 @@ jobs:
             --exclude 'logs' \
             --exclude 'reports' \
             --exclude 'screen-mlflow.png' \
             --exclude 'data/*.csv' \
             --exclude 'data/*.parquet' \
             ./ hf_space/

     steps:
       - name: Checkout
         uses: actions/checkout@v4
+        with:
+          lfs: true
       - name: Set up Python
         uses: actions/setup-python@v5
             --exclude 'logs' \
             --exclude 'reports' \
             --exclude 'screen-mlflow.png' \
+            --exclude 'data/HistGB_final_model.pkl' \
+            --exclude 'artifacts/preprocessor.joblib' \
             --exclude 'data/*.csv' \
             --exclude 'data/*.parquet' \
             ./ hf_space/

@@ -6,6 +6,7 @@ logs/
 reports/
 data/*
 !data/HistGB_final_model.pkl
 artifacts/*
 !artifacts/preprocessor.joblib
 .DS_Store
@@ -18,7 +19,8 @@ mlruns/
 *.code-workspace
 presentation_projet08.pptx
 rapport_projet06.md
 ## https://github.com/github/gitignore/blob/e8554d85bf62e38d6db966a50d2064ac025fd82a/Python.gitignore
 # Byte-compiled / optimized / DLL files

 reports/
 data/*
 !data/HistGB_final_model.pkl
+!data/data_final.parquet
 artifacts/*
 !artifacts/preprocessor.joblib
 .DS_Store
 *.code-workspace
 presentation_projet08.pptx
 rapport_projet06.md
+rapport_template.md
+data_final.parquet
 ## https://github.com/github/gitignore/blob/e8554d85bf62e38d6db966a50d2064ac025fd82a/Python.gitignore
 # Byte-compiled / optimized / DLL files

	@@ -0,0 +1,25 @@

+from fastapi import FastAPI
+import gradio as gr
+from app.main import app as api_app
+from app.main import startup_event
+from gradio_app import demo
+root_app = FastAPI()
+root_app.mount("/api", api_app)
+root_app = gr.mount_gradio_app(root_app, demo, path="/")
+@root_app.on_event("startup")
+def _startup() -> None:
+    startup_event()
+app = root_app
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

@@ -41,6 +41,18 @@ LOG_INCLUDE_INPUTS = os.getenv("LOG_INCLUDE_INPUTS", "1") == "1"
 LOG_HASH_SK_ID = os.getenv("LOG_HASH_SK_ID", "0") == "1"
 MODEL_VERSION = os.getenv("MODEL_VERSION", MODEL_PATH.name)
 LOGS_ACCESS_TOKEN = os.getenv("LOGS_ACCESS_TOKEN")
 IGNORE_FEATURES = ["is_train", "is_test", "TARGET", "SK_ID_CURR"]
 ENGINEERED_FEATURES = [
@@ -117,6 +129,13 @@ class PredictionRequest(BaseModel):
     data: dict[str, Any] | list[dict[str, Any]]
 @dataclass
 class PreprocessorArtifacts:
     columns_keep: list[str]
@@ -173,6 +192,32 @@ def _normalize_category_value(value: object, mapping: dict[str, str]) -> object:
     return mapping.get(key, "Unknown")
 def _normalize_inputs(
     df_raw: pd.DataFrame,
     preprocessor: PreprocessorArtifacts,
@@ -262,6 +307,54 @@ def _build_data_quality_records(
     return records
 def _append_log_entries(entries: list[dict[str, Any]]) -> None:
     if not LOG_PREDICTIONS:
         return
@@ -596,6 +689,41 @@ def load_model(model_path: Path):
         return pickle.load(handle)
 def _infer_numeric_ranges_from_scaler(preprocessor: PreprocessorArtifacts) -> dict[str, tuple[float, float]]:
     ranges = {}
     scaler = getattr(preprocessor, "scaler", None)
@@ -963,19 +1091,41 @@ def preprocess_input(df_raw: pd.DataFrame, artifacts: PreprocessorArtifacts) ->
 @app.on_event("startup")
 def startup_event() -> None:
-    if not MODEL_PATH.exists():
         if ALLOW_MISSING_ARTIFACTS:
-            logger.warning("Model file not found: %s. Using dummy model.", MODEL_PATH)
             app.state.model = DummyModel()
         else:
-            raise RuntimeError(f"Model file not found: {MODEL_PATH}")
     else:
-        logger.info("Loading model from %s", MODEL_PATH)
-        app.state.model = load_model(MODEL_PATH)
     try:
-        logger.info("Loading preprocessor artifacts from %s", ARTIFACTS_PATH)
-        app.state.preprocessor = load_preprocessor(DATA_PATH, ARTIFACTS_PATH)
     except RuntimeError as exc:
         if ALLOW_MISSING_ARTIFACTS:
             logger.warning("Preprocessor artifacts missing (%s). Using fallback preprocessor.", exc)
@@ -983,6 +1133,19 @@ def startup_event() -> None:
         else:
             raise
 @app.get("/health")
 def health() -> dict[str, str]:
@@ -1063,16 +1226,11 @@ def logs(
     return Response(content="".join(lines), media_type="application/x-ndjson")
-@app.post("/predict")
-def predict(
-    payload: PredictionRequest,
-    threshold: float | None = Query(default=None, ge=0.0, le=1.0),
-) -> dict[str, Any]:
     model = app.state.model
     preprocessor: PreprocessorArtifacts = app.state.preprocessor
     request_id = str(uuid.uuid4())
     start_time = time.perf_counter()
-    records = payload.data if isinstance(payload.data, list) else [payload.data]
     if not records:
         raise HTTPException(status_code=422, detail={"message": "No input records provided."})
@@ -1168,3 +1326,22 @@ def predict(
             error=str(exc),
         )
         raise

 LOG_HASH_SK_ID = os.getenv("LOG_HASH_SK_ID", "0") == "1"
 MODEL_VERSION = os.getenv("MODEL_VERSION", MODEL_PATH.name)
 LOGS_ACCESS_TOKEN = os.getenv("LOGS_ACCESS_TOKEN")
+CUSTOMER_DATA_PATH = Path(os.getenv("CUSTOMER_DATA_PATH", str(DATA_PATH)))
+CUSTOMER_LOOKUP_ENABLED = os.getenv("CUSTOMER_LOOKUP_ENABLED", "1") == "1"
+CUSTOMER_LOOKUP_CACHE = os.getenv("CUSTOMER_LOOKUP_CACHE", "1") == "1"
+HF_MODEL_REPO_ID = os.getenv("HF_MODEL_REPO_ID")
+HF_MODEL_REPO_TYPE = os.getenv("HF_MODEL_REPO_TYPE", "model")
+HF_MODEL_FILENAME = os.getenv("HF_MODEL_FILENAME", MODEL_PATH.name)
+HF_PREPROCESSOR_REPO_ID = os.getenv("HF_PREPROCESSOR_REPO_ID", HF_MODEL_REPO_ID or "")
+HF_PREPROCESSOR_REPO_TYPE = os.getenv("HF_PREPROCESSOR_REPO_TYPE", HF_MODEL_REPO_TYPE)
+HF_PREPROCESSOR_FILENAME = os.getenv("HF_PREPROCESSOR_FILENAME", ARTIFACTS_PATH.name)
+HF_CUSTOMER_REPO_ID = os.getenv("HF_CUSTOMER_REPO_ID")
+HF_CUSTOMER_REPO_TYPE = os.getenv("HF_CUSTOMER_REPO_TYPE", "dataset")
+HF_CUSTOMER_FILENAME = os.getenv("HF_CUSTOMER_FILENAME", CUSTOMER_DATA_PATH.name)
 IGNORE_FEATURES = ["is_train", "is_test", "TARGET", "SK_ID_CURR"]
 ENGINEERED_FEATURES = [
     data: dict[str, Any] | list[dict[str, Any]]
+class MinimalPredictionRequest(BaseModel):
+    sk_id_curr: int
+    amt_credit: float
+    duration_months: int | None = None
+    amt_annuity: float | None = None
 @dataclass
 class PreprocessorArtifacts:
     columns_keep: list[str]
     return mapping.get(key, "Unknown")
+def _ensure_hf_asset(
+    local_path: Path,
+    repo_id: str | None,
+    filename: str,
+    repo_type: str,
+) -> Path | None:
+    if local_path.exists():
+        return local_path
+    if not repo_id:
+        return None
+    try:
+        from huggingface_hub import hf_hub_download
+    except ImportError as exc:  # pragma: no cover - optional dependency
+        raise RuntimeError("huggingface_hub is required to download remote assets.") from exc
+    local_path.parent.mkdir(parents=True, exist_ok=True)
+    return Path(
+        hf_hub_download(
+            repo_id=repo_id,
+            filename=filename,
+            repo_type=repo_type,
+            local_dir=str(local_path.parent),
+            local_dir_use_symlinks=False,
+        )
+    )
 def _normalize_inputs(
     df_raw: pd.DataFrame,
     preprocessor: PreprocessorArtifacts,
     return records
+def _build_minimal_record(
+    payload: MinimalPredictionRequest,
+    preprocessor: PreprocessorArtifacts,
+) -> dict[str, Any]:
+    reference = _get_customer_reference(preprocessor)
+    if reference is None:
+        raise HTTPException(
+            status_code=503,
+            detail={"message": "Customer reference data is not available."},
+        )
+    sk_id = int(payload.sk_id_curr)
+    if sk_id not in reference.index:
+        raise HTTPException(
+            status_code=404,
+            detail={"message": f"Client {sk_id} not found in reference data."},
+        )
+    record = reference.loc[sk_id].to_dict()
+    record["SK_ID_CURR"] = sk_id
+    if payload.amt_credit <= 0:
+        raise HTTPException(
+            status_code=422,
+            detail={"message": "AMT_CREDIT must be positive."},
+        )
+    record["AMT_CREDIT"] = float(payload.amt_credit)
+    if payload.amt_annuity is not None:
+        if payload.amt_annuity <= 0:
+            raise HTTPException(
+                status_code=422,
+                detail={"message": "AMT_ANNUITY must be positive."},
+            )
+        record["AMT_ANNUITY"] = float(payload.amt_annuity)
+    elif payload.duration_months is not None:
+        if payload.duration_months <= 0:
+            raise HTTPException(
+                status_code=422,
+                detail={"message": "duration_months must be positive."},
+            )
+        record["AMT_ANNUITY"] = float(payload.amt_credit) / float(payload.duration_months)
+    else:
+        raise HTTPException(
+            status_code=422,
+            detail={"message": "Provide duration_months or amt_annuity."},
+        )
+    if "AMT_GOODS_PRICE" in record:
+        record["AMT_GOODS_PRICE"] = float(payload.amt_credit)
+    return record
 def _append_log_entries(entries: list[dict[str, Any]]) -> None:
     if not LOG_PREDICTIONS:
         return
         return pickle.load(handle)
+def _load_customer_reference(
+    data_path: Path,
+    preprocessor: PreprocessorArtifacts,
+) -> pd.DataFrame:
+    columns = list(preprocessor.input_feature_columns)
+    if "SK_ID_CURR" not in columns:
+        columns.insert(0, "SK_ID_CURR")
+    df = pd.read_parquet(data_path, columns=columns)
+    df = df.drop_duplicates(subset=["SK_ID_CURR"], keep="last").set_index("SK_ID_CURR")
+    return df
+def _get_customer_reference(preprocessor: PreprocessorArtifacts) -> pd.DataFrame | None:
+    if not CUSTOMER_LOOKUP_ENABLED:
+        return None
+    cached = getattr(app.state, "customer_reference", None)
+    if cached is not None:
+        return cached
+    data_path = CUSTOMER_DATA_PATH
+    if not data_path.exists():
+        downloaded = _ensure_hf_asset(
+            data_path,
+            HF_CUSTOMER_REPO_ID,
+            HF_CUSTOMER_FILENAME,
+            HF_CUSTOMER_REPO_TYPE,
+        )
+        if downloaded is None:
+            return None
+        data_path = downloaded
+    ref = _load_customer_reference(data_path, preprocessor)
+    if CUSTOMER_LOOKUP_CACHE:
+        app.state.customer_reference = ref
+    return ref
 def _infer_numeric_ranges_from_scaler(preprocessor: PreprocessorArtifacts) -> dict[str, tuple[float, float]]:
     ranges = {}
     scaler = getattr(preprocessor, "scaler", None)
 @app.on_event("startup")
 def startup_event() -> None:
+    if getattr(app.state, "model", None) is not None and getattr(app.state, "preprocessor", None) is not None:
+        return
+    model_path = MODEL_PATH
+    if not model_path.exists():
+        downloaded = _ensure_hf_asset(
+            model_path,
+            HF_MODEL_REPO_ID,
+            HF_MODEL_FILENAME,
+            HF_MODEL_REPO_TYPE,
+        )
+        if downloaded is not None:
+            model_path = downloaded
+    if not model_path.exists():
         if ALLOW_MISSING_ARTIFACTS:
+            logger.warning("Model file not found: %s. Using dummy model.", model_path)
             app.state.model = DummyModel()
         else:
+            raise RuntimeError(f"Model file not found: {model_path}")
     else:
+        logger.info("Loading model from %s", model_path)
+        app.state.model = load_model(model_path)
     try:
+        artifacts_path = ARTIFACTS_PATH
+        if not artifacts_path.exists():
+            downloaded = _ensure_hf_asset(
+                artifacts_path,
+                HF_PREPROCESSOR_REPO_ID or None,
+                HF_PREPROCESSOR_FILENAME,
+                HF_PREPROCESSOR_REPO_TYPE,
+            )
+            if downloaded is not None:
+                artifacts_path = downloaded
+        logger.info("Loading preprocessor artifacts from %s", artifacts_path)
+        app.state.preprocessor = load_preprocessor(DATA_PATH, artifacts_path)
     except RuntimeError as exc:
         if ALLOW_MISSING_ARTIFACTS:
             logger.warning("Preprocessor artifacts missing (%s). Using fallback preprocessor.", exc)
         else:
             raise
+    app.state.customer_reference = None
+    if CUSTOMER_LOOKUP_ENABLED and CUSTOMER_LOOKUP_CACHE:
+        try:
+            ref = _get_customer_reference(app.state.preprocessor)
+            if ref is not None:
+                logger.info("Loaded customer reference data (%s rows)", len(ref))
+            else:
+                logger.warning("Customer reference data not available.")
+        except Exception as exc:  # pragma: no cover - optional cache load
+            logger.warning("Failed to load customer reference data: %s", exc)
+    elif CUSTOMER_LOOKUP_ENABLED:
+        logger.info("Customer lookup enabled without cache (on-demand load).")
 @app.get("/health")
 def health() -> dict[str, str]:
     return Response(content="".join(lines), media_type="application/x-ndjson")
+def _predict_records(records: list[dict[str, Any]], threshold: float | None) -> dict[str, Any]:
     model = app.state.model
     preprocessor: PreprocessorArtifacts = app.state.preprocessor
     request_id = str(uuid.uuid4())
     start_time = time.perf_counter()
     if not records:
         raise HTTPException(status_code=422, detail={"message": "No input records provided."})
             error=str(exc),
         )
         raise
+@app.post("/predict")
+def predict(
+    payload: PredictionRequest,
+    threshold: float | None = Query(default=None, ge=0.0, le=1.0),
+) -> dict[str, Any]:
+    records = payload.data if isinstance(payload.data, list) else [payload.data]
+    return _predict_records(records, threshold)
+@app.post("/predict-minimal")
+def predict_minimal(
+    payload: MinimalPredictionRequest,
+    threshold: float | None = Query(default=None, ge=0.0, le=1.0),
+) -> dict[str, Any]:
+    preprocessor: PreprocessorArtifacts = app.state.preprocessor
+    record = _build_minimal_record(payload, preprocessor)
+    return _predict_records([record], threshold)

	@@ -0,0 +1,96 @@

+from __future__ import annotations
+from typing import Any
+import gradio as gr
+from fastapi import HTTPException
+from app.main import MinimalPredictionRequest, app, predict_minimal, startup_event
+def _ensure_startup() -> None:
+    if not getattr(app.state, "preprocessor", None):
+        startup_event()
+def _customer_snapshot(sk_id_curr: int) -> dict[str, Any]:
+    reference = getattr(app.state, "customer_reference", None)
+    if reference is None or sk_id_curr not in reference.index:
+        return {}
+    row = reference.loc[sk_id_curr]
+    snapshot: dict[str, Any] = {"SK_ID_CURR": int(sk_id_curr)}
+    if "CODE_GENDER" in row:
+        snapshot["CODE_GENDER"] = row["CODE_GENDER"]
+    if "FLAG_OWN_CAR" in row:
+        snapshot["FLAG_OWN_CAR"] = row["FLAG_OWN_CAR"]
+    if "AMT_INCOME_TOTAL" in row:
+        snapshot["AMT_INCOME_TOTAL"] = float(row["AMT_INCOME_TOTAL"])
+    if "DAYS_BIRTH" in row:
+        snapshot["AGE_YEARS"] = round(abs(float(row["DAYS_BIRTH"])) / 365.25, 1)
+    return snapshot
+def score_minimal(
+    sk_id_curr: float,
+    amt_credit: float,
+    duration_months: float,
+    threshold: float,
+) -> tuple[float | None, str, float | None, dict[str, Any]]:
+    _ensure_startup()
+    try:
+        payload = MinimalPredictionRequest(
+            sk_id_curr=int(sk_id_curr),
+            amt_credit=float(amt_credit),
+            duration_months=int(duration_months),
+        )
+        response = predict_minimal(payload, threshold=float(threshold))
+        result = response["predictions"][0]
+        probability = float(result.get("probability", 0.0))
+        pred_value = int(result.get("prediction", 0))
+        label = "Default (1)" if pred_value == 1 else "No default (0)"
+        snapshot = _customer_snapshot(int(sk_id_curr))
+        snapshot.update(
+            {
+                "AMT_CREDIT_REQUESTED": float(amt_credit),
+                "DURATION_MONTHS": int(duration_months),
+            }
+        )
+        return probability, label, float(response.get("threshold", 0.0)), snapshot
+    except HTTPException as exc:
+        return None, f"Erreur: {exc.detail}", None, {"error": exc.detail}
+    except Exception as exc:  # pragma: no cover - UI fallback
+        return None, f"Erreur: {exc}", None, {"error": str(exc)}
+with gr.Blocks(title="Credit Scoring - Minimal Inputs") as demo:
+    gr.Markdown("# Credit Scoring - Minimal Inputs")
+    gr.Markdown(
+        "Renseignez l'identifiant client, le montant du credit et la duree. "
+        "Les autres features proviennent des donnees clients reference."
+    )
+    with gr.Row():
+        sk_id_curr = gr.Number(label="SK_ID_CURR", precision=0, value=100001)
+        amt_credit = gr.Number(label="AMT_CREDIT", value=200000)
+        duration_months = gr.Number(label="Duree (mois)", precision=0, value=60)
+        threshold = gr.Slider(label="Seuil", minimum=0.0, maximum=1.0, value=0.5, step=0.01)
+    run_btn = gr.Button("Scorer")
+    with gr.Row():
+        probability = gr.Number(label="Probabilite de defaut")
+        prediction = gr.Textbox(label="Decision")
+        threshold_used = gr.Number(label="Seuil utilise")
+    snapshot = gr.JSON(label="Snapshot client (reference)")
+    run_btn.click(
+        score_minimal,
+        inputs=[sk_id_curr, amt_credit, duration_months, threshold],
+        outputs=[probability, prediction, threshold_used, snapshot],
+    )
+if __name__ == "__main__":
+    _ensure_startup()
+    demo.launch()

@@ -1,5 +1,5 @@
 ---
-title: OCR Projet 06
 emoji: 🤖
 colorFrom: indigo
 colorTo: green
@@ -8,7 +8,7 @@ app_port: 7860
 pinned: false
 ---
-# OCR Projet 06 – Crédit
 [![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/stephmnt/credit-scoring-mlops/deploy.yml)](https://github.com/stephmnt/credit-scoring-mlops/actions/workflows/deploy.yml)
 [![GitHub Release Date](https://img.shields.io/github/release-date/stephmnt/credit-scoring-mlops?display_date=published_at&style=flat-square)](https://github.com/stephmnt/credit-scoring-mlops/releases)
@@ -62,24 +62,33 @@ Parametres utiles (selection des features) :
 - `FEATURE_SELECTION_TOP_N` (defaut: `8`)
 - `FEATURE_SELECTION_MIN_CORR` (defaut: `0.02`)
-### Environnement Poetry (recommande)
-Le fichier `pyproject.toml` fixe des versions compatibles pour un stack recent
-(`numpy>=2`, `pyarrow>=15`, `scikit-learn>=1.6`). L'environnement vise Python
-3.11.
 ```shell
-poetry env use 3.11
-poetry install
 poetry run pytest -q
 poetry run uvicorn app.main:app --reload --port 7860
 ```
 Important : le modele `HistGB_final_model.pkl` doit etre regenere avec la
-nouvelle version de scikit-learn (re-execution de
-`P6_MANET_Stephane_notebook_modélisation.ipynb`, cellule de sauvegarde pickle).
-Note : `requirements.txt` est aligne sur `pyproject.toml` (meme versions).
 ### Exemple d'input (schema + valeurs)
@@ -123,9 +132,70 @@ Valeurs d'exemple :
 }
 ```
 Note : l'API valide strictement les champs requis (`/features`). Pour afficher
 toutes les colonnes possibles : `/features?include_all=true`.
 ### Demo live (commandes cles en main)
 Lancer l'API :
@@ -231,6 +301,10 @@ Variables utiles :
 - `LOGS_ACCESS_TOKEN` pour proteger l'endpoint `/logs`
 - `LOG_HASH_SK_ID=1` pour anonymiser `SK_ID_CURR`
 Exemple local :
 ```shell
@@ -251,27 +325,70 @@ Alternative :
 curl -s -H "Authorization: Bearer $LOGS_ACCESS_TOKEN" "${BASE_URL}/logs?tail=200"
 ```
-Apres quelques requêtes, gélérer le rapport de drift :
 ```shell
 python monitoring/drift_report.py \
   --logs logs/predictions.jsonl \
   --reference data/data_final.parquet \
-  --output-dir reports
 ```
 Le rapport HTML est généré dans `reports/drift_report.html` (avec des plots dans
 `reports/plots/`). Sur Hugging Face, le disque est éphemère : télécharger les logs
 avant d'analyser.
 Le rapport inclut aussi la distribution des scores predits et le taux de prediction
-(option `--score-bins` pour ajuster le nombre de bins).
 Captures (snapshot local du reporting + stockage):
 - Rapport: `docs/monitoring/drift_report.html` + `docs/monitoring/plots/`
 - Stockage des logs: `docs/monitoring/logs_storage.png`
 ## Contenu de la release
 - **Preparation + pipeline** : nettoyage / preparation, encodage, imputation et pipeline d'entrainement presentes.
@@ -282,8 +399,10 @@ Captures (snapshot local du reporting + stockage):
 - **Score metier + seuil optimal** : le `custom_score` est la metrique principale des tableaux de comparaison et de la CV, avec un `best_threshold` calcule.
 - **Explicabilite** : feature importance, SHAP et LIME sont inclus.
 - **Selection de features par correlation** : top‑N numeriques + un petit set categoriel, expose via `/features`.
-- **Monitoring & drift** : rapport HTML avec KS/PSI + distribution des scores predits et taux de prediction
-  (snapshots dans `docs/monitoring/`).
 - **CI/CD** : tests avec couverture (`pytest-cov`), build Docker et deploy vers Hugging Face Spaces.
 ![Screenshot MLFlow](https://raw.githubusercontent.com/stephmnt/credit-scoring-mlops/main/screen-mlflow.png)
@@ -304,5 +423,4 @@ Captures (snapshot local du reporting + stockage):
 * Compléter les tests API: /logs (auth OK/KO), batch predict, param threshold, SK_ID_CURR manquant, outliers dans test_api.py.
 * Simplifier le fallback ALLOW_MISSING_ARTIFACTS et DummyModel si les artefacts sont versionnés (nettoyer main.py et conftest.py).
-* Unifier la gestion des dépendances (Poetry vs requirements.txt) et aligner pyproject.toml / requirements.txt.
 * Si l’évaluateur attend une stratégie de branches, créer une branche feature et fusionner pour preuve.

 ---
+title: Credit scoring MLOps
 emoji: 🤖
 colorFrom: indigo
 colorTo: green
 pinned: false
 ---
+# Credit scoring MLOps
 [![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/stephmnt/credit-scoring-mlops/deploy.yml)](https://github.com/stephmnt/credit-scoring-mlops/actions/workflows/deploy.yml)
 [![GitHub Release Date](https://img.shields.io/github/release-date/stephmnt/credit-scoring-mlops?display_date=published_at&style=flat-square)](https://github.com/stephmnt/credit-scoring-mlops/releases)
 - `FEATURE_SELECTION_TOP_N` (defaut: `8`)
 - `FEATURE_SELECTION_MIN_CORR` (defaut: `0.02`)
+### Environnement pip (dev)
+Le developpement local utilise pip et `requirements.txt` (versions figees),
+avec Python 3.11+.
 ```shell
+python3 -m venv .venv
+source .venv/bin/activate
+python -m pip install -r requirements.txt
+pytest -q
+uvicorn app.main:app --reload --port 7860
+```
+### Environnement Poetry (livrable)
+Le livrable inclut `pyproject.toml`, aligne sur `requirements.txt`. Si besoin :
+```shell
+poetry install --with dev
 poetry run pytest -q
 poetry run uvicorn app.main:app --reload --port 7860
 ```
 Important : le modele `HistGB_final_model.pkl` doit etre regenere avec la
+version de scikit-learn definie dans `requirements.txt` / `pyproject.toml`
+(re-execution de `P6_MANET_Stephane_notebook_modélisation.ipynb`, cellule de
+sauvegarde pickle).
 ### Exemple d'input (schema + valeurs)
 }
 ```
+### Prediction minimale (client existant)
+Endpoint `POST /predict-minimal` : l'utilisateur fournit un identifiant client,
+un montant de credit et une duree. Les autres features sont prises depuis la
+reference clients (`CUSTOMER_DATA_PATH`, par defaut `data/data_final.parquet`).
+Si la reference est absente, l'API renvoie 503.
+```shell
+curl -s -X POST "${BASE_URL}/predict-minimal" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "sk_id_curr": 100001,
+    "amt_credit": 200000,
+    "duration_months": 60
+  }'
+```
+Variables utiles :
+- `CUSTOMER_LOOKUP_ENABLED=1` active la recherche client (defaut: 1)
+- `CUSTOMER_DATA_PATH=data/data_final.parquet`
+- `CUSTOMER_LOOKUP_CACHE=1` garde la reference en memoire
+### Data contract (validation)
+- Types numeriques stricts (invalides -> 422).
+- Ranges numeriques (min/max entrainement) controles.
+- Categoriels normalises: `CODE_GENDER` -> {`F`, `M`}, `FLAG_OWN_CAR` -> {`Y`, `N`}.
+- Sentinelle `DAYS_EMPLOYED=365243` remplacee par NaN.
+- Logs enrichis via `data_quality` pour distinguer drift vs qualite de donnees.
+### Interface Gradio (scoring)
+```shell
+python gradio_app.py
+```
+Sur Hugging Face Spaces, `app.py` lance l'UI Gradio automatiquement.
 Note : l'API valide strictement les champs requis (`/features`). Pour afficher
 toutes les colonnes possibles : `/features?include_all=true`.
+### Hugging Face (assets lourds)
+Les fichiers binaires (modele, preprocessor, data_final) ne sont pas pushes
+dans le Space. Ils sont telecharges a l'execution via Hugging Face Hub si les
+variables suivantes sont definies :
+- `HF_MODEL_REPO_ID` + `HF_MODEL_FILENAME` + `HF_MODEL_REPO_TYPE`
+- `HF_PREPROCESSOR_REPO_ID` + `HF_PREPROCESSOR_FILENAME` + `HF_PREPROCESSOR_REPO_TYPE`
+- `HF_CUSTOMER_REPO_ID` + `HF_CUSTOMER_FILENAME` + `HF_CUSTOMER_REPO_TYPE`
+Exemple (un seul repo dataset avec 3 fichiers) :
+- `HF_MODEL_REPO_ID=stephmnt/credit-scoring-mlops-assets`
+- `HF_MODEL_REPO_TYPE=dataset`
+- `HF_MODEL_FILENAME=HistGB_final_model.pkl`
+- `HF_PREPROCESSOR_REPO_ID=stephmnt/credit-scoring-mlops-assets`
+- `HF_PREPROCESSOR_REPO_TYPE=dataset`
+- `HF_PREPROCESSOR_FILENAME=preprocessor.joblib`
+- `HF_CUSTOMER_REPO_ID=stephmnt/credit-scoring-mlops-assets`
+- `HF_CUSTOMER_REPO_TYPE=dataset`
+- `HF_CUSTOMER_FILENAME=data_final.parquet`
 ### Demo live (commandes cles en main)
 Lancer l'API :
 - `LOGS_ACCESS_TOKEN` pour proteger l'endpoint `/logs`
 - `LOG_HASH_SK_ID=1` pour anonymiser `SK_ID_CURR`
+Les logs incluent un bloc `data_quality` par requete (champs manquants,
+types invalides, out-of-range, categories inconnues, sentinelle
+`DAYS_EMPLOYED`).
 Exemple local :
 ```shell
 curl -s -H "Authorization: Bearer $LOGS_ACCESS_TOKEN" "${BASE_URL}/logs?tail=200"
 ```
+Apres quelques requêtes, générer le rapport de drift :
 ```shell
 python monitoring/drift_report.py \
   --logs logs/predictions.jsonl \
   --reference data/data_final.parquet \
+  --output-dir reports \
+  --min-prod-samples 200 \
+  --fdr-alpha 0.05 \
+  --prod-since "2024-01-01T00:00:00Z" \
+  --prod-until "2024-01-31T23:59:59Z"
 ```
 Le rapport HTML est généré dans `reports/drift_report.html` (avec des plots dans
 `reports/plots/`). Sur Hugging Face, le disque est éphemère : télécharger les logs
 avant d'analyser.
+Le drift est calcule uniquement si `n_prod >= --min-prod-samples` (defaut 200).
+Sinon, un badge "Sample insuffisant" est affiche et les alertes sont desactivees.
+Robustesse integree:
+- Categoriels: PSI avec lissage (`--psi-eps`) + categories rares regroupees (OTHER).
+- Numeriques: KS corrige par FDR (Benjamini-Hochberg, `--fdr-alpha`).
+- Sentinel `DAYS_EMPLOYED`: converti en NaN + taux suivi.
 Le rapport inclut aussi la distribution des scores predits et le taux de prediction
+(option `--score-bins` pour ajuster le nombre de bins), ainsi qu'une section
+Data Quality si les logs contiennent `data_quality` (types, NaN, out-of-range,
+categories inconnues).
+Pour simuler des fenetres glissantes, utiliser `--prod-since` / `--prod-until`
+avec les timestamps des logs.
+Runbook drift: `docs/monitoring/runbook.md`.
 Captures (snapshot local du reporting + stockage):
 - Rapport: `docs/monitoring/drift_report.html` + `docs/monitoring/plots/`
 - Stockage des logs: `docs/monitoring/logs_storage.png`
+## Profiling & Optimisation (Etape 4)
+Profiling et benchmark d'inference (cProfile + latence) :
+```shell
+python profiling/profile_inference.py \
+  --sample-size 2000 \
+  --batch-size 128 \
+  --runs 3
+```
+Sorties:
+- `docs/performance/benchmark_results.json`
+- `docs/performance/profile_summary.txt`
+- Rapport detaille: `docs/performance/performance_report.md`
+Dashboard local Streamlit (monitoring + drift):
+```shell
+python -m streamlit run monitoring/streamlit_app.py
+```
 ## Contenu de la release
 - **Preparation + pipeline** : nettoyage / preparation, encodage, imputation et pipeline d'entrainement presentes.
 - **Score metier + seuil optimal** : le `custom_score` est la metrique principale des tableaux de comparaison et de la CV, avec un `best_threshold` calcule.
 - **Explicabilite** : feature importance, SHAP et LIME sont inclus.
 - **Selection de features par correlation** : top‑N numeriques + un petit set categoriel, expose via `/features`.
+- **Interface Gradio** : formulaire minimal (id client + montant + duree) base sur la reference clients.
+- **Monitoring & drift** : rapport HTML avec gating par volume, PSI robuste, KS + FDR, data quality et
+  distribution des scores (snapshots dans `docs/monitoring/`).
+- **Profiling & optimisation** : benchmark d'inference + profil cProfile (dossier `docs/performance/`).
 - **CI/CD** : tests avec couverture (`pytest-cov`), build Docker et deploy vers Hugging Face Spaces.
 ![Screenshot MLFlow](https://raw.githubusercontent.com/stephmnt/credit-scoring-mlops/main/screen-mlflow.png)
 * Compléter les tests API: /logs (auth OK/KO), batch predict, param threshold, SK_ID_CURR manquant, outliers dans test_api.py.
 * Simplifier le fallback ALLOW_MISSING_ARTIFACTS et DummyModel si les artefacts sont versionnés (nettoyer main.py et conftest.py).
 * Si l’évaluateur attend une stratégie de branches, créer une branche feature et fusionner pour preuve.

	@@ -0,0 +1,13 @@

+# Monitoring Captures
+These files are snapshot artifacts for the monitoring deliverable.
+- drift_report.html: report generated by monitoring/drift_report.py (sample-size 5000).
+- runbook.md: triage et actions quand une alerte drift apparait.
+- plots/: feature drift plots + score distribution + prediction rate.
+- predictions_sample.jsonl: sanitized example of production logs.
+- logs_storage.png: snapshot of the logging storage format.
+Notes:
+- Drift alerts are gated by minimum production volume (see report badge).
+- Data quality metrics appear when logs include `data_quality`.

	@@ -0,0 +1,140 @@

+<!doctype html>
+<html>
+  <head>
+    <meta charset="utf-8" />
+    <title>Drift Report</title>
+    <style>
+      body { font-family: Arial, sans-serif; margin: 24px; }
+      table { border-collapse: collapse; width: 100%; }
+      th, td { border: 1px solid #ddd; padding: 8px; }
+      th { background: #f3f3f3; }
+      img { max-width: 720px; }
+    </style>
+  </head>
+  <body>
+    <h2>Production Monitoring Summary</h2>
+    <ul>
+      <li>Total calls: 1</li>
+      <li>Error rate: 0.00%</li>
+      <li>Latency p50: 82.04 ms</li>
+      <li>Latency p95: 82.04 ms</li>
+    </ul>
+    <h2>Score Monitoring</h2>
+    <ul>
+      <li>Score mean: 0.3755</li>
+<li>Score p50: 0.3755</li>
+<li>Score p95: 0.3755</li>
+<li>Score min: 0.3755</li>
+<li>Score max: 0.3755</li>
+<li>Predicted default rate: 0.00%</li>
+    </ul>
+    <img src='plots/score_distribution.png' />
+<img src='plots/prediction_rate.png' />
+    <h2>Data Drift Summary</h2>
+    <table border="1" class="dataframe">
+  <thead>
+    <tr style="text-align: right;">
+      <th>feature</th>
+      <th>type</th>
+      <th>ks_stat</th>
+      <th>p_value</th>
+      <th>drift_detected</th>
+      <th>psi</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>EXT_SOURCE_2</td>
+      <td>numeric</td>
+      <td>0.5905</td>
+      <td>0.819238</td>
+      <td>False</td>
+      <td>NaN</td>
+    </tr>
+    <tr>
+      <td>EXT_SOURCE_3</td>
+      <td>numeric</td>
+      <td>0.9047</td>
+      <td>0.191111</td>
+      <td>False</td>
+      <td>NaN</td>
+    </tr>
+    <tr>
+      <td>AMT_ANNUITY</td>
+      <td>numeric</td>
+      <td>0.5184</td>
+      <td>0.963407</td>
+      <td>False</td>
+      <td>NaN</td>
+    </tr>
+    <tr>
+      <td>EXT_SOURCE_1</td>
+      <td>numeric</td>
+      <td>0.5822</td>
+      <td>0.836199</td>
+      <td>False</td>
+      <td>NaN</td>
+    </tr>
+    <tr>
+      <td>CODE_GENDER</td>
+      <td>categorical</td>
+      <td>NaN</td>
+      <td>NaN</td>
+      <td>True</td>
+      <td>9.6538</td>
+    </tr>
+    <tr>
+      <td>DAYS_EMPLOYED</td>
+      <td>numeric</td>
+      <td>0.6508</td>
+      <td>0.698660</td>
+      <td>False</td>
+      <td>NaN</td>
+    </tr>
+    <tr>
+      <td>AMT_CREDIT</td>
+      <td>numeric</td>
+      <td>0.5996</td>
+      <td>0.801040</td>
+      <td>False</td>
+      <td>NaN</td>
+    </tr>
+    <tr>
+      <td>AMT_GOODS_PRICE</td>
+      <td>numeric</td>
+      <td>0.6115</td>
+      <td>0.777177</td>
+      <td>False</td>
+      <td>NaN</td>
+    </tr>
+    <tr>
+      <td>DAYS_BIRTH</td>
+      <td>numeric</td>
+      <td>0.9474</td>
+      <td>0.105579</td>
+      <td>False</td>
+      <td>NaN</td>
+    </tr>
+    <tr>
+      <td>FLAG_OWN_CAR</td>
+      <td>categorical</td>
+      <td>NaN</td>
+      <td>NaN</td>
+      <td>True</td>
+      <td>4.3985</td>
+    </tr>
+  </tbody>
+</table>
+    <h2>Feature Distributions</h2>
+    <h4>EXT_SOURCE_2</h4><img src='plots/EXT_SOURCE_2.png' />
+<h4>EXT_SOURCE_3</h4><img src='plots/EXT_SOURCE_3.png' />
+<h4>AMT_ANNUITY</h4><img src='plots/AMT_ANNUITY.png' />
+<h4>EXT_SOURCE_1</h4><img src='plots/EXT_SOURCE_1.png' />
+<h4>CODE_GENDER</h4><img src='plots/CODE_GENDER.png' />
+<h4>DAYS_EMPLOYED</h4><img src='plots/DAYS_EMPLOYED.png' />
+<h4>AMT_CREDIT</h4><img src='plots/AMT_CREDIT.png' />
+<h4>AMT_GOODS_PRICE</h4><img src='plots/AMT_GOODS_PRICE.png' />
+<h4>DAYS_BIRTH</h4><img src='plots/DAYS_BIRTH.png' />
+<h4>FLAG_OWN_CAR</h4><img src='plots/FLAG_OWN_CAR.png' />
+  </body>
+</html>