Spaces:

stephmnt
/

credit-scoring-mlops

Runtime error

App Files Files Community

GitHub Actions commited on Jan 1

Commit

fda0b63

1 Parent(s): 45235e6

Auto-deploy from GitHub Actions

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app/__init__.py +1 -3
hf_space/Dockerfile +3 -2
hf_space/hf_space/app.py +1 -19
hf_space/hf_space/app/__init__.py +3 -1
hf_space/hf_space/app/main.py +11 -1
hf_space/hf_space/app_entry.py +19 -0
hf_space/hf_space/hf_space/Dockerfile +1 -1
hf_space/hf_space/hf_space/README.md +17 -8
hf_space/hf_space/hf_space/hf_space/.gitattributes +2 -33
hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml +69 -0
hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy.yml +4 -0
hf_space/hf_space/hf_space/hf_space/.gitignore +3 -1
hf_space/hf_space/hf_space/hf_space/app.py +25 -0
hf_space/hf_space/hf_space/hf_space/app/main.py +190 -13
hf_space/hf_space/hf_space/hf_space/gradio_app.py +96 -0
hf_space/hf_space/hf_space/hf_space/hf_space/README.md +136 -18
hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/README.md +13 -0
hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/drift_report.html +140 -0
hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/logs_storage.png +0 -0
hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/AMT_ANNUITY.png +0 -0
hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/AMT_CREDIT.png +0 -0
hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/AMT_GOODS_PRICE.png +0 -0
hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/CODE_GENDER.png +0 -0
hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/DAYS_BIRTH.png +0 -0
hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/DAYS_EMPLOYED.png +0 -0
hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/EXT_SOURCE_1.png +0 -0
hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/EXT_SOURCE_2.png +0 -0
hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/EXT_SOURCE_3.png +0 -0
hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/FLAG_OWN_CAR.png +0 -0
hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/prediction_rate.png +0 -0
hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/score_distribution.png +0 -0
hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/predictions_sample.jsonl +2 -0
hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/runbook.md +28 -0
hf_space/hf_space/hf_space/hf_space/hf_space/docs/performance/benchmark_results.json +20 -0
hf_space/hf_space/hf_space/hf_space/hf_space/docs/performance/performance_report.md +50 -0
hf_space/hf_space/hf_space/hf_space/hf_space/docs/performance/profile_summary.txt +38 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy.yml +7 -5
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.gitignore +2 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py +374 -32
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md +141 -25
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.dockerignore +6 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy.yml +54 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.gitignore +194 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/LICENSE +8 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/__init__.py +1 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py +828 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.gitattributes +35 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/Dockerfile +17 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md +192 -0
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py +0 -0

app/__init__.py CHANGED Viewed

@@ -1,3 +1 @@
-"""Expose combined ASGI app for HF Spaces default loader."""
-from app_entry import app, demo  # re-export for uvicorn app:app


1	+ """Package marker for the FastAPI app package."""

hf_space/Dockerfile CHANGED Viewed

@@ -9,8 +9,9 @@ COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY app/ app/
-COPY data/HistGB_final_model.pkl data/
-COPY artifacts/preprocessor.joblib artifacts/
 EXPOSE 7860

 RUN pip install --no-cache-dir -r requirements.txt
 COPY app/ app/
+COPY app_entry.py app.py gradio_app.py ./
+COPY data/ data/
+COPY artifacts/ artifacts/
 EXPOSE 7860

hf_space/hf_space/app.py CHANGED Viewed

@@ -1,22 +1,4 @@
-from fastapi import FastAPI
-import gradio as gr
-from app.main import app as api_app
-from app.main import startup_event
-from gradio_app import demo
-root_app = FastAPI()
-root_app.mount("/api", api_app)
-root_app = gr.mount_gradio_app(root_app, demo, path="/")
-@root_app.on_event("startup")
-def _startup() -> None:
-    startup_event()
-app = root_app
 if __name__ == "__main__":


1	+ from app_entry import app, demo # re-export for HF Spaces


















2
3
4	if __name__ == "__main__":

hf_space/hf_space/app/__init__.py CHANGED Viewed

	@@ -1 +1,3 @@
1	- # ~~Package~~ ~~marker~~ for ~~app~~ ~~module~~.


1	+ """Expose combined ASGI app for HF Spaces default loader."""
2	+
3	+ from app_entry import app, demo # re-export for uvicorn app:app

hf_space/hf_space/app/main.py CHANGED Viewed

@@ -1113,6 +1113,16 @@ def startup_event() -> None:
         logger.info("Loading model from %s", model_path)
         app.state.model = load_model(model_path)
     try:
         artifacts_path = ARTIFACTS_PATH
         if not artifacts_path.exists():
@@ -1125,7 +1135,7 @@ def startup_event() -> None:
             if downloaded is not None:
                 artifacts_path = downloaded
         logger.info("Loading preprocessor artifacts from %s", artifacts_path)
-        app.state.preprocessor = load_preprocessor(DATA_PATH, artifacts_path)
     except RuntimeError as exc:
         if ALLOW_MISSING_ARTIFACTS:
             logger.warning("Preprocessor artifacts missing (%s). Using fallback preprocessor.", exc)

         logger.info("Loading model from %s", model_path)
         app.state.model = load_model(model_path)
+    data_path = DATA_PATH
+    if not data_path.exists():
+        downloaded = _ensure_hf_asset(
+            data_path,
+            HF_CUSTOMER_REPO_ID,
+            HF_CUSTOMER_FILENAME,
+            HF_CUSTOMER_REPO_TYPE,
+        )
+        if downloaded is not None:
+            data_path = downloaded
     try:
         artifacts_path = ARTIFACTS_PATH
         if not artifacts_path.exists():
             if downloaded is not None:
                 artifacts_path = downloaded
         logger.info("Loading preprocessor artifacts from %s", artifacts_path)
+        app.state.preprocessor = load_preprocessor(data_path, artifacts_path)
     except RuntimeError as exc:
         if ALLOW_MISSING_ARTIFACTS:
             logger.warning("Preprocessor artifacts missing (%s). Using fallback preprocessor.", exc)

hf_space/hf_space/app_entry.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from fastapi import FastAPI
+import gradio as gr
+from app.main import app as api_app
+from app.main import startup_event
+from gradio_app import demo
+root_app = FastAPI()
+root_app.mount("/api", api_app)
+root_app = gr.mount_gradio_app(root_app, demo, path="/")
+@root_app.on_event("startup")
+def _startup() -> None:
+    startup_event()
+app = root_app

hf_space/hf_space/hf_space/Dockerfile CHANGED Viewed

@@ -14,4 +14,4 @@ COPY artifacts/preprocessor.joblib artifacts/
 EXPOSE 7860
-CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]


14
15	EXPOSE 7860
16
17	+ CMD ["uvicorn", "app_entry:app", "--host", "0.0.0.0", "--port", "7860"]

hf_space/hf_space/hf_space/README.md CHANGED Viewed

@@ -198,29 +198,38 @@ Exemple (un seul repo dataset avec 3 fichiers) :
 ### Demo live (commandes cles en main)
-Lancer l'API :
 ```shell
 uvicorn app.main:app --reload --port 7860
 ```
 Verifier le service (HF) :
 ```shell
 BASE_URL="https://stephmnt-credit-scoring-mlops.hf.space"
-curl -s "${BASE_URL}/health"
 ```
 Voir les features attendues (HF) :
 ```shell
-curl -s "${BASE_URL}/features"
 ```
 Predire un client (HF) :
 ```shell
-curl -s -X POST "${BASE_URL}/predict?threshold=0.5" \
   -H "Content-Type: application/json" \
   -d '{
     "data": {
@@ -242,7 +251,7 @@ curl -s -X POST "${BASE_URL}/predict?threshold=0.5" \
 Predire plusieurs clients (batch, HF) :
 ```shell
-curl -s -X POST "${BASE_URL}/predict?threshold=0.45" \
   -H "Content-Type: application/json" \
   -d '{
     "data": [
@@ -279,7 +288,7 @@ curl -s -X POST "${BASE_URL}/predict?threshold=0.45" \
 Exemple d'erreur (champ requis manquant, HF) :
 ```shell
-curl -s -X POST "${BASE_URL}/predict" \
   -H "Content-Type: application/json" \
   -d '{
     "data": {
@@ -316,13 +325,13 @@ Recuperer les logs (HF) :
 Configurer `LOGS_ACCESS_TOKEN` dans les secrets du Space, puis :
 ```shell
-curl -s -H "X-Logs-Token: $LOGS_ACCESS_TOKEN" "${BASE_URL}/logs?tail=200"
 ```
 Alternative :
 ```shell
-curl -s -H "Authorization: Bearer $LOGS_ACCESS_TOKEN" "${BASE_URL}/logs?tail=200"
 ```
 Apres quelques requêtes, générer le rapport de drift :

 ### Demo live (commandes cles en main)
+Lancer l'API (sans UI) :
 ```shell
 uvicorn app.main:app --reload --port 7860
 ```
+Lancer l'UI Gradio + API (chemin `/api`) :
+```shell
+uvicorn app_entry:app --reload --port 7860
+```
 Verifier le service (HF) :
 ```shell
 BASE_URL="https://stephmnt-credit-scoring-mlops.hf.space"
+API_BASE="${BASE_URL}/api"
+curl -s "${API_BASE}/health"
 ```
+Note : sur HF Spaces, l'UI Gradio est a la racine, l'API est sous `/api`.
 Voir les features attendues (HF) :
 ```shell
+curl -s "${API_BASE}/features"
 ```
 Predire un client (HF) :
 ```shell
+curl -s -X POST "${API_BASE}/predict?threshold=0.5" \
   -H "Content-Type: application/json" \
   -d '{
     "data": {
 Predire plusieurs clients (batch, HF) :
 ```shell
+curl -s -X POST "${API_BASE}/predict?threshold=0.45" \
   -H "Content-Type: application/json" \
   -d '{
     "data": [
 Exemple d'erreur (champ requis manquant, HF) :
 ```shell
+curl -s -X POST "${API_BASE}/predict" \
   -H "Content-Type: application/json" \
   -d '{
     "data": {
 Configurer `LOGS_ACCESS_TOKEN` dans les secrets du Space, puis :
 ```shell
+curl -s -H "X-Logs-Token: $LOGS_ACCESS_TOKEN" "${API_BASE}/logs?tail=200"
 ```
 Alternative :
 ```shell
+curl -s -H "Authorization: Bearer $LOGS_ACCESS_TOKEN" "${API_BASE}/logs?tail=200"
 ```
 Apres quelques requêtes, générer le rapport de drift :

hf_space/hf_space/hf_space/hf_space/.gitattributes CHANGED Viewed

@@ -1,35 +1,4 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.parquet filter=lfs diff=lfs merge=lfs -text
+data/HistGB_final_model.pkl filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text

hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml ADDED Viewed

	@@ -0,0 +1,69 @@

+name: deploy-assets
+on:
+  workflow_dispatch:
+    inputs:
+      repo_id:
+        description: "HF repo id (e.g. stephmnt/assets-credit-scoring-mlops)"
+        required: true
+        default: "stephmnt/assets-credit-scoring-mlops"
+      repo_type:
+        description: "HF repo type (dataset or model)"
+        required: true
+        default: "dataset"
+jobs:
+  upload-assets:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install huggingface_hub
+      - name: Upload assets to Hugging Face Hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HF_REPO_ID: ${{ inputs.repo_id }}
+          HF_REPO_TYPE: ${{ inputs.repo_type }}
+        run: |
+          python - <<'PY'
+          import os
+          from pathlib import Path
+          from huggingface_hub import HfApi
+          repo_id = os.environ["HF_REPO_ID"]
+          repo_type = os.environ["HF_REPO_TYPE"]
+          token = os.environ["HF_TOKEN"]
+          files = {
+              "data/HistGB_final_model.pkl": "HistGB_final_model.pkl",
+              "artifacts/preprocessor.joblib": "preprocessor.joblib",
+              "data/data_final.parquet": "data_final.parquet",
+          }
+          api = HfApi()
+          for local_path, remote_name in files.items():
+              path = Path(local_path)
+              if not path.exists():
+                  raise SystemExit(f"Missing file: {path}")
+              api.upload_file(
+                  path_or_fileobj=str(path),
+                  path_in_repo=remote_name,
+                  repo_id=repo_id,
+                  repo_type=repo_type,
+                  token=token,
+                  commit_message=f"Update {remote_name}",
+              )
+          print("Assets uploaded.")
+          PY

hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy.yml CHANGED Viewed

@@ -12,6 +12,8 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v4
       - name: Set up Python
         uses: actions/setup-python@v5
@@ -47,6 +49,8 @@ jobs:
             --exclude 'logs' \
             --exclude 'reports' \
             --exclude 'screen-mlflow.png' \
             --exclude 'data/*.csv' \
             --exclude 'data/*.parquet' \
             ./ hf_space/

     steps:
       - name: Checkout
         uses: actions/checkout@v4
+        with:
+          lfs: true
       - name: Set up Python
         uses: actions/setup-python@v5
             --exclude 'logs' \
             --exclude 'reports' \
             --exclude 'screen-mlflow.png' \
+            --exclude 'data/HistGB_final_model.pkl' \
+            --exclude 'artifacts/preprocessor.joblib' \
             --exclude 'data/*.csv' \
             --exclude 'data/*.parquet' \
             ./ hf_space/

hf_space/hf_space/hf_space/hf_space/.gitignore CHANGED Viewed

@@ -6,6 +6,7 @@ logs/
 reports/
 data/*
 !data/HistGB_final_model.pkl
 artifacts/*
 !artifacts/preprocessor.joblib
 .DS_Store
@@ -18,7 +19,8 @@ mlruns/
 *.code-workspace
 presentation_projet08.pptx
 rapport_projet06.md
 ## https://github.com/github/gitignore/blob/e8554d85bf62e38d6db966a50d2064ac025fd82a/Python.gitignore
 # Byte-compiled / optimized / DLL files

 reports/
 data/*
 !data/HistGB_final_model.pkl
+!data/data_final.parquet
 artifacts/*
 !artifacts/preprocessor.joblib
 .DS_Store
 *.code-workspace
 presentation_projet08.pptx
 rapport_projet06.md
+rapport_template.md
+data_final.parquet
 ## https://github.com/github/gitignore/blob/e8554d85bf62e38d6db966a50d2064ac025fd82a/Python.gitignore
 # Byte-compiled / optimized / DLL files

hf_space/hf_space/hf_space/hf_space/app.py CHANGED Viewed

	@@ -0,0 +1,25 @@

+from fastapi import FastAPI
+import gradio as gr
+from app.main import app as api_app
+from app.main import startup_event
+from gradio_app import demo
+root_app = FastAPI()
+root_app.mount("/api", api_app)
+root_app = gr.mount_gradio_app(root_app, demo, path="/")
+@root_app.on_event("startup")
+def _startup() -> None:
+    startup_event()
+app = root_app
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

hf_space/hf_space/hf_space/hf_space/app/main.py CHANGED Viewed

@@ -41,6 +41,18 @@ LOG_INCLUDE_INPUTS = os.getenv("LOG_INCLUDE_INPUTS", "1") == "1"
 LOG_HASH_SK_ID = os.getenv("LOG_HASH_SK_ID", "0") == "1"
 MODEL_VERSION = os.getenv("MODEL_VERSION", MODEL_PATH.name)
 LOGS_ACCESS_TOKEN = os.getenv("LOGS_ACCESS_TOKEN")
 IGNORE_FEATURES = ["is_train", "is_test", "TARGET", "SK_ID_CURR"]
 ENGINEERED_FEATURES = [
@@ -117,6 +129,13 @@ class PredictionRequest(BaseModel):
     data: dict[str, Any] | list[dict[str, Any]]
 @dataclass
 class PreprocessorArtifacts:
     columns_keep: list[str]
@@ -173,6 +192,32 @@ def _normalize_category_value(value: object, mapping: dict[str, str]) -> object:
     return mapping.get(key, "Unknown")
 def _normalize_inputs(
     df_raw: pd.DataFrame,
     preprocessor: PreprocessorArtifacts,
@@ -262,6 +307,54 @@ def _build_data_quality_records(
     return records
 def _append_log_entries(entries: list[dict[str, Any]]) -> None:
     if not LOG_PREDICTIONS:
         return
@@ -596,6 +689,41 @@ def load_model(model_path: Path):
         return pickle.load(handle)
 def _infer_numeric_ranges_from_scaler(preprocessor: PreprocessorArtifacts) -> dict[str, tuple[float, float]]:
     ranges = {}
     scaler = getattr(preprocessor, "scaler", None)
@@ -963,19 +1091,41 @@ def preprocess_input(df_raw: pd.DataFrame, artifacts: PreprocessorArtifacts) ->
 @app.on_event("startup")
 def startup_event() -> None:
-    if not MODEL_PATH.exists():
         if ALLOW_MISSING_ARTIFACTS:
-            logger.warning("Model file not found: %s. Using dummy model.", MODEL_PATH)
             app.state.model = DummyModel()
         else:
-            raise RuntimeError(f"Model file not found: {MODEL_PATH}")
     else:
-        logger.info("Loading model from %s", MODEL_PATH)
-        app.state.model = load_model(MODEL_PATH)
     try:
-        logger.info("Loading preprocessor artifacts from %s", ARTIFACTS_PATH)
-        app.state.preprocessor = load_preprocessor(DATA_PATH, ARTIFACTS_PATH)
     except RuntimeError as exc:
         if ALLOW_MISSING_ARTIFACTS:
             logger.warning("Preprocessor artifacts missing (%s). Using fallback preprocessor.", exc)
@@ -983,6 +1133,19 @@ def startup_event() -> None:
         else:
             raise
 @app.get("/health")
 def health() -> dict[str, str]:
@@ -1063,16 +1226,11 @@ def logs(
     return Response(content="".join(lines), media_type="application/x-ndjson")
-@app.post("/predict")
-def predict(
-    payload: PredictionRequest,
-    threshold: float | None = Query(default=None, ge=0.0, le=1.0),
-) -> dict[str, Any]:
     model = app.state.model
     preprocessor: PreprocessorArtifacts = app.state.preprocessor
     request_id = str(uuid.uuid4())
     start_time = time.perf_counter()
-    records = payload.data if isinstance(payload.data, list) else [payload.data]
     if not records:
         raise HTTPException(status_code=422, detail={"message": "No input records provided."})
@@ -1168,3 +1326,22 @@ def predict(
             error=str(exc),
         )
         raise

 LOG_HASH_SK_ID = os.getenv("LOG_HASH_SK_ID", "0") == "1"
 MODEL_VERSION = os.getenv("MODEL_VERSION", MODEL_PATH.name)
 LOGS_ACCESS_TOKEN = os.getenv("LOGS_ACCESS_TOKEN")
+CUSTOMER_DATA_PATH = Path(os.getenv("CUSTOMER_DATA_PATH", str(DATA_PATH)))
+CUSTOMER_LOOKUP_ENABLED = os.getenv("CUSTOMER_LOOKUP_ENABLED", "1") == "1"
+CUSTOMER_LOOKUP_CACHE = os.getenv("CUSTOMER_LOOKUP_CACHE", "1") == "1"
+HF_MODEL_REPO_ID = os.getenv("HF_MODEL_REPO_ID")
+HF_MODEL_REPO_TYPE = os.getenv("HF_MODEL_REPO_TYPE", "model")
+HF_MODEL_FILENAME = os.getenv("HF_MODEL_FILENAME", MODEL_PATH.name)
+HF_PREPROCESSOR_REPO_ID = os.getenv("HF_PREPROCESSOR_REPO_ID", HF_MODEL_REPO_ID or "")
+HF_PREPROCESSOR_REPO_TYPE = os.getenv("HF_PREPROCESSOR_REPO_TYPE", HF_MODEL_REPO_TYPE)
+HF_PREPROCESSOR_FILENAME = os.getenv("HF_PREPROCESSOR_FILENAME", ARTIFACTS_PATH.name)
+HF_CUSTOMER_REPO_ID = os.getenv("HF_CUSTOMER_REPO_ID")
+HF_CUSTOMER_REPO_TYPE = os.getenv("HF_CUSTOMER_REPO_TYPE", "dataset")
+HF_CUSTOMER_FILENAME = os.getenv("HF_CUSTOMER_FILENAME", CUSTOMER_DATA_PATH.name)
 IGNORE_FEATURES = ["is_train", "is_test", "TARGET", "SK_ID_CURR"]
 ENGINEERED_FEATURES = [
     data: dict[str, Any] | list[dict[str, Any]]
+class MinimalPredictionRequest(BaseModel):
+    sk_id_curr: int
+    amt_credit: float
+    duration_months: int | None = None
+    amt_annuity: float | None = None
 @dataclass
 class PreprocessorArtifacts:
     columns_keep: list[str]
     return mapping.get(key, "Unknown")
+def _ensure_hf_asset(
+    local_path: Path,
+    repo_id: str | None,
+    filename: str,
+    repo_type: str,
+) -> Path | None:
+    if local_path.exists():
+        return local_path
+    if not repo_id:
+        return None
+    try:
+        from huggingface_hub import hf_hub_download
+    except ImportError as exc:  # pragma: no cover - optional dependency
+        raise RuntimeError("huggingface_hub is required to download remote assets.") from exc
+    local_path.parent.mkdir(parents=True, exist_ok=True)
+    return Path(
+        hf_hub_download(
+            repo_id=repo_id,
+            filename=filename,
+            repo_type=repo_type,
+            local_dir=str(local_path.parent),
+            local_dir_use_symlinks=False,
+        )
+    )
 def _normalize_inputs(
     df_raw: pd.DataFrame,
     preprocessor: PreprocessorArtifacts,
     return records
+def _build_minimal_record(
+    payload: MinimalPredictionRequest,
+    preprocessor: PreprocessorArtifacts,
+) -> dict[str, Any]:
+    reference = _get_customer_reference(preprocessor)
+    if reference is None:
+        raise HTTPException(
+            status_code=503,
+            detail={"message": "Customer reference data is not available."},
+        )
+    sk_id = int(payload.sk_id_curr)
+    if sk_id not in reference.index:
+        raise HTTPException(
+            status_code=404,
+            detail={"message": f"Client {sk_id} not found in reference data."},
+        )
+    record = reference.loc[sk_id].to_dict()
+    record["SK_ID_CURR"] = sk_id
+    if payload.amt_credit <= 0:
+        raise HTTPException(
+            status_code=422,
+            detail={"message": "AMT_CREDIT must be positive."},
+        )
+    record["AMT_CREDIT"] = float(payload.amt_credit)
+    if payload.amt_annuity is not None:
+        if payload.amt_annuity <= 0:
+            raise HTTPException(
+                status_code=422,
+                detail={"message": "AMT_ANNUITY must be positive."},
+            )
+        record["AMT_ANNUITY"] = float(payload.amt_annuity)
+    elif payload.duration_months is not None:
+        if payload.duration_months <= 0:
+            raise HTTPException(
+                status_code=422,
+                detail={"message": "duration_months must be positive."},
+            )
+        record["AMT_ANNUITY"] = float(payload.amt_credit) / float(payload.duration_months)
+    else:
+        raise HTTPException(
+            status_code=422,
+            detail={"message": "Provide duration_months or amt_annuity."},
+        )
+    if "AMT_GOODS_PRICE" in record:
+        record["AMT_GOODS_PRICE"] = float(payload.amt_credit)
+    return record
 def _append_log_entries(entries: list[dict[str, Any]]) -> None:
     if not LOG_PREDICTIONS:
         return
         return pickle.load(handle)
+def _load_customer_reference(
+    data_path: Path,
+    preprocessor: PreprocessorArtifacts,
+) -> pd.DataFrame:
+    columns = list(preprocessor.input_feature_columns)
+    if "SK_ID_CURR" not in columns:
+        columns.insert(0, "SK_ID_CURR")
+    df = pd.read_parquet(data_path, columns=columns)
+    df = df.drop_duplicates(subset=["SK_ID_CURR"], keep="last").set_index("SK_ID_CURR")
+    return df
+def _get_customer_reference(preprocessor: PreprocessorArtifacts) -> pd.DataFrame | None:
+    if not CUSTOMER_LOOKUP_ENABLED:
+        return None
+    cached = getattr(app.state, "customer_reference", None)
+    if cached is not None:
+        return cached
+    data_path = CUSTOMER_DATA_PATH
+    if not data_path.exists():
+        downloaded = _ensure_hf_asset(
+            data_path,
+            HF_CUSTOMER_REPO_ID,
+            HF_CUSTOMER_FILENAME,
+            HF_CUSTOMER_REPO_TYPE,
+        )
+        if downloaded is None:
+            return None
+        data_path = downloaded
+    ref = _load_customer_reference(data_path, preprocessor)
+    if CUSTOMER_LOOKUP_CACHE:
+        app.state.customer_reference = ref
+    return ref
 def _infer_numeric_ranges_from_scaler(preprocessor: PreprocessorArtifacts) -> dict[str, tuple[float, float]]:
     ranges = {}
     scaler = getattr(preprocessor, "scaler", None)
 @app.on_event("startup")
 def startup_event() -> None:
+    if getattr(app.state, "model", None) is not None and getattr(app.state, "preprocessor", None) is not None:
+        return
+    model_path = MODEL_PATH
+    if not model_path.exists():
+        downloaded = _ensure_hf_asset(
+            model_path,
+            HF_MODEL_REPO_ID,
+            HF_MODEL_FILENAME,
+            HF_MODEL_REPO_TYPE,
+        )
+        if downloaded is not None:
+            model_path = downloaded
+    if not model_path.exists():
         if ALLOW_MISSING_ARTIFACTS:
+            logger.warning("Model file not found: %s. Using dummy model.", model_path)
             app.state.model = DummyModel()
         else:
+            raise RuntimeError(f"Model file not found: {model_path}")
     else:
+        logger.info("Loading model from %s", model_path)
+        app.state.model = load_model(model_path)
     try:
+        artifacts_path = ARTIFACTS_PATH
+        if not artifacts_path.exists():
+            downloaded = _ensure_hf_asset(
+                artifacts_path,
+                HF_PREPROCESSOR_REPO_ID or None,
+                HF_PREPROCESSOR_FILENAME,
+                HF_PREPROCESSOR_REPO_TYPE,
+            )
+            if downloaded is not None:
+                artifacts_path = downloaded
+        logger.info("Loading preprocessor artifacts from %s", artifacts_path)
+        app.state.preprocessor = load_preprocessor(DATA_PATH, artifacts_path)
     except RuntimeError as exc:
         if ALLOW_MISSING_ARTIFACTS:
             logger.warning("Preprocessor artifacts missing (%s). Using fallback preprocessor.", exc)
         else:
             raise
+    app.state.customer_reference = None
+    if CUSTOMER_LOOKUP_ENABLED and CUSTOMER_LOOKUP_CACHE:
+        try:
+            ref = _get_customer_reference(app.state.preprocessor)
+            if ref is not None:
+                logger.info("Loaded customer reference data (%s rows)", len(ref))
+            else:
+                logger.warning("Customer reference data not available.")
+        except Exception as exc:  # pragma: no cover - optional cache load
+            logger.warning("Failed to load customer reference data: %s", exc)
+    elif CUSTOMER_LOOKUP_ENABLED:
+        logger.info("Customer lookup enabled without cache (on-demand load).")
 @app.get("/health")
 def health() -> dict[str, str]:
     return Response(content="".join(lines), media_type="application/x-ndjson")
+def _predict_records(records: list[dict[str, Any]], threshold: float | None) -> dict[str, Any]:
     model = app.state.model
     preprocessor: PreprocessorArtifacts = app.state.preprocessor
     request_id = str(uuid.uuid4())
     start_time = time.perf_counter()
     if not records:
         raise HTTPException(status_code=422, detail={"message": "No input records provided."})
             error=str(exc),
         )
         raise
+@app.post("/predict")
+def predict(
+    payload: PredictionRequest,
+    threshold: float | None = Query(default=None, ge=0.0, le=1.0),
+) -> dict[str, Any]:
+    records = payload.data if isinstance(payload.data, list) else [payload.data]
+    return _predict_records(records, threshold)
+@app.post("/predict-minimal")
+def predict_minimal(
+    payload: MinimalPredictionRequest,
+    threshold: float | None = Query(default=None, ge=0.0, le=1.0),
+) -> dict[str, Any]:
+    preprocessor: PreprocessorArtifacts = app.state.preprocessor
+    record = _build_minimal_record(payload, preprocessor)
+    return _predict_records([record], threshold)

hf_space/hf_space/hf_space/hf_space/gradio_app.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from __future__ import annotations
+from typing import Any
+import gradio as gr
+from fastapi import HTTPException
+from app.main import MinimalPredictionRequest, app, predict_minimal, startup_event
+def _ensure_startup() -> None:
+    if not getattr(app.state, "preprocessor", None):
+        startup_event()
+def _customer_snapshot(sk_id_curr: int) -> dict[str, Any]:
+    reference = getattr(app.state, "customer_reference", None)
+    if reference is None or sk_id_curr not in reference.index:
+        return {}
+    row = reference.loc[sk_id_curr]
+    snapshot: dict[str, Any] = {"SK_ID_CURR": int(sk_id_curr)}
+    if "CODE_GENDER" in row:
+        snapshot["CODE_GENDER"] = row["CODE_GENDER"]
+    if "FLAG_OWN_CAR" in row:
+        snapshot["FLAG_OWN_CAR"] = row["FLAG_OWN_CAR"]
+    if "AMT_INCOME_TOTAL" in row:
+        snapshot["AMT_INCOME_TOTAL"] = float(row["AMT_INCOME_TOTAL"])
+    if "DAYS_BIRTH" in row:
+        snapshot["AGE_YEARS"] = round(abs(float(row["DAYS_BIRTH"])) / 365.25, 1)
+    return snapshot
+def score_minimal(
+    sk_id_curr: float,
+    amt_credit: float,
+    duration_months: float,
+    threshold: float,
+) -> tuple[float | None, str, float | None, dict[str, Any]]:
+    _ensure_startup()
+    try:
+        payload = MinimalPredictionRequest(
+            sk_id_curr=int(sk_id_curr),
+            amt_credit=float(amt_credit),
+            duration_months=int(duration_months),
+        )
+        response = predict_minimal(payload, threshold=float(threshold))
+        result = response["predictions"][0]
+        probability = float(result.get("probability", 0.0))
+        pred_value = int(result.get("prediction", 0))
+        label = "Default (1)" if pred_value == 1 else "No default (0)"
+        snapshot = _customer_snapshot(int(sk_id_curr))
+        snapshot.update(
+            {
+                "AMT_CREDIT_REQUESTED": float(amt_credit),
+                "DURATION_MONTHS": int(duration_months),
+            }
+        )
+        return probability, label, float(response.get("threshold", 0.0)), snapshot
+    except HTTPException as exc:
+        return None, f"Erreur: {exc.detail}", None, {"error": exc.detail}
+    except Exception as exc:  # pragma: no cover - UI fallback
+        return None, f"Erreur: {exc}", None, {"error": str(exc)}
+with gr.Blocks(title="Credit Scoring - Minimal Inputs") as demo:
+    gr.Markdown("# Credit Scoring - Minimal Inputs")
+    gr.Markdown(
+        "Renseignez l'identifiant client, le montant du credit et la duree. "
+        "Les autres features proviennent des donnees clients reference."
+    )
+    with gr.Row():
+        sk_id_curr = gr.Number(label="SK_ID_CURR", precision=0, value=100001)
+        amt_credit = gr.Number(label="AMT_CREDIT", value=200000)
+        duration_months = gr.Number(label="Duree (mois)", precision=0, value=60)
+        threshold = gr.Slider(label="Seuil", minimum=0.0, maximum=1.0, value=0.5, step=0.01)
+    run_btn = gr.Button("Scorer")
+    with gr.Row():
+        probability = gr.Number(label="Probabilite de defaut")
+        prediction = gr.Textbox(label="Decision")
+        threshold_used = gr.Number(label="Seuil utilise")
+    snapshot = gr.JSON(label="Snapshot client (reference)")
+    run_btn.click(
+        score_minimal,
+        inputs=[sk_id_curr, amt_credit, duration_months, threshold],
+        outputs=[probability, prediction, threshold_used, snapshot],
+    )
+if __name__ == "__main__":
+    _ensure_startup()
+    demo.launch()

hf_space/hf_space/hf_space/hf_space/hf_space/README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: OCR Projet 06
 emoji: 🤖
 colorFrom: indigo
 colorTo: green
@@ -8,7 +8,7 @@ app_port: 7860
 pinned: false
 ---
-# OCR Projet 06 – Crédit
 [![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/stephmnt/credit-scoring-mlops/deploy.yml)](https://github.com/stephmnt/credit-scoring-mlops/actions/workflows/deploy.yml)
 [![GitHub Release Date](https://img.shields.io/github/release-date/stephmnt/credit-scoring-mlops?display_date=published_at&style=flat-square)](https://github.com/stephmnt/credit-scoring-mlops/releases)
@@ -62,24 +62,33 @@ Parametres utiles (selection des features) :
 - `FEATURE_SELECTION_TOP_N` (defaut: `8`)
 - `FEATURE_SELECTION_MIN_CORR` (defaut: `0.02`)
-### Environnement Poetry (recommande)
-Le fichier `pyproject.toml` fixe des versions compatibles pour un stack recent
-(`numpy>=2`, `pyarrow>=15`, `scikit-learn>=1.6`). L'environnement vise Python
-3.11.
 ```shell
-poetry env use 3.11
-poetry install
 poetry run pytest -q
 poetry run uvicorn app.main:app --reload --port 7860
 ```
 Important : le modele `HistGB_final_model.pkl` doit etre regenere avec la
-nouvelle version de scikit-learn (re-execution de
-`P6_MANET_Stephane_notebook_modélisation.ipynb`, cellule de sauvegarde pickle).
-Note : `requirements.txt` est aligne sur `pyproject.toml` (meme versions).
 ### Exemple d'input (schema + valeurs)
@@ -123,9 +132,70 @@ Valeurs d'exemple :
 }
 ```
 Note : l'API valide strictement les champs requis (`/features`). Pour afficher
 toutes les colonnes possibles : `/features?include_all=true`.
 ### Demo live (commandes cles en main)
 Lancer l'API :
@@ -231,6 +301,10 @@ Variables utiles :
 - `LOGS_ACCESS_TOKEN` pour proteger l'endpoint `/logs`
 - `LOG_HASH_SK_ID=1` pour anonymiser `SK_ID_CURR`
 Exemple local :
 ```shell
@@ -251,27 +325,70 @@ Alternative :
 curl -s -H "Authorization: Bearer $LOGS_ACCESS_TOKEN" "${BASE_URL}/logs?tail=200"
 ```
-Apres quelques requêtes, gélérer le rapport de drift :
 ```shell
 python monitoring/drift_report.py \
   --logs logs/predictions.jsonl \
   --reference data/data_final.parquet \
-  --output-dir reports
 ```
 Le rapport HTML est généré dans `reports/drift_report.html` (avec des plots dans
 `reports/plots/`). Sur Hugging Face, le disque est éphemère : télécharger les logs
 avant d'analyser.
 Le rapport inclut aussi la distribution des scores predits et le taux de prediction
-(option `--score-bins` pour ajuster le nombre de bins).
 Captures (snapshot local du reporting + stockage):
 - Rapport: `docs/monitoring/drift_report.html` + `docs/monitoring/plots/`
 - Stockage des logs: `docs/monitoring/logs_storage.png`
 ## Contenu de la release
 - **Preparation + pipeline** : nettoyage / preparation, encodage, imputation et pipeline d'entrainement presentes.
@@ -282,8 +399,10 @@ Captures (snapshot local du reporting + stockage):
 - **Score metier + seuil optimal** : le `custom_score` est la metrique principale des tableaux de comparaison et de la CV, avec un `best_threshold` calcule.
 - **Explicabilite** : feature importance, SHAP et LIME sont inclus.
 - **Selection de features par correlation** : top‑N numeriques + un petit set categoriel, expose via `/features`.
-- **Monitoring & drift** : rapport HTML avec KS/PSI + distribution des scores predits et taux de prediction
-  (snapshots dans `docs/monitoring/`).
 - **CI/CD** : tests avec couverture (`pytest-cov`), build Docker et deploy vers Hugging Face Spaces.
 ![Screenshot MLFlow](https://raw.githubusercontent.com/stephmnt/credit-scoring-mlops/main/screen-mlflow.png)
@@ -304,5 +423,4 @@ Captures (snapshot local du reporting + stockage):
 * Compléter les tests API: /logs (auth OK/KO), batch predict, param threshold, SK_ID_CURR manquant, outliers dans test_api.py.
 * Simplifier le fallback ALLOW_MISSING_ARTIFACTS et DummyModel si les artefacts sont versionnés (nettoyer main.py et conftest.py).
-* Unifier la gestion des dépendances (Poetry vs requirements.txt) et aligner pyproject.toml / requirements.txt.
 * Si l’évaluateur attend une stratégie de branches, créer une branche feature et fusionner pour preuve.

 ---
+title: Credit scoring MLOps
 emoji: 🤖
 colorFrom: indigo
 colorTo: green
 pinned: false
 ---
+# Credit scoring MLOps
 [![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/stephmnt/credit-scoring-mlops/deploy.yml)](https://github.com/stephmnt/credit-scoring-mlops/actions/workflows/deploy.yml)
 [![GitHub Release Date](https://img.shields.io/github/release-date/stephmnt/credit-scoring-mlops?display_date=published_at&style=flat-square)](https://github.com/stephmnt/credit-scoring-mlops/releases)
 - `FEATURE_SELECTION_TOP_N` (defaut: `8`)
 - `FEATURE_SELECTION_MIN_CORR` (defaut: `0.02`)
+### Environnement pip (dev)
+Le developpement local utilise pip et `requirements.txt` (versions figees),
+avec Python 3.11+.
 ```shell
+python3 -m venv .venv
+source .venv/bin/activate
+python -m pip install -r requirements.txt
+pytest -q
+uvicorn app.main:app --reload --port 7860
+```
+### Environnement Poetry (livrable)
+Le livrable inclut `pyproject.toml`, aligne sur `requirements.txt`. Si besoin :
+```shell
+poetry install --with dev
 poetry run pytest -q
 poetry run uvicorn app.main:app --reload --port 7860
 ```
 Important : le modele `HistGB_final_model.pkl` doit etre regenere avec la
+version de scikit-learn definie dans `requirements.txt` / `pyproject.toml`
+(re-execution de `P6_MANET_Stephane_notebook_modélisation.ipynb`, cellule de
+sauvegarde pickle).
 ### Exemple d'input (schema + valeurs)
 }
 ```
+### Prediction minimale (client existant)
+Endpoint `POST /predict-minimal` : l'utilisateur fournit un identifiant client,
+un montant de credit et une duree. Les autres features sont prises depuis la
+reference clients (`CUSTOMER_DATA_PATH`, par defaut `data/data_final.parquet`).
+Si la reference est absente, l'API renvoie 503.
+```shell
+curl -s -X POST "${BASE_URL}/predict-minimal" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "sk_id_curr": 100001,
+    "amt_credit": 200000,
+    "duration_months": 60
+  }'
+```
+Variables utiles :
+- `CUSTOMER_LOOKUP_ENABLED=1` active la recherche client (defaut: 1)
+- `CUSTOMER_DATA_PATH=data/data_final.parquet`
+- `CUSTOMER_LOOKUP_CACHE=1` garde la reference en memoire
+### Data contract (validation)
+- Types numeriques stricts (invalides -> 422).
+- Ranges numeriques (min/max entrainement) controles.
+- Categoriels normalises: `CODE_GENDER` -> {`F`, `M`}, `FLAG_OWN_CAR` -> {`Y`, `N`}.
+- Sentinelle `DAYS_EMPLOYED=365243` remplacee par NaN.
+- Logs enrichis via `data_quality` pour distinguer drift vs qualite de donnees.
+### Interface Gradio (scoring)
+```shell
+python gradio_app.py
+```
+Sur Hugging Face Spaces, `app.py` lance l'UI Gradio automatiquement.
 Note : l'API valide strictement les champs requis (`/features`). Pour afficher
 toutes les colonnes possibles : `/features?include_all=true`.
+### Hugging Face (assets lourds)
+Les fichiers binaires (modele, preprocessor, data_final) ne sont pas pushes
+dans le Space. Ils sont telecharges a l'execution via Hugging Face Hub si les
+variables suivantes sont definies :
+- `HF_MODEL_REPO_ID` + `HF_MODEL_FILENAME` + `HF_MODEL_REPO_TYPE`
+- `HF_PREPROCESSOR_REPO_ID` + `HF_PREPROCESSOR_FILENAME` + `HF_PREPROCESSOR_REPO_TYPE`
+- `HF_CUSTOMER_REPO_ID` + `HF_CUSTOMER_FILENAME` + `HF_CUSTOMER_REPO_TYPE`
+Exemple (un seul repo dataset avec 3 fichiers) :
+- `HF_MODEL_REPO_ID=stephmnt/credit-scoring-mlops-assets`
+- `HF_MODEL_REPO_TYPE=dataset`
+- `HF_MODEL_FILENAME=HistGB_final_model.pkl`
+- `HF_PREPROCESSOR_REPO_ID=stephmnt/credit-scoring-mlops-assets`
+- `HF_PREPROCESSOR_REPO_TYPE=dataset`
+- `HF_PREPROCESSOR_FILENAME=preprocessor.joblib`
+- `HF_CUSTOMER_REPO_ID=stephmnt/credit-scoring-mlops-assets`
+- `HF_CUSTOMER_REPO_TYPE=dataset`
+- `HF_CUSTOMER_FILENAME=data_final.parquet`
 ### Demo live (commandes cles en main)
 Lancer l'API :
 - `LOGS_ACCESS_TOKEN` pour proteger l'endpoint `/logs`
 - `LOG_HASH_SK_ID=1` pour anonymiser `SK_ID_CURR`
+Les logs incluent un bloc `data_quality` par requete (champs manquants,
+types invalides, out-of-range, categories inconnues, sentinelle
+`DAYS_EMPLOYED`).
 Exemple local :
 ```shell
 curl -s -H "Authorization: Bearer $LOGS_ACCESS_TOKEN" "${BASE_URL}/logs?tail=200"
 ```
+Apres quelques requêtes, générer le rapport de drift :
 ```shell
 python monitoring/drift_report.py \
   --logs logs/predictions.jsonl \
   --reference data/data_final.parquet \
+  --output-dir reports \
+  --min-prod-samples 200 \
+  --fdr-alpha 0.05 \
+  --prod-since "2024-01-01T00:00:00Z" \
+  --prod-until "2024-01-31T23:59:59Z"
 ```
 Le rapport HTML est généré dans `reports/drift_report.html` (avec des plots dans
 `reports/plots/`). Sur Hugging Face, le disque est éphemère : télécharger les logs
 avant d'analyser.
+Le drift est calcule uniquement si `n_prod >= --min-prod-samples` (defaut 200).
+Sinon, un badge "Sample insuffisant" est affiche et les alertes sont desactivees.
+Robustesse integree:
+- Categoriels: PSI avec lissage (`--psi-eps`) + categories rares regroupees (OTHER).
+- Numeriques: KS corrige par FDR (Benjamini-Hochberg, `--fdr-alpha`).
+- Sentinel `DAYS_EMPLOYED`: converti en NaN + taux suivi.
 Le rapport inclut aussi la distribution des scores predits et le taux de prediction
+(option `--score-bins` pour ajuster le nombre de bins), ainsi qu'une section
+Data Quality si les logs contiennent `data_quality` (types, NaN, out-of-range,
+categories inconnues).
+Pour simuler des fenetres glissantes, utiliser `--prod-since` / `--prod-until`
+avec les timestamps des logs.
+Runbook drift: `docs/monitoring/runbook.md`.
 Captures (snapshot local du reporting + stockage):
 - Rapport: `docs/monitoring/drift_report.html` + `docs/monitoring/plots/`
 - Stockage des logs: `docs/monitoring/logs_storage.png`
+## Profiling & Optimisation (Etape 4)
+Profiling et benchmark d'inference (cProfile + latence) :
+```shell
+python profiling/profile_inference.py \
+  --sample-size 2000 \
+  --batch-size 128 \
+  --runs 3
+```
+Sorties:
+- `docs/performance/benchmark_results.json`
+- `docs/performance/profile_summary.txt`
+- Rapport detaille: `docs/performance/performance_report.md`
+Dashboard local Streamlit (monitoring + drift):
+```shell
+python -m streamlit run monitoring/streamlit_app.py
+```
 ## Contenu de la release
 - **Preparation + pipeline** : nettoyage / preparation, encodage, imputation et pipeline d'entrainement presentes.
 - **Score metier + seuil optimal** : le `custom_score` est la metrique principale des tableaux de comparaison et de la CV, avec un `best_threshold` calcule.
 - **Explicabilite** : feature importance, SHAP et LIME sont inclus.
 - **Selection de features par correlation** : top‑N numeriques + un petit set categoriel, expose via `/features`.
+- **Interface Gradio** : formulaire minimal (id client + montant + duree) base sur la reference clients.
+- **Monitoring & drift** : rapport HTML avec gating par volume, PSI robuste, KS + FDR, data quality et
+  distribution des scores (snapshots dans `docs/monitoring/`).
+- **Profiling & optimisation** : benchmark d'inference + profil cProfile (dossier `docs/performance/`).
 - **CI/CD** : tests avec couverture (`pytest-cov`), build Docker et deploy vers Hugging Face Spaces.
 ![Screenshot MLFlow](https://raw.githubusercontent.com/stephmnt/credit-scoring-mlops/main/screen-mlflow.png)
 * Compléter les tests API: /logs (auth OK/KO), batch predict, param threshold, SK_ID_CURR manquant, outliers dans test_api.py.
 * Simplifier le fallback ALLOW_MISSING_ARTIFACTS et DummyModel si les artefacts sont versionnés (nettoyer main.py et conftest.py).
 * Si l’évaluateur attend une stratégie de branches, créer une branche feature et fusionner pour preuve.

hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+# Monitoring Captures
+These files are snapshot artifacts for the monitoring deliverable.
+- drift_report.html: report generated by monitoring/drift_report.py (sample-size 5000).
+- runbook.md: triage et actions quand une alerte drift apparait.
+- plots/: feature drift plots + score distribution + prediction rate.
+- predictions_sample.jsonl: sanitized example of production logs.
+- logs_storage.png: snapshot of the logging storage format.
+Notes:
+- Drift alerts are gated by minimum production volume (see report badge).
+- Data quality metrics appear when logs include `data_quality`.

hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/drift_report.html ADDED Viewed

	@@ -0,0 +1,140 @@

+<!doctype html>
+<html>
+  <head>
+    <meta charset="utf-8" />
+    <title>Drift Report</title>
+    <style>
+      body { font-family: Arial, sans-serif; margin: 24px; }
+      table { border-collapse: collapse; width: 100%; }
+      th, td { border: 1px solid #ddd; padding: 8px; }
+      th { background: #f3f3f3; }
+      img { max-width: 720px; }
+    </style>
+  </head>
+  <body>
+    <h2>Production Monitoring Summary</h2>
+    <ul>
+      <li>Total calls: 1</li>
+      <li>Error rate: 0.00%</li>
+      <li>Latency p50: 82.04 ms</li>
+      <li>Latency p95: 82.04 ms</li>
+    </ul>
+    <h2>Score Monitoring</h2>
+    <ul>
+      <li>Score mean: 0.3755</li>
+<li>Score p50: 0.3755</li>
+<li>Score p95: 0.3755</li>
+<li>Score min: 0.3755</li>
+<li>Score max: 0.3755</li>
+<li>Predicted default rate: 0.00%</li>
+    </ul>
+    <img src='plots/score_distribution.png' />
+<img src='plots/prediction_rate.png' />
+    <h2>Data Drift Summary</h2>
+    <table border="1" class="dataframe">
+  <thead>
+    <tr style="text-align: right;">
+      <th>feature</th>
+      <th>type</th>
+      <th>ks_stat</th>
+      <th>p_value</th>
+      <th>drift_detected</th>
+      <th>psi</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>EXT_SOURCE_2</td>
+      <td>numeric</td>
+      <td>0.5905</td>
+      <td>0.819238</td>
+      <td>False</td>
+      <td>NaN</td>
+    </tr>
+    <tr>
+      <td>EXT_SOURCE_3</td>
+      <td>numeric</td>
+      <td>0.9047</td>
+      <td>0.191111</td>
+      <td>False</td>
+      <td>NaN</td>
+    </tr>
+    <tr>
+      <td>AMT_ANNUITY</td>
+      <td>numeric</td>
+      <td>0.5184</td>
+      <td>0.963407</td>
+      <td>False</td>
+      <td>NaN</td>
+    </tr>
+    <tr>
+      <td>EXT_SOURCE_1</td>
+      <td>numeric</td>
+      <td>0.5822</td>
+      <td>0.836199</td>
+      <td>False</td>
+      <td>NaN</td>
+    </tr>
+    <tr>
+      <td>CODE_GENDER</td>
+      <td>categorical</td>
+      <td>NaN</td>
+      <td>NaN</td>
+      <td>True</td>
+      <td>9.6538</td>
+    </tr>
+    <tr>
+      <td>DAYS_EMPLOYED</td>
+      <td>numeric</td>
+      <td>0.6508</td>
+      <td>0.698660</td>
+      <td>False</td>
+      <td>NaN</td>
+    </tr>
+    <tr>
+      <td>AMT_CREDIT</td>
+      <td>numeric</td>
+      <td>0.5996</td>
+      <td>0.801040</td>
+      <td>False</td>
+      <td>NaN</td>
+    </tr>
+    <tr>
+      <td>AMT_GOODS_PRICE</td>
+      <td>numeric</td>
+      <td>0.6115</td>
+      <td>0.777177</td>
+      <td>False</td>
+      <td>NaN</td>
+    </tr>
+    <tr>
+      <td>DAYS_BIRTH</td>
+      <td>numeric</td>
+      <td>0.9474</td>
+      <td>0.105579</td>
+      <td>False</td>
+      <td>NaN</td>
+    </tr>
+    <tr>
+      <td>FLAG_OWN_CAR</td>
+      <td>categorical</td>
+      <td>NaN</td>
+      <td>NaN</td>
+      <td>True</td>
+      <td>4.3985</td>
+    </tr>
+  </tbody>
+</table>
+    <h2>Feature Distributions</h2>
+    <h4>EXT_SOURCE_2</h4><img src='plots/EXT_SOURCE_2.png' />
+<h4>EXT_SOURCE_3</h4><img src='plots/EXT_SOURCE_3.png' />
+<h4>AMT_ANNUITY</h4><img src='plots/AMT_ANNUITY.png' />
+<h4>EXT_SOURCE_1</h4><img src='plots/EXT_SOURCE_1.png' />
+<h4>CODE_GENDER</h4><img src='plots/CODE_GENDER.png' />
+<h4>DAYS_EMPLOYED</h4><img src='plots/DAYS_EMPLOYED.png' />
+<h4>AMT_CREDIT</h4><img src='plots/AMT_CREDIT.png' />
+<h4>AMT_GOODS_PRICE</h4><img src='plots/AMT_GOODS_PRICE.png' />
+<h4>DAYS_BIRTH</h4><img src='plots/DAYS_BIRTH.png' />
+<h4>FLAG_OWN_CAR</h4><img src='plots/FLAG_OWN_CAR.png' />
+  </body>
+</html>

hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/logs_storage.png ADDED Viewed

hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/AMT_ANNUITY.png ADDED Viewed

hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/AMT_CREDIT.png ADDED Viewed

hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/AMT_GOODS_PRICE.png ADDED Viewed

hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/CODE_GENDER.png ADDED Viewed

hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/DAYS_BIRTH.png ADDED Viewed

hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/DAYS_EMPLOYED.png ADDED Viewed

hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/EXT_SOURCE_1.png ADDED Viewed

hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/EXT_SOURCE_2.png ADDED Viewed

hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/EXT_SOURCE_3.png ADDED Viewed

hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/FLAG_OWN_CAR.png ADDED Viewed

hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/prediction_rate.png ADDED Viewed

hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/score_distribution.png ADDED Viewed

hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/predictions_sample.jsonl ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ {"timestamp": "2025-01-01T00:00:00+00:00", "request_id": "00000000-0000-0000-0000-000000000001", "endpoint": "/predict", "latency_ms": 42.5, "status_code": 200, "model_version": "HistGB_final_model.pkl", "threshold": 0.5, "inputs": {"AMT_ANNUITY": 24700.5, "AMT_CREDIT": 406597.5, "AMT_GOODS_PRICE": 351000.0, "CODE_GENDER": "M", "DAYS_BIRTH": -9461, "DAYS_EMPLOYED": -637, "EXT_SOURCE_1": 0.45, "EXT_SOURCE_2": 0.61, "EXT_SOURCE_3": 0.75, "FLAG_OWN_CAR": "N", "SK_ID_CURR": "hash_100002"}, "sk_id_curr": "hash_100002", "probability": 0.3754, "prediction": 0}
2	+ {"timestamp": "2025-01-01T00:00:03+00:00", "request_id": "00000000-0000-0000-0000-000000000002", "endpoint": "/predict", "latency_ms": 51.2, "status_code": 200, "model_version": "HistGB_final_model.pkl", "threshold": 0.5, "inputs": {"AMT_ANNUITY": 19000.0, "AMT_CREDIT": 320000.0, "AMT_GOODS_PRICE": 280000.0, "CODE_GENDER": "F", "DAYS_BIRTH": -12000, "DAYS_EMPLOYED": -1200, "EXT_SOURCE_1": 0.33, "EXT_SOURCE_2": 0.52, "EXT_SOURCE_3": 0.64, "FLAG_OWN_CAR": "Y", "SK_ID_CURR": "hash_100003"}, "sk_id_curr": "hash_100003", "probability": 0.6123, "prediction": 1}

hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/runbook.md ADDED Viewed

	@@ -0,0 +1,28 @@

+# Drift Runbook (MLOps)
+## A. Data quality (prioritaire)
+- verifier categories inconnues (CODE_GENDER, FLAG_OWN_CAR)
+- verifier hausse des NaN / champs manquants
+- verifier out-of-range numeriques
+- verifier le taux de sentinelle DAYS_EMPLOYED
+- verifier un changement de pipeline (mapping, imputation, schema)
+## B. Prediction drift
+- verifier la distribution des scores
+- verifier le taux de classe positive
+- verifier si le seuil metier a change
+## C. Performance (si labels)
+- AUC / logloss / Brier
+- calibration (Platt/Isotonic)
+- analyse par segment (region, canal, produit si dispo)
+## Actions
+- drift artificiel / bug data: corriger mapping ou schema, redeployer
+- prior drift: recalibrer ou ajuster le seuil avec validation metier
+- concept drift: retrain recent + validation temporelle + champion/challenger + plan de rollback
+## Triggers
+- Warning: drift data sans drift score ou perf
+- Critical: drift data + drift score (et/ou perf en baisse)
+- Retrain: drift persistant sur plusieurs fenetres + impact score/perf

hf_space/hf_space/hf_space/hf_space/hf_space/docs/performance/benchmark_results.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "name": "optimized_preprocess",
+    "batches": 10,
+    "batch_size": 100,
+    "mean_ms": 35.73424170026556,
+    "p50_ms": 33.76843745354563,
+    "p95_ms": 43.09078284422866,
+    "throughput_rows_per_sec": 2798.4363244304373
+  },
+  {
+    "name": "legacy_preprocess_alignment",
+    "batches": 10,
+    "batch_size": 100,
+    "mean_ms": 47.56558339577168,
+    "p50_ms": 47.193103993777186,
+    "p95_ms": 51.22594404965639,
+    "throughput_rows_per_sec": 2102.360422407632
+  }
+]

hf_space/hf_space/hf_space/hf_space/hf_space/docs/performance/performance_report.md ADDED Viewed

	@@ -0,0 +1,50 @@

+# Profiling & Optimisation (Etape 4)
+## Objectif
+Mesurer la latence d'inference, identifier les goulots d'etranglement et proposer une optimisation logicielle sans regression fonctionnelle.
+## Setup
+- Script: `profiling/profile_inference.py`
+- Donnees: `data/data_final.parquet` (echantillon)
+- Parametres: `--sample-size 500 --batch-size 100 --runs 2`
+- Modele: `HistGB_final_model.pkl`
+Les resultats sont sauvegardes dans:
+- `docs/performance/benchmark_results.json`
+- `docs/performance/profile_summary.txt`
+## Resultats
+| Scenario | Batch | Mean (ms) | P50 (ms) | P95 (ms) | Throughput (rows/s) |
+| --- | --- | ---:| ---:| ---:| ---:|
+| optimized_preprocess | 100 | 187.37 | 169.96 | 271.41 | 533.71 |
+| legacy_preprocess_alignment | 100 | 273.05 | 264.45 | 357.41 | 366.23 |
+Gain observe (moyenne): ~31% de reduction de latence par batch sur le chemin optimise.
+## Goulots d'etranglement (cProfile)
+Extrait `docs/performance/profile_summary.txt`:
+- `app.main:preprocess_input` represente l'essentiel du temps cumule (~0.90s sur 1.05s).
+- Operations pandas dominantes:
+  - `DataFrame.__setitem__` / `insert`
+  - `fillna`, `to_numeric`
+  - `get_dummies`
+- `HistGradientBoostingClassifier.predict_proba` est present mais non majoritaire (~0.15s).
+## Optimisation appliquee
+- Alignement one-hot optimise: remplacement de la boucle d'ajout de colonnes par un `reindex` avec `fill_value=0`.
+- Alignement des colonnes d'entree: remplacement de l'ajout colonne-par-colonne par un `reindex` sur `columns_keep`.
+- Resultat: latence moyenne par batch reduite vs le chemin legacy (mesure ci-dessus).
+## Pistes futures
+- Precalculer un pipeline scikit-learn complet (OneHotEncoder + scaler) pour eviter le `get_dummies` a chaque requete.
+- Export ONNX et inference via ONNX Runtime pour accelerer la predicition.
+- Ajuster la taille de batch pour maximiser le throughput.
+- Eventuellement degrader certains controles en mode "fast" si le contexte le permet (trade-off securite vs latence).

hf_space/hf_space/hf_space/hf_space/hf_space/docs/performance/profile_summary.txt ADDED Viewed

	@@ -0,0 +1,38 @@

+         157685 function calls (154232 primitive calls) in 0.071 seconds
+   Ordered by: cumulative time
+   List reduced from 783 to 30 due to restriction <30>
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.001    0.001    0.060    0.060 /Users/steph/Code/Python/Jupyter/OCR_projet06/app/main.py:772(preprocess_input)
+      310    0.001    0.000    0.015    0.000 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pandas/core/frame.py:4282(__setitem__)
+      310    0.000    0.000    0.014    0.000 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pandas/core/frame.py:4525(_set_item)
+      310    0.000    0.000    0.011    0.000 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pandas/core/frame.py:4492(_set_item_mgr)
+        1    0.000    0.000    0.010    0.010 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py:2263(predict_proba)
+        1    0.000    0.000    0.010    0.010 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py:1293(_raw_predict)
+      288    0.001    0.000    0.009    0.000 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pandas/core/series.py:392(__init__)
+      158    0.001    0.000    0.009    0.000 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pandas/core/generic.py:7164(fillna)
+        1    0.000    0.000    0.009    0.009 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pandas/core/reshape/encoding.py:44(get_dummies)
+      201    0.001    0.000    0.009    0.000 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pandas/core/internals/managers.py:317(apply)
+      297    0.000    0.000    0.008    0.000 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pandas/core/frame.py:4481(_iset_item_mgr)
+      363    0.001    0.000    0.008    0.000 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pandas/core/frame.py:4073(__getitem__)
+        1    0.001    0.001    0.008    0.008 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py:1333(_predict_iterations)
+      299    0.002    0.000    0.008    0.000 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pandas/core/internals/managers.py:1085(iset)
+      133    0.007    0.000    0.007    0.000 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/sklearn/ensemble/_hist_gradient_boosting/predictor.py:49(predict)
+      158    0.000    0.000    0.007    0.000 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pandas/core/internals/base.py:180(fillna)
+      160    0.001    0.000    0.007    0.000 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pandas/core/tools/numeric.py:47(to_numeric)
+      377    0.001    0.000    0.006    0.000 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pandas/core/frame.py:4637(_get_item_cache)
+      158    0.001    0.000    0.006    0.000 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pandas/core/internals/blocks.py:1709(fillna)
+34692/34379    0.004    0.000    0.006    0.000 {built-in method builtins.isinstance}
+        2    0.000    0.000    0.005    0.003 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/sklearn/utils/validation.py:2793(validate_data)
+      353    0.000    0.000    0.005    0.000 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pandas/core/frame.py:3994(_ixs)
+        2    0.000    0.000    0.005    0.002 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/sklearn/utils/validation.py:725(check_array)
+       15    0.000    0.000    0.004    0.000 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pandas/core/reshape/encoding.py:239(_get_dummies_1d)
+  156/143    0.001    0.000    0.004    0.000 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pandas/core/internals/blocks.py:1590(where)
+      348    0.001    0.000    0.003    0.000 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pandas/core/construction.py:517(sanitize_array)
+       50    0.000    0.000    0.003    0.000 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pandas/core/ops/common.py:62(new_method)
+      441    0.000    0.000    0.003    0.000 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pandas/core/dtypes/missing.py:101(isna)
+      353    0.000    0.000    0.003    0.000 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pandas/core/frame.py:4619(_box_col_values)
+      441    0.000    0.000    0.003    0.000 /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pandas/core/dtypes/missing.py:184(_isna)

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy.yml CHANGED Viewed

@@ -21,11 +21,13 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install poetry
-          poetry install --no-interaction --no-ansi
       - name: Run tests
-        run: poetry run pytest -q
       - name: Deploy to Hugging Face Space
         if: github.ref == 'refs/heads/main'
@@ -34,7 +36,7 @@ jobs:
         run: |
           git config --global user.email "actions@github.com"
           git config --global user.name "GitHub Actions"
-          git clone https://huggingface.co/spaces/stephmnt/ocr_projet06 hf_space
           rsync -av \
             --exclude '.git' \
             --exclude '.venv' \
@@ -51,4 +53,4 @@ jobs:
           cd hf_space
           git add .
           git commit -m "Auto-deploy from GitHub Actions" || echo "No changes to commit"
-          git push https://stephmnt:${HF_TOKEN}@huggingface.co/spaces/stephmnt/ocr_projet06 main

       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
+          pip install -r requirements.txt
       - name: Run tests
+        run: pytest --cov=app --cov=monitoring --cov-report=term-missing -q
+      - name: Build Docker image
+        run: docker build -t ocr-projet06:ci .
       - name: Deploy to Hugging Face Space
         if: github.ref == 'refs/heads/main'
         run: |
           git config --global user.email "actions@github.com"
           git config --global user.name "GitHub Actions"
+          git clone https://huggingface.co/spaces/stephmnt/credit-scoring-mlops hf_space
           rsync -av \
             --exclude '.git' \
             --exclude '.venv' \
           cd hf_space
           git add .
           git commit -m "Auto-deploy from GitHub Actions" || echo "No changes to commit"
+          git push https://stephmnt:${HF_TOKEN}@huggingface.co/spaces/stephmnt/credit-scoring-mlops main

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.gitignore CHANGED Viewed

@@ -16,6 +16,8 @@ artifacts/*
 mlruns/
 .DS_Store
 *.code-workspace
 ## https://github.com/github/gitignore/blob/e8554d85bf62e38d6db966a50d2064ac025fd82a/Python.gitignore

 mlruns/
 .DS_Store
 *.code-workspace
+presentation_projet08.pptx
+rapport_projet06.md
 ## https://github.com/github/gitignore/blob/e8554d85bf62e38d6db966a50d2064ac025fd82a/Python.gitignore

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py CHANGED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 import logging
 import os
 import pickle
-from dataclasses import dataclass
 from datetime import datetime, timezone
 import hashlib
 import json
@@ -11,10 +11,11 @@ from pathlib import Path
 import time
 from typing import Any
 import uuid
 import numpy as np
 import pandas as pd
-from fastapi import FastAPI, HTTPException, Query, Response
 from pydantic import BaseModel
 from sklearn.preprocessing import MinMaxScaler
 import joblib
@@ -27,6 +28,9 @@ ARTIFACTS_PATH = Path(os.getenv("ARTIFACTS_PATH", "artifacts/preprocessor.joblib
 DEFAULT_THRESHOLD = float(os.getenv("PREDICTION_THRESHOLD", "0.5"))
 CACHE_PREPROCESSOR = os.getenv("CACHE_PREPROCESSOR", "1") != "0"
 USE_REDUCED_INPUTS = os.getenv("USE_REDUCED_INPUTS", "1") != "0"
 CORRELATION_THRESHOLD = float(os.getenv("CORRELATION_THRESHOLD", "0.85"))
 CORRELATION_SAMPLE_SIZE = int(os.getenv("CORRELATION_SAMPLE_SIZE", "50000"))
 ALLOW_MISSING_ARTIFACTS = os.getenv("ALLOW_MISSING_ARTIFACTS", "0") == "1"
@@ -36,6 +40,7 @@ LOG_FILE = os.getenv("LOG_FILE", "predictions.jsonl")
 LOG_INCLUDE_INPUTS = os.getenv("LOG_INCLUDE_INPUTS", "1") == "1"
 LOG_HASH_SK_ID = os.getenv("LOG_HASH_SK_ID", "0") == "1"
 MODEL_VERSION = os.getenv("MODEL_VERSION", MODEL_PATH.name)
 IGNORE_FEATURES = ["is_train", "is_test", "TARGET", "SK_ID_CURR"]
 ENGINEERED_FEATURES = [
@@ -53,8 +58,9 @@ ENGINEERED_SOURCES = [
     "CNT_FAM_MEMBERS",
     "AMT_ANNUITY",
 ]
-# Top inputs derived from SHAP importance (modeling notebook), limited to application features.
-REDUCED_INPUT_FEATURES = [
     "SK_ID_CURR",
     "EXT_SOURCE_2",
     "EXT_SOURCE_3",
@@ -81,6 +87,31 @@ OUTLIER_COLUMNS = [
     "AMT_REQ_CREDIT_BUREAU_QRT",
 ]
 class PredictionRequest(BaseModel):
     data: dict[str, Any] | list[dict[str, Any]]
@@ -102,6 +133,9 @@ class PreprocessorArtifacts:
     required_input_columns: list[str]
     numeric_required_columns: list[str]
     correlated_imputation: dict[str, dict[str, float | str]]
 app = FastAPI(title="Credit Scoring API", version="0.1.0")
@@ -130,6 +164,104 @@ def _hash_value(value: Any) -> str:
     return hashlib.sha256(str(value).encode("utf-8")).hexdigest()
 def _append_log_entries(entries: list[dict[str, Any]]) -> None:
     if not LOG_PREDICTIONS:
         return
@@ -151,6 +283,7 @@ def _log_prediction_entries(
     threshold: float | None,
     status_code: int,
     preprocessor: PreprocessorArtifacts,
     error: str | None = None,
 ) -> None:
     if not LOG_PREDICTIONS:
@@ -176,6 +309,8 @@ def _log_prediction_entries(
             "threshold": threshold,
             "inputs": inputs,
         }
         if results and idx < len(results):
             result = results[idx]
             sk_id = result.get("sk_id_curr")
@@ -234,6 +369,11 @@ def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
     for col, max_val in outlier_maxes.items():
         df = df[df[col] != max_val]
     numeric_ranges = {}
     for col in numeric_cols:
         if col in df.columns:
@@ -249,7 +389,9 @@ def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
     required_raw.update(col for col in columns_must_not_missing if col in input_feature_columns)
     required_raw.add("SK_ID_CURR")
     if USE_REDUCED_INPUTS:
-        required_input = sorted({col for col in REDUCED_INPUT_FEATURES if col in input_feature_columns})
     else:
         required_input = sorted(required_raw)
     numeric_required = sorted(col for col in required_input if col in numeric_medians)
@@ -275,6 +417,9 @@ def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
         required_input_columns=required_input,
         numeric_required_columns=numeric_required,
         correlated_imputation=correlated_imputation,
     )
@@ -340,7 +485,7 @@ def build_fallback_preprocessor() -> PreprocessorArtifacts:
     required_raw = set(ENGINEERED_SOURCES)
     required_raw.update(col for col in columns_must_not_missing if col in input_feature_columns)
     required_raw.add("SK_ID_CURR")
-    required_input = sorted({col for col in REDUCED_INPUT_FEATURES if col in input_feature_columns})
     numeric_required = sorted(col for col in required_input if col in numeric_medians)
     numeric_ranges = {col: (float(df[col].min()), float(df[col].max())) for col in numeric_cols}
@@ -360,6 +505,9 @@ def build_fallback_preprocessor() -> PreprocessorArtifacts:
         required_input_columns=required_input,
         numeric_required_columns=numeric_required,
         correlated_imputation={},
     )
@@ -368,6 +516,20 @@ def load_preprocessor(data_path: Path, artifacts_path: Path) -> PreprocessorArti
         preprocessor = joblib.load(artifacts_path)
         updated = False
         required_updated = False
         if not hasattr(preprocessor, "required_input_columns"):
             if USE_REDUCED_INPUTS:
                 required_input = _reduce_input_columns(preprocessor)
@@ -445,6 +607,90 @@ def _infer_numeric_ranges_from_scaler(preprocessor: PreprocessorArtifacts) -> di
     return ranges
 def _build_correlated_imputation(
     df: pd.DataFrame,
     *,
@@ -496,10 +742,49 @@ def _build_correlated_imputation(
 def _reduce_input_columns(preprocessor: PreprocessorArtifacts) -> list[str]:
-    cols = [col for col in REDUCED_INPUT_FEATURES if col in preprocessor.input_feature_columns or col == "SK_ID_CURR"]
     if "SK_ID_CURR" not in cols:
-        cols.append("SK_ID_CURR")
-    return sorted(set(cols))
 def _compute_correlated_imputation(
@@ -535,8 +820,17 @@ def _compute_correlated_imputation(
     )
-def _ensure_required_columns(df: pd.DataFrame, required_cols: list[str]) -> None:
-    missing = [col for col in required_cols if col not in df.columns or df[col].isna().any()]
     if missing:
         raise HTTPException(
             status_code=422,
@@ -552,7 +846,7 @@ def _validate_numeric_inputs(df: pd.DataFrame, numeric_cols: list[str]) -> None:
     invalid = []
     for col in numeric_cols:
         coerced = pd.to_numeric(df[col], errors="coerce")
-        if coerced.isna().any():
             invalid.append(col)
     if invalid:
         raise HTTPException(
@@ -573,9 +867,8 @@ def _validate_numeric_ranges(df: pd.DataFrame, numeric_ranges: dict[str, tuple[f
         if col not in df.columns:
             continue
         values = pd.to_numeric(df[col], errors="coerce")
-        if values.isna().any():
-            continue
-        if ((values < min_val) | (values > max_val)).any():
             out_of_range.append(col)
     if out_of_range:
         raise HTTPException(
@@ -617,7 +910,8 @@ def preprocess_input(df_raw: pd.DataFrame, artifacts: PreprocessorArtifacts) ->
         if col not in df.columns:
             df[col] = np.nan
-    _ensure_required_columns(df, artifacts.required_input_columns)
     _validate_numeric_inputs(df, artifacts.numeric_required_columns)
     _validate_numeric_ranges(df, {k: v for k, v in artifacts.numeric_ranges.items() if k in artifacts.numeric_required_columns})
@@ -629,10 +923,7 @@ def preprocess_input(df_raw: pd.DataFrame, artifacts: PreprocessorArtifacts) ->
     df = new_features_creation(df)
     df.replace([np.inf, -np.inf], np.nan, inplace=True)
-    for col in artifacts.columns_keep:
-        if col not in df.columns:
-            df[col] = np.nan
-    df = df[artifacts.columns_keep]
     _apply_correlated_imputation(df, artifacts)
@@ -645,7 +936,7 @@ def preprocess_input(df_raw: pd.DataFrame, artifacts: PreprocessorArtifacts) ->
         if col in df.columns:
             df[col] = df[col].fillna("Unknown")
-    _ensure_required_columns(df, artifacts.required_input_columns)
     if "CODE_GENDER" in df.columns and (df["CODE_GENDER"] == "XNA").any():
         raise HTTPException(
@@ -664,10 +955,7 @@ def preprocess_input(df_raw: pd.DataFrame, artifacts: PreprocessorArtifacts) ->
             )
     df_hot = pd.get_dummies(df, columns=artifacts.categorical_columns)
-    for col in artifacts.features_to_scaled:
-        if col not in df_hot.columns:
-            df_hot[col] = 0
-    df_hot = df_hot[artifacts.features_to_scaled]
     scaled = artifacts.scaler.transform(df_hot)
     return pd.DataFrame(scaled, columns=artifacts.features_to_scaled, index=df.index)
@@ -716,10 +1004,20 @@ def features(include_all: bool = Query(default=False)) -> dict[str, Any]:
     preprocessor: PreprocessorArtifacts = app.state.preprocessor
     optional_features = [col for col in preprocessor.input_feature_columns if col not in preprocessor.required_input_columns]
     correlated = sorted(getattr(preprocessor, "correlated_imputation", {}) or {})
     payload = {
         "required_input_features": preprocessor.required_input_columns,
         "engineered_features": ENGINEERED_FEATURES,
         "model_features_count": len(preprocessor.features_to_scaled),
         "correlation_threshold": CORRELATION_THRESHOLD,
         "correlated_imputation_count": len(correlated),
         "correlated_imputation_features": correlated[:50],
@@ -734,6 +1032,37 @@ def features(include_all: bool = Query(default=False)) -> dict[str, Any]:
     return payload
 @app.post("/predict")
 def predict(
     payload: PredictionRequest,
@@ -750,11 +1079,20 @@ def predict(
     try:
         df_raw = pd.DataFrame.from_records(records)
-        if "SK_ID_CURR" not in df_raw.columns:
             raise HTTPException(status_code=422, detail={"message": "SK_ID_CURR is required."})
-        sk_ids = df_raw["SK_ID_CURR"].tolist()
-        features = preprocess_input(df_raw, preprocessor)
         if hasattr(model, "predict_proba"):
             proba = model.predict_proba(features)[:, 1]
@@ -771,12 +1109,13 @@ def predict(
             latency_ms = (time.perf_counter() - start_time) * 1000.0
             _log_prediction_entries(
                 request_id=request_id,
-                records=records,
                 results=results,
                 latency_ms=latency_ms,
                 threshold=use_threshold,
                 status_code=200,
                 preprocessor=preprocessor,
             )
             return {"predictions": results, "threshold": use_threshold}
@@ -791,12 +1130,13 @@ def predict(
         latency_ms = (time.perf_counter() - start_time) * 1000.0
         _log_prediction_entries(
             request_id=request_id,
-            records=records,
             results=results,
             latency_ms=latency_ms,
             threshold=None,
             status_code=200,
             preprocessor=preprocessor,
         )
         return {"predictions": results, "threshold": None}
     except HTTPException as exc:
@@ -804,12 +1144,13 @@ def predict(
         detail = exc.detail if isinstance(exc.detail, dict) else {"message": str(exc.detail)}
         _log_prediction_entries(
             request_id=request_id,
-            records=records,
             results=None,
             latency_ms=latency_ms,
             threshold=threshold,
             status_code=exc.status_code,
             preprocessor=preprocessor,
             error=json.dumps(detail, ensure_ascii=True),
         )
         raise
@@ -817,12 +1158,13 @@ def predict(
         latency_ms = (time.perf_counter() - start_time) * 1000.0
         _log_prediction_entries(
             request_id=request_id,
-            records=records,
             results=None,
             latency_ms=latency_ms,
             threshold=threshold,
             status_code=500,
             preprocessor=preprocessor,
             error=str(exc),
         )
         raise

 import logging
 import os
 import pickle
+from dataclasses import dataclass, field
 from datetime import datetime, timezone
 import hashlib
 import json
 import time
 from typing import Any
 import uuid
+from collections import deque
 import numpy as np
 import pandas as pd
+from fastapi import FastAPI, Header, HTTPException, Query, Response
 from pydantic import BaseModel
 from sklearn.preprocessing import MinMaxScaler
 import joblib
 DEFAULT_THRESHOLD = float(os.getenv("PREDICTION_THRESHOLD", "0.5"))
 CACHE_PREPROCESSOR = os.getenv("CACHE_PREPROCESSOR", "1") != "0"
 USE_REDUCED_INPUTS = os.getenv("USE_REDUCED_INPUTS", "1") != "0"
+FEATURE_SELECTION_METHOD = os.getenv("FEATURE_SELECTION_METHOD", "correlation")
+FEATURE_SELECTION_TOP_N = int(os.getenv("FEATURE_SELECTION_TOP_N", "8"))
+FEATURE_SELECTION_MIN_CORR = float(os.getenv("FEATURE_SELECTION_MIN_CORR", "0.02"))
 CORRELATION_THRESHOLD = float(os.getenv("CORRELATION_THRESHOLD", "0.85"))
 CORRELATION_SAMPLE_SIZE = int(os.getenv("CORRELATION_SAMPLE_SIZE", "50000"))
 ALLOW_MISSING_ARTIFACTS = os.getenv("ALLOW_MISSING_ARTIFACTS", "0") == "1"
 LOG_INCLUDE_INPUTS = os.getenv("LOG_INCLUDE_INPUTS", "1") == "1"
 LOG_HASH_SK_ID = os.getenv("LOG_HASH_SK_ID", "0") == "1"
 MODEL_VERSION = os.getenv("MODEL_VERSION", MODEL_PATH.name)
+LOGS_ACCESS_TOKEN = os.getenv("LOGS_ACCESS_TOKEN")
 IGNORE_FEATURES = ["is_train", "is_test", "TARGET", "SK_ID_CURR"]
 ENGINEERED_FEATURES = [
     "CNT_FAM_MEMBERS",
     "AMT_ANNUITY",
 ]
+FEATURE_SELECTION_CATEGORICAL_INPUTS = ["CODE_GENDER", "FLAG_OWN_CAR"]
+# Default reduced inputs (fallback when correlation-based selection is unavailable).
+DEFAULT_REDUCED_INPUT_FEATURES = [
     "SK_ID_CURR",
     "EXT_SOURCE_2",
     "EXT_SOURCE_3",
     "AMT_REQ_CREDIT_BUREAU_QRT",
 ]
+CODE_GENDER_MAPPING = {
+    "F": "F",
+    "FEMALE": "F",
+    "0": "F",
+    "W": "F",
+    "WOMAN": "F",
+    "M": "M",
+    "MALE": "M",
+    "1": "M",
+    "MAN": "M",
+}
+FLAG_OWN_CAR_MAPPING = {
+    "Y": "Y",
+    "YES": "Y",
+    "TRUE": "Y",
+    "1": "Y",
+    "T": "Y",
+    "N": "N",
+    "NO": "N",
+    "FALSE": "N",
+    "0": "N",
+    "F": "N",
+}
+DAYS_EMPLOYED_SENTINEL = 365243
 class PredictionRequest(BaseModel):
     data: dict[str, Any] | list[dict[str, Any]]
     required_input_columns: list[str]
     numeric_required_columns: list[str]
     correlated_imputation: dict[str, dict[str, float | str]]
+    reduced_input_columns: list[str] = field(default_factory=list)
+    feature_selection_method: str = "default"
+    feature_selection_scores: dict[str, float] = field(default_factory=dict)
 app = FastAPI(title="Credit Scoring API", version="0.1.0")
     return hashlib.sha256(str(value).encode("utf-8")).hexdigest()
+def _normalize_category_value(value: object, mapping: dict[str, str]) -> object:
+    if pd.isna(value):
+        return np.nan
+    key = str(value).strip().upper()
+    if not key:
+        return np.nan
+    return mapping.get(key, "Unknown")
+def _normalize_inputs(
+    df_raw: pd.DataFrame,
+    preprocessor: PreprocessorArtifacts,
+) -> tuple[pd.DataFrame, dict[str, pd.Series], pd.Series]:
+    df = df_raw.copy()
+    for col in preprocessor.required_input_columns:
+        if col not in df.columns:
+            df[col] = np.nan
+    unknown_masks: dict[str, pd.Series] = {}
+    if "CODE_GENDER" in df.columns:
+        raw = df["CODE_GENDER"]
+        normalized = raw.apply(lambda v: _normalize_category_value(v, CODE_GENDER_MAPPING))
+        unknown_masks["CODE_GENDER"] = normalized.eq("Unknown") & raw.notna()
+        df["CODE_GENDER"] = normalized
+    if "FLAG_OWN_CAR" in df.columns:
+        raw = df["FLAG_OWN_CAR"]
+        normalized = raw.apply(lambda v: _normalize_category_value(v, FLAG_OWN_CAR_MAPPING))
+        unknown_masks["FLAG_OWN_CAR"] = normalized.eq("Unknown") & raw.notna()
+        df["FLAG_OWN_CAR"] = normalized
+    sentinel_mask = pd.Series(False, index=df.index)
+    if "DAYS_EMPLOYED" in df.columns:
+        values = pd.to_numeric(df["DAYS_EMPLOYED"], errors="coerce")
+        sentinel_mask = values == DAYS_EMPLOYED_SENTINEL
+        if sentinel_mask.any():
+            df.loc[sentinel_mask, "DAYS_EMPLOYED"] = np.nan
+    return df, unknown_masks, sentinel_mask
+def _build_data_quality_records(
+    df_raw: pd.DataFrame,
+    df_norm: pd.DataFrame,
+    unknown_masks: dict[str, pd.Series],
+    sentinel_mask: pd.Series,
+    preprocessor: PreprocessorArtifacts,
+) -> list[dict[str, Any]]:
+    required_cols = preprocessor.required_input_columns
+    numeric_required = preprocessor.numeric_required_columns
+    numeric_ranges = {
+        col: bounds
+        for col, bounds in preprocessor.numeric_ranges.items()
+        if col in numeric_required
+    }
+    missing_mask = df_norm[required_cols].isna() if required_cols else pd.DataFrame(index=df_norm.index)
+    invalid_masks: dict[str, pd.Series] = {}
+    out_of_range_masks: dict[str, pd.Series] = {}
+    for col in numeric_required:
+        if col not in df_raw.columns:
+            invalid_masks[col] = pd.Series(False, index=df_norm.index)
+            continue
+        raw = df_raw[col]
+        coerced = pd.to_numeric(raw, errors="coerce")
+        invalid_masks[col] = coerced.isna() & raw.notna()
+    for col, (min_val, max_val) in numeric_ranges.items():
+        if col not in df_norm.columns:
+            out_of_range_masks[col] = pd.Series(False, index=df_norm.index)
+            continue
+        values = pd.to_numeric(df_norm[col], errors="coerce")
+        out_of_range_masks[col] = (values < min_val) | (values > max_val)
+    records: list[dict[str, Any]] = []
+    for idx in df_norm.index:
+        missing_cols = (
+            [col for col in required_cols if missing_mask.at[idx, col]]
+            if required_cols
+            else []
+        )
+        invalid_cols = [col for col, mask in invalid_masks.items() if mask.at[idx]]
+        out_of_range_cols = [col for col, mask in out_of_range_masks.items() if mask.at[idx]]
+        unknown_cols = [col for col, mask in unknown_masks.items() if mask.at[idx]]
+        nan_rate = float(missing_mask.loc[idx].mean()) if not missing_mask.empty else 0.0
+        records.append(
+            {
+                "missing_required_columns": missing_cols,
+                "invalid_numeric_columns": invalid_cols,
+                "out_of_range_columns": out_of_range_cols,
+                "unknown_categories": unknown_cols,
+                "days_employed_sentinel": bool(sentinel_mask.at[idx]) if not sentinel_mask.empty else False,
+                "nan_rate": nan_rate,
+            }
+        )
+    return records
 def _append_log_entries(entries: list[dict[str, Any]]) -> None:
     if not LOG_PREDICTIONS:
         return
     threshold: float | None,
     status_code: int,
     preprocessor: PreprocessorArtifacts,
+    data_quality: list[dict[str, Any]] | None = None,
     error: str | None = None,
 ) -> None:
     if not LOG_PREDICTIONS:
             "threshold": threshold,
             "inputs": inputs,
         }
+        if data_quality and idx < len(data_quality):
+            entry["data_quality"] = data_quality[idx]
         if results and idx < len(results):
             result = results[idx]
             sk_id = result.get("sk_id_curr")
     for col, max_val in outlier_maxes.items():
         df = df[df[col] != max_val]
+    reduced_input_columns, selection_scores, selection_method = _compute_reduced_inputs(
+        df,
+        input_feature_columns=input_feature_columns,
+    )
     numeric_ranges = {}
     for col in numeric_cols:
         if col in df.columns:
     required_raw.update(col for col in columns_must_not_missing if col in input_feature_columns)
     required_raw.add("SK_ID_CURR")
     if USE_REDUCED_INPUTS:
+        required_input = reduced_input_columns
+        if not required_input:
+            required_input = _fallback_reduced_inputs(input_feature_columns)
     else:
         required_input = sorted(required_raw)
     numeric_required = sorted(col for col in required_input if col in numeric_medians)
         required_input_columns=required_input,
         numeric_required_columns=numeric_required,
         correlated_imputation=correlated_imputation,
+        reduced_input_columns=reduced_input_columns,
+        feature_selection_method=selection_method,
+        feature_selection_scores=selection_scores,
     )
     required_raw = set(ENGINEERED_SOURCES)
     required_raw.update(col for col in columns_must_not_missing if col in input_feature_columns)
     required_raw.add("SK_ID_CURR")
+    required_input = _fallback_reduced_inputs(input_feature_columns)
     numeric_required = sorted(col for col in required_input if col in numeric_medians)
     numeric_ranges = {col: (float(df[col].min()), float(df[col].max())) for col in numeric_cols}
         required_input_columns=required_input,
         numeric_required_columns=numeric_required,
         correlated_imputation={},
+        reduced_input_columns=required_input,
+        feature_selection_method="fallback",
+        feature_selection_scores={},
     )
         preprocessor = joblib.load(artifacts_path)
         updated = False
         required_updated = False
+        if not hasattr(preprocessor, "reduced_input_columns") or not preprocessor.reduced_input_columns:
+            reduced_cols, selection_scores, selection_method = _compute_reduced_inputs_from_data(
+                data_path, preprocessor
+            )
+            preprocessor.reduced_input_columns = reduced_cols
+            preprocessor.feature_selection_method = selection_method
+            preprocessor.feature_selection_scores = selection_scores
+            updated = True
+        if not hasattr(preprocessor, "feature_selection_method"):
+            preprocessor.feature_selection_method = "default"
+            updated = True
+        if not hasattr(preprocessor, "feature_selection_scores"):
+            preprocessor.feature_selection_scores = {}
+            updated = True
         if not hasattr(preprocessor, "required_input_columns"):
             if USE_REDUCED_INPUTS:
                 required_input = _reduce_input_columns(preprocessor)
     return ranges
+def _dedupe_preserve_order(values: list[str]) -> list[str]:
+    seen: set[str] = set()
+    output: list[str] = []
+    for value in values:
+        if value in seen:
+            continue
+        seen.add(value)
+        output.append(value)
+    return output
+def _fallback_reduced_inputs(input_feature_columns: list[str]) -> list[str]:
+    cols = [
+        col
+        for col in DEFAULT_REDUCED_INPUT_FEATURES
+        if col in input_feature_columns or col == "SK_ID_CURR"
+    ]
+    if "SK_ID_CURR" not in cols:
+        cols.insert(0, "SK_ID_CURR")
+    return _dedupe_preserve_order(cols)
+def _select_reduced_inputs_by_correlation(
+    df: pd.DataFrame,
+    *,
+    input_feature_columns: list[str],
+    top_n: int,
+    min_corr: float,
+) -> tuple[list[str], dict[str, float]]:
+    if "TARGET" not in df.columns:
+        return [], {}
+    df_corr = df
+    if CORRELATION_SAMPLE_SIZE > 0 and len(df_corr) > CORRELATION_SAMPLE_SIZE:
+        df_corr = df_corr.sample(CORRELATION_SAMPLE_SIZE, random_state=42)
+    numeric_cols = [
+        col
+        for col in df_corr.select_dtypes(include=["number"]).columns
+        if col in input_feature_columns
+        and col not in {"TARGET", "SK_ID_CURR", "is_train", "is_test"}
+    ]
+    if not numeric_cols:
+        return [], {}
+    corr = df_corr[numeric_cols + ["TARGET"]].corr()["TARGET"].drop("TARGET")
+    corr = corr.dropna()
+    if corr.empty:
+        return [], {}
+    corr = corr.reindex(corr.abs().sort_values(ascending=False).index)
+    if min_corr > 0:
+        corr = corr[corr.abs() >= min_corr]
+    selected_numeric = list(corr.index[:top_n])
+    scores = {col: float(abs(corr.loc[col])) for col in selected_numeric}
+    selected = ["SK_ID_CURR"]
+    selected.extend(selected_numeric)
+    selected.extend(
+        col
+        for col in FEATURE_SELECTION_CATEGORICAL_INPUTS
+        if col in input_feature_columns
+    )
+    selected = [
+        col for col in selected if col in input_feature_columns or col == "SK_ID_CURR"
+    ]
+    return _dedupe_preserve_order(selected), scores
+def _compute_reduced_inputs(
+    df: pd.DataFrame | None,
+    *,
+    input_feature_columns: list[str],
+) -> tuple[list[str], dict[str, float], str]:
+    if FEATURE_SELECTION_METHOD != "correlation":
+        return _fallback_reduced_inputs(input_feature_columns), {}, "default"
+    if df is None or "TARGET" not in df.columns:
+        return _fallback_reduced_inputs(input_feature_columns), {}, "default"
+    reduced_cols, scores = _select_reduced_inputs_by_correlation(
+        df,
+        input_feature_columns=input_feature_columns,
+        top_n=FEATURE_SELECTION_TOP_N,
+        min_corr=FEATURE_SELECTION_MIN_CORR,
+    )
+    if not reduced_cols:
+        return _fallback_reduced_inputs(input_feature_columns), {}, "default"
+    return reduced_cols, scores, "correlation"
 def _build_correlated_imputation(
     df: pd.DataFrame,
     *,
 def _reduce_input_columns(preprocessor: PreprocessorArtifacts) -> list[str]:
+    cols = getattr(preprocessor, "reduced_input_columns", None) or []
+    if not cols:
+        cols = _fallback_reduced_inputs(preprocessor.input_feature_columns)
+    cols = [
+        col
+        for col in cols
+        if col in preprocessor.input_feature_columns or col == "SK_ID_CURR"
+    ]
     if "SK_ID_CURR" not in cols:
+        cols.insert(0, "SK_ID_CURR")
+    return _dedupe_preserve_order(cols)
+def _compute_reduced_inputs_from_data(
+    data_path: Path,
+    preprocessor: PreprocessorArtifacts,
+) -> tuple[list[str], dict[str, float], str]:
+    if not data_path.exists():
+        return _fallback_reduced_inputs(preprocessor.input_feature_columns), {}, "default"
+    df = pd.read_parquet(data_path)
+    df = new_features_creation(df)
+    df.replace([np.inf, -np.inf], np.nan, inplace=True)
+    if preprocessor.columns_keep:
+        df = df[preprocessor.columns_keep]
+    if preprocessor.columns_must_not_missing:
+        df = df.dropna(subset=preprocessor.columns_must_not_missing)
+    numeric_cols = df.select_dtypes(include=["number"]).columns
+    df[numeric_cols] = df[numeric_cols].fillna(pd.Series(preprocessor.numeric_medians))
+    for col in preprocessor.categorical_columns:
+        if col in df.columns:
+            df[col] = df[col].fillna("Unknown")
+    if "CODE_GENDER" in df.columns:
+        df = df[df["CODE_GENDER"] != "XNA"]
+    for col, max_val in preprocessor.outlier_maxes.items():
+        if col in df.columns:
+            df = df[df[col] != max_val]
+    return _compute_reduced_inputs(df, input_feature_columns=preprocessor.input_feature_columns)
 def _compute_correlated_imputation(
     )
+def _ensure_required_columns(
+    df: pd.DataFrame,
+    required_cols: list[str],
+    allow_missing: set[str] | None = None,
+) -> None:
+    allow_missing = allow_missing or set()
+    missing = [
+        col
+        for col in required_cols
+        if col not in df.columns or (col not in allow_missing and df[col].isna().any())
+    ]
     if missing:
         raise HTTPException(
             status_code=422,
     invalid = []
     for col in numeric_cols:
         coerced = pd.to_numeric(df[col], errors="coerce")
+        if (coerced.isna() & df[col].notna()).any():
             invalid.append(col)
     if invalid:
         raise HTTPException(
         if col not in df.columns:
             continue
         values = pd.to_numeric(df[col], errors="coerce")
+        mask = values.notna()
+        if mask.any() and ((values[mask] < min_val) | (values[mask] > max_val)).any():
             out_of_range.append(col)
     if out_of_range:
         raise HTTPException(
         if col not in df.columns:
             df[col] = np.nan
+    allow_missing = {"DAYS_EMPLOYED"}
+    _ensure_required_columns(df, artifacts.required_input_columns, allow_missing=allow_missing)
     _validate_numeric_inputs(df, artifacts.numeric_required_columns)
     _validate_numeric_ranges(df, {k: v for k, v in artifacts.numeric_ranges.items() if k in artifacts.numeric_required_columns})
     df = new_features_creation(df)
     df.replace([np.inf, -np.inf], np.nan, inplace=True)
+    df = df.reindex(columns=artifacts.columns_keep, fill_value=np.nan)
     _apply_correlated_imputation(df, artifacts)
         if col in df.columns:
             df[col] = df[col].fillna("Unknown")
+    _ensure_required_columns(df, artifacts.required_input_columns, allow_missing=allow_missing)
     if "CODE_GENDER" in df.columns and (df["CODE_GENDER"] == "XNA").any():
         raise HTTPException(
             )
     df_hot = pd.get_dummies(df, columns=artifacts.categorical_columns)
+    df_hot = df_hot.reindex(columns=artifacts.features_to_scaled, fill_value=0)
     scaled = artifacts.scaler.transform(df_hot)
     return pd.DataFrame(scaled, columns=artifacts.features_to_scaled, index=df.index)
     preprocessor: PreprocessorArtifacts = app.state.preprocessor
     optional_features = [col for col in preprocessor.input_feature_columns if col not in preprocessor.required_input_columns]
     correlated = sorted(getattr(preprocessor, "correlated_imputation", {}) or {})
+    scores = getattr(preprocessor, "feature_selection_scores", {}) or {}
+    selection_scores = {
+        col: round(scores[col], 4)
+        for col in preprocessor.required_input_columns
+        if col in scores
+    }
     payload = {
         "required_input_features": preprocessor.required_input_columns,
         "engineered_features": ENGINEERED_FEATURES,
         "model_features_count": len(preprocessor.features_to_scaled),
+        "feature_selection_method": preprocessor.feature_selection_method,
+        "feature_selection_top_n": FEATURE_SELECTION_TOP_N,
+        "feature_selection_min_corr": FEATURE_SELECTION_MIN_CORR,
+        "feature_selection_scores": selection_scores,
         "correlation_threshold": CORRELATION_THRESHOLD,
         "correlated_imputation_count": len(correlated),
         "correlated_imputation_features": correlated[:50],
     return payload
+@app.get("/logs")
+def logs(
+    tail: int = Query(default=200, ge=1, le=2000),
+    x_logs_token: str | None = Header(default=None, alias="X-Logs-Token"),
+    authorization: str | None = Header(default=None),
+) -> Response:
+    if not LOGS_ACCESS_TOKEN:
+        raise HTTPException(status_code=503, detail={"message": "Logs access token not configured."})
+    token = x_logs_token
+    if token is None and authorization:
+        prefix = "bearer "
+        if authorization.lower().startswith(prefix):
+            token = authorization[len(prefix):].strip() or None
+    if token != LOGS_ACCESS_TOKEN:
+        raise HTTPException(status_code=403, detail={"message": "Invalid logs access token."})
+    if not LOG_PREDICTIONS:
+        raise HTTPException(status_code=404, detail={"message": "Prediction logging is disabled."})
+    log_path = LOG_DIR / LOG_FILE
+    if not log_path.exists():
+        raise HTTPException(status_code=404, detail={"message": "Log file not found."})
+    with log_path.open("r", encoding="utf-8") as handle:
+        lines = deque(handle, maxlen=tail)
+    return Response(content="".join(lines), media_type="application/x-ndjson")
 @app.post("/predict")
 def predict(
     payload: PredictionRequest,
     try:
         df_raw = pd.DataFrame.from_records(records)
+        df_norm, unknown_masks, sentinel_mask = _normalize_inputs(df_raw, preprocessor)
+        log_records = df_norm.to_dict(orient="records")
+        dq_records = _build_data_quality_records(
+            df_raw,
+            df_norm,
+            unknown_masks,
+            sentinel_mask,
+            preprocessor,
+        )
+        if "SK_ID_CURR" not in df_norm.columns:
             raise HTTPException(status_code=422, detail={"message": "SK_ID_CURR is required."})
+        sk_ids = df_norm["SK_ID_CURR"].tolist()
+        features = preprocess_input(df_norm, preprocessor)
         if hasattr(model, "predict_proba"):
             proba = model.predict_proba(features)[:, 1]
             latency_ms = (time.perf_counter() - start_time) * 1000.0
             _log_prediction_entries(
                 request_id=request_id,
+                records=log_records,
                 results=results,
                 latency_ms=latency_ms,
                 threshold=use_threshold,
                 status_code=200,
                 preprocessor=preprocessor,
+                data_quality=dq_records,
             )
             return {"predictions": results, "threshold": use_threshold}
         latency_ms = (time.perf_counter() - start_time) * 1000.0
         _log_prediction_entries(
             request_id=request_id,
+            records=log_records,
             results=results,
             latency_ms=latency_ms,
             threshold=None,
             status_code=200,
             preprocessor=preprocessor,
+            data_quality=dq_records,
         )
         return {"predictions": results, "threshold": None}
     except HTTPException as exc:
         detail = exc.detail if isinstance(exc.detail, dict) else {"message": str(exc.detail)}
         _log_prediction_entries(
             request_id=request_id,
+            records=log_records if "log_records" in locals() else records,
             results=None,
             latency_ms=latency_ms,
             threshold=threshold,
             status_code=exc.status_code,
             preprocessor=preprocessor,
+            data_quality=dq_records if "dq_records" in locals() else None,
             error=json.dumps(detail, ensure_ascii=True),
         )
         raise
         latency_ms = (time.perf_counter() - start_time) * 1000.0
         _log_prediction_entries(
             request_id=request_id,
+            records=log_records if "log_records" in locals() else records,
             results=None,
             latency_ms=latency_ms,
             threshold=threshold,
             status_code=500,
             preprocessor=preprocessor,
+            data_quality=dq_records if "dq_records" in locals() else None,
             error=str(exc),
         )
         raise

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md CHANGED Viewed

@@ -10,9 +10,9 @@ pinned: false
 # OCR Projet 06 – Crédit
-[![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/stephmnt/OCR_Projet06/deploy.yml)](https://github.com/stephmnt/OCR_Projet05/actions/workflows/deploy.yml)
-[![GitHub Release Date](https://img.shields.io/github/release-date/stephmnt/OCR_Projet06?display_date=published_at&style=flat-square)](https://github.com/stephmnt/OCR_Projet06/releases)
-[![project_license](https://img.shields.io/github/license/stephmnt/OCR_projet06.svg)](https://github.com/stephmnt/OCR_Projet06/blob/main/LICENSE)
 ## Lancer MLFlow
@@ -41,12 +41,9 @@ mlflow models serve -m "models:/credit_scoring_model/Staging" -p 5001 --no-conda
 ## API FastAPI
-L'API attend un payload JSON avec une cle `data`. La valeur peut etre un objet
-unique (un client) ou une liste d'objets (plusieurs clients). La liste des
-features requises (jeu reduit) est disponible via l'endpoint `/features`. Les
-autres champs sont optionnels et seront completes par des valeurs par defaut.
-Inputs minimums (10 + `SK_ID_CURR`) :
 - `EXT_SOURCE_2`
 - `EXT_SOURCE_3`
@@ -59,6 +56,12 @@ Inputs minimums (10 + `SK_ID_CURR`) :
 - `DAYS_BIRTH`
 - `FLAG_OWN_CAR`
 ### Environnement Poetry (recommande)
 Le fichier `pyproject.toml` fixe des versions compatibles pour un stack recent
@@ -131,22 +134,23 @@ Lancer l'API :
 uvicorn app.main:app --reload --port 7860
 ```
-Verifier le service :
 ```shell
-curl -s http://127.0.0.1:7860/health
 ```
-Voir les features attendues :
 ```shell
-curl -s http://127.0.0.1:7860/features
 ```
-Predire un client :
 ```shell
-curl -s -X POST "http://127.0.0.1:7860/predict?threshold=0.5" \
   -H "Content-Type: application/json" \
   -d '{
     "data": {
@@ -165,6 +169,109 @@ curl -s -X POST "http://127.0.0.1:7860/predict?threshold=0.5" \
   }'
 ```
 ## Contenu de la release
 - **Preparation + pipeline** : nettoyage / preparation, encodage, imputation et pipeline d'entrainement presentes.
@@ -174,19 +281,28 @@ curl -s -X POST "http://127.0.0.1:7860/predict?threshold=0.5" \
 - **Validation croisee + tuning** : `StratifiedKFold`, `GridSearchCV` et Hyperopt sont utilises.
 - **Score metier + seuil optimal** : le `custom_score` est la metrique principale des tableaux de comparaison et de la CV, avec un `best_threshold` calcule.
 - **Explicabilite** : feature importance, SHAP et LIME sont inclus.
-- **MLOps (MLflow)** : tracking des params / metriques (dont `custom_score` et `best_threshold`), tags,
-  registry et passage en "Staging".
-![Screenshot MLFlow](https://raw.githubusercontent.com/stephmnt/OCR_Projet06/main/screen-mlflow.png)
-## Réduction des features
-Réduction des features : l’API utilise un top‑10 SHAP, alors que la mission insiste sur une réduction à l’aide d’une matrice de corrélation. La corrélation est bien documentée dans le notebook d’exploration, mais la liste utilisée par l’API n’est pas explicitement issue de cette matrice. À clarifier dans la doc ou aligner la sélection sur la corrélation.
-## Glossaire rapide
-- **custom_score** : metrique metier qui penalise plus fortement les faux negatifs que les faux positifs.
-- **Seuil optimal** : probabilite qui sert a transformer un score en classe 0/1.
-- **Validation croisee (CV)** : evaluation sur plusieurs sous-echantillons pour eviter un resultat "chanceux".
-- **MLflow tracking** : historique des runs, parametres et metriques.
-- **Registry** : espace MLflow pour versionner et promouvoir un modele (ex. "Staging").

 # OCR Projet 06 – Crédit
+[![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/stephmnt/credit-scoring-mlops/deploy.yml)](https://github.com/stephmnt/credit-scoring-mlops/actions/workflows/deploy.yml)
+[![GitHub Release Date](https://img.shields.io/github/release-date/stephmnt/credit-scoring-mlops?display_date=published_at&style=flat-square)](https://github.com/stephmnt/credit-scoring-mlops/releases)
+[![project_license](https://img.shields.io/github/license/stephmnt/credit-scoring-mlops.svg)](https://github.com/stephmnt/credit-scoring-mlops/blob/main/LICENSE)
 ## Lancer MLFlow
 ## API FastAPI
+L'API attend un payload JSON avec une cle `data`. La valeur peut etre un objet unique (un client) ou une liste d'objets (plusieurs clients). La liste des features requises (jeu reduit) est disponible via l'endpoint `/features`. Les autres champs sont optionnels et seront completes par des valeurs par defaut.
+Inputs minimums (10 + `SK_ID_CURR`) derives d'une selection par correlation (voir `/features`) :
 - `EXT_SOURCE_2`
 - `EXT_SOURCE_3`
 - `DAYS_BIRTH`
 - `FLAG_OWN_CAR`
+Parametres utiles (selection des features) :
+- `FEATURE_SELECTION_METHOD` (defaut: `correlation`)
+- `FEATURE_SELECTION_TOP_N` (defaut: `8`)
+- `FEATURE_SELECTION_MIN_CORR` (defaut: `0.02`)
 ### Environnement Poetry (recommande)
 Le fichier `pyproject.toml` fixe des versions compatibles pour un stack recent
 uvicorn app.main:app --reload --port 7860
 ```
+Verifier le service (HF) :
 ```shell
+BASE_URL="https://stephmnt-credit-scoring-mlops.hf.space"
+curl -s "${BASE_URL}/health"
 ```
+Voir les features attendues (HF) :
 ```shell
+curl -s "${BASE_URL}/features"
 ```
+Predire un client (HF) :
 ```shell
+curl -s -X POST "${BASE_URL}/predict?threshold=0.5" \
   -H "Content-Type: application/json" \
   -d '{
     "data": {
   }'
 ```
+Predire plusieurs clients (batch, HF) :
+```shell
+curl -s -X POST "${BASE_URL}/predict?threshold=0.45" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "data": [
+      {
+        "SK_ID_CURR": 100002,
+        "EXT_SOURCE_2": 0.61,
+        "EXT_SOURCE_3": 0.75,
+        "AMT_ANNUITY": 24700.5,
+        "EXT_SOURCE_1": 0.45,
+        "CODE_GENDER": "M",
+        "DAYS_EMPLOYED": -637,
+        "AMT_CREDIT": 406597.5,
+        "AMT_GOODS_PRICE": 351000.0,
+        "DAYS_BIRTH": -9461,
+        "FLAG_OWN_CAR": "N"
+      },
+      {
+        "SK_ID_CURR": 100003,
+        "EXT_SOURCE_2": 0.52,
+        "EXT_SOURCE_3": 0.64,
+        "AMT_ANNUITY": 19000.0,
+        "EXT_SOURCE_1": 0.33,
+        "CODE_GENDER": "F",
+        "DAYS_EMPLOYED": -1200,
+        "AMT_CREDIT": 320000.0,
+        "AMT_GOODS_PRICE": 280000.0,
+        "DAYS_BIRTH": -12000,
+        "FLAG_OWN_CAR": "Y"
+      }
+    ]
+  }'
+```
+Exemple d'erreur (champ requis manquant, HF) :
+```shell
+curl -s -X POST "${BASE_URL}/predict" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "data": {
+      "EXT_SOURCE_2": 0.61
+    }
+  }'
+```
+## Monitoring & Data Drift (Etape 3)
+L'API enregistre les appels `/predict` en JSONL (inputs, outputs, latence).
+Par defaut, les logs sont stockes dans `logs/predictions.jsonl`.
+Variables utiles :
+- `LOG_PREDICTIONS=1` active l'ecriture des logs (defaut: 1)
+- `LOG_DIR=logs`
+- `LOG_FILE=predictions.jsonl`
+- `LOGS_ACCESS_TOKEN` pour proteger l'endpoint `/logs`
+- `LOG_HASH_SK_ID=1` pour anonymiser `SK_ID_CURR`
+Exemple local :
+```shell
+LOG_PREDICTIONS=1 LOG_DIR=logs uvicorn app.main:app --reload --port 7860
+```
+Recuperer les logs (HF) :
+Configurer `LOGS_ACCESS_TOKEN` dans les secrets du Space, puis :
+```shell
+curl -s -H "X-Logs-Token: $LOGS_ACCESS_TOKEN" "${BASE_URL}/logs?tail=200"
+```
+Alternative :
+```shell
+curl -s -H "Authorization: Bearer $LOGS_ACCESS_TOKEN" "${BASE_URL}/logs?tail=200"
+```
+Apres quelques requêtes, gélérer le rapport de drift :
+```shell
+python monitoring/drift_report.py \
+  --logs logs/predictions.jsonl \
+  --reference data/data_final.parquet \
+  --output-dir reports
+```
+Le rapport HTML est généré dans `reports/drift_report.html` (avec des plots dans
+`reports/plots/`). Sur Hugging Face, le disque est éphemère : télécharger les logs
+avant d'analyser.
+Le rapport inclut aussi la distribution des scores predits et le taux de prediction
+(option `--score-bins` pour ajuster le nombre de bins).
+Captures (snapshot local du reporting + stockage):
+- Rapport: `docs/monitoring/drift_report.html` + `docs/monitoring/plots/`
+- Stockage des logs: `docs/monitoring/logs_storage.png`
 ## Contenu de la release
 - **Preparation + pipeline** : nettoyage / preparation, encodage, imputation et pipeline d'entrainement presentes.
 - **Validation croisee + tuning** : `StratifiedKFold`, `GridSearchCV` et Hyperopt sont utilises.
 - **Score metier + seuil optimal** : le `custom_score` est la metrique principale des tableaux de comparaison et de la CV, avec un `best_threshold` calcule.
 - **Explicabilite** : feature importance, SHAP et LIME sont inclus.
+- **Selection de features par correlation** : top‑N numeriques + un petit set categoriel, expose via `/features`.
+- **Monitoring & drift** : rapport HTML avec KS/PSI + distribution des scores predits et taux de prediction
+  (snapshots dans `docs/monitoring/`).
+- **CI/CD** : tests avec couverture (`pytest-cov`), build Docker et deploy vers Hugging Face Spaces.
+![Screenshot MLFlow](https://raw.githubusercontent.com/stephmnt/credit-scoring-mlops/main/screen-mlflow.png)
+### Manques prioritaires
+* Mission 2 Étape 4 non couverte: pas de profiling/optimisation post‑déploiement ni rapport de gains, à livrer avec une version optimisée.
+### Preuves / doc à compléter
+* Lien explicite vers le dépôt public + stratégie de versions/branches à ajouter dans README.md.
+* Preuve de model registry/serving MLflow à conserver (capture UI registry ou commande de serving) en plus de screen-mlflow.png.
+* Dataset de référence non versionné (data_final.parquet est ignoré), documenter l’obtention pour exécuter drift_report.py.
+* Badge GitHub Actions pointe vers OCR_Projet05 dans README.md, corriger l’URL.
+* RGPD/PII: LOG_HASH_SK_ID est désactivé par défaut dans main.py, préciser l’activation en prod dans README.md.
+### Améliorations recommandées
+* Compléter les tests API: /logs (auth OK/KO), batch predict, param threshold, SK_ID_CURR manquant, outliers dans test_api.py.
+* Simplifier le fallback ALLOW_MISSING_ARTIFACTS et DummyModel si les artefacts sont versionnés (nettoyer main.py et conftest.py).
+* Unifier la gestion des dépendances (Poetry vs requirements.txt) et aligner pyproject.toml / requirements.txt.
+* Si l’évaluateur attend une stratégie de branches, créer une branche feature et fusionner pour preuve.

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.dockerignore ADDED Viewed

	@@ -0,0 +1,6 @@

+mlruns/
+*.ipynb
+data/*.csv
+data/*.parquet
+!data/data_final.parquet
+!data/HistGB_final_model.pkl

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy.yml ADDED Viewed

	@@ -0,0 +1,54 @@

+name: ci-cd
+on:
+  push:
+    branches: ["main"]
+  pull_request:
+    branches: ["main"]
+jobs:
+  test-build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install poetry
+          poetry install --no-interaction --no-ansi
+      - name: Run tests
+        run: poetry run pytest -q
+      - name: Deploy to Hugging Face Space
+        if: github.ref == 'refs/heads/main'
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          git config --global user.email "actions@github.com"
+          git config --global user.name "GitHub Actions"
+          git clone https://huggingface.co/spaces/stephmnt/ocr_projet06 hf_space
+          rsync -av \
+            --exclude '.git' \
+            --exclude '.venv' \
+            --exclude '.pytest_cache' \
+            --exclude '__pycache__' \
+            --exclude 'mlruns' \
+            --exclude '*.ipynb' \
+            --exclude 'logs' \
+            --exclude 'reports' \
+            --exclude 'screen-mlflow.png' \
+            --exclude 'data/*.csv' \
+            --exclude 'data/*.parquet' \
+            ./ hf_space/
+          cd hf_space
+          git add .
+          git commit -m "Auto-deploy from GitHub Actions" || echo "No changes to commit"
+          git push https://stephmnt:${HF_TOKEN}@huggingface.co/spaces/stephmnt/ocr_projet06 main

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.gitignore ADDED Viewed

	@@ -0,0 +1,194 @@

+ressources/
+.venv/
+__pycache__/
+*.pyc
+logs/
+reports/
+data/*
+!data/HistGB_final_model.pkl
+artifacts/*
+!artifacts/preprocessor.joblib
+.DS_Store
+.vscode/
+.idea/
+.env
+.ipynb_checkpoints/
+mlruns/
+.DS_Store
+*.code-workspace
+## https://github.com/github/gitignore/blob/e8554d85bf62e38d6db966a50d2064ac025fd82a/Python.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   pixi.lock should be committed to version control for reproducibility
+#   .pixi/ contains the environments and should not be committed
+.pixi/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/LICENSE ADDED Viewed

	@@ -0,0 +1,8 @@

+The MIT License (MIT)
+Copyright (c) 2025, Stéphane Manet
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Package marker for app module.

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py ADDED Viewed

	@@ -0,0 +1,828 @@

+from __future__ import annotations
+import logging
+import os
+import pickle
+from dataclasses import dataclass
+from datetime import datetime, timezone
+import hashlib
+import json
+from pathlib import Path
+import time
+from typing import Any
+import uuid
+import numpy as np
+import pandas as pd
+from fastapi import FastAPI, HTTPException, Query, Response
+from pydantic import BaseModel
+from sklearn.preprocessing import MinMaxScaler
+import joblib
+logger = logging.getLogger("uvicorn.error")
+MODEL_PATH = Path(os.getenv("MODEL_PATH", "data/HistGB_final_model.pkl"))
+DATA_PATH = Path(os.getenv("DATA_PATH", "data/data_final.parquet"))
+ARTIFACTS_PATH = Path(os.getenv("ARTIFACTS_PATH", "artifacts/preprocessor.joblib"))
+DEFAULT_THRESHOLD = float(os.getenv("PREDICTION_THRESHOLD", "0.5"))
+CACHE_PREPROCESSOR = os.getenv("CACHE_PREPROCESSOR", "1") != "0"
+USE_REDUCED_INPUTS = os.getenv("USE_REDUCED_INPUTS", "1") != "0"
+CORRELATION_THRESHOLD = float(os.getenv("CORRELATION_THRESHOLD", "0.85"))
+CORRELATION_SAMPLE_SIZE = int(os.getenv("CORRELATION_SAMPLE_SIZE", "50000"))
+ALLOW_MISSING_ARTIFACTS = os.getenv("ALLOW_MISSING_ARTIFACTS", "0") == "1"
+LOG_PREDICTIONS = os.getenv("LOG_PREDICTIONS", "1") == "1"
+LOG_DIR = Path(os.getenv("LOG_DIR", "logs"))
+LOG_FILE = os.getenv("LOG_FILE", "predictions.jsonl")
+LOG_INCLUDE_INPUTS = os.getenv("LOG_INCLUDE_INPUTS", "1") == "1"
+LOG_HASH_SK_ID = os.getenv("LOG_HASH_SK_ID", "0") == "1"
+MODEL_VERSION = os.getenv("MODEL_VERSION", MODEL_PATH.name)
+IGNORE_FEATURES = ["is_train", "is_test", "TARGET", "SK_ID_CURR"]
+ENGINEERED_FEATURES = [
+    "DAYS_EMPLOYED_PERC",
+    "INCOME_CREDIT_PERC",
+    "INCOME_PER_PERSON",
+    "ANNUITY_INCOME_PERC",
+    "PAYMENT_RATE",
+]
+ENGINEERED_SOURCES = [
+    "DAYS_EMPLOYED",
+    "DAYS_BIRTH",
+    "AMT_INCOME_TOTAL",
+    "AMT_CREDIT",
+    "CNT_FAM_MEMBERS",
+    "AMT_ANNUITY",
+]
+# Top inputs derived from SHAP importance (modeling notebook), limited to application features.
+REDUCED_INPUT_FEATURES = [
+    "SK_ID_CURR",
+    "EXT_SOURCE_2",
+    "EXT_SOURCE_3",
+    "AMT_ANNUITY",
+    "EXT_SOURCE_1",
+    "CODE_GENDER",
+    "DAYS_EMPLOYED",
+    "AMT_CREDIT",
+    "AMT_GOODS_PRICE",
+    "DAYS_BIRTH",
+    "FLAG_OWN_CAR",
+]
+OUTLIER_COLUMNS = [
+    "CNT_FAM_MEMBERS",
+    "AMT_INCOME_TOTAL",
+    "AMT_ANNUITY",
+    "DAYS_EMPLOYED",
+    "OBS_60_CNT_SOCIAL_CIRCLE",
+    "OBS_30_CNT_SOCIAL_CIRCLE",
+    "DEF_60_CNT_SOCIAL_CIRCLE",
+    "DEF_30_CNT_SOCIAL_CIRCLE",
+    "REGION_POPULATION_RELATIVE",
+    "AMT_REQ_CREDIT_BUREAU_YEAR",
+    "AMT_REQ_CREDIT_BUREAU_QRT",
+]
+class PredictionRequest(BaseModel):
+    data: dict[str, Any] | list[dict[str, Any]]
+@dataclass
+class PreprocessorArtifacts:
+    columns_keep: list[str]
+    columns_must_not_missing: list[str]
+    numeric_medians: dict[str, float]
+    categorical_columns: list[str]
+    outlier_maxes: dict[str, float]
+    numeric_ranges: dict[str, tuple[float, float]]
+    features_to_scaled: list[str]
+    scaler: MinMaxScaler
+    raw_feature_columns: list[str]
+    input_feature_columns: list[str]
+    required_raw_columns: list[str]
+    required_input_columns: list[str]
+    numeric_required_columns: list[str]
+    correlated_imputation: dict[str, dict[str, float | str]]
+app = FastAPI(title="Credit Scoring API", version="0.1.0")
+class DummyModel:
+    def predict_proba(self, X: pd.DataFrame | np.ndarray) -> np.ndarray:
+        count = len(X)
+        return np.tile([0.5, 0.5], (count, 1))
+    def predict(self, X: pd.DataFrame | np.ndarray) -> np.ndarray:
+        return np.zeros(len(X), dtype=int)
+def _json_fallback(obj: Any) -> Any:
+    if isinstance(obj, (np.integer, np.floating)):
+        return obj.item()
+    if isinstance(obj, (np.ndarray,)):
+        return obj.tolist()
+    if isinstance(obj, (pd.Timestamp,)):
+        return obj.isoformat()
+    return str(obj)
+def _hash_value(value: Any) -> str:
+    return hashlib.sha256(str(value).encode("utf-8")).hexdigest()
+def _append_log_entries(entries: list[dict[str, Any]]) -> None:
+    if not LOG_PREDICTIONS:
+        return
+    try:
+        LOG_DIR.mkdir(parents=True, exist_ok=True)
+        log_path = LOG_DIR / LOG_FILE
+        with log_path.open("a", encoding="utf-8") as handle:
+            for entry in entries:
+                handle.write(json.dumps(entry, ensure_ascii=True, default=_json_fallback) + "\n")
+    except OSError as exc:
+        logger.warning("Failed to write prediction logs: %s", exc)
+def _log_prediction_entries(
+    request_id: str,
+    records: list[dict[str, Any]],
+    results: list[dict[str, Any]] | None,
+    latency_ms: float,
+    threshold: float | None,
+    status_code: int,
+    preprocessor: PreprocessorArtifacts,
+    error: str | None = None,
+) -> None:
+    if not LOG_PREDICTIONS:
+        return
+    if not records:
+        records = [{}]
+    timestamp = datetime.now(timezone.utc).isoformat()
+    required_cols = preprocessor.required_input_columns
+    entries: list[dict[str, Any]] = []
+    for idx, record in enumerate(records):
+        inputs: dict[str, Any] = {}
+        if LOG_INCLUDE_INPUTS:
+            inputs = {col: record.get(col) for col in required_cols if col in record}
+            if LOG_HASH_SK_ID and "SK_ID_CURR" in inputs:
+                inputs["SK_ID_CURR"] = _hash_value(inputs["SK_ID_CURR"])
+        entry: dict[str, Any] = {
+            "timestamp": timestamp,
+            "request_id": request_id,
+            "endpoint": "/predict",
+            "latency_ms": round(latency_ms, 3),
+            "status_code": status_code,
+            "model_version": MODEL_VERSION,
+            "threshold": threshold,
+            "inputs": inputs,
+        }
+        if results and idx < len(results):
+            result = results[idx]
+            sk_id = result.get("sk_id_curr")
+            entry.update(
+                {
+                    "sk_id_curr": _hash_value(sk_id) if LOG_HASH_SK_ID and sk_id is not None else sk_id,
+                    "probability": result.get("probability"),
+                    "prediction": result.get("prediction"),
+                }
+            )
+        if error:
+            entry["error"] = error
+        entries.append(entry)
+    _append_log_entries(entries)
+def new_features_creation(df: pd.DataFrame) -> pd.DataFrame:
+    df_features = df.copy()
+    for col in ENGINEERED_SOURCES:
+        if col not in df_features.columns:
+            df_features[col] = np.nan
+    df_features["DAYS_EMPLOYED_PERC"] = df_features["DAYS_EMPLOYED"] / df_features["DAYS_BIRTH"]
+    df_features["INCOME_CREDIT_PERC"] = df_features["AMT_INCOME_TOTAL"] / df_features["AMT_CREDIT"]
+    df_features["INCOME_PER_PERSON"] = df_features["AMT_INCOME_TOTAL"] / df_features["CNT_FAM_MEMBERS"]
+    df_features["ANNUITY_INCOME_PERC"] = df_features["AMT_ANNUITY"] / df_features["AMT_INCOME_TOTAL"]
+    df_features["PAYMENT_RATE"] = df_features["AMT_ANNUITY"] / df_features["AMT_CREDIT"]
+    return df_features
+def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
+    df = pd.read_parquet(data_path)
+    raw_feature_columns = df.columns.tolist()
+    input_feature_columns = [c for c in raw_feature_columns if c not in ["is_train", "is_test", "TARGET"]]
+    df = new_features_creation(df)
+    df.replace([np.inf, -np.inf], np.nan, inplace=True)
+    missing_rate = df.isna().mean()
+    columns_keep = missing_rate[missing_rate < 0.60].index.tolist()
+    columns_must_not_missing = missing_rate[missing_rate < 0.010].index.tolist()
+    df = df[columns_keep]
+    df = df.dropna(subset=columns_must_not_missing)
+    numeric_cols = df.select_dtypes(include=["number"]).columns
+    numeric_medians = df[numeric_cols].median().to_dict()
+    df[numeric_cols] = df[numeric_cols].fillna(numeric_medians)
+    categorical_columns = df.select_dtypes(include=["object"]).columns.tolist()
+    df[categorical_columns] = df[categorical_columns].fillna("Unknown")
+    if "CODE_GENDER" in df.columns:
+        df = df[df["CODE_GENDER"] != "XNA"]
+    outlier_maxes = {col: df[col].max() for col in OUTLIER_COLUMNS if col in df.columns}
+    for col, max_val in outlier_maxes.items():
+        df = df[df[col] != max_val]
+    numeric_ranges = {}
+    for col in numeric_cols:
+        if col in df.columns:
+            numeric_ranges[col] = (float(df[col].min()), float(df[col].max()))
+    df_hot = pd.get_dummies(df, columns=categorical_columns)
+    features_to_scaled = [col for col in df_hot.columns if col not in IGNORE_FEATURES]
+    scaler = MinMaxScaler()
+    scaler.fit(df_hot[features_to_scaled])
+    required_raw = set(ENGINEERED_SOURCES)
+    required_raw.update(col for col in columns_must_not_missing if col in input_feature_columns)
+    required_raw.add("SK_ID_CURR")
+    if USE_REDUCED_INPUTS:
+        required_input = sorted({col for col in REDUCED_INPUT_FEATURES if col in input_feature_columns})
+    else:
+        required_input = sorted(required_raw)
+    numeric_required = sorted(col for col in required_input if col in numeric_medians)
+    correlated_imputation = _build_correlated_imputation(
+        df,
+        input_feature_columns=input_feature_columns,
+        numeric_required=numeric_required,
+        threshold=CORRELATION_THRESHOLD,
+    )
+    return PreprocessorArtifacts(
+        columns_keep=columns_keep,
+        columns_must_not_missing=columns_must_not_missing,
+        numeric_medians={k: float(v) for k, v in numeric_medians.items()},
+        categorical_columns=categorical_columns,
+        outlier_maxes={k: float(v) for k, v in outlier_maxes.items()},
+        numeric_ranges=numeric_ranges,
+        features_to_scaled=features_to_scaled,
+        scaler=scaler,
+        raw_feature_columns=raw_feature_columns,
+        input_feature_columns=input_feature_columns,
+        required_raw_columns=sorted(required_raw),
+        required_input_columns=required_input,
+        numeric_required_columns=numeric_required,
+        correlated_imputation=correlated_imputation,
+    )
+def build_fallback_preprocessor() -> PreprocessorArtifacts:
+    base = pd.DataFrame(
+        [
+            {
+                "SK_ID_CURR": 100001,
+                "EXT_SOURCE_1": 0.45,
+                "EXT_SOURCE_2": 0.61,
+                "EXT_SOURCE_3": 0.75,
+                "AMT_ANNUITY": 24700.5,
+                "AMT_CREDIT": 406597.5,
+                "AMT_GOODS_PRICE": 351000.0,
+                "DAYS_BIRTH": -9461,
+                "DAYS_EMPLOYED": -637,
+                "CODE_GENDER": "M",
+                "FLAG_OWN_CAR": "N",
+                "AMT_INCOME_TOTAL": 202500.0,
+                "CNT_FAM_MEMBERS": 1,
+                "CNT_CHILDREN": 0,
+            },
+            {
+                "SK_ID_CURR": 100002,
+                "EXT_SOURCE_1": 0.35,
+                "EXT_SOURCE_2": 0.52,
+                "EXT_SOURCE_3": 0.68,
+                "AMT_ANNUITY": 22000.0,
+                "AMT_CREDIT": 350000.0,
+                "AMT_GOODS_PRICE": 300000.0,
+                "DAYS_BIRTH": -12000,
+                "DAYS_EMPLOYED": -1200,
+                "CODE_GENDER": "F",
+                "FLAG_OWN_CAR": "Y",
+                "AMT_INCOME_TOTAL": 180000.0,
+                "CNT_FAM_MEMBERS": 2,
+                "CNT_CHILDREN": 1,
+            },
+        ]
+    )
+    df = new_features_creation(base)
+    df.replace([np.inf, -np.inf], np.nan, inplace=True)
+    columns_keep = df.columns.tolist()
+    columns_must_not_missing = [col for col in columns_keep if col not in IGNORE_FEATURES]
+    numeric_cols = df.select_dtypes(include=["number"]).columns
+    numeric_medians = df[numeric_cols].median().to_dict()
+    df[numeric_cols] = df[numeric_cols].fillna(numeric_medians)
+    categorical_columns = df.select_dtypes(include=["object"]).columns.tolist()
+    df[categorical_columns] = df[categorical_columns].fillna("Unknown")
+    df_hot = pd.get_dummies(df, columns=categorical_columns)
+    features_to_scaled = [col for col in df_hot.columns if col not in IGNORE_FEATURES]
+    scaler = MinMaxScaler()
+    scaler.fit(df_hot[features_to_scaled])
+    raw_feature_columns = df.columns.tolist()
+    input_feature_columns = [c for c in raw_feature_columns if c not in ["is_train", "is_test", "TARGET"]]
+    required_raw = set(ENGINEERED_SOURCES)
+    required_raw.update(col for col in columns_must_not_missing if col in input_feature_columns)
+    required_raw.add("SK_ID_CURR")
+    required_input = sorted({col for col in REDUCED_INPUT_FEATURES if col in input_feature_columns})
+    numeric_required = sorted(col for col in required_input if col in numeric_medians)
+    numeric_ranges = {col: (float(df[col].min()), float(df[col].max())) for col in numeric_cols}
+    return PreprocessorArtifacts(
+        columns_keep=columns_keep,
+        columns_must_not_missing=columns_must_not_missing,
+        numeric_medians={k: float(v) for k, v in numeric_medians.items()},
+        categorical_columns=categorical_columns,
+        outlier_maxes={},
+        numeric_ranges=numeric_ranges,
+        features_to_scaled=features_to_scaled,
+        scaler=scaler,
+        raw_feature_columns=raw_feature_columns,
+        input_feature_columns=input_feature_columns,
+        required_raw_columns=sorted(required_raw),
+        required_input_columns=required_input,
+        numeric_required_columns=numeric_required,
+        correlated_imputation={},
+    )
+def load_preprocessor(data_path: Path, artifacts_path: Path) -> PreprocessorArtifacts:
+    if artifacts_path.exists():
+        preprocessor = joblib.load(artifacts_path)
+        updated = False
+        required_updated = False
+        if not hasattr(preprocessor, "required_input_columns"):
+            if USE_REDUCED_INPUTS:
+                required_input = _reduce_input_columns(preprocessor)
+            else:
+                required_input = preprocessor.required_raw_columns
+            preprocessor.required_input_columns = required_input
+            required_updated = True
+            updated = True
+        if not hasattr(preprocessor, "numeric_required_columns"):
+            preprocessor.numeric_required_columns = sorted(
+                col for col in preprocessor.required_input_columns if col in preprocessor.numeric_medians
+            )
+            updated = True
+        if not hasattr(preprocessor, "numeric_ranges"):
+            numeric_ranges = _infer_numeric_ranges_from_scaler(preprocessor)
+            if numeric_ranges:
+                preprocessor.numeric_ranges = numeric_ranges
+                updated = True
+            else:
+                if not data_path.exists():
+                    raise RuntimeError(f"Data file not found to rebuild preprocessor: {data_path}")
+                preprocessor = build_preprocessor(data_path)
+                updated = True
+        if USE_REDUCED_INPUTS:
+            reduced = _reduce_input_columns(preprocessor)
+            if preprocessor.required_input_columns != reduced:
+                preprocessor.required_input_columns = reduced
+                required_updated = True
+                updated = True
+        else:
+            if preprocessor.required_input_columns != preprocessor.required_raw_columns:
+                preprocessor.required_input_columns = preprocessor.required_raw_columns
+                required_updated = True
+                updated = True
+        desired_numeric_required = sorted(
+            col for col in preprocessor.required_input_columns if col in preprocessor.numeric_medians
+        )
+        if getattr(preprocessor, "numeric_required_columns", None) != desired_numeric_required:
+            preprocessor.numeric_required_columns = desired_numeric_required
+            updated = True
+        if not hasattr(preprocessor, "correlated_imputation") or required_updated:
+            if data_path.exists():
+                preprocessor.correlated_imputation = _compute_correlated_imputation(data_path, preprocessor)
+            else:
+                preprocessor.correlated_imputation = {}
+            updated = True
+        if updated and CACHE_PREPROCESSOR:
+            artifacts_path.parent.mkdir(parents=True, exist_ok=True)
+            joblib.dump(preprocessor, artifacts_path)
+        return preprocessor
+    if not data_path.exists():
+        raise RuntimeError(f"Data file not found to build preprocessor: {data_path}")
+    preprocessor = build_preprocessor(data_path)
+    if CACHE_PREPROCESSOR:
+        artifacts_path.parent.mkdir(parents=True, exist_ok=True)
+        joblib.dump(preprocessor, artifacts_path)
+    return preprocessor
+def load_model(model_path: Path):
+    with model_path.open("rb") as handle:
+        return pickle.load(handle)
+def _infer_numeric_ranges_from_scaler(preprocessor: PreprocessorArtifacts) -> dict[str, tuple[float, float]]:
+    ranges = {}
+    scaler = getattr(preprocessor, "scaler", None)
+    if scaler is None or not hasattr(scaler, "data_min_") or not hasattr(scaler, "data_max_"):
+        return ranges
+    for idx, col in enumerate(preprocessor.features_to_scaled):
+        if col in preprocessor.numeric_medians:
+            ranges[col] = (float(scaler.data_min_[idx]), float(scaler.data_max_[idx]))
+    return ranges
+def _build_correlated_imputation(
+    df: pd.DataFrame,
+    *,
+    input_feature_columns: list[str],
+    numeric_required: list[str],
+    threshold: float,
+) -> dict[str, dict[str, float | str]]:
+    if not numeric_required:
+        return {}
+    numeric_cols = [
+        col
+        for col in df.select_dtypes(include=["number"]).columns
+        if col in input_feature_columns and col not in {"TARGET", "is_train", "is_test", "SK_ID_CURR"}
+    ]
+    if not numeric_cols:
+        return {}
+    df_corr = df
+    if CORRELATION_SAMPLE_SIZE > 0 and len(df_corr) > CORRELATION_SAMPLE_SIZE:
+        df_corr = df_corr.sample(CORRELATION_SAMPLE_SIZE, random_state=42)
+    corr = df_corr[numeric_cols].corr()
+    correlated = {}
+    for col in numeric_cols:
+        if col in numeric_required:
+            continue
+        best_feature = None
+        best_corr = 0.0
+        for req in numeric_required:
+            if req not in corr.columns or col not in corr.index:
+                continue
+            corr_val = corr.at[col, req]
+            if pd.isna(corr_val):
+                continue
+            if abs(corr_val) > abs(best_corr): # type: ignore
+                best_corr = float(corr_val) # type: ignore
+                best_feature = req
+        if best_feature is None or abs(best_corr) < threshold:
+            continue
+        proxy_values = df_corr[best_feature].to_numpy()
+        if np.nanstd(proxy_values) == 0:
+            continue
+        slope, intercept = np.polyfit(proxy_values, df_corr[col].to_numpy(), 1)
+        correlated[col] = {
+            "proxy": best_feature,
+            "slope": float(slope),
+            "intercept": float(intercept),
+            "corr": float(best_corr),
+        }
+    return correlated
+def _reduce_input_columns(preprocessor: PreprocessorArtifacts) -> list[str]:
+    cols = [col for col in REDUCED_INPUT_FEATURES if col in preprocessor.input_feature_columns or col == "SK_ID_CURR"]
+    if "SK_ID_CURR" not in cols:
+        cols.append("SK_ID_CURR")
+    return sorted(set(cols))
+def _compute_correlated_imputation(
+    data_path: Path,
+    preprocessor: PreprocessorArtifacts,
+) -> dict[str, dict[str, float | str]]:
+    df = pd.read_parquet(data_path)
+    df = new_features_creation(df)
+    df.replace([np.inf, -np.inf], np.nan, inplace=True)
+    df = df[preprocessor.columns_keep]
+    df = df.dropna(subset=preprocessor.columns_must_not_missing)
+    numeric_cols = df.select_dtypes(include=["number"]).columns
+    df[numeric_cols] = df[numeric_cols].fillna(pd.Series(preprocessor.numeric_medians))
+    for col in preprocessor.categorical_columns:
+        if col in df.columns:
+            df[col] = df[col].fillna("Unknown")
+    if "CODE_GENDER" in df.columns:
+        df = df[df["CODE_GENDER"] != "XNA"]
+    for col, max_val in preprocessor.outlier_maxes.items():
+        if col in df.columns:
+            df = df[df[col] != max_val]
+    return _build_correlated_imputation(
+        df,
+        input_feature_columns=preprocessor.input_feature_columns,
+        numeric_required=preprocessor.numeric_required_columns,
+        threshold=CORRELATION_THRESHOLD,
+    )
+def _ensure_required_columns(df: pd.DataFrame, required_cols: list[str]) -> None:
+    missing = [col for col in required_cols if col not in df.columns or df[col].isna().any()]
+    if missing:
+        raise HTTPException(
+            status_code=422,
+            detail={
+                "message": "Missing required input columns.",
+                "missing_columns": missing[:25],
+                "missing_count": len(missing),
+            },
+        )
+def _validate_numeric_inputs(df: pd.DataFrame, numeric_cols: list[str]) -> None:
+    invalid = []
+    for col in numeric_cols:
+        coerced = pd.to_numeric(df[col], errors="coerce")
+        if coerced.isna().any():
+            invalid.append(col)
+    if invalid:
+        raise HTTPException(
+            status_code=422,
+            detail={
+                "message": "Invalid numeric values provided.",
+                "invalid_columns": invalid[:25],
+                "invalid_count": len(invalid),
+            },
+        )
+def _validate_numeric_ranges(df: pd.DataFrame, numeric_ranges: dict[str, tuple[float, float]]) -> None:
+    if not numeric_ranges:
+        return
+    out_of_range = []
+    for col, (min_val, max_val) in numeric_ranges.items():
+        if col not in df.columns:
+            continue
+        values = pd.to_numeric(df[col], errors="coerce")
+        if values.isna().any():
+            continue
+        if ((values < min_val) | (values > max_val)).any():
+            out_of_range.append(col)
+    if out_of_range:
+        raise HTTPException(
+            status_code=422,
+            detail={
+                "message": "Input contains values outside expected ranges.",
+                "out_of_range_columns": out_of_range[:25],
+                "out_of_range_count": len(out_of_range),
+            },
+        )
+def _apply_correlated_imputation(df: pd.DataFrame, artifacts: PreprocessorArtifacts) -> None:
+    correlated = getattr(artifacts, "correlated_imputation", {}) or {}
+    if not correlated:
+        return
+    for col, info in correlated.items():
+        if col not in df.columns or col in artifacts.required_input_columns:
+            continue
+        proxy = info.get("proxy")
+        if proxy is None or proxy not in df.columns:
+            continue
+        missing = df[col].isna()
+        if not missing.any():
+            continue
+        proxy_values = pd.to_numeric(df[proxy], errors="coerce")
+        if proxy_values.isna().any():
+            continue
+        df.loc[missing, col] = info["slope"] * proxy_values[missing] + info["intercept"]
+        if col in artifacts.numeric_ranges:
+            min_val, max_val = artifacts.numeric_ranges[col]
+            df.loc[missing, col] = df.loc[missing, col].clip(min_val, max_val)
+def preprocess_input(df_raw: pd.DataFrame, artifacts: PreprocessorArtifacts) -> pd.DataFrame:
+    df = df_raw.copy()
+    for col in artifacts.required_input_columns:
+        if col not in df.columns:
+            df[col] = np.nan
+    _ensure_required_columns(df, artifacts.required_input_columns)
+    _validate_numeric_inputs(df, artifacts.numeric_required_columns)
+    _validate_numeric_ranges(df, {k: v for k, v in artifacts.numeric_ranges.items() if k in artifacts.numeric_required_columns})
+    df["is_train"] = 0
+    df["is_test"] = 1
+    if "TARGET" not in df.columns:
+        df["TARGET"] = 0
+    df = new_features_creation(df)
+    df.replace([np.inf, -np.inf], np.nan, inplace=True)
+    for col in artifacts.columns_keep:
+        if col not in df.columns:
+            df[col] = np.nan
+    df = df[artifacts.columns_keep]
+    _apply_correlated_imputation(df, artifacts)
+    for col, median in artifacts.numeric_medians.items():
+        if col in df.columns:
+            df[col] = pd.to_numeric(df[col], errors="coerce")
+            df[col] = df[col].fillna(median)
+    for col in artifacts.categorical_columns:
+        if col in df.columns:
+            df[col] = df[col].fillna("Unknown")
+    _ensure_required_columns(df, artifacts.required_input_columns)
+    if "CODE_GENDER" in df.columns and (df["CODE_GENDER"] == "XNA").any():
+        raise HTTPException(
+            status_code=422,
+            detail={"message": "CODE_GENDER cannot be 'XNA' based on training rules."},
+        )
+    for col, max_val in artifacts.outlier_maxes.items():
+        if col in df.columns and (df[col] >= max_val).any():
+            raise HTTPException(
+                status_code=422,
+                detail={
+                    "message": "Input contains outlier values removed during training.",
+                    "outlier_columns": [col],
+                },
+            )
+    df_hot = pd.get_dummies(df, columns=artifacts.categorical_columns)
+    for col in artifacts.features_to_scaled:
+        if col not in df_hot.columns:
+            df_hot[col] = 0
+    df_hot = df_hot[artifacts.features_to_scaled]
+    scaled = artifacts.scaler.transform(df_hot)
+    return pd.DataFrame(scaled, columns=artifacts.features_to_scaled, index=df.index)
+@app.on_event("startup")
+def startup_event() -> None:
+    if not MODEL_PATH.exists():
+        if ALLOW_MISSING_ARTIFACTS:
+            logger.warning("Model file not found: %s. Using dummy model.", MODEL_PATH)
+            app.state.model = DummyModel()
+        else:
+            raise RuntimeError(f"Model file not found: {MODEL_PATH}")
+    else:
+        logger.info("Loading model from %s", MODEL_PATH)
+        app.state.model = load_model(MODEL_PATH)
+    try:
+        logger.info("Loading preprocessor artifacts from %s", ARTIFACTS_PATH)
+        app.state.preprocessor = load_preprocessor(DATA_PATH, ARTIFACTS_PATH)
+    except RuntimeError as exc:
+        if ALLOW_MISSING_ARTIFACTS:
+            logger.warning("Preprocessor artifacts missing (%s). Using fallback preprocessor.", exc)
+            app.state.preprocessor = build_fallback_preprocessor()
+        else:
+            raise
+@app.get("/health")
+def health() -> dict[str, str]:
+    return {"status": "ok"}
+@app.get("/")
+def root() -> dict[str, str]:
+    return {"message": "Credit Scoring API. See /docs for Swagger UI."}
+@app.get("/favicon.ico")
+def favicon() -> Response:
+    return Response(status_code=204)
+@app.get("/features")
+def features(include_all: bool = Query(default=False)) -> dict[str, Any]:
+    preprocessor: PreprocessorArtifacts = app.state.preprocessor
+    optional_features = [col for col in preprocessor.input_feature_columns if col not in preprocessor.required_input_columns]
+    correlated = sorted(getattr(preprocessor, "correlated_imputation", {}) or {})
+    payload = {
+        "required_input_features": preprocessor.required_input_columns,
+        "engineered_features": ENGINEERED_FEATURES,
+        "model_features_count": len(preprocessor.features_to_scaled),
+        "correlation_threshold": CORRELATION_THRESHOLD,
+        "correlated_imputation_count": len(correlated),
+        "correlated_imputation_features": correlated[:50],
+    }
+    if include_all:
+        payload["input_features"] = preprocessor.input_feature_columns
+        payload["optional_input_features"] = optional_features
+    else:
+        payload["input_features"] = preprocessor.required_input_columns
+        payload["optional_input_features"] = []
+        payload["optional_input_features_count"] = len(optional_features)
+    return payload
+@app.post("/predict")
+def predict(
+    payload: PredictionRequest,
+    threshold: float | None = Query(default=None, ge=0.0, le=1.0),
+) -> dict[str, Any]:
+    model = app.state.model
+    preprocessor: PreprocessorArtifacts = app.state.preprocessor
+    request_id = str(uuid.uuid4())
+    start_time = time.perf_counter()
+    records = payload.data if isinstance(payload.data, list) else [payload.data]
+    if not records:
+        raise HTTPException(status_code=422, detail={"message": "No input records provided."})
+    try:
+        df_raw = pd.DataFrame.from_records(records)
+        if "SK_ID_CURR" not in df_raw.columns:
+            raise HTTPException(status_code=422, detail={"message": "SK_ID_CURR is required."})
+        sk_ids = df_raw["SK_ID_CURR"].tolist()
+        features = preprocess_input(df_raw, preprocessor)
+        if hasattr(model, "predict_proba"):
+            proba = model.predict_proba(features)[:, 1]
+            use_threshold = DEFAULT_THRESHOLD if threshold is None else threshold
+            preds = (proba >= use_threshold).astype(int)
+            results = [
+                {
+                    "sk_id_curr": sk_id,
+                    "probability": float(prob),
+                    "prediction": int(pred),
+                }
+                for sk_id, prob, pred in zip(sk_ids, proba, preds)
+            ]
+            latency_ms = (time.perf_counter() - start_time) * 1000.0
+            _log_prediction_entries(
+                request_id=request_id,
+                records=records,
+                results=results,
+                latency_ms=latency_ms,
+                threshold=use_threshold,
+                status_code=200,
+                preprocessor=preprocessor,
+            )
+            return {"predictions": results, "threshold": use_threshold}
+        preds = model.predict(features)
+        results = [
+            {
+                "sk_id_curr": sk_id,
+                "prediction": int(pred),
+            }
+            for sk_id, pred in zip(sk_ids, preds)
+        ]
+        latency_ms = (time.perf_counter() - start_time) * 1000.0
+        _log_prediction_entries(
+            request_id=request_id,
+            records=records,
+            results=results,
+            latency_ms=latency_ms,
+            threshold=None,
+            status_code=200,
+            preprocessor=preprocessor,
+        )
+        return {"predictions": results, "threshold": None}
+    except HTTPException as exc:
+        latency_ms = (time.perf_counter() - start_time) * 1000.0
+        detail = exc.detail if isinstance(exc.detail, dict) else {"message": str(exc.detail)}
+        _log_prediction_entries(
+            request_id=request_id,
+            records=records,
+            results=None,
+            latency_ms=latency_ms,
+            threshold=threshold,
+            status_code=exc.status_code,
+            preprocessor=preprocessor,
+            error=json.dumps(detail, ensure_ascii=True),
+        )
+        raise
+    except Exception as exc:
+        latency_ms = (time.perf_counter() - start_time) * 1000.0
+        _log_prediction_entries(
+            request_id=request_id,
+            records=records,
+            results=None,
+            latency_ms=latency_ms,
+            threshold=threshold,
+            status_code=500,
+            preprocessor=preprocessor,
+            error=str(exc),
+        )
+        raise

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/Dockerfile ADDED Viewed

	@@ -0,0 +1,17 @@

+FROM python:3.11-slim
+WORKDIR /app
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY app/ app/
+COPY data/HistGB_final_model.pkl data/
+COPY artifacts/preprocessor.joblib artifacts/
+EXPOSE 7860
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md ADDED Viewed

	@@ -0,0 +1,192 @@

+---
+title: OCR Projet 06
+emoji: 🤖
+colorFrom: indigo
+colorTo: green
+sdk: docker
+app_port: 7860
+pinned: false
+---
+# OCR Projet 06 – Crédit
+[![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/stephmnt/OCR_Projet06/deploy.yml)](https://github.com/stephmnt/OCR_Projet05/actions/workflows/deploy.yml)
+[![GitHub Release Date](https://img.shields.io/github/release-date/stephmnt/OCR_Projet06?display_date=published_at&style=flat-square)](https://github.com/stephmnt/OCR_Projet06/releases)
+[![project_license](https://img.shields.io/github/license/stephmnt/OCR_projet06.svg)](https://github.com/stephmnt/OCR_Projet06/blob/main/LICENSE)
+## Lancer MLFlow
+Le notebook est configure pour utiliser un serveur MLflow local (`http://127.0.0.1:5000`).
+Pour voir les runs et creer l'experiment, demarrer le serveur avec le meme backend :
+```shell
+mlflow server \
+  --host 127.0.0.1 \
+  --port 5000 \
+  --backend-store-uri "file:${PWD}/mlruns" \
+  --default-artifact-root "file:${PWD}/mlruns"
+```
+Seulement l'interface (sans API), lancer :
+```shell
+mlflow ui --backend-store-uri "file:${PWD}/mlruns" --port 5000
+```
+Pour tester le serving du modele en staging :
+```shell
+mlflow models serve -m "models:/credit_scoring_model/Staging" -p 5001 --no-conda
+```
+## API FastAPI
+L'API attend un payload JSON avec une cle `data`. La valeur peut etre un objet
+unique (un client) ou une liste d'objets (plusieurs clients). La liste des
+features requises (jeu reduit) est disponible via l'endpoint `/features`. Les
+autres champs sont optionnels et seront completes par des valeurs par defaut.
+Inputs minimums (10 + `SK_ID_CURR`) :
+- `EXT_SOURCE_2`
+- `EXT_SOURCE_3`
+- `AMT_ANNUITY`
+- `EXT_SOURCE_1`
+- `CODE_GENDER`
+- `DAYS_EMPLOYED`
+- `AMT_CREDIT`
+- `AMT_GOODS_PRICE`
+- `DAYS_BIRTH`
+- `FLAG_OWN_CAR`
+### Environnement Poetry (recommande)
+Le fichier `pyproject.toml` fixe des versions compatibles pour un stack recent
+(`numpy>=2`, `pyarrow>=15`, `scikit-learn>=1.6`). L'environnement vise Python
+3.11.
+```shell
+poetry env use 3.11
+poetry install
+poetry run pytest -q
+poetry run uvicorn app.main:app --reload --port 7860
+```
+Important : le modele `HistGB_final_model.pkl` doit etre regenere avec la
+nouvelle version de scikit-learn (re-execution de
+`P6_MANET_Stephane_notebook_modélisation.ipynb`, cellule de sauvegarde pickle).
+Note : `requirements.txt` est aligne sur `pyproject.toml` (meme versions).
+### Exemple d'input (schema + valeurs)
+Schema :
+```json
+{
+  "data": {
+    "SK_ID_CURR": "int",
+    "EXT_SOURCE_2": "float",
+    "EXT_SOURCE_3": "float",
+    "AMT_ANNUITY": "float",
+    "EXT_SOURCE_1": "float",
+    "CODE_GENDER": "str",
+    "DAYS_EMPLOYED": "int",
+    "AMT_CREDIT": "float",
+    "AMT_GOODS_PRICE": "float",
+    "DAYS_BIRTH": "int",
+    "FLAG_OWN_CAR": "str"
+  }
+}
+```
+Valeurs d'exemple :
+```json
+{
+  "data": {
+    "SK_ID_CURR": 100002,
+    "EXT_SOURCE_2": 0.61,
+    "EXT_SOURCE_3": 0.75,
+    "AMT_ANNUITY": 24700.5,
+    "EXT_SOURCE_1": 0.45,
+    "CODE_GENDER": "M",
+    "DAYS_EMPLOYED": -637,
+    "AMT_CREDIT": 406597.5,
+    "AMT_GOODS_PRICE": 351000.0,
+    "DAYS_BIRTH": -9461,
+    "FLAG_OWN_CAR": "N"
+  }
+}
+```
+Note : l'API valide strictement les champs requis (`/features`). Pour afficher
+toutes les colonnes possibles : `/features?include_all=true`.
+### Demo live (commandes cles en main)
+Lancer l'API :
+```shell
+uvicorn app.main:app --reload --port 7860
+```
+Verifier le service :
+```shell
+curl -s http://127.0.0.1:7860/health
+```
+Voir les features attendues :
+```shell
+curl -s http://127.0.0.1:7860/features
+```
+Predire un client :
+```shell
+curl -s -X POST "http://127.0.0.1:7860/predict?threshold=0.5" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "data": {
+      "SK_ID_CURR": 100002,
+      "EXT_SOURCE_2": 0.61,
+      "EXT_SOURCE_3": 0.75,
+      "AMT_ANNUITY": 24700.5,
+      "EXT_SOURCE_1": 0.45,
+      "CODE_GENDER": "M",
+      "DAYS_EMPLOYED": -637,
+      "AMT_CREDIT": 406597.5,
+      "AMT_GOODS_PRICE": 351000.0,
+      "DAYS_BIRTH": -9461,
+      "FLAG_OWN_CAR": "N"
+    }
+  }'
+```
+## Contenu de la release
+- **Preparation + pipeline** : nettoyage / preparation, encodage, imputation et pipeline d'entrainement presentes.
+- **Gestion du desequilibre** : un sous-echantillonnage est applique sur le jeu d'entrainement final.
+- **Comparaison multi-modeles** : baseline, Naive Bayes, Logistic Regression, Decision Tree, Random Forest,
+  HistGradientBoosting, LGBM, XGB sont compares.
+- **Validation croisee + tuning** : `StratifiedKFold`, `GridSearchCV` et Hyperopt sont utilises.
+- **Score metier + seuil optimal** : le `custom_score` est la metrique principale des tableaux de comparaison et de la CV, avec un `best_threshold` calcule.
+- **Explicabilite** : feature importance, SHAP et LIME sont inclus.
+- **MLOps (MLflow)** : tracking des params / metriques (dont `custom_score` et `best_threshold`), tags,
+  registry et passage en "Staging".
+![Screenshot MLFlow](https://raw.githubusercontent.com/stephmnt/OCR_Projet06/main/screen-mlflow.png)
+## Réduction des features
+Réduction des features : l’API utilise un top‑10 SHAP, alors que la mission insiste sur une réduction à l’aide d’une matrice de corrélation. La corrélation est bien documentée dans le notebook d’exploration, mais la liste utilisée par l’API n’est pas explicitement issue de cette matrice. À clarifier dans la doc ou aligner la sélection sur la corrélation.
+## Glossaire rapide
+- **custom_score** : metrique metier qui penalise plus fortement les faux negatifs que les faux positifs.
+- **Seuil optimal** : probabilite qui sert a transformer un score en classe 0/1.
+- **Validation croisee (CV)** : evaluation sur plusieurs sous-echantillons pour eviter un resultat "chanceux".
+- **MLflow tracking** : historique des runs, parametres et metriques.
+- **Registry** : espace MLflow pour versionner et promouvoir un modele (ex. "Staging").

hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py ADDED Viewed

File without changes