GitHub Actions commited on
Commit
271ec19
·
1 Parent(s): decf87a

Auto-deploy from GitHub Actions

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. hf_space/.github/workflows/deploy-assets.yml +0 -2
  2. hf_space/.github/workflows/deploy.yml +0 -2
  3. hf_space/.gitignore +0 -3
  4. hf_space/Dockerfile +0 -2
  5. hf_space/README.md +7 -29
  6. hf_space/docs/performance/performance_report.md +7 -8
  7. hf_space/hf_space/app/main.py +155 -11
  8. hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml +18 -0
  9. hf_space/hf_space/hf_space/data/xgb_final_model.pkl +3 -0
  10. hf_space/hf_space/hf_space/gradio_app.py +1 -121
  11. hf_space/hf_space/hf_space/hf_space/.gitignore +1 -1
  12. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml +6 -2
  13. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py +7 -5
  14. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py +2 -3
  15. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy.yml +0 -1
  16. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py +127 -3
  17. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py +8 -8
  18. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/HistGB_final_model.pkl +3 -0
  19. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.env.example +46 -0
  20. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml +10 -11
  21. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy.yml +2 -1
  22. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/Dockerfile +1 -0
  23. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md +54 -17
  24. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py +365 -60
  25. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/performance/performance_report.md +2 -1
  26. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py +161 -27
  27. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py +108 -19
  28. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/__init__.py +1 -3
  29. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/Dockerfile +3 -2
  30. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py +1 -19
  31. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/__init__.py +3 -1
  32. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py +11 -1
  33. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app_entry.py +19 -0
  34. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/Dockerfile +1 -1
  35. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md +17 -8
  36. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.gitattributes +2 -33
  37. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml +69 -0
  38. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy.yml +4 -0
  39. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.gitignore +3 -1
  40. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py +25 -0
  41. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py +190 -13
  42. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py +96 -0
  43. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md +136 -18
  44. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/README.md +13 -0
  45. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/drift_report.html +140 -0
  46. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/logs_storage.png +0 -0
  47. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/AMT_ANNUITY.png +0 -0
  48. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/AMT_CREDIT.png +0 -0
  49. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/AMT_GOODS_PRICE.png +0 -0
  50. hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/CODE_GENDER.png +0 -0
hf_space/.github/workflows/deploy-assets.yml CHANGED
@@ -22,8 +22,6 @@ jobs:
22
  steps:
23
  - name: Checkout
24
  uses: actions/checkout@v4
25
- with:
26
- lfs: true
27
 
28
  - name: Set up Python
29
  uses: actions/setup-python@v5
 
22
  steps:
23
  - name: Checkout
24
  uses: actions/checkout@v4
 
 
25
 
26
  - name: Set up Python
27
  uses: actions/setup-python@v5
hf_space/.github/workflows/deploy.yml CHANGED
@@ -12,8 +12,6 @@ jobs:
12
  steps:
13
  - name: Checkout
14
  uses: actions/checkout@v4
15
- with:
16
- lfs: true
17
 
18
  - name: Set up Python
19
  uses: actions/setup-python@v5
 
12
  steps:
13
  - name: Checkout
14
  uses: actions/checkout@v4
 
 
15
 
16
  - name: Set up Python
17
  uses: actions/setup-python@v5
hf_space/.gitignore CHANGED
@@ -5,10 +5,7 @@ __pycache__/
5
  logs/
6
  reports/
7
  data/*
8
- !data/*_final_model.pkl
9
- !data/data_final.parquet
10
  artifacts/*
11
- !artifacts/preprocessor.joblib
12
  .DS_Store
13
  .vscode/
14
  .idea/
 
5
  logs/
6
  reports/
7
  data/*
 
 
8
  artifacts/*
 
9
  .DS_Store
10
  .vscode/
11
  .idea/
hf_space/Dockerfile CHANGED
@@ -11,8 +11,6 @@ RUN pip install --no-cache-dir -r requirements.txt
11
  COPY app/ app/
12
  COPY app_entry.py app.py gradio_app.py ./
13
  COPY src/ src/
14
- COPY data/ data/
15
- COPY artifacts/ artifacts/
16
 
17
  EXPOSE 7860
18
 
 
11
  COPY app/ app/
12
  COPY app_entry.py app.py gradio_app.py ./
13
  COPY src/ src/
 
 
14
 
15
  EXPOSE 7860
16
 
hf_space/README.md CHANGED
@@ -381,7 +381,7 @@ python monitoring/drift_report.py \
381
  --logs logs/predictions.jsonl \
382
  --reference data/data_final.parquet \
383
  --output-dir reports \
384
- --min-prod-samples 200 \
385
  --fdr-alpha 0.05 \
386
  --prod-since "2024-01-01T00:00:00Z" \
387
  --prod-until "2024-01-31T23:59:59Z"
@@ -391,7 +391,7 @@ Le rapport HTML est généré dans `reports/drift_report.html` (avec des plots d
391
  `reports/plots/`). Sur Hugging Face, le disque est éphemère : télécharger les logs
392
  avant d'analyser.
393
 
394
- Le drift est calcule uniquement si `n_prod >= --min-prod-samples` (defaut 200).
395
  Sinon, un badge "Sample insuffisant" est affiche et les alertes sont desactivees.
396
 
397
  Robustesse integree:
@@ -418,20 +418,16 @@ Captures (snapshot local du reporting + stockage):
418
 
419
  ## Profiling & Optimisation (Etape 4)
420
 
421
- Profiling et benchmark d'inference (cProfile + latence) :
422
 
423
- - Desormais via le notebook modélisation (section TODO 5).
424
- - L'ancien script est archive dans `dev_archive/profiling/profile_inference.py`.
425
-
426
- Sorties:
427
-
428
- - `docs/performance/benchmark_results.json`
429
- - `docs/performance/profile_summary.txt`
430
- - Rapport detaille: `docs/performance/performance_report.md`
431
 
432
  Dashboard local Streamlit (monitoring + drift):
433
 
434
  ```shell
 
 
435
  python -m streamlit run monitoring/streamlit_app.py
436
  ```
437
 
@@ -452,21 +448,3 @@ python -m streamlit run monitoring/streamlit_app.py
452
  - **CI/CD** : tests avec couverture (`pytest-cov`), build Docker et deploy vers Hugging Face Spaces.
453
 
454
  ![Screenshot MLFlow](https://raw.githubusercontent.com/stephmnt/credit-scoring-mlops/main/screen-mlflow.png)
455
-
456
- ### Manques prioritaires
457
-
458
- * Mission 2 Étape 4 non couverte: pas de profiling/optimisation post‑déploiement ni rapport de gains, à livrer avec une version optimisée.
459
-
460
- ### Preuves / doc à compléter
461
-
462
- * Lien explicite vers le dépôt public + stratégie de versions/branches à ajouter dans README.md.
463
- * Preuve de model registry/serving MLflow à conserver (capture UI registry ou commande de serving) en plus de screen-mlflow.png.
464
- * Dataset de référence non versionné (data_final.parquet est ignoré), documenter l’obtention pour exécuter drift_report.py.
465
- * Badge GitHub Actions pointe vers OCR_Projet05 dans README.md, corriger l’URL.
466
- * RGPD/PII: LOG_HASH_SK_ID est désactivé par défaut dans main.py, préciser l’activation en prod dans README.md.
467
-
468
- ### Améliorations recommandées
469
-
470
- * Compléter les tests API: /logs (auth OK/KO), batch predict, param threshold, SK_ID_CURR manquant, outliers dans test_api.py.
471
- * Simplifier le fallback ALLOW_MISSING_ARTIFACTS et DummyModel si les artefacts sont versionnés (nettoyer main.py et conftest.py).
472
- * Si l’évaluateur attend une stratégie de branches, créer une branche feature et fusionner pour preuve.
 
381
  --logs logs/predictions.jsonl \
382
  --reference data/data_final.parquet \
383
  --output-dir reports \
384
+ --min-prod-samples 50 \
385
  --fdr-alpha 0.05 \
386
  --prod-since "2024-01-01T00:00:00Z" \
387
  --prod-until "2024-01-31T23:59:59Z"
 
391
  `reports/plots/`). Sur Hugging Face, le disque est éphemère : télécharger les logs
392
  avant d'analyser.
393
 
394
+ Le drift est calcule uniquement si `n_prod >= --min-prod-samples` (defaut 50).
395
  Sinon, un badge "Sample insuffisant" est affiche et les alertes sont desactivees.
396
 
397
  Robustesse integree:
 
418
 
419
  ## Profiling & Optimisation (Etape 4)
420
 
421
+ Profiling et benchmark d'inference (cProfile + latence):
422
 
423
+ - Notebook: `notebooks/P6_MANET_Stephane_notebook_modélisation.ipynb` (section TODO 5).
424
+ - Resultats: `docs/performance/benchmark_results.json`, `docs/performance/profile_summary.txt`, `docs/performance/performance_report.md`.
 
 
 
 
 
 
425
 
426
  Dashboard local Streamlit (monitoring + drift):
427
 
428
  ```shell
429
+ streamlit run monitoring/streamlit_app.py
430
+ # ou
431
  python -m streamlit run monitoring/streamlit_app.py
432
  ```
433
 
 
448
  - **CI/CD** : tests avec couverture (`pytest-cov`), build Docker et deploy vers Hugging Face Spaces.
449
 
450
  ![Screenshot MLFlow](https://raw.githubusercontent.com/stephmnt/credit-scoring-mlops/main/screen-mlflow.png)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hf_space/docs/performance/performance_report.md CHANGED
@@ -6,11 +6,10 @@ Mesurer la latence d'inference, identifier les goulots d'etranglement et propose
6
 
7
  ## Setup
8
 
9
- - Script (archivé): `dev_archive/profiling/profile_inference.py`
10
- - Workflow courant: notebook modélisation (section TODO 5)
11
  - Donnees: `data/data_final.parquet` (echantillon)
12
  - Parametres: `--sample-size 500 --batch-size 100 --runs 2`
13
- - Modele: `HistGB_final_model.pkl`
14
 
15
  Les resultats sont sauvegardes dans:
16
 
@@ -21,21 +20,21 @@ Les resultats sont sauvegardes dans:
21
 
22
  | Scenario | Batch | Mean (ms) | P50 (ms) | P95 (ms) | Throughput (rows/s) |
23
  | --- | --- | ---:| ---:| ---:| ---:|
24
- | optimized_preprocess | 100 | 187.37 | 169.96 | 271.41 | 533.71 |
25
- | legacy_preprocess_alignment | 100 | 273.05 | 264.45 | 357.41 | 366.23 |
26
 
27
- Gain observe (moyenne): ~31% de reduction de latence par batch sur le chemin optimise.
28
 
29
  ## Goulots d'etranglement (cProfile)
30
 
31
  Extrait `docs/performance/profile_summary.txt`:
32
 
33
- - `app.main:preprocess_input` represente l'essentiel du temps cumule (~0.90s sur 1.05s).
34
  - Operations pandas dominantes:
35
  - `DataFrame.__setitem__` / `insert`
36
  - `fillna`, `to_numeric`
37
  - `get_dummies`
38
- - `HistGradientBoostingClassifier.predict_proba` est present mais non majoritaire (~0.15s).
39
 
40
  ## Optimisation appliquee
41
 
 
6
 
7
  ## Setup
8
 
9
+ - Notebook: `notebooks/P6_MANET_Stephane_notebook_modélisation.ipynb` (section TODO 5)
 
10
  - Donnees: `data/data_final.parquet` (echantillon)
11
  - Parametres: `--sample-size 500 --batch-size 100 --runs 2`
12
+ - Modele: `data/*_final_model.pkl` (ex: `data/xgb_final_model.pkl`)
13
 
14
  Les resultats sont sauvegardes dans:
15
 
 
20
 
21
  | Scenario | Batch | Mean (ms) | P50 (ms) | P95 (ms) | Throughput (rows/s) |
22
  | --- | --- | ---:| ---:| ---:| ---:|
23
+ | optimized_preprocess | 100 | 35.73 | 33.77 | 43.09 | 2798.44 |
24
+ | legacy_preprocess_alignment | 100 | 47.57 | 47.19 | 51.23 | 2102.36 |
25
 
26
+ Gain observe (moyenne): ~25% de reduction de latence par batch sur le chemin optimise.
27
 
28
  ## Goulots d'etranglement (cProfile)
29
 
30
  Extrait `docs/performance/profile_summary.txt`:
31
 
32
+ - `app.main:preprocess_input` represente l'essentiel du temps cumule (voir `docs/performance/profile_summary.txt`).
33
  - Operations pandas dominantes:
34
  - `DataFrame.__setitem__` / `insert`
35
  - `fillna`, `to_numeric`
36
  - `get_dummies`
37
+ - `predict_proba` est present mais non majoritaire.
38
 
39
  ## Optimisation appliquee
40
 
hf_space/hf_space/app/main.py CHANGED
@@ -1,5 +1,6 @@
1
  from __future__ import annotations
2
 
 
3
  import logging
4
  import os
5
  import pickle
@@ -8,6 +9,7 @@ from datetime import datetime, timezone
8
  import hashlib
9
  import json
10
  from pathlib import Path
 
11
  import time
12
  from typing import Any
13
  import uuid
@@ -16,6 +18,7 @@ from collections import deque
16
  import numpy as np
17
  import pandas as pd
18
  from fastapi import FastAPI, Header, HTTPException, Query, Response
 
19
  from pydantic import BaseModel
20
  from sklearn.preprocessing import MinMaxScaler
21
  import joblib
@@ -78,6 +81,19 @@ HF_CUSTOMER_REPO_ID = os.getenv("HF_CUSTOMER_REPO_ID")
78
  HF_CUSTOMER_REPO_TYPE = os.getenv("HF_CUSTOMER_REPO_TYPE", "dataset")
79
  HF_CUSTOMER_FILENAME = os.getenv("HF_CUSTOMER_FILENAME", CUSTOMER_DATA_PATH.name)
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  IGNORE_FEATURES = ["is_train", "is_test", "TARGET", "SK_ID_CURR"]
82
  ENGINEERED_FEATURES = [
83
  "DAYS_EMPLOYED_ANOM",
@@ -218,6 +234,87 @@ def _hash_value(value: Any) -> str:
218
  return hashlib.sha256(str(value).encode("utf-8")).hexdigest()
219
 
220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  def _normalize_category_value(value: object, mapping: dict[str, str]) -> object:
222
  if pd.isna(value): # type: ignore
223
  return np.nan
@@ -234,7 +331,9 @@ def _ensure_hf_asset(
234
  repo_type: str,
235
  ) -> Path | None:
236
  if local_path.exists():
237
- return local_path
 
 
238
  if not repo_id:
239
  return None
240
 
@@ -254,6 +353,16 @@ def _ensure_hf_asset(
254
  )
255
 
256
 
 
 
 
 
 
 
 
 
 
 
257
 
258
  def _normalize_inputs(
259
  df_raw: pd.DataFrame,
@@ -470,11 +579,39 @@ def _log_prediction_entries(
470
  "prediction": result.get("prediction"),
471
  }
472
  )
473
- if error:
474
- entry["error"] = error
475
- entries.append(entry)
476
  _append_log_entries(entries)
477
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
478
 
479
  def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
480
  df = pd.read_parquet(data_path)
@@ -853,7 +990,7 @@ def _get_customer_reference(preprocessor: PreprocessorArtifacts) -> pd.DataFrame
853
  if cached is not None:
854
  return cached
855
  data_path = CUSTOMER_DATA_PATH
856
- if not data_path.exists():
857
  downloaded = _ensure_hf_asset(
858
  data_path,
859
  HF_CUSTOMER_REPO_ID,
@@ -1362,7 +1499,7 @@ def startup_event() -> None:
1362
  if getattr(app.state, "model", None) is not None and getattr(app.state, "preprocessor", None) is not None:
1363
  return
1364
  model_path = MODEL_PATH
1365
- if not model_path.exists():
1366
  downloaded = _ensure_hf_asset(
1367
  model_path,
1368
  HF_MODEL_REPO_ID,
@@ -1371,7 +1508,7 @@ def startup_event() -> None:
1371
  )
1372
  if downloaded is not None:
1373
  model_path = downloaded
1374
- if not model_path.exists():
1375
  if ALLOW_MISSING_ARTIFACTS:
1376
  logger.warning("Model file not found: %s. Using dummy model.", model_path)
1377
  app.state.model = DummyModel()
@@ -1379,10 +1516,17 @@ def startup_event() -> None:
1379
  raise RuntimeError(f"Model file not found: {model_path}")
1380
  else:
1381
  logger.info("Loading model from %s", model_path)
1382
- app.state.model = load_model(model_path)
 
 
 
 
 
 
 
1383
 
1384
  data_path = DATA_PATH
1385
- if not data_path.exists():
1386
  downloaded = _ensure_hf_asset(
1387
  data_path,
1388
  HF_CUSTOMER_REPO_ID,
@@ -1393,7 +1537,7 @@ def startup_event() -> None:
1393
  data_path = downloaded
1394
  try:
1395
  artifacts_path = ARTIFACTS_PATH
1396
- if not artifacts_path.exists():
1397
  downloaded = _ensure_hf_asset(
1398
  artifacts_path,
1399
  HF_PREPROCESSOR_REPO_ID or None,
@@ -1404,7 +1548,7 @@ def startup_event() -> None:
1404
  artifacts_path = downloaded
1405
  logger.info("Loading preprocessor artifacts from %s", artifacts_path)
1406
  app.state.preprocessor = load_preprocessor(data_path, artifacts_path)
1407
- except RuntimeError as exc:
1408
  if ALLOW_MISSING_ARTIFACTS:
1409
  logger.warning("Preprocessor artifacts missing (%s). Using fallback preprocessor.", exc)
1410
  app.state.preprocessor = build_fallback_preprocessor()
 
1
  from __future__ import annotations
2
 
3
+ import io
4
  import logging
5
  import os
6
  import pickle
 
9
  import hashlib
10
  import json
11
  from pathlib import Path
12
+ import threading
13
  import time
14
  from typing import Any
15
  import uuid
 
18
  import numpy as np
19
  import pandas as pd
20
  from fastapi import FastAPI, Header, HTTPException, Query, Response
21
+ from huggingface_hub import HfApi
22
  from pydantic import BaseModel
23
  from sklearn.preprocessing import MinMaxScaler
24
  import joblib
 
81
  HF_CUSTOMER_REPO_TYPE = os.getenv("HF_CUSTOMER_REPO_TYPE", "dataset")
82
  HF_CUSTOMER_FILENAME = os.getenv("HF_CUSTOMER_FILENAME", CUSTOMER_DATA_PATH.name)
83
 
84
+ HF_LOG_ENABLED = os.getenv("HF_LOG_ENABLED", "1") == "1"
85
+ HF_LOG_DATASET_REPO = os.getenv("HF_LOG_DATASET_REPO")
86
+ HF_LOG_PATH_PREFIX = os.getenv("HF_LOG_PATH_PREFIX", "prod_logs")
87
+
88
+ HF_LOG_BUFFER_MAX = int(os.getenv("HF_LOG_BUFFER_MAX", "50"))
89
+ HF_LOG_FLUSH_SECONDS = int(os.getenv("HF_LOG_FLUSH_SECONDS", "60"))
90
+
91
+ _hf_api = HfApi(token=os.getenv("HF_TOKEN")) if os.getenv("HF_TOKEN") else None
92
+ _hf_lock = threading.Lock()
93
+ _hf_buffer: list[dict[str, Any]] = []
94
+ _hf_last_flush = 0.0
95
+ _hf_flusher_started = False
96
+
97
  IGNORE_FEATURES = ["is_train", "is_test", "TARGET", "SK_ID_CURR"]
98
  ENGINEERED_FEATURES = [
99
  "DAYS_EMPLOYED_ANOM",
 
234
  return hashlib.sha256(str(value).encode("utf-8")).hexdigest()
235
 
236
 
237
+ def _utc_day() -> str:
238
+ return datetime.now(timezone.utc).strftime("%Y-%m-%d")
239
+
240
+
241
+ def _utc_stamp() -> str:
242
+ return datetime.now(timezone.utc).strftime("%H%M%S")
243
+
244
+
245
+ def _start_hf_flusher_if_needed() -> None:
246
+ global _hf_flusher_started
247
+ if _hf_flusher_started:
248
+ return
249
+ _hf_flusher_started = True
250
+
251
+ def _loop() -> None:
252
+ while True:
253
+ time.sleep(HF_LOG_FLUSH_SECONDS)
254
+ with _hf_lock:
255
+ _flush_hf_locked(force=True)
256
+
257
+ threading.Thread(target=_loop, daemon=True).start()
258
+
259
+
260
+ def _upload_parquet_part(df: pd.DataFrame) -> None:
261
+ if not (HF_LOG_ENABLED and _hf_api and HF_LOG_DATASET_REPO):
262
+ return
263
+
264
+ part_path = (
265
+ f"{HF_LOG_PATH_PREFIX}/date={_utc_day()}/"
266
+ f"part-{_utc_stamp()}-{uuid.uuid4().hex}.parquet"
267
+ )
268
+
269
+ bio = io.BytesIO()
270
+ df.to_parquet(bio, index=False)
271
+
272
+ for attempt in range(3):
273
+ try:
274
+ bio.seek(0)
275
+ _hf_api.upload_file(
276
+ path_or_fileobj=bio,
277
+ path_in_repo=part_path,
278
+ repo_id=HF_LOG_DATASET_REPO,
279
+ repo_type="dataset",
280
+ commit_message=f"Add inference logs {_utc_day()}",
281
+ )
282
+ return
283
+ except Exception:
284
+ if attempt == 2:
285
+ raise
286
+ time.sleep(1.5 * (attempt + 1))
287
+
288
+
289
+ def _flush_hf_locked(force: bool = False) -> None:
290
+ global _hf_buffer, _hf_last_flush
291
+ if not _hf_buffer:
292
+ return
293
+
294
+ now = time.time()
295
+ if not force:
296
+ if len(_hf_buffer) < HF_LOG_BUFFER_MAX and (now - _hf_last_flush) < HF_LOG_FLUSH_SECONDS:
297
+ return
298
+
299
+ df = pd.DataFrame(_hf_buffer)
300
+ _hf_buffer = []
301
+ _hf_last_flush = now
302
+
303
+ try:
304
+ _upload_parquet_part(df)
305
+ except Exception as exc:
306
+ logger.warning("HF log upload failed: %s", exc)
307
+
308
+
309
+ def hf_log_rows(rows: list[dict[str, Any]]) -> None:
310
+ if not (HF_LOG_ENABLED and _hf_api and HF_LOG_DATASET_REPO):
311
+ return
312
+ _start_hf_flusher_if_needed()
313
+ with _hf_lock:
314
+ _hf_buffer.extend(rows)
315
+ _flush_hf_locked(force=False)
316
+
317
+
318
  def _normalize_category_value(value: object, mapping: dict[str, str]) -> object:
319
  if pd.isna(value): # type: ignore
320
  return np.nan
 
331
  repo_type: str,
332
  ) -> Path | None:
333
  if local_path.exists():
334
+ if not _is_lfs_pointer(local_path):
335
+ return local_path
336
+ logger.warning("LFS pointer detected for %s; attempting remote download.", local_path)
337
  if not repo_id:
338
  return None
339
 
 
353
  )
354
 
355
 
356
+ def _is_lfs_pointer(path: Path) -> bool:
357
+ try:
358
+ with path.open("rb") as handle:
359
+ head = handle.read(200)
360
+ except OSError:
361
+ return False
362
+ text = head.decode("utf-8", errors="ignore")
363
+ return text.startswith("version https://git-lfs.github.com/spec/v1")
364
+
365
+
366
 
367
  def _normalize_inputs(
368
  df_raw: pd.DataFrame,
 
579
  "prediction": result.get("prediction"),
580
  }
581
  )
582
+ if error:
583
+ entry["error"] = error
584
+ entries.append(entry)
585
  _append_log_entries(entries)
586
 
587
+ flat_rows: list[dict[str, Any]] = []
588
+ for entry in entries:
589
+ row = {
590
+ "timestamp_utc": entry.get("timestamp"),
591
+ "request_id": entry.get("request_id"),
592
+ "endpoint": entry.get("endpoint"),
593
+ "source": entry.get("source"),
594
+ "status_code": entry.get("status_code"),
595
+ "latency_ms": entry.get("latency_ms"),
596
+ "model_version": entry.get("model_version"),
597
+ "threshold": entry.get("threshold"),
598
+ "sk_id_curr": entry.get("sk_id_curr"),
599
+ "probability": entry.get("probability"),
600
+ "prediction": entry.get("prediction"),
601
+ "error": entry.get("error"),
602
+ }
603
+ inputs = entry.get("inputs") or {}
604
+ for key, value in inputs.items():
605
+ row[f"input__{key}"] = value
606
+
607
+ dq = entry.get("data_quality") or {}
608
+ for key, value in dq.items():
609
+ row[f"dq__{key}"] = value
610
+
611
+ flat_rows.append(row)
612
+
613
+ hf_log_rows(flat_rows)
614
+
615
 
616
  def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
617
  df = pd.read_parquet(data_path)
 
990
  if cached is not None:
991
  return cached
992
  data_path = CUSTOMER_DATA_PATH
993
+ if not data_path.exists() or _is_lfs_pointer(data_path):
994
  downloaded = _ensure_hf_asset(
995
  data_path,
996
  HF_CUSTOMER_REPO_ID,
 
1499
  if getattr(app.state, "model", None) is not None and getattr(app.state, "preprocessor", None) is not None:
1500
  return
1501
  model_path = MODEL_PATH
1502
+ if not model_path.exists() or _is_lfs_pointer(model_path):
1503
  downloaded = _ensure_hf_asset(
1504
  model_path,
1505
  HF_MODEL_REPO_ID,
 
1508
  )
1509
  if downloaded is not None:
1510
  model_path = downloaded
1511
+ if not model_path.exists() or _is_lfs_pointer(model_path):
1512
  if ALLOW_MISSING_ARTIFACTS:
1513
  logger.warning("Model file not found: %s. Using dummy model.", model_path)
1514
  app.state.model = DummyModel()
 
1516
  raise RuntimeError(f"Model file not found: {model_path}")
1517
  else:
1518
  logger.info("Loading model from %s", model_path)
1519
+ try:
1520
+ app.state.model = load_model(model_path)
1521
+ except Exception as exc:
1522
+ if ALLOW_MISSING_ARTIFACTS:
1523
+ logger.warning("Model load failed (%s). Using dummy model.", exc)
1524
+ app.state.model = DummyModel()
1525
+ else:
1526
+ raise
1527
 
1528
  data_path = DATA_PATH
1529
+ if not data_path.exists() or _is_lfs_pointer(data_path):
1530
  downloaded = _ensure_hf_asset(
1531
  data_path,
1532
  HF_CUSTOMER_REPO_ID,
 
1537
  data_path = downloaded
1538
  try:
1539
  artifacts_path = ARTIFACTS_PATH
1540
+ if not artifacts_path.exists() or _is_lfs_pointer(artifacts_path):
1541
  downloaded = _ensure_hf_asset(
1542
  artifacts_path,
1543
  HF_PREPROCESSOR_REPO_ID or None,
 
1548
  artifacts_path = downloaded
1549
  logger.info("Loading preprocessor artifacts from %s", artifacts_path)
1550
  app.state.preprocessor = load_preprocessor(data_path, artifacts_path)
1551
+ except Exception as exc:
1552
  if ALLOW_MISSING_ARTIFACTS:
1553
  logger.warning("Preprocessor artifacts missing (%s). Using fallback preprocessor.", exc)
1554
  app.state.preprocessor = build_fallback_preprocessor()
hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml CHANGED
@@ -59,6 +59,24 @@ jobs:
59
  model_path = candidates[0]
60
 
61
  api = HfApi()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  for path in [model_path]:
63
  api.upload_file(
64
  path_or_fileobj=str(path),
 
59
  model_path = candidates[0]
60
 
61
  api = HfApi()
62
+ existing = api.list_repo_files(
63
+ repo_id=repo_id,
64
+ repo_type=repo_type,
65
+ token=token,
66
+ )
67
+ to_delete = [
68
+ name
69
+ for name in existing
70
+ if name.endswith("_final_model.pkl") and name != model_path.name
71
+ ]
72
+ for name in to_delete:
73
+ api.delete_file(
74
+ path_in_repo=name,
75
+ repo_id=repo_id,
76
+ repo_type=repo_type,
77
+ token=token,
78
+ commit_message=f"Remove {name}",
79
+ )
80
  for path in [model_path]:
81
  api.upload_file(
82
  path_or_fileobj=str(path),
hf_space/hf_space/hf_space/data/xgb_final_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fe10d7c60f50f96a87bafd298f2919653aed37d90a091059017800450e6273b
3
+ size 1370510
hf_space/hf_space/hf_space/gradio_app.py CHANGED
@@ -21,20 +21,10 @@ from app.main import (
21
  _normalize_inputs,
22
  )
23
 
24
- import io
25
- import os
26
- import threading
27
- import time
28
- import uuid
29
- from datetime import datetime, timezone
30
-
31
- from huggingface_hub import HfApi
32
-
33
 
34
  def _ensure_startup() -> None:
35
  if not getattr(app.state, "preprocessor", None):
36
  startup_event()
37
- _start_log_flusher_if_needed()
38
 
39
 
40
  def _customer_snapshot(sk_id_curr: int) -> dict[str, Any]:
@@ -297,8 +287,7 @@ with gr.Blocks(title="Credit scoring MLOps") as demo:
297
  """
298
  <p>Renseignez l'identifiant client, le montant du crédit et la durée.</p>
299
  <p>Le modèle prédit la probabilité de défaut de paiement ainsi que la prédiction binaire associée. Le tableau SHAP affiche les 10 features les plus influentes pour cette prédiction. Le snapshot client affiche quelques informations de référence sur le client.</p>
300
- <p>Pour accéder au data drift monitoring et aux rapports, rendez-vous sur l'application Streamlit dédiée.</p>
301
- <p> Le dataset est disponible sur <a href="https://huggingface.co/datasets/stephmnt/assets-credit-scoring-mlops" rel="noreferrer">Hugging Face</a>.</p>
302
  """
303
  )
304
 
@@ -328,115 +317,6 @@ with gr.Blocks(title="Credit scoring MLOps") as demo:
328
  outputs=[probability, prediction, shap_table, snapshot],
329
  )
330
 
331
- # =========================
332
- # HF Dataset logging (Parquet parts)
333
- # =========================
334
-
335
- LOG_ENABLED = os.getenv("LOG_ENABLED", "1") == "1"
336
- LOG_DATASET_REPO = os.getenv("LOG_DATASET_REPO", "stephmnt/assets-credit-scoring-mlops")
337
- LOG_PATH_PREFIX = os.getenv("LOG_PATH_PREFIX", "prod_logs")
338
- HF_TOKEN = os.getenv("HF_TOKEN") # Secret HF (write) sur le Space inference
339
-
340
- LOG_BUFFER_MAX = int(os.getenv("LOG_BUFFER_MAX", "50")) # flush dès 50 lignes
341
- LOG_FLUSH_SECONDS = int(os.getenv("LOG_FLUSH_SECONDS", "60")) # flush au moins toutes les 60s
342
-
343
- _hf_api = HfApi(token=HF_TOKEN) if HF_TOKEN else None
344
- _log_lock = threading.Lock()
345
- _log_buffer: list[dict] = []
346
- _last_flush_ts = 0.0
347
- _flusher_started = False
348
-
349
-
350
- def _now_utc_iso() -> str:
351
- return datetime.now(timezone.utc).isoformat()
352
-
353
-
354
- def _upload_parquet_part(df: pd.DataFrame) -> None:
355
- if _hf_api is None:
356
- return # pas de token => pas de write
357
- day = datetime.now(timezone.utc).strftime("%Y-%m-%d")
358
- stamp = datetime.now(timezone.utc).strftime("%H%M%S")
359
- part = f"{LOG_PATH_PREFIX}/date={day}/part-{stamp}-{uuid.uuid4().hex}.parquet"
360
-
361
- bio = io.BytesIO()
362
- df.to_parquet(bio, index=False)
363
- bio.seek(0)
364
-
365
- _hf_api.upload_file(
366
- path_or_fileobj=bio,
367
- path_in_repo=part,
368
- repo_id=LOG_DATASET_REPO,
369
- repo_type="dataset",
370
- commit_message=f"Add inference logs {day}",
371
- )
372
-
373
-
374
- def _flush_logs_locked(force: bool = False) -> None:
375
- global _log_buffer, _last_flush_ts
376
- if not _log_buffer:
377
- return
378
-
379
- now = time.time()
380
- if not force:
381
- if len(_log_buffer) < LOG_BUFFER_MAX and (now - _last_flush_ts) < LOG_FLUSH_SECONDS:
382
- return
383
-
384
- df = pd.DataFrame(_log_buffer)
385
- _log_buffer = []
386
- _last_flush_ts = now
387
-
388
- try:
389
- _upload_parquet_part(df)
390
- except Exception:
391
- # En prod tu peux logger ça en stderr / structlog etc.
392
- # On évite de faire échouer l'inférence.
393
- pass
394
-
395
-
396
- def _start_log_flusher_if_needed() -> None:
397
- global _flusher_started
398
- if _flusher_started:
399
- return
400
- _flusher_started = True
401
-
402
- def _loop():
403
- while True:
404
- time.sleep(LOG_FLUSH_SECONDS)
405
- with _log_lock:
406
- _flush_logs_locked(force=True)
407
-
408
- t = threading.Thread(target=_loop, daemon=True)
409
- t.start()
410
-
411
-
412
- def log_inference_row(row: dict) -> None:
413
- if not LOG_ENABLED or _hf_api is None:
414
- return
415
- with _log_lock:
416
- _log_buffer.append(row)
417
- _flush_logs_locked(force=False)
418
-
419
- # --- Logging (Evidently-friendly) ---
420
- row = {
421
- "timestamp_utc": _now_utc_iso(),
422
- "model_version": MODEL_VERSION,
423
- "source": "gradio",
424
- "sk_id_curr": int(sk_id_curr),
425
- "amt_credit_requested": float(amt_credit),
426
- "duration_months": int(duration_months),
427
- "probability": float(probability),
428
- "prediction": int(pred_value),
429
- }
430
- # Ajoute quelques features "business" utiles au drift (cat + num)
431
- # (tu peux en ajouter plus si tu veux)
432
- for k, v in snapshot.items():
433
- if k == "SK_ID_CURR":
434
- continue
435
- row[f"cust__{k}"] = v
436
-
437
- log_inference_row(row)
438
-
439
-
440
  if __name__ == "__main__":
441
  _ensure_startup()
442
  demo.launch()
 
21
  _normalize_inputs,
22
  )
23
 
 
 
 
 
 
 
 
 
 
24
 
25
  def _ensure_startup() -> None:
26
  if not getattr(app.state, "preprocessor", None):
27
  startup_event()
 
28
 
29
 
30
  def _customer_snapshot(sk_id_curr: int) -> dict[str, Any]:
 
287
  """
288
  <p>Renseignez l'identifiant client, le montant du crédit et la durée.</p>
289
  <p>Le modèle prédit la probabilité de défaut de paiement ainsi que la prédiction binaire associée. Le tableau SHAP affiche les 10 features les plus influentes pour cette prédiction. Le snapshot client affiche quelques informations de référence sur le client.</p>
290
+ <p>Pour accéder au data drift monitoring et aux rapports, rendez-vous sur l'application Streamlit dédiée. Le dataset est disponible sur <a href="https://huggingface.co/datasets/stephmnt/assets-credit-scoring-mlops" rel="noreferrer">Hugging Face</a>.</p>
 
291
  """
292
  )
293
 
 
317
  outputs=[probability, prediction, shap_table, snapshot],
318
  )
319
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  if __name__ == "__main__":
321
  _ensure_startup()
322
  demo.launch()
hf_space/hf_space/hf_space/hf_space/.gitignore CHANGED
@@ -5,7 +5,7 @@ __pycache__/
5
  logs/
6
  reports/
7
  data/*
8
- !data/HistGB_final_model.pkl
9
  !data/data_final.parquet
10
  artifacts/*
11
  !artifacts/preprocessor.joblib
 
5
  logs/
6
  reports/
7
  data/*
8
+ !data/*_final_model.pkl
9
  !data/data_final.parquet
10
  artifacts/*
11
  !artifacts/preprocessor.joblib
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml CHANGED
@@ -11,6 +11,10 @@ on:
11
  description: "HF repo type (dataset or model)"
12
  required: true
13
  default: "dataset"
 
 
 
 
14
 
15
  jobs:
16
  upload-assets:
@@ -34,8 +38,8 @@ jobs:
34
  - name: Upload assets to Hugging Face Hub
35
  env:
36
  HF_TOKEN: ${{ secrets.HF_TOKEN }}
37
- HF_REPO_ID: ${{ inputs.repo_id }}
38
- HF_REPO_TYPE: ${{ inputs.repo_type }}
39
  run: |
40
  python - <<'PY'
41
  import os
 
11
  description: "HF repo type (dataset or model)"
12
  required: true
13
  default: "dataset"
14
+ push:
15
+ branches: ["main"]
16
+ paths:
17
+ - "data/*_final_model.pkl"
18
 
19
  jobs:
20
  upload-assets:
 
38
  - name: Upload assets to Hugging Face Hub
39
  env:
40
  HF_TOKEN: ${{ secrets.HF_TOKEN }}
41
+ HF_REPO_ID: ${{ inputs.repo_id || 'stephmnt/assets-credit-scoring-mlops' }}
42
+ HF_REPO_TYPE: ${{ inputs.repo_type || 'dataset' }}
43
  run: |
44
  python - <<'PY'
45
  import os
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py CHANGED
@@ -237,22 +237,24 @@ def _ensure_hf_asset(
237
  return local_path
238
  if not repo_id:
239
  return None
240
- try:
241
- from huggingface_hub import hf_hub_download
242
- except ImportError as exc: # pragma: no cover - optional dependency
243
- raise RuntimeError("huggingface_hub is required to download remote assets.") from exc
 
244
  local_path.parent.mkdir(parents=True, exist_ok=True)
245
  return Path(
246
  hf_hub_download(
247
  repo_id=repo_id,
248
  filename=filename,
249
  repo_type=repo_type,
 
250
  local_dir=str(local_path.parent),
251
- local_dir_use_symlinks=False,
252
  )
253
  )
254
 
255
 
 
256
  def _normalize_inputs(
257
  df_raw: pd.DataFrame,
258
  preprocessor: PreprocessorArtifacts,
 
237
  return local_path
238
  if not repo_id:
239
  return None
240
+
241
+ from huggingface_hub import hf_hub_download
242
+
243
+ token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
244
+
245
  local_path.parent.mkdir(parents=True, exist_ok=True)
246
  return Path(
247
  hf_hub_download(
248
  repo_id=repo_id,
249
  filename=filename,
250
  repo_type=repo_type,
251
+ token=token, # ✅ essentiel pour repo gated
252
  local_dir=str(local_path.parent),
 
253
  )
254
  )
255
 
256
 
257
+
258
  def _normalize_inputs(
259
  df_raw: pd.DataFrame,
260
  preprocessor: PreprocessorArtifacts,
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py CHANGED
@@ -296,10 +296,9 @@ with gr.Blocks(title="Credit scoring MLOps") as demo:
296
  gr.HTML(
297
  """
298
  <p>Renseignez l'identifiant client, le montant du crédit et la durée.</p>
299
- <p>Le modèle prédit la probabilité de défaut de paiement ainsi que la prédiction binaire associée.</p>
300
- <p>Le tableau SHAP affiche les 10 features les plus influentes pour cette prédiction.</p>
301
- <p>Le snapshot client affiche quelques informations de référence sur le client.</p>
302
  <p>Pour accéder au data drift monitoring et aux rapports, rendez-vous sur l'application Streamlit dédiée.</p>
 
303
  """
304
  )
305
 
 
296
  gr.HTML(
297
  """
298
  <p>Renseignez l'identifiant client, le montant du crédit et la durée.</p>
299
+ <p>Le modèle prédit la probabilité de défaut de paiement ainsi que la prédiction binaire associée. Le tableau SHAP affiche les 10 features les plus influentes pour cette prédiction. Le snapshot client affiche quelques informations de référence sur le client.</p>
 
 
300
  <p>Pour accéder au data drift monitoring et aux rapports, rendez-vous sur l'application Streamlit dédiée.</p>
301
+ <p> Le dataset est disponible sur <a href="https://huggingface.co/datasets/stephmnt/assets-credit-scoring-mlops" rel="noreferrer">Hugging Face</a>.</p>
302
  """
303
  )
304
 
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy.yml CHANGED
@@ -49,7 +49,6 @@ jobs:
49
  --exclude 'logs' \
50
  --exclude 'reports' \
51
  --exclude 'screen-mlflow.png' \
52
- --exclude 'data/*_final_model.pkl' \
53
  --exclude 'artifacts/preprocessor.joblib' \
54
  --exclude 'data/*.csv' \
55
  --exclude 'data/*.parquet' \
 
49
  --exclude 'logs' \
50
  --exclude 'reports' \
51
  --exclude 'screen-mlflow.png' \
 
52
  --exclude 'artifacts/preprocessor.joblib' \
53
  --exclude 'data/*.csv' \
54
  --exclude 'data/*.parquet' \
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py CHANGED
@@ -21,10 +21,20 @@ from app.main import (
21
  _normalize_inputs,
22
  )
23
 
 
 
 
 
 
 
 
 
 
24
 
25
  def _ensure_startup() -> None:
26
  if not getattr(app.state, "preprocessor", None):
27
  startup_event()
 
28
 
29
 
30
  def _customer_snapshot(sk_id_curr: int) -> dict[str, Any]:
@@ -283,13 +293,19 @@ with gr.Blocks(title="Credit scoring MLOps") as demo:
283
  </a>
284
  </div>
285
  """)
286
- gr.Markdown(
287
- "Renseignez l'identifiant client, le montant du crédit et la durée. "
 
 
 
 
 
 
288
  )
289
 
290
  with gr.Row():
291
  sk_id_curr = gr.Number(label="Identifiant client", precision=0, value=100001)
292
- amt_credit = gr.Number(label="Montant du crédit", value=200000)
293
  duration_months = gr.Number(label="Durée (mois)", precision=0, value=60)
294
 
295
  run_btn = gr.Button("Scorer")
@@ -313,6 +329,114 @@ with gr.Blocks(title="Credit scoring MLOps") as demo:
313
  outputs=[probability, prediction, shap_table, snapshot],
314
  )
315
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
 
317
  if __name__ == "__main__":
318
  _ensure_startup()
 
21
  _normalize_inputs,
22
  )
23
 
24
+ import io
25
+ import os
26
+ import threading
27
+ import time
28
+ import uuid
29
+ from datetime import datetime, timezone
30
+
31
+ from huggingface_hub import HfApi
32
+
33
 
34
  def _ensure_startup() -> None:
35
  if not getattr(app.state, "preprocessor", None):
36
  startup_event()
37
+ _start_log_flusher_if_needed()
38
 
39
 
40
  def _customer_snapshot(sk_id_curr: int) -> dict[str, Any]:
 
293
  </a>
294
  </div>
295
  """)
296
+ gr.HTML(
297
+ """
298
+ <p>Renseignez l'identifiant client, le montant du crédit et la durée.</p>
299
+ <p>Le modèle prédit la probabilité de défaut de paiement ainsi que la prédiction binaire associée.</p>
300
+ <p>Le tableau SHAP affiche les 10 features les plus influentes pour cette prédiction.</p>
301
+ <p>Le snapshot client affiche quelques informations de référence sur le client.</p>
302
+ <p>Pour accéder au data drift monitoring et aux rapports, rendez-vous sur l'application Streamlit dédiée.</p>
303
+ """
304
  )
305
 
306
  with gr.Row():
307
  sk_id_curr = gr.Number(label="Identifiant client", precision=0, value=100001)
308
+ amt_credit = gr.Number(label="Montant du crédit", value=2000000)
309
  duration_months = gr.Number(label="Durée (mois)", precision=0, value=60)
310
 
311
  run_btn = gr.Button("Scorer")
 
329
  outputs=[probability, prediction, shap_table, snapshot],
330
  )
331
 
332
+ # =========================
333
+ # HF Dataset logging (Parquet parts)
334
+ # =========================
335
+
336
+ LOG_ENABLED = os.getenv("LOG_ENABLED", "1") == "1"
337
+ LOG_DATASET_REPO = os.getenv("LOG_DATASET_REPO", "stephmnt/assets-credit-scoring-mlops")
338
+ LOG_PATH_PREFIX = os.getenv("LOG_PATH_PREFIX", "prod_logs")
339
+ HF_TOKEN = os.getenv("HF_TOKEN") # Secret HF (write) sur le Space inference
340
+
341
+ LOG_BUFFER_MAX = int(os.getenv("LOG_BUFFER_MAX", "50")) # flush dès 50 lignes
342
+ LOG_FLUSH_SECONDS = int(os.getenv("LOG_FLUSH_SECONDS", "60")) # flush au moins toutes les 60s
343
+
344
+ _hf_api = HfApi(token=HF_TOKEN) if HF_TOKEN else None
345
+ _log_lock = threading.Lock()
346
+ _log_buffer: list[dict] = []
347
+ _last_flush_ts = 0.0
348
+ _flusher_started = False
349
+
350
+
351
+ def _now_utc_iso() -> str:
352
+ return datetime.now(timezone.utc).isoformat()
353
+
354
+
355
+ def _upload_parquet_part(df: pd.DataFrame) -> None:
356
+ if _hf_api is None:
357
+ return # pas de token => pas de write
358
+ day = datetime.now(timezone.utc).strftime("%Y-%m-%d")
359
+ stamp = datetime.now(timezone.utc).strftime("%H%M%S")
360
+ part = f"{LOG_PATH_PREFIX}/date={day}/part-{stamp}-{uuid.uuid4().hex}.parquet"
361
+
362
+ bio = io.BytesIO()
363
+ df.to_parquet(bio, index=False)
364
+ bio.seek(0)
365
+
366
+ _hf_api.upload_file(
367
+ path_or_fileobj=bio,
368
+ path_in_repo=part,
369
+ repo_id=LOG_DATASET_REPO,
370
+ repo_type="dataset",
371
+ commit_message=f"Add inference logs {day}",
372
+ )
373
+
374
+
375
+ def _flush_logs_locked(force: bool = False) -> None:
376
+ global _log_buffer, _last_flush_ts
377
+ if not _log_buffer:
378
+ return
379
+
380
+ now = time.time()
381
+ if not force:
382
+ if len(_log_buffer) < LOG_BUFFER_MAX and (now - _last_flush_ts) < LOG_FLUSH_SECONDS:
383
+ return
384
+
385
+ df = pd.DataFrame(_log_buffer)
386
+ _log_buffer = []
387
+ _last_flush_ts = now
388
+
389
+ try:
390
+ _upload_parquet_part(df)
391
+ except Exception:
392
+ # En prod tu peux logger ça en stderr / structlog etc.
393
+ # On évite de faire échouer l'inférence.
394
+ pass
395
+
396
+
397
+ def _start_log_flusher_if_needed() -> None:
398
+ global _flusher_started
399
+ if _flusher_started:
400
+ return
401
+ _flusher_started = True
402
+
403
+ def _loop():
404
+ while True:
405
+ time.sleep(LOG_FLUSH_SECONDS)
406
+ with _log_lock:
407
+ _flush_logs_locked(force=True)
408
+
409
+ t = threading.Thread(target=_loop, daemon=True)
410
+ t.start()
411
+
412
+
413
+ def log_inference_row(row: dict) -> None:
414
+ if not LOG_ENABLED or _hf_api is None:
415
+ return
416
+ with _log_lock:
417
+ _log_buffer.append(row)
418
+ _flush_logs_locked(force=False)
419
+
420
+ # --- Logging (Evidently-friendly) ---
421
+ row = {
422
+ "timestamp_utc": _now_utc_iso(),
423
+ "model_version": MODEL_VERSION,
424
+ "source": "gradio",
425
+ "sk_id_curr": int(sk_id_curr),
426
+ "amt_credit_requested": float(amt_credit),
427
+ "duration_months": int(duration_months),
428
+ "probability": float(probability),
429
+ "prediction": int(pred_value),
430
+ }
431
+ # Ajoute quelques features "business" utiles au drift (cat + num)
432
+ # (tu peux en ajouter plus si tu veux)
433
+ for k, v in snapshot.items():
434
+ if k == "SK_ID_CURR":
435
+ continue
436
+ row[f"cust__{k}"] = v
437
+
438
+ log_inference_row(row)
439
+
440
 
441
  if __name__ == "__main__":
442
  _ensure_startup()
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py CHANGED
@@ -219,7 +219,7 @@ def _hash_value(value: Any) -> str:
219
 
220
 
221
  def _normalize_category_value(value: object, mapping: dict[str, str]) -> object:
222
- if pd.isna(value):
223
  return np.nan
224
  key = str(value).strip().upper()
225
  if not key:
@@ -265,12 +265,12 @@ def _normalize_inputs(
265
  unknown_masks: dict[str, pd.Series] = {}
266
  if "CODE_GENDER" in df.columns:
267
  raw = df["CODE_GENDER"]
268
- normalized = raw.apply(lambda v: _normalize_category_value(v, CODE_GENDER_MAPPING))
269
  unknown_masks["CODE_GENDER"] = normalized.eq("Unknown") & raw.notna()
270
  df["CODE_GENDER"] = normalized
271
  if "FLAG_OWN_CAR" in df.columns:
272
  raw = df["FLAG_OWN_CAR"]
273
- normalized = raw.apply(lambda v: _normalize_category_value(v, FLAG_OWN_CAR_MAPPING))
274
  unknown_masks["FLAG_OWN_CAR"] = normalized.eq("Unknown") & raw.notna()
275
  df["FLAG_OWN_CAR"] = normalized
276
 
@@ -404,7 +404,7 @@ def _build_minimal_record(
404
  )
405
  if "AMT_GOODS_PRICE" in record:
406
  record["AMT_GOODS_PRICE"] = float(payload.amt_credit)
407
- return record
408
 
409
 
410
  def _append_log_entries(entries: list[dict[str, Any]]) -> None:
@@ -1576,7 +1576,7 @@ def _predict_records(
1576
  latency_ms = (time.perf_counter() - start_time) * 1000.0
1577
  _log_prediction_entries(
1578
  request_id=request_id,
1579
- records=log_records,
1580
  results=results,
1581
  latency_ms=latency_ms,
1582
  threshold=use_threshold,
@@ -1598,7 +1598,7 @@ def _predict_records(
1598
  latency_ms = (time.perf_counter() - start_time) * 1000.0
1599
  _log_prediction_entries(
1600
  request_id=request_id,
1601
- records=log_records,
1602
  results=results,
1603
  latency_ms=latency_ms,
1604
  threshold=None,
@@ -1613,7 +1613,7 @@ def _predict_records(
1613
  detail = exc.detail if isinstance(exc.detail, dict) else {"message": str(exc.detail)}
1614
  _log_prediction_entries(
1615
  request_id=request_id,
1616
- records=log_records if "log_records" in locals() else records,
1617
  results=None,
1618
  latency_ms=latency_ms,
1619
  threshold=threshold,
@@ -1628,7 +1628,7 @@ def _predict_records(
1628
  latency_ms = (time.perf_counter() - start_time) * 1000.0
1629
  _log_prediction_entries(
1630
  request_id=request_id,
1631
- records=log_records if "log_records" in locals() else records,
1632
  results=None,
1633
  latency_ms=latency_ms,
1634
  threshold=threshold,
 
219
 
220
 
221
  def _normalize_category_value(value: object, mapping: dict[str, str]) -> object:
222
+ if pd.isna(value): # type: ignore
223
  return np.nan
224
  key = str(value).strip().upper()
225
  if not key:
 
265
  unknown_masks: dict[str, pd.Series] = {}
266
  if "CODE_GENDER" in df.columns:
267
  raw = df["CODE_GENDER"]
268
+ normalized = raw.apply(lambda v: _normalize_category_value(v, CODE_GENDER_MAPPING)) # type: ignore
269
  unknown_masks["CODE_GENDER"] = normalized.eq("Unknown") & raw.notna()
270
  df["CODE_GENDER"] = normalized
271
  if "FLAG_OWN_CAR" in df.columns:
272
  raw = df["FLAG_OWN_CAR"]
273
+ normalized = raw.apply(lambda v: _normalize_category_value(v, FLAG_OWN_CAR_MAPPING)) # type: ignore
274
  unknown_masks["FLAG_OWN_CAR"] = normalized.eq("Unknown") & raw.notna()
275
  df["FLAG_OWN_CAR"] = normalized
276
 
 
404
  )
405
  if "AMT_GOODS_PRICE" in record:
406
  record["AMT_GOODS_PRICE"] = float(payload.amt_credit)
407
+ return record # type: ignore
408
 
409
 
410
  def _append_log_entries(entries: list[dict[str, Any]]) -> None:
 
1576
  latency_ms = (time.perf_counter() - start_time) * 1000.0
1577
  _log_prediction_entries(
1578
  request_id=request_id,
1579
+ records=log_records, # type: ignore
1580
  results=results,
1581
  latency_ms=latency_ms,
1582
  threshold=use_threshold,
 
1598
  latency_ms = (time.perf_counter() - start_time) * 1000.0
1599
  _log_prediction_entries(
1600
  request_id=request_id,
1601
+ records=log_records, # type: ignore
1602
  results=results,
1603
  latency_ms=latency_ms,
1604
  threshold=None,
 
1613
  detail = exc.detail if isinstance(exc.detail, dict) else {"message": str(exc.detail)}
1614
  _log_prediction_entries(
1615
  request_id=request_id,
1616
+ records=log_records if "log_records" in locals() else records, # type: ignore
1617
  results=None,
1618
  latency_ms=latency_ms,
1619
  threshold=threshold,
 
1628
  latency_ms = (time.perf_counter() - start_time) * 1000.0
1629
  _log_prediction_entries(
1630
  request_id=request_id,
1631
+ records=log_records if "log_records" in locals() else records, # type: ignore
1632
  results=None,
1633
  latency_ms=latency_ms,
1634
  threshold=threshold,
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/HistGB_final_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c7b31d6b2aa9d622717d03b6eaf79e6e21297869ff401f2f61a2d688cc55d6f
3
+ size 411244
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.env.example ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core paths
2
+ MODEL_PATH=data/HistGB_final_model.pkl
3
+ DATA_PATH=data/data_final.parquet
4
+ ARTIFACTS_PATH=artifacts/preprocessor.joblib
5
+
6
+ # Prediction behavior
7
+ PREDICTION_THRESHOLD=0.5
8
+ CACHE_PREPROCESSOR=1
9
+ USE_REDUCED_INPUTS=1
10
+ ALLOW_MISSING_ARTIFACTS=0
11
+ MISSING_INDICATOR_MIN_RATE=0.05
12
+
13
+ # Feature selection (correlation)
14
+ FEATURE_SELECTION_METHOD=correlation
15
+ FEATURE_SELECTION_TOP_N=8
16
+ FEATURE_SELECTION_MIN_CORR=0.02
17
+ CORRELATION_THRESHOLD=0.85
18
+ CORRELATION_SAMPLE_SIZE=50000
19
+
20
+ # Logging
21
+ LOG_PREDICTIONS=1
22
+ LOG_DIR=logs
23
+ LOG_FILE=predictions.jsonl
24
+ LOG_INCLUDE_INPUTS=1
25
+ LOG_HASH_SK_ID=0
26
+ MODEL_VERSION=HistGB_final_model.pkl
27
+ LOGS_ACCESS_TOKEN=
28
+
29
+ # Customer reference lookup
30
+ CUSTOMER_DATA_PATH=data/data_final.parquet
31
+ CUSTOMER_LOOKUP_ENABLED=1
32
+ CUSTOMER_LOOKUP_CACHE=1
33
+
34
+ # Hugging Face assets (optional)
35
+ HF_MODEL_REPO_ID=stephmnt/assets-credit-scoring-mlops
36
+ HF_MODEL_REPO_TYPE=model
37
+ HF_MODEL_FILENAME=HistGB_final_model.pkl
38
+ HF_PREPROCESSOR_REPO_ID=stephmnt/assets-credit-scoring-mlops
39
+ HF_PREPROCESSOR_REPO_TYPE=model
40
+ HF_PREPROCESSOR_FILENAME=preprocessor.joblib
41
+ HF_CUSTOMER_REPO_ID=stephmnt/assets-credit-scoring-mlops
42
+ HF_CUSTOMER_REPO_TYPE=dataset
43
+ HF_CUSTOMER_FILENAME=data_final.parquet
44
+
45
+ # MLflow
46
+ MLFLOW_TRACKING_URI=http://127.0.0.1:5000
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml CHANGED
@@ -46,24 +46,23 @@ jobs:
46
  repo_type = os.environ["HF_REPO_TYPE"]
47
  token = os.environ["HF_TOKEN"]
48
 
49
- files = {
50
- "data/HistGB_final_model.pkl": "HistGB_final_model.pkl",
51
- "artifacts/preprocessor.joblib": "preprocessor.joblib",
52
- "data/data_final.parquet": "data_final.parquet",
53
- }
 
 
54
 
55
  api = HfApi()
56
- for local_path, remote_name in files.items():
57
- path = Path(local_path)
58
- if not path.exists():
59
- raise SystemExit(f"Missing file: {path}")
60
  api.upload_file(
61
  path_or_fileobj=str(path),
62
- path_in_repo=remote_name,
63
  repo_id=repo_id,
64
  repo_type=repo_type,
65
  token=token,
66
- commit_message=f"Update {remote_name}",
67
  )
68
  print("Assets uploaded.")
69
  PY
 
46
  repo_type = os.environ["HF_REPO_TYPE"]
47
  token = os.environ["HF_TOKEN"]
48
 
49
+ candidates = sorted(Path("data").glob("*_final_model.pkl"))
50
+ if not candidates:
51
+ raise SystemExit("Missing model file: data/*_final_model.pkl")
52
+ if len(candidates) > 1:
53
+ names = ", ".join(path.name for path in candidates)
54
+ raise SystemExit(f"Multiple *_final_model.pkl files found: {names}")
55
+ model_path = candidates[0]
56
 
57
  api = HfApi()
58
+ for path in [model_path]:
 
 
 
59
  api.upload_file(
60
  path_or_fileobj=str(path),
61
+ path_in_repo=path.name,
62
  repo_id=repo_id,
63
  repo_type=repo_type,
64
  token=token,
65
+ commit_message=f"Update {path.name}",
66
  )
67
  print("Assets uploaded.")
68
  PY
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy.yml CHANGED
@@ -49,10 +49,11 @@ jobs:
49
  --exclude 'logs' \
50
  --exclude 'reports' \
51
  --exclude 'screen-mlflow.png' \
52
- --exclude 'data/HistGB_final_model.pkl' \
53
  --exclude 'artifacts/preprocessor.joblib' \
54
  --exclude 'data/*.csv' \
55
  --exclude 'data/*.parquet' \
 
56
  ./ hf_space/
57
  cd hf_space
58
  git add .
 
49
  --exclude 'logs' \
50
  --exclude 'reports' \
51
  --exclude 'screen-mlflow.png' \
52
+ --exclude 'data/*_final_model.pkl' \
53
  --exclude 'artifacts/preprocessor.joblib' \
54
  --exclude 'data/*.csv' \
55
  --exclude 'data/*.parquet' \
56
+ --exclude 'notebooks/mlflow.db' \
57
  ./ hf_space/
58
  cd hf_space
59
  git add .
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/Dockerfile CHANGED
@@ -10,6 +10,7 @@ RUN pip install --no-cache-dir -r requirements.txt
10
 
11
  COPY app/ app/
12
  COPY app_entry.py app.py gradio_app.py ./
 
13
  COPY data/ data/
14
  COPY artifacts/ artifacts/
15
 
 
10
 
11
  COPY app/ app/
12
  COPY app_entry.py app.py gradio_app.py ./
13
+ COPY src/ src/
14
  COPY data/ data/
15
  COPY artifacts/ artifacts/
16
 
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md CHANGED
@@ -14,6 +14,18 @@ pinned: false
14
  [![GitHub Release Date](https://img.shields.io/github/release-date/stephmnt/credit-scoring-mlops?display_date=published_at&style=flat-square)](https://github.com/stephmnt/credit-scoring-mlops/releases)
15
  [![project_license](https://img.shields.io/github/license/stephmnt/credit-scoring-mlops.svg)](https://github.com/stephmnt/credit-scoring-mlops/blob/main/LICENSE)
16
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  ## Lancer MLFlow
18
 
19
  Le notebook est configure pour utiliser un serveur MLflow local (`http://127.0.0.1:5000`).
@@ -75,6 +87,28 @@ pytest -q
75
  uvicorn app.main:app --reload --port 7860
76
  ```
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  ### Environnement Poetry (livrable)
79
 
80
  Le livrable inclut `pyproject.toml`, aligne sur `requirements.txt`. Si besoin :
@@ -85,9 +119,9 @@ poetry run pytest -q
85
  poetry run uvicorn app.main:app --reload --port 7860
86
  ```
87
 
88
- Important : le modele `HistGB_final_model.pkl` doit etre regenere avec la
89
  version de scikit-learn definie dans `requirements.txt` / `pyproject.toml`
90
- (re-execution de `P6_MANET_Stephane_notebook_modélisation.ipynb`, cellule de
91
  sauvegarde pickle).
92
 
93
  ### Exemple d'input (schema + valeurs)
@@ -158,10 +192,13 @@ Variables utiles :
158
  ### Data contract (validation)
159
 
160
  - Types numeriques stricts (invalides -> 422).
161
- - Ranges numeriques (min/max entrainement) controles.
162
  - Categoriels normalises: `CODE_GENDER` -> {`F`, `M`}, `FLAG_OWN_CAR` -> {`Y`, `N`}.
163
- - Sentinelle `DAYS_EMPLOYED=365243` remplacee par NaN.
164
- - Logs enrichis via `data_quality` pour distinguer drift vs qualite de donnees.
 
 
 
165
 
166
  ### Interface Gradio (scoring)
167
 
@@ -186,13 +223,13 @@ variables suivantes sont definies :
186
 
187
  Exemple (un seul repo dataset avec 3 fichiers) :
188
 
189
- - `HF_MODEL_REPO_ID=stephmnt/credit-scoring-mlops-assets`
190
  - `HF_MODEL_REPO_TYPE=dataset`
191
- - `HF_MODEL_FILENAME=HistGB_final_model.pkl`
192
- - `HF_PREPROCESSOR_REPO_ID=stephmnt/credit-scoring-mlops-assets`
193
  - `HF_PREPROCESSOR_REPO_TYPE=dataset`
194
  - `HF_PREPROCESSOR_FILENAME=preprocessor.joblib`
195
- - `HF_CUSTOMER_REPO_ID=stephmnt/credit-scoring-mlops-assets`
196
  - `HF_CUSTOMER_REPO_TYPE=dataset`
197
  - `HF_CUSTOMER_FILENAME=data_final.parquet`
198
 
@@ -311,8 +348,11 @@ Variables utiles :
311
  - `LOG_HASH_SK_ID=1` pour anonymiser `SK_ID_CURR`
312
 
313
  Les logs incluent un bloc `data_quality` par requete (champs manquants,
314
- types invalides, out-of-range, categories inconnues, sentinelle
315
- `DAYS_EMPLOYED`).
 
 
 
316
 
317
  Exemple local :
318
 
@@ -359,6 +399,7 @@ Robustesse integree:
359
  - Categoriels: PSI avec lissage (`--psi-eps`) + categories rares regroupees (OTHER).
360
  - Numeriques: KS corrige par FDR (Benjamini-Hochberg, `--fdr-alpha`).
361
  - Sentinel `DAYS_EMPLOYED`: converti en NaN + taux suivi.
 
362
 
363
  Le rapport inclut aussi la distribution des scores predits et le taux de prediction
364
  (option `--score-bins` pour ajuster le nombre de bins), ainsi qu'une section
@@ -379,12 +420,8 @@ Captures (snapshot local du reporting + stockage):
379
 
380
  Profiling et benchmark d'inference (cProfile + latence) :
381
 
382
- ```shell
383
- python profiling/profile_inference.py \
384
- --sample-size 2000 \
385
- --batch-size 128 \
386
- --runs 3
387
- ```
388
 
389
  Sorties:
390
 
 
14
  [![GitHub Release Date](https://img.shields.io/github/release-date/stephmnt/credit-scoring-mlops?display_date=published_at&style=flat-square)](https://github.com/stephmnt/credit-scoring-mlops/releases)
15
  [![project_license](https://img.shields.io/github/license/stephmnt/credit-scoring-mlops.svg)](https://github.com/stephmnt/credit-scoring-mlops/blob/main/LICENSE)
16
 
17
+ ## Structure rapide
18
+
19
+ - `app/` API FastAPI + preprocessing inference
20
+ - `monitoring/` rapport drift + Streamlit
21
+ - `notebooks/` exploration + modelisation
22
+ - `src/` utilitaires ML (feature engineering / pipeline)
23
+ - `docs/` preuves & rapports (monitoring, perf)
24
+ - `tests/` tests unitaires/integration
25
+
26
+ Le feature engineering est factorise dans `src/features.py` et reutilise
27
+ par le notebook et l'API pour eviter le training-serving skew.
28
+
29
  ## Lancer MLFlow
30
 
31
  Le notebook est configure pour utiliser un serveur MLflow local (`http://127.0.0.1:5000`).
 
87
  uvicorn app.main:app --reload --port 7860
88
  ```
89
 
90
+ ### Workflow DEV (notebooks)
91
+
92
+ Ordre recommande (dev uniquement) :
93
+
94
+ 1. `notebooks/P6_MANET_Stephane_notebook_exploration.ipynb` → genere `data/data_final.parquet` (ecrase).
95
+ 2. `notebooks/P6_MANET_Stephane_notebook_compare_tuning_mlflow.ipynb` → compare+tuning, log MLflow, ecrit `reports/best_model.json`.
96
+ 3. `notebooks/P6_MANET_Stephane_notebook_modélisation.ipynb` → rebuild preprocessor, entraine le modele final, exporte `data/<model>_final_model.pkl`.
97
+ 4. Lancer manuellement le workflow `deploy-assets.yml` pour pousser `data/*_final_model.pkl`.
98
+
99
+ Note : ces notebooks restent dev-only. Le code prod reste dans `app/` et `monitoring/`.
100
+
101
+ ### Configuration (.env)
102
+
103
+ Dupliquez `.env.example` en `.env` si vous voulez surcharger les chemins,
104
+ seuils ou sources Hugging Face.
105
+ Le seuil `MISSING_INDICATOR_MIN_RATE` limite les colonnes `is_missing_*`
106
+ aux features avec un taux de NaN >= 5% (par defaut).
107
+
108
+ ```shell
109
+ cp .env.example .env
110
+ ```
111
+
112
  ### Environnement Poetry (livrable)
113
 
114
  Le livrable inclut `pyproject.toml`, aligne sur `requirements.txt`. Si besoin :
 
119
  poetry run uvicorn app.main:app --reload --port 7860
120
  ```
121
 
122
+ Important : le modele `*_final_model.pkl` doit etre regenere avec la
123
  version de scikit-learn definie dans `requirements.txt` / `pyproject.toml`
124
+ (re-execution de `notebooks/P6_MANET_Stephane_notebook_modélisation.ipynb`, cellule de
125
  sauvegarde pickle).
126
 
127
  ### Exemple d'input (schema + valeurs)
 
192
  ### Data contract (validation)
193
 
194
  - Types numeriques stricts (invalides -> 422).
195
+ - Ranges numeriques (min/max entrainement) controles, hors `SK_ID_CURR` (ID).
196
  - Categoriels normalises: `CODE_GENDER` -> {`F`, `M`}, `FLAG_OWN_CAR` -> {`Y`, `N`}.
197
+ - Sentinelle `DAYS_EMPLOYED=365243` remplacee par NaN + flag `DAYS_EMPLOYED_ANOM`.
198
+ - Ratios securises (division par zero) + flags `DENOM_ZERO_*`.
199
+ - Outliers clippees (p1/p99) + flags `is_outlier_*`.
200
+ - Missingness indicators `is_missing_*` pour les numeriques avec taux de NaN >= 5%.
201
+ - Logs enrichis via `data_quality` et `source` pour distinguer drift vs qualite de donnees.
202
 
203
  ### Interface Gradio (scoring)
204
 
 
223
 
224
  Exemple (un seul repo dataset avec 3 fichiers) :
225
 
226
+ - `HF_MODEL_REPO_ID=stephmnt/assets-credit-scoring-mlops`
227
  - `HF_MODEL_REPO_TYPE=dataset`
228
+ - `HF_MODEL_FILENAME=histgb_final_model.pkl` (ou `lgbm_final_model.pkl` / `xgb_final_model.pkl`)
229
+ - `HF_PREPROCESSOR_REPO_ID=stephmnt/assets-credit-scoring-mlops`
230
  - `HF_PREPROCESSOR_REPO_TYPE=dataset`
231
  - `HF_PREPROCESSOR_FILENAME=preprocessor.joblib`
232
+ - `HF_CUSTOMER_REPO_ID=stephmnt/assets-credit-scoring-mlops`
233
  - `HF_CUSTOMER_REPO_TYPE=dataset`
234
  - `HF_CUSTOMER_FILENAME=data_final.parquet`
235
 
 
348
  - `LOG_HASH_SK_ID=1` pour anonymiser `SK_ID_CURR`
349
 
350
  Les logs incluent un bloc `data_quality` par requete (champs manquants,
351
+ types invalides, out-of-range, outliers, categories inconnues, sentinelle
352
+ `DAYS_EMPLOYED`) et un champ `source` (api/gradio/etc.).
353
+
354
+ Astuce : vous pouvez passer un header `X-Client-Source` pour tagger la source
355
+ des requetes (ex: `gradio`, `test`, `batch`).
356
 
357
  Exemple local :
358
 
 
399
  - Categoriels: PSI avec lissage (`--psi-eps`) + categories rares regroupees (OTHER).
400
  - Numeriques: KS corrige par FDR (Benjamini-Hochberg, `--fdr-alpha`).
401
  - Sentinel `DAYS_EMPLOYED`: converti en NaN + taux suivi.
402
+ - Outliers: clipping p1/p99 + taux via `data_quality`.
403
 
404
  Le rapport inclut aussi la distribution des scores predits et le taux de prediction
405
  (option `--score-bins` pour ajuster le nombre de bins), ainsi qu'une section
 
420
 
421
  Profiling et benchmark d'inference (cProfile + latence) :
422
 
423
+ - Desormais via le notebook modélisation (section TODO 5).
424
+ - L'ancien script est archive dans `dev_archive/profiling/profile_inference.py`.
 
 
 
 
425
 
426
  Sorties:
427
 
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py CHANGED
@@ -20,9 +20,33 @@ from pydantic import BaseModel
20
  from sklearn.preprocessing import MinMaxScaler
21
  import joblib
22
 
 
 
 
 
 
 
 
 
23
  logger = logging.getLogger("uvicorn.error")
24
 
25
- MODEL_PATH = Path(os.getenv("MODEL_PATH", "data/HistGB_final_model.pkl"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  DATA_PATH = Path(os.getenv("DATA_PATH", "data/data_final.parquet"))
27
  ARTIFACTS_PATH = Path(os.getenv("ARTIFACTS_PATH", "artifacts/preprocessor.joblib"))
28
  DEFAULT_THRESHOLD = float(os.getenv("PREDICTION_THRESHOLD", "0.5"))
@@ -56,11 +80,17 @@ HF_CUSTOMER_FILENAME = os.getenv("HF_CUSTOMER_FILENAME", CUSTOMER_DATA_PATH.name
56
 
57
  IGNORE_FEATURES = ["is_train", "is_test", "TARGET", "SK_ID_CURR"]
58
  ENGINEERED_FEATURES = [
 
59
  "DAYS_EMPLOYED_PERC",
60
  "INCOME_CREDIT_PERC",
61
  "INCOME_PER_PERSON",
62
  "ANNUITY_INCOME_PERC",
63
  "PAYMENT_RATE",
 
 
 
 
 
64
  ]
65
  ENGINEERED_SOURCES = [
66
  "DAYS_EMPLOYED",
@@ -98,6 +128,9 @@ OUTLIER_COLUMNS = [
98
  "AMT_REQ_CREDIT_BUREAU_YEAR",
99
  "AMT_REQ_CREDIT_BUREAU_QRT",
100
  ]
 
 
 
101
 
102
  CODE_GENDER_MAPPING = {
103
  "F": "F",
@@ -143,6 +176,8 @@ class PreprocessorArtifacts:
143
  numeric_medians: dict[str, float]
144
  categorical_columns: list[str]
145
  outlier_maxes: dict[str, float]
 
 
146
  numeric_ranges: dict[str, tuple[float, float]]
147
  features_to_scaled: list[str]
148
  scaler: MinMaxScaler
@@ -243,6 +278,7 @@ def _normalize_inputs(
243
  if "DAYS_EMPLOYED" in df.columns:
244
  values = pd.to_numeric(df["DAYS_EMPLOYED"], errors="coerce")
245
  sentinel_mask = values == DAYS_EMPLOYED_SENTINEL
 
246
  if sentinel_mask.any():
247
  df.loc[sentinel_mask, "DAYS_EMPLOYED"] = np.nan
248
 
@@ -267,6 +303,7 @@ def _build_data_quality_records(
267
  missing_mask = df_norm[required_cols].isna() if required_cols else pd.DataFrame(index=df_norm.index)
268
  invalid_masks: dict[str, pd.Series] = {}
269
  out_of_range_masks: dict[str, pd.Series] = {}
 
270
 
271
  for col in numeric_required:
272
  if col not in df_raw.columns:
@@ -283,6 +320,13 @@ def _build_data_quality_records(
283
  values = pd.to_numeric(df_norm[col], errors="coerce")
284
  out_of_range_masks[col] = (values < min_val) | (values > max_val)
285
 
 
 
 
 
 
 
 
286
  records: list[dict[str, Any]] = []
287
  for idx in df_norm.index:
288
  missing_cols = (
@@ -292,18 +336,26 @@ def _build_data_quality_records(
292
  )
293
  invalid_cols = [col for col, mask in invalid_masks.items() if mask.at[idx]]
294
  out_of_range_cols = [col for col, mask in out_of_range_masks.items() if mask.at[idx]]
 
295
  unknown_cols = [col for col, mask in unknown_masks.items() if mask.at[idx]]
 
 
 
 
 
296
  nan_rate = float(missing_mask.loc[idx].mean()) if not missing_mask.empty else 0.0
297
- records.append(
298
- {
299
- "missing_required_columns": missing_cols,
300
- "invalid_numeric_columns": invalid_cols,
301
- "out_of_range_columns": out_of_range_cols,
302
- "unknown_categories": unknown_cols,
303
- "days_employed_sentinel": bool(sentinel_mask.at[idx]) if not sentinel_mask.empty else False,
304
- "nan_rate": nan_rate,
305
- }
306
- )
 
 
307
  return records
308
 
309
 
@@ -376,6 +428,7 @@ def _log_prediction_entries(
376
  threshold: float | None,
377
  status_code: int,
378
  preprocessor: PreprocessorArtifacts,
 
379
  data_quality: list[dict[str, Any]] | None = None,
380
  error: str | None = None,
381
  ) -> None:
@@ -400,6 +453,7 @@ def _log_prediction_entries(
400
  "status_code": status_code,
401
  "model_version": MODEL_VERSION,
402
  "threshold": threshold,
 
403
  "inputs": inputs,
404
  }
405
  if data_quality and idx < len(data_quality):
@@ -420,25 +474,16 @@ def _log_prediction_entries(
420
  _append_log_entries(entries)
421
 
422
 
423
- def new_features_creation(df: pd.DataFrame) -> pd.DataFrame:
424
- df_features = df.copy()
425
- for col in ENGINEERED_SOURCES:
426
- if col not in df_features.columns:
427
- df_features[col] = np.nan
428
- df_features["DAYS_EMPLOYED_PERC"] = df_features["DAYS_EMPLOYED"] / df_features["DAYS_BIRTH"]
429
- df_features["INCOME_CREDIT_PERC"] = df_features["AMT_INCOME_TOTAL"] / df_features["AMT_CREDIT"]
430
- df_features["INCOME_PER_PERSON"] = df_features["AMT_INCOME_TOTAL"] / df_features["CNT_FAM_MEMBERS"]
431
- df_features["ANNUITY_INCOME_PERC"] = df_features["AMT_ANNUITY"] / df_features["AMT_INCOME_TOTAL"]
432
- df_features["PAYMENT_RATE"] = df_features["AMT_ANNUITY"] / df_features["AMT_CREDIT"]
433
- return df_features
434
-
435
-
436
  def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
437
  df = pd.read_parquet(data_path)
438
  raw_feature_columns = df.columns.tolist()
439
  input_feature_columns = [c for c in raw_feature_columns if c not in ["is_train", "is_test", "TARGET"]]
440
 
441
- df = new_features_creation(df)
 
 
 
 
442
  df.replace([np.inf, -np.inf], np.nan, inplace=True)
443
 
444
  missing_rate = df.isna().mean()
@@ -448,6 +493,26 @@ def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
448
  df = df[columns_keep]
449
  df = df.dropna(subset=columns_must_not_missing)
450
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
  numeric_cols = df.select_dtypes(include=["number"]).columns
452
  numeric_medians = df[numeric_cols].median().to_dict()
453
  df[numeric_cols] = df[numeric_cols].fillna(numeric_medians)
@@ -455,12 +520,7 @@ def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
455
  categorical_columns = df.select_dtypes(include=["object"]).columns.tolist()
456
  df[categorical_columns] = df[categorical_columns].fillna("Unknown")
457
 
458
- if "CODE_GENDER" in df.columns:
459
- df = df[df["CODE_GENDER"] != "XNA"]
460
-
461
- outlier_maxes = {col: df[col].max() for col in OUTLIER_COLUMNS if col in df.columns}
462
- for col, max_val in outlier_maxes.items():
463
- df = df[df[col] != max_val]
464
 
465
  reduced_input_columns, selection_scores, selection_method = _compute_reduced_inputs(
466
  df,
@@ -487,7 +547,11 @@ def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
487
  required_input = _fallback_reduced_inputs(input_feature_columns)
488
  else:
489
  required_input = sorted(required_raw)
490
- numeric_required = sorted(col for col in required_input if col in numeric_medians)
 
 
 
 
491
  correlated_imputation = _build_correlated_imputation(
492
  df,
493
  input_feature_columns=input_feature_columns,
@@ -501,6 +565,8 @@ def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
501
  numeric_medians={k: float(v) for k, v in numeric_medians.items()},
502
  categorical_columns=categorical_columns,
503
  outlier_maxes={k: float(v) for k, v in outlier_maxes.items()},
 
 
504
  numeric_ranges=numeric_ranges,
505
  features_to_scaled=features_to_scaled,
506
  scaler=scaler,
@@ -554,9 +620,28 @@ def build_fallback_preprocessor() -> PreprocessorArtifacts:
554
  ]
555
  )
556
 
557
- df = new_features_creation(base)
 
 
 
 
558
  df.replace([np.inf, -np.inf], np.nan, inplace=True)
559
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
560
  columns_keep = df.columns.tolist()
561
  columns_must_not_missing = [col for col in columns_keep if col not in IGNORE_FEATURES]
562
 
@@ -579,7 +664,9 @@ def build_fallback_preprocessor() -> PreprocessorArtifacts:
579
  required_raw.update(col for col in columns_must_not_missing if col in input_feature_columns)
580
  required_raw.add("SK_ID_CURR")
581
  required_input = _fallback_reduced_inputs(input_feature_columns)
582
- numeric_required = sorted(col for col in required_input if col in numeric_medians)
 
 
583
 
584
  numeric_ranges = {col: (float(df[col].min()), float(df[col].max())) for col in numeric_cols}
585
 
@@ -588,7 +675,9 @@ def build_fallback_preprocessor() -> PreprocessorArtifacts:
588
  columns_must_not_missing=columns_must_not_missing,
589
  numeric_medians={k: float(v) for k, v in numeric_medians.items()},
590
  categorical_columns=categorical_columns,
591
- outlier_maxes={},
 
 
592
  numeric_ranges=numeric_ranges,
593
  features_to_scaled=features_to_scaled,
594
  scaler=scaler,
@@ -633,7 +722,9 @@ def load_preprocessor(data_path: Path, artifacts_path: Path) -> PreprocessorArti
633
  updated = True
634
  if not hasattr(preprocessor, "numeric_required_columns"):
635
  preprocessor.numeric_required_columns = sorted(
636
- col for col in preprocessor.required_input_columns if col in preprocessor.numeric_medians
 
 
637
  )
638
  updated = True
639
  if not hasattr(preprocessor, "numeric_ranges"):
@@ -646,6 +737,56 @@ def load_preprocessor(data_path: Path, artifacts_path: Path) -> PreprocessorArti
646
  raise RuntimeError(f"Data file not found to rebuild preprocessor: {data_path}")
647
  preprocessor = build_preprocessor(data_path)
648
  updated = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
649
  if USE_REDUCED_INPUTS:
650
  reduced = _reduce_input_columns(preprocessor)
651
  if preprocessor.required_input_columns != reduced:
@@ -658,7 +799,9 @@ def load_preprocessor(data_path: Path, artifacts_path: Path) -> PreprocessorArti
658
  required_updated = True
659
  updated = True
660
  desired_numeric_required = sorted(
661
- col for col in preprocessor.required_input_columns if col in preprocessor.numeric_medians
 
 
662
  )
663
  if getattr(preprocessor, "numeric_required_columns", None) != desired_numeric_required:
664
  preprocessor.numeric_required_columns = desired_numeric_required
@@ -890,7 +1033,11 @@ def _compute_reduced_inputs_from_data(
890
  if not data_path.exists():
891
  return _fallback_reduced_inputs(preprocessor.input_feature_columns), {}, "default"
892
  df = pd.read_parquet(data_path)
893
- df = new_features_creation(df)
 
 
 
 
894
  df.replace([np.inf, -np.inf], np.nan, inplace=True)
895
 
896
  if preprocessor.columns_keep:
@@ -908,9 +1055,25 @@ def _compute_reduced_inputs_from_data(
908
  if "CODE_GENDER" in df.columns:
909
  df = df[df["CODE_GENDER"] != "XNA"]
910
 
911
- for col, max_val in preprocessor.outlier_maxes.items():
912
- if col in df.columns:
913
- df = df[df[col] != max_val]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
914
 
915
  return _compute_reduced_inputs(df, input_feature_columns=preprocessor.input_feature_columns)
916
 
@@ -920,7 +1083,11 @@ def _compute_correlated_imputation(
920
  preprocessor: PreprocessorArtifacts,
921
  ) -> dict[str, dict[str, float | str]]:
922
  df = pd.read_parquet(data_path)
923
- df = new_features_creation(df)
 
 
 
 
924
  df.replace([np.inf, -np.inf], np.nan, inplace=True)
925
 
926
  df = df[preprocessor.columns_keep]
@@ -936,9 +1103,25 @@ def _compute_correlated_imputation(
936
  if "CODE_GENDER" in df.columns:
937
  df = df[df["CODE_GENDER"] != "XNA"]
938
 
939
- for col, max_val in preprocessor.outlier_maxes.items():
940
- if col in df.columns:
941
- df = df[df[col] != max_val]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
942
 
943
  return _build_correlated_imputation(
944
  df,
@@ -1048,11 +1231,30 @@ def preprocess_input(df_raw: pd.DataFrame, artifacts: PreprocessorArtifacts) ->
1048
  if "TARGET" not in df.columns:
1049
  df["TARGET"] = 0
1050
 
1051
- df = new_features_creation(df)
 
 
 
 
1052
  df.replace([np.inf, -np.inf], np.nan, inplace=True)
1053
 
1054
  df = df.reindex(columns=artifacts.columns_keep, fill_value=np.nan)
1055
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1056
  _apply_correlated_imputation(df, artifacts)
1057
 
1058
  for col, median in artifacts.numeric_medians.items():
@@ -1072,16 +1274,6 @@ def preprocess_input(df_raw: pd.DataFrame, artifacts: PreprocessorArtifacts) ->
1072
  detail={"message": "CODE_GENDER cannot be 'XNA' based on training rules."},
1073
  )
1074
 
1075
- for col, max_val in artifacts.outlier_maxes.items():
1076
- if col in df.columns and (df[col] >= max_val).any():
1077
- raise HTTPException(
1078
- status_code=422,
1079
- detail={
1080
- "message": "Input contains outlier values removed during training.",
1081
- "outlier_columns": [col],
1082
- },
1083
- )
1084
-
1085
  df_hot = pd.get_dummies(df, columns=artifacts.categorical_columns)
1086
  df_hot = df_hot.reindex(columns=artifacts.features_to_scaled, fill_value=0)
1087
 
@@ -1089,6 +1281,80 @@ def preprocess_input(df_raw: pd.DataFrame, artifacts: PreprocessorArtifacts) ->
1089
  return pd.DataFrame(scaled, columns=artifacts.features_to_scaled, index=df.index)
1090
 
1091
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1092
  @app.on_event("startup")
1093
  def startup_event() -> None:
1094
  if getattr(app.state, "model", None) is not None and getattr(app.state, "preprocessor", None) is not None:
@@ -1183,9 +1449,19 @@ def features(include_all: bool = Query(default=False)) -> dict[str, Any]:
1183
  for col in preprocessor.required_input_columns
1184
  if col in scores
1185
  }
 
 
 
 
 
 
 
 
1186
  payload = {
1187
  "required_input_features": preprocessor.required_input_columns,
1188
  "engineered_features": ENGINEERED_FEATURES,
 
 
1189
  "model_features_count": len(preprocessor.features_to_scaled),
1190
  "feature_selection_method": preprocessor.feature_selection_method,
1191
  "feature_selection_top_n": FEATURE_SELECTION_TOP_N,
@@ -1198,6 +1474,8 @@ def features(include_all: bool = Query(default=False)) -> dict[str, Any]:
1198
  if include_all:
1199
  payload["input_features"] = preprocessor.input_feature_columns
1200
  payload["optional_input_features"] = optional_features
 
 
1201
  else:
1202
  payload["input_features"] = preprocessor.required_input_columns
1203
  payload["optional_input_features"] = []
@@ -1235,8 +1513,28 @@ def logs(
1235
 
1236
  return Response(content="".join(lines), media_type="application/x-ndjson")
1237
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1238
 
1239
- def _predict_records(records: list[dict[str, Any]], threshold: float | None) -> dict[str, Any]:
 
 
 
 
 
1240
  model = app.state.model
1241
  preprocessor: PreprocessorArtifacts = app.state.preprocessor
1242
  request_id = str(uuid.uuid4())
@@ -1260,7 +1558,8 @@ def _predict_records(records: list[dict[str, Any]], threshold: float | None) ->
1260
  raise HTTPException(status_code=422, detail={"message": "SK_ID_CURR is required."})
1261
 
1262
  sk_ids = df_norm["SK_ID_CURR"].tolist()
1263
- features = preprocess_input(df_norm, preprocessor)
 
1264
 
1265
  if hasattr(model, "predict_proba"):
1266
  proba = model.predict_proba(features)[:, 1]
@@ -1283,6 +1582,7 @@ def _predict_records(records: list[dict[str, Any]], threshold: float | None) ->
1283
  threshold=use_threshold,
1284
  status_code=200,
1285
  preprocessor=preprocessor,
 
1286
  data_quality=dq_records,
1287
  )
1288
  return {"predictions": results, "threshold": use_threshold}
@@ -1304,6 +1604,7 @@ def _predict_records(records: list[dict[str, Any]], threshold: float | None) ->
1304
  threshold=None,
1305
  status_code=200,
1306
  preprocessor=preprocessor,
 
1307
  data_quality=dq_records,
1308
  )
1309
  return {"predictions": results, "threshold": None}
@@ -1318,6 +1619,7 @@ def _predict_records(records: list[dict[str, Any]], threshold: float | None) ->
1318
  threshold=threshold,
1319
  status_code=exc.status_code,
1320
  preprocessor=preprocessor,
 
1321
  data_quality=dq_records if "dq_records" in locals() else None,
1322
  error=json.dumps(detail, ensure_ascii=True),
1323
  )
@@ -1332,6 +1634,7 @@ def _predict_records(records: list[dict[str, Any]], threshold: float | None) ->
1332
  threshold=threshold,
1333
  status_code=500,
1334
  preprocessor=preprocessor,
 
1335
  data_quality=dq_records if "dq_records" in locals() else None,
1336
  error=str(exc),
1337
  )
@@ -1342,16 +1645,18 @@ def _predict_records(records: list[dict[str, Any]], threshold: float | None) ->
1342
  def predict(
1343
  payload: PredictionRequest,
1344
  threshold: float | None = Query(default=None, ge=0.0, le=1.0),
 
1345
  ) -> dict[str, Any]:
1346
  records = payload.data if isinstance(payload.data, list) else [payload.data]
1347
- return _predict_records(records, threshold)
1348
 
1349
 
1350
  @app.post("/predict-minimal")
1351
  def predict_minimal(
1352
  payload: MinimalPredictionRequest,
1353
  threshold: float | None = Query(default=None, ge=0.0, le=1.0),
 
1354
  ) -> dict[str, Any]:
1355
  preprocessor: PreprocessorArtifacts = app.state.preprocessor
1356
  record = _build_minimal_record(payload, preprocessor)
1357
- return _predict_records([record], threshold)
 
20
  from sklearn.preprocessing import MinMaxScaler
21
  import joblib
22
 
23
+ from src.features import (
24
+ add_missingness_indicators,
25
+ apply_outlier_clipping,
26
+ compute_outlier_bounds,
27
+ new_features_creation,
28
+ select_missing_indicator_columns,
29
+ )
30
+
31
  logger = logging.getLogger("uvicorn.error")
32
 
33
+ def _resolve_model_path() -> Path:
34
+ env_path = os.getenv("MODEL_PATH")
35
+ if env_path:
36
+ return Path(env_path)
37
+ candidates = sorted(Path("data").glob("*_final_model.pkl"))
38
+ if len(candidates) == 1:
39
+ return candidates[0]
40
+ if candidates:
41
+ logger.warning(
42
+ "Multiple *_final_model.pkl files found; set MODEL_PATH explicitly. Using %s",
43
+ candidates[0],
44
+ )
45
+ return candidates[0]
46
+ return Path("data/histgb_final_model.pkl")
47
+
48
+
49
+ MODEL_PATH = _resolve_model_path()
50
  DATA_PATH = Path(os.getenv("DATA_PATH", "data/data_final.parquet"))
51
  ARTIFACTS_PATH = Path(os.getenv("ARTIFACTS_PATH", "artifacts/preprocessor.joblib"))
52
  DEFAULT_THRESHOLD = float(os.getenv("PREDICTION_THRESHOLD", "0.5"))
 
80
 
81
  IGNORE_FEATURES = ["is_train", "is_test", "TARGET", "SK_ID_CURR"]
82
  ENGINEERED_FEATURES = [
83
+ "DAYS_EMPLOYED_ANOM",
84
  "DAYS_EMPLOYED_PERC",
85
  "INCOME_CREDIT_PERC",
86
  "INCOME_PER_PERSON",
87
  "ANNUITY_INCOME_PERC",
88
  "PAYMENT_RATE",
89
+ "DENOM_ZERO_DAYS_EMPLOYED_PERC",
90
+ "DENOM_ZERO_INCOME_CREDIT_PERC",
91
+ "DENOM_ZERO_INCOME_PER_PERSON",
92
+ "DENOM_ZERO_ANNUITY_INCOME_PERC",
93
+ "DENOM_ZERO_PAYMENT_RATE",
94
  ]
95
  ENGINEERED_SOURCES = [
96
  "DAYS_EMPLOYED",
 
128
  "AMT_REQ_CREDIT_BUREAU_YEAR",
129
  "AMT_REQ_CREDIT_BUREAU_QRT",
130
  ]
131
+ OUTLIER_LOWER_Q = 0.01
132
+ OUTLIER_UPPER_Q = 0.99
133
+ MISSING_INDICATOR_MIN_RATE = float(os.getenv("MISSING_INDICATOR_MIN_RATE", "0.05"))
134
 
135
  CODE_GENDER_MAPPING = {
136
  "F": "F",
 
176
  numeric_medians: dict[str, float]
177
  categorical_columns: list[str]
178
  outlier_maxes: dict[str, float]
179
+ outlier_bounds: dict[str, tuple[float, float]]
180
+ missing_indicator_columns: list[str]
181
  numeric_ranges: dict[str, tuple[float, float]]
182
  features_to_scaled: list[str]
183
  scaler: MinMaxScaler
 
278
  if "DAYS_EMPLOYED" in df.columns:
279
  values = pd.to_numeric(df["DAYS_EMPLOYED"], errors="coerce")
280
  sentinel_mask = values == DAYS_EMPLOYED_SENTINEL
281
+ df["DAYS_EMPLOYED_ANOM"] = sentinel_mask.astype(int)
282
  if sentinel_mask.any():
283
  df.loc[sentinel_mask, "DAYS_EMPLOYED"] = np.nan
284
 
 
303
  missing_mask = df_norm[required_cols].isna() if required_cols else pd.DataFrame(index=df_norm.index)
304
  invalid_masks: dict[str, pd.Series] = {}
305
  out_of_range_masks: dict[str, pd.Series] = {}
306
+ outlier_masks: dict[str, pd.Series] = {}
307
 
308
  for col in numeric_required:
309
  if col not in df_raw.columns:
 
320
  values = pd.to_numeric(df_norm[col], errors="coerce")
321
  out_of_range_masks[col] = (values < min_val) | (values > max_val)
322
 
323
+ for col, (low, high) in getattr(preprocessor, "outlier_bounds", {}).items():
324
+ if col not in df_norm.columns:
325
+ outlier_masks[col] = pd.Series(False, index=df_norm.index)
326
+ continue
327
+ values = pd.to_numeric(df_norm[col], errors="coerce")
328
+ outlier_masks[col] = (values < low) | (values > high)
329
+
330
  records: list[dict[str, Any]] = []
331
  for idx in df_norm.index:
332
  missing_cols = (
 
336
  )
337
  invalid_cols = [col for col, mask in invalid_masks.items() if mask.at[idx]]
338
  out_of_range_cols = [col for col, mask in out_of_range_masks.items() if mask.at[idx]]
339
+ outlier_cols = [col for col, mask in outlier_masks.items() if mask.at[idx]]
340
  unknown_cols = [col for col, mask in unknown_masks.items() if mask.at[idx]]
341
+ unknown_values = {
342
+ col: df_raw.at[idx, col]
343
+ for col in unknown_cols
344
+ if col in df_raw.columns
345
+ }
346
  nan_rate = float(missing_mask.loc[idx].mean()) if not missing_mask.empty else 0.0
347
+ record = {
348
+ "missing_required_columns": missing_cols,
349
+ "invalid_numeric_columns": invalid_cols,
350
+ "out_of_range_columns": out_of_range_cols,
351
+ "outlier_columns": outlier_cols,
352
+ "unknown_categories": unknown_cols,
353
+ "days_employed_sentinel": bool(sentinel_mask.at[idx]) if not sentinel_mask.empty else False,
354
+ "nan_rate": nan_rate,
355
+ }
356
+ if unknown_values:
357
+ record["unknown_category_values"] = unknown_values
358
+ records.append(record)
359
  return records
360
 
361
 
 
428
  threshold: float | None,
429
  status_code: int,
430
  preprocessor: PreprocessorArtifacts,
431
+ source: str | None = None,
432
  data_quality: list[dict[str, Any]] | None = None,
433
  error: str | None = None,
434
  ) -> None:
 
453
  "status_code": status_code,
454
  "model_version": MODEL_VERSION,
455
  "threshold": threshold,
456
+ "source": source or "api",
457
  "inputs": inputs,
458
  }
459
  if data_quality and idx < len(data_quality):
 
474
  _append_log_entries(entries)
475
 
476
 
 
 
 
 
 
 
 
 
 
 
 
 
 
477
  def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
478
  df = pd.read_parquet(data_path)
479
  raw_feature_columns = df.columns.tolist()
480
  input_feature_columns = [c for c in raw_feature_columns if c not in ["is_train", "is_test", "TARGET"]]
481
 
482
+ df = new_features_creation(
483
+ df,
484
+ days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
485
+ engineered_sources=ENGINEERED_SOURCES,
486
+ )
487
  df.replace([np.inf, -np.inf], np.nan, inplace=True)
488
 
489
  missing_rate = df.isna().mean()
 
493
  df = df[columns_keep]
494
  df = df.dropna(subset=columns_must_not_missing)
495
 
496
+ if "CODE_GENDER" in df.columns:
497
+ df = df[df["CODE_GENDER"] != "XNA"]
498
+
499
+ missing_indicator_columns = select_missing_indicator_columns(
500
+ df,
501
+ exclude_cols=set(IGNORE_FEATURES),
502
+ min_missing_rate=MISSING_INDICATOR_MIN_RATE,
503
+ )
504
+ df = add_missingness_indicators(df, missing_indicator_columns)
505
+
506
+ outlier_bounds = compute_outlier_bounds(
507
+ df,
508
+ OUTLIER_COLUMNS,
509
+ lower_q=OUTLIER_LOWER_Q,
510
+ upper_q=OUTLIER_UPPER_Q,
511
+ )
512
+ df = apply_outlier_clipping(df, outlier_bounds)
513
+
514
+ columns_keep = df.columns.tolist()
515
+
516
  numeric_cols = df.select_dtypes(include=["number"]).columns
517
  numeric_medians = df[numeric_cols].median().to_dict()
518
  df[numeric_cols] = df[numeric_cols].fillna(numeric_medians)
 
520
  categorical_columns = df.select_dtypes(include=["object"]).columns.tolist()
521
  df[categorical_columns] = df[categorical_columns].fillna("Unknown")
522
 
523
+ outlier_maxes = {col: bounds[1] for col, bounds in outlier_bounds.items()}
 
 
 
 
 
524
 
525
  reduced_input_columns, selection_scores, selection_method = _compute_reduced_inputs(
526
  df,
 
547
  required_input = _fallback_reduced_inputs(input_feature_columns)
548
  else:
549
  required_input = sorted(required_raw)
550
+ numeric_required = sorted(
551
+ col
552
+ for col in required_input
553
+ if col in numeric_medians and col != "SK_ID_CURR"
554
+ )
555
  correlated_imputation = _build_correlated_imputation(
556
  df,
557
  input_feature_columns=input_feature_columns,
 
565
  numeric_medians={k: float(v) for k, v in numeric_medians.items()},
566
  categorical_columns=categorical_columns,
567
  outlier_maxes={k: float(v) for k, v in outlier_maxes.items()},
568
+ outlier_bounds={k: (float(v[0]), float(v[1])) for k, v in outlier_bounds.items()},
569
+ missing_indicator_columns=missing_indicator_columns,
570
  numeric_ranges=numeric_ranges,
571
  features_to_scaled=features_to_scaled,
572
  scaler=scaler,
 
620
  ]
621
  )
622
 
623
+ df = new_features_creation(
624
+ base,
625
+ days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
626
+ engineered_sources=ENGINEERED_SOURCES,
627
+ )
628
  df.replace([np.inf, -np.inf], np.nan, inplace=True)
629
 
630
+ missing_indicator_columns = select_missing_indicator_columns(
631
+ df,
632
+ exclude_cols=set(IGNORE_FEATURES),
633
+ min_missing_rate=MISSING_INDICATOR_MIN_RATE,
634
+ )
635
+ df = add_missingness_indicators(df, missing_indicator_columns)
636
+
637
+ outlier_bounds = compute_outlier_bounds(
638
+ df,
639
+ OUTLIER_COLUMNS,
640
+ lower_q=OUTLIER_LOWER_Q,
641
+ upper_q=OUTLIER_UPPER_Q,
642
+ )
643
+ df = apply_outlier_clipping(df, outlier_bounds)
644
+
645
  columns_keep = df.columns.tolist()
646
  columns_must_not_missing = [col for col in columns_keep if col not in IGNORE_FEATURES]
647
 
 
664
  required_raw.update(col for col in columns_must_not_missing if col in input_feature_columns)
665
  required_raw.add("SK_ID_CURR")
666
  required_input = _fallback_reduced_inputs(input_feature_columns)
667
+ numeric_required = sorted(
668
+ col for col in required_input if col in numeric_medians and col != "SK_ID_CURR"
669
+ )
670
 
671
  numeric_ranges = {col: (float(df[col].min()), float(df[col].max())) for col in numeric_cols}
672
 
 
675
  columns_must_not_missing=columns_must_not_missing,
676
  numeric_medians={k: float(v) for k, v in numeric_medians.items()},
677
  categorical_columns=categorical_columns,
678
+ outlier_maxes={k: float(v[1]) for k, v in outlier_bounds.items()},
679
+ outlier_bounds={k: (float(v[0]), float(v[1])) for k, v in outlier_bounds.items()},
680
+ missing_indicator_columns=missing_indicator_columns,
681
  numeric_ranges=numeric_ranges,
682
  features_to_scaled=features_to_scaled,
683
  scaler=scaler,
 
722
  updated = True
723
  if not hasattr(preprocessor, "numeric_required_columns"):
724
  preprocessor.numeric_required_columns = sorted(
725
+ col
726
+ for col in preprocessor.required_input_columns
727
+ if col in preprocessor.numeric_medians and col != "SK_ID_CURR"
728
  )
729
  updated = True
730
  if not hasattr(preprocessor, "numeric_ranges"):
 
737
  raise RuntimeError(f"Data file not found to rebuild preprocessor: {data_path}")
738
  preprocessor = build_preprocessor(data_path)
739
  updated = True
740
+ needs_missing_indicators = (
741
+ not hasattr(preprocessor, "missing_indicator_columns")
742
+ or not preprocessor.missing_indicator_columns
743
+ )
744
+ needs_outlier_bounds = (
745
+ not hasattr(preprocessor, "outlier_bounds") or not preprocessor.outlier_bounds
746
+ )
747
+ prepared_df = None
748
+ if (needs_missing_indicators or needs_outlier_bounds) and data_path.exists():
749
+ prepared_df = pd.read_parquet(data_path)
750
+ prepared_df = new_features_creation(
751
+ prepared_df,
752
+ days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
753
+ engineered_sources=ENGINEERED_SOURCES,
754
+ )
755
+ prepared_df.replace([np.inf, -np.inf], np.nan, inplace=True)
756
+ if preprocessor.columns_keep:
757
+ prepared_df = prepared_df[preprocessor.columns_keep]
758
+ if preprocessor.columns_must_not_missing:
759
+ prepared_df = prepared_df.dropna(subset=preprocessor.columns_must_not_missing)
760
+ if "CODE_GENDER" in prepared_df.columns:
761
+ prepared_df = prepared_df[prepared_df["CODE_GENDER"] != "XNA"]
762
+ if needs_missing_indicators:
763
+ if prepared_df is not None:
764
+ preprocessor.missing_indicator_columns = select_missing_indicator_columns(
765
+ prepared_df,
766
+ exclude_cols=set(IGNORE_FEATURES),
767
+ min_missing_rate=MISSING_INDICATOR_MIN_RATE,
768
+ )
769
+ else:
770
+ preprocessor.missing_indicator_columns = []
771
+ updated = True
772
+ if needs_outlier_bounds:
773
+ if prepared_df is not None:
774
+ preprocessor.outlier_bounds = compute_outlier_bounds(
775
+ prepared_df,
776
+ OUTLIER_COLUMNS,
777
+ lower_q=OUTLIER_LOWER_Q,
778
+ upper_q=OUTLIER_UPPER_Q,
779
+ )
780
+ else:
781
+ preprocessor.outlier_bounds = {}
782
+ for col, max_val in getattr(preprocessor, "outlier_maxes", {}).items():
783
+ min_val = None
784
+ if hasattr(preprocessor, "numeric_ranges") and col in preprocessor.numeric_ranges:
785
+ min_val = preprocessor.numeric_ranges[col][0]
786
+ if min_val is None:
787
+ min_val = float("-inf")
788
+ preprocessor.outlier_bounds[col] = (float(min_val), float(max_val))
789
+ updated = True
790
  if USE_REDUCED_INPUTS:
791
  reduced = _reduce_input_columns(preprocessor)
792
  if preprocessor.required_input_columns != reduced:
 
799
  required_updated = True
800
  updated = True
801
  desired_numeric_required = sorted(
802
+ col
803
+ for col in preprocessor.required_input_columns
804
+ if col in preprocessor.numeric_medians and col != "SK_ID_CURR"
805
  )
806
  if getattr(preprocessor, "numeric_required_columns", None) != desired_numeric_required:
807
  preprocessor.numeric_required_columns = desired_numeric_required
 
1033
  if not data_path.exists():
1034
  return _fallback_reduced_inputs(preprocessor.input_feature_columns), {}, "default"
1035
  df = pd.read_parquet(data_path)
1036
+ df = new_features_creation(
1037
+ df,
1038
+ days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
1039
+ engineered_sources=ENGINEERED_SOURCES,
1040
+ )
1041
  df.replace([np.inf, -np.inf], np.nan, inplace=True)
1042
 
1043
  if preprocessor.columns_keep:
 
1055
  if "CODE_GENDER" in df.columns:
1056
  df = df[df["CODE_GENDER"] != "XNA"]
1057
 
1058
+ if getattr(preprocessor, "missing_indicator_columns", None):
1059
+ df = add_missingness_indicators(df, preprocessor.missing_indicator_columns)
1060
+ else:
1061
+ df = add_missingness_indicators(
1062
+ df,
1063
+ select_missing_indicator_columns(
1064
+ df,
1065
+ exclude_cols=set(IGNORE_FEATURES),
1066
+ min_missing_rate=MISSING_INDICATOR_MIN_RATE,
1067
+ ),
1068
+ )
1069
+
1070
+ outlier_bounds = getattr(preprocessor, "outlier_bounds", {}) or compute_outlier_bounds(
1071
+ df,
1072
+ OUTLIER_COLUMNS,
1073
+ lower_q=OUTLIER_LOWER_Q,
1074
+ upper_q=OUTLIER_UPPER_Q,
1075
+ )
1076
+ df = apply_outlier_clipping(df, outlier_bounds)
1077
 
1078
  return _compute_reduced_inputs(df, input_feature_columns=preprocessor.input_feature_columns)
1079
 
 
1083
  preprocessor: PreprocessorArtifacts,
1084
  ) -> dict[str, dict[str, float | str]]:
1085
  df = pd.read_parquet(data_path)
1086
+ df = new_features_creation(
1087
+ df,
1088
+ days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
1089
+ engineered_sources=ENGINEERED_SOURCES,
1090
+ )
1091
  df.replace([np.inf, -np.inf], np.nan, inplace=True)
1092
 
1093
  df = df[preprocessor.columns_keep]
 
1103
  if "CODE_GENDER" in df.columns:
1104
  df = df[df["CODE_GENDER"] != "XNA"]
1105
 
1106
+ if getattr(preprocessor, "missing_indicator_columns", None):
1107
+ df = add_missingness_indicators(df, preprocessor.missing_indicator_columns)
1108
+ else:
1109
+ df = add_missingness_indicators(
1110
+ df,
1111
+ select_missing_indicator_columns(
1112
+ df,
1113
+ exclude_cols=set(IGNORE_FEATURES),
1114
+ min_missing_rate=MISSING_INDICATOR_MIN_RATE,
1115
+ ),
1116
+ )
1117
+
1118
+ outlier_bounds = getattr(preprocessor, "outlier_bounds", {}) or compute_outlier_bounds(
1119
+ df,
1120
+ OUTLIER_COLUMNS,
1121
+ lower_q=OUTLIER_LOWER_Q,
1122
+ upper_q=OUTLIER_UPPER_Q,
1123
+ )
1124
+ df = apply_outlier_clipping(df, outlier_bounds)
1125
 
1126
  return _build_correlated_imputation(
1127
  df,
 
1231
  if "TARGET" not in df.columns:
1232
  df["TARGET"] = 0
1233
 
1234
+ df = new_features_creation(
1235
+ df,
1236
+ days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
1237
+ engineered_sources=ENGINEERED_SOURCES,
1238
+ )
1239
  df.replace([np.inf, -np.inf], np.nan, inplace=True)
1240
 
1241
  df = df.reindex(columns=artifacts.columns_keep, fill_value=np.nan)
1242
 
1243
+ indicator_cols = getattr(artifacts, "missing_indicator_columns", None) or select_missing_indicator_columns(
1244
+ df,
1245
+ exclude_cols=set(IGNORE_FEATURES),
1246
+ min_missing_rate=MISSING_INDICATOR_MIN_RATE,
1247
+ )
1248
+ df = add_missingness_indicators(df, indicator_cols)
1249
+
1250
+ outlier_bounds = getattr(artifacts, "outlier_bounds", {}) or compute_outlier_bounds(
1251
+ df,
1252
+ OUTLIER_COLUMNS,
1253
+ lower_q=OUTLIER_LOWER_Q,
1254
+ upper_q=OUTLIER_UPPER_Q,
1255
+ )
1256
+ df = apply_outlier_clipping(df, outlier_bounds)
1257
+
1258
  _apply_correlated_imputation(df, artifacts)
1259
 
1260
  for col, median in artifacts.numeric_medians.items():
 
1274
  detail={"message": "CODE_GENDER cannot be 'XNA' based on training rules."},
1275
  )
1276
 
 
 
 
 
 
 
 
 
 
 
1277
  df_hot = pd.get_dummies(df, columns=artifacts.categorical_columns)
1278
  df_hot = df_hot.reindex(columns=artifacts.features_to_scaled, fill_value=0)
1279
 
 
1281
  return pd.DataFrame(scaled, columns=artifacts.features_to_scaled, index=df.index)
1282
 
1283
 
1284
+ def _prepare_pipeline_input(
1285
+ df_raw: pd.DataFrame,
1286
+ artifacts: PreprocessorArtifacts,
1287
+ model: Any,
1288
+ ) -> pd.DataFrame:
1289
+ df = df_raw.copy()
1290
+
1291
+ for col in artifacts.required_input_columns:
1292
+ if col not in df.columns:
1293
+ df[col] = np.nan
1294
+
1295
+ allow_missing = {"DAYS_EMPLOYED"}
1296
+ _ensure_required_columns(df, artifacts.required_input_columns, allow_missing=allow_missing)
1297
+ _validate_numeric_inputs(df, artifacts.numeric_required_columns)
1298
+ _validate_numeric_ranges(
1299
+ df,
1300
+ {k: v for k, v in artifacts.numeric_ranges.items() if k in artifacts.numeric_required_columns},
1301
+ )
1302
+
1303
+ df["is_train"] = 0
1304
+ df["is_test"] = 1
1305
+ if "TARGET" not in df.columns:
1306
+ df["TARGET"] = 0
1307
+
1308
+ df = new_features_creation(
1309
+ df,
1310
+ days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
1311
+ engineered_sources=ENGINEERED_SOURCES,
1312
+ )
1313
+ df.replace([np.inf, -np.inf], np.nan, inplace=True)
1314
+
1315
+ df = df.reindex(columns=artifacts.columns_keep, fill_value=np.nan)
1316
+
1317
+ indicator_cols = getattr(artifacts, "missing_indicator_columns", None) or select_missing_indicator_columns(
1318
+ df,
1319
+ exclude_cols=set(IGNORE_FEATURES),
1320
+ min_missing_rate=MISSING_INDICATOR_MIN_RATE,
1321
+ )
1322
+ df = add_missingness_indicators(df, indicator_cols)
1323
+
1324
+ outlier_bounds = getattr(artifacts, "outlier_bounds", {}) or compute_outlier_bounds(
1325
+ df,
1326
+ OUTLIER_COLUMNS,
1327
+ lower_q=OUTLIER_LOWER_Q,
1328
+ upper_q=OUTLIER_UPPER_Q,
1329
+ )
1330
+ df = apply_outlier_clipping(df, outlier_bounds)
1331
+
1332
+ if "CODE_GENDER" in df.columns and (df["CODE_GENDER"] == "XNA").any():
1333
+ raise HTTPException(
1334
+ status_code=422,
1335
+ detail={"message": "CODE_GENDER cannot be 'XNA' based on training rules."},
1336
+ )
1337
+
1338
+ expected_cols = None
1339
+ if hasattr(model, "named_steps"):
1340
+ preprocessor = model.named_steps.get("preprocessing")
1341
+ expected_cols = getattr(preprocessor, "feature_names_in_", None)
1342
+ if expected_cols is None:
1343
+ expected_cols = [c for c in artifacts.input_feature_columns if c not in IGNORE_FEATURES]
1344
+
1345
+ return df.reindex(columns=expected_cols, fill_value=np.nan)
1346
+
1347
+
1348
+ def prepare_inference_features(
1349
+ df_raw: pd.DataFrame,
1350
+ artifacts: PreprocessorArtifacts,
1351
+ model: Any,
1352
+ ) -> pd.DataFrame:
1353
+ if hasattr(model, "named_steps") and model.named_steps.get("preprocessing") is not None:
1354
+ return _prepare_pipeline_input(df_raw, artifacts, model)
1355
+ return preprocess_input(df_raw, artifacts)
1356
+
1357
+
1358
  @app.on_event("startup")
1359
  def startup_event() -> None:
1360
  if getattr(app.state, "model", None) is not None and getattr(app.state, "preprocessor", None) is not None:
 
1449
  for col in preprocessor.required_input_columns
1450
  if col in scores
1451
  }
1452
+ missing_indicator_features = [
1453
+ f"is_missing_{col}"
1454
+ for col in getattr(preprocessor, "missing_indicator_columns", []) or []
1455
+ ]
1456
+ outlier_indicator_features = [
1457
+ f"is_outlier_{col}"
1458
+ for col in getattr(preprocessor, "outlier_bounds", {}) or {}
1459
+ ]
1460
  payload = {
1461
  "required_input_features": preprocessor.required_input_columns,
1462
  "engineered_features": ENGINEERED_FEATURES,
1463
+ "missing_indicator_features_count": len(missing_indicator_features),
1464
+ "outlier_indicator_features_count": len(outlier_indicator_features),
1465
  "model_features_count": len(preprocessor.features_to_scaled),
1466
  "feature_selection_method": preprocessor.feature_selection_method,
1467
  "feature_selection_top_n": FEATURE_SELECTION_TOP_N,
 
1474
  if include_all:
1475
  payload["input_features"] = preprocessor.input_feature_columns
1476
  payload["optional_input_features"] = optional_features
1477
+ payload["missing_indicator_features"] = missing_indicator_features
1478
+ payload["outlier_indicator_features"] = outlier_indicator_features
1479
  else:
1480
  payload["input_features"] = preprocessor.required_input_columns
1481
  payload["optional_input_features"] = []
 
1513
 
1514
  return Response(content="".join(lines), media_type="application/x-ndjson")
1515
 
1516
+ def _align_features_to_model(features: pd.DataFrame, model: Any) -> pd.DataFrame:
1517
+ expected = getattr(model, "feature_names_in_", None)
1518
+ if expected is None:
1519
+ return features
1520
+ expected = list(expected)
1521
+
1522
+ extra = [c for c in features.columns if c not in expected]
1523
+ missing = [c for c in expected if c not in features.columns]
1524
+ if extra or missing:
1525
+ logger.warning(
1526
+ "Feature mismatch: extra=%s missing=%s",
1527
+ extra[:15],
1528
+ missing[:15],
1529
+ )
1530
+ return features.reindex(columns=expected, fill_value=0)
1531
 
1532
+ def _predict_records(
1533
+ records: list[dict[str, Any]],
1534
+ threshold: float | None,
1535
+ *,
1536
+ source: str | None = None,
1537
+ ) -> dict[str, Any]:
1538
  model = app.state.model
1539
  preprocessor: PreprocessorArtifacts = app.state.preprocessor
1540
  request_id = str(uuid.uuid4())
 
1558
  raise HTTPException(status_code=422, detail={"message": "SK_ID_CURR is required."})
1559
 
1560
  sk_ids = df_norm["SK_ID_CURR"].tolist()
1561
+ features = prepare_inference_features(df_norm, preprocessor, model)
1562
+ features = _align_features_to_model(features, model)
1563
 
1564
  if hasattr(model, "predict_proba"):
1565
  proba = model.predict_proba(features)[:, 1]
 
1582
  threshold=use_threshold,
1583
  status_code=200,
1584
  preprocessor=preprocessor,
1585
+ source=source,
1586
  data_quality=dq_records,
1587
  )
1588
  return {"predictions": results, "threshold": use_threshold}
 
1604
  threshold=None,
1605
  status_code=200,
1606
  preprocessor=preprocessor,
1607
+ source=source,
1608
  data_quality=dq_records,
1609
  )
1610
  return {"predictions": results, "threshold": None}
 
1619
  threshold=threshold,
1620
  status_code=exc.status_code,
1621
  preprocessor=preprocessor,
1622
+ source=source,
1623
  data_quality=dq_records if "dq_records" in locals() else None,
1624
  error=json.dumps(detail, ensure_ascii=True),
1625
  )
 
1634
  threshold=threshold,
1635
  status_code=500,
1636
  preprocessor=preprocessor,
1637
+ source=source,
1638
  data_quality=dq_records if "dq_records" in locals() else None,
1639
  error=str(exc),
1640
  )
 
1645
  def predict(
1646
  payload: PredictionRequest,
1647
  threshold: float | None = Query(default=None, ge=0.0, le=1.0),
1648
+ x_client_source: str | None = Header(default=None, alias="X-Client-Source"),
1649
  ) -> dict[str, Any]:
1650
  records = payload.data if isinstance(payload.data, list) else [payload.data]
1651
+ return _predict_records(records, threshold, source=x_client_source)
1652
 
1653
 
1654
  @app.post("/predict-minimal")
1655
  def predict_minimal(
1656
  payload: MinimalPredictionRequest,
1657
  threshold: float | None = Query(default=None, ge=0.0, le=1.0),
1658
+ x_client_source: str | None = Header(default=None, alias="X-Client-Source"),
1659
  ) -> dict[str, Any]:
1660
  preprocessor: PreprocessorArtifacts = app.state.preprocessor
1661
  record = _build_minimal_record(payload, preprocessor)
1662
+ return _predict_records([record], threshold, source=x_client_source)
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/performance/performance_report.md CHANGED
@@ -6,7 +6,8 @@ Mesurer la latence d'inference, identifier les goulots d'etranglement et propose
6
 
7
  ## Setup
8
 
9
- - Script: `profiling/profile_inference.py`
 
10
  - Donnees: `data/data_final.parquet` (echantillon)
11
  - Parametres: `--sample-size 500 --batch-size 100 --runs 2`
12
  - Modele: `HistGB_final_model.pkl`
 
6
 
7
  ## Setup
8
 
9
+ - Script (archivé): `dev_archive/profiling/profile_inference.py`
10
+ - Workflow courant: notebook modélisation (section TODO 5)
11
  - Donnees: `data/data_final.parquet` (echantillon)
12
  - Parametres: `--sample-size 500 --batch-size 100 --runs 2`
13
  - Modele: `HistGB_final_model.pkl`
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py CHANGED
@@ -8,13 +8,17 @@ import pandas as pd
8
  from fastapi import HTTPException
9
 
10
  from app.main import (
 
 
 
11
  MinimalPredictionRequest,
12
  app,
 
 
13
  predict_minimal,
14
  startup_event,
15
  _build_minimal_record,
16
  _normalize_inputs,
17
- preprocess_input,
18
  )
19
 
20
 
@@ -45,7 +49,7 @@ def _shap_error_table(message: str) -> pd.DataFrame:
45
  [
46
  {
47
  "feature": message,
48
- "value": np.nan,
49
  "shap_value": np.nan,
50
  }
51
  ]
@@ -63,38 +67,171 @@ def _extract_shap_values(shap_values: Any) -> np.ndarray:
63
  return values
64
 
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  def _compute_shap_top_features(record: dict[str, Any], top_k: int = 10) -> pd.DataFrame:
67
  preprocessor = app.state.preprocessor
 
68
  df_raw = pd.DataFrame.from_records([record])
69
  df_norm, _, _ = _normalize_inputs(df_raw, preprocessor)
70
- features = preprocess_input(df_norm, preprocessor)
 
 
 
 
 
 
 
71
  try:
72
  import shap
73
  except ImportError:
74
  return _shap_error_table("SHAP not installed.")
75
 
76
- explainer = getattr(app.state, "shap_explainer", None)
77
- if explainer is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  try:
79
- explainer = shap.TreeExplainer(app.state.model)
80
  except Exception:
81
- explainer = shap.Explainer(app.state.model, features)
82
- app.state.shap_explainer = explainer
 
83
 
 
 
 
 
 
84
  try:
85
- explanation = explainer(features)
86
- values = _extract_shap_values(explanation.values)
 
 
87
  except Exception:
88
- values = _extract_shap_values(explainer.shap_values(features))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  shap_row = values[0]
91
- feature_values = features.iloc[0].to_numpy()
 
 
 
 
 
92
  top_idx = np.argsort(np.abs(shap_row))[::-1][:top_k]
93
  rows = [
94
  {
95
- "feature": str(features.columns[idx]),
96
- "value": float(feature_values[idx]),
97
- "shap_value": float(shap_row[idx]),
 
 
98
  }
99
  for idx in top_idx
100
  ]
@@ -105,8 +242,7 @@ def score_minimal(
105
  sk_id_curr: float,
106
  amt_credit: float,
107
  duration_months: float,
108
- threshold: float,
109
- ) -> tuple[float | None, str, float | None, pd.DataFrame, dict[str, Any]]:
110
  _ensure_startup()
111
  try:
112
  payload = MinimalPredictionRequest(
@@ -115,7 +251,7 @@ def score_minimal(
115
  duration_months=int(duration_months),
116
  )
117
  record = _build_minimal_record(payload, app.state.preprocessor)
118
- response = predict_minimal(payload, threshold=float(threshold))
119
  result = response["predictions"][0]
120
  probability = float(result.get("probability", 0.0))
121
  pred_value = int(result.get("prediction", 0))
@@ -128,11 +264,11 @@ def score_minimal(
128
  "DURATION_MONTHS": int(duration_months),
129
  }
130
  )
131
- return probability, label, float(response.get("threshold", 0.0)), shap_table, snapshot
132
  except HTTPException as exc:
133
- return None, f"Erreur: {exc.detail}", None, _shap_error_table("No SHAP available."), {"error": exc.detail}
134
  except Exception as exc: # pragma: no cover - UI fallback
135
- return None, f"Erreur: {exc}", None, _shap_error_table("No SHAP available."), {"error": str(exc)}
136
 
137
 
138
  with gr.Blocks(title="Credit scoring MLOps") as demo:
@@ -155,19 +291,17 @@ with gr.Blocks(title="Credit scoring MLOps") as demo:
155
  sk_id_curr = gr.Number(label="Identifiant client", precision=0, value=100001)
156
  amt_credit = gr.Number(label="Montant du crédit", value=200000)
157
  duration_months = gr.Number(label="Durée (mois)", precision=0, value=60)
158
- threshold = gr.Slider(label="Seuil", minimum=0.0, maximum=1.0, value=0.5, step=0.01)
159
 
160
  run_btn = gr.Button("Scorer")
161
 
162
  with gr.Row():
163
  probability = gr.Number(label="Probabilité de défaut")
164
  prediction = gr.Textbox(label="Prédiction")
165
- threshold_used = gr.Number(label="Seuil utilisé")
166
 
167
  shap_table = gr.Dataframe(
168
- headers=["feature", "value", "shap_value"],
169
  label="Top 10 SHAP (local)",
170
- datatype=["str", "number", "number"],
171
  interactive=False,
172
  )
173
 
@@ -175,8 +309,8 @@ with gr.Blocks(title="Credit scoring MLOps") as demo:
175
 
176
  run_btn.click(
177
  score_minimal,
178
- inputs=[sk_id_curr, amt_credit, duration_months, threshold],
179
- outputs=[probability, prediction, threshold_used, shap_table, snapshot],
180
  )
181
 
182
 
 
8
  from fastapi import HTTPException
9
 
10
  from app.main import (
11
+ DAYS_EMPLOYED_SENTINEL,
12
+ ENGINEERED_SOURCES,
13
+ MODEL_VERSION,
14
  MinimalPredictionRequest,
15
  app,
16
+ new_features_creation,
17
+ prepare_inference_features,
18
  predict_minimal,
19
  startup_event,
20
  _build_minimal_record,
21
  _normalize_inputs,
 
22
  )
23
 
24
 
 
49
  [
50
  {
51
  "feature": message,
52
+ "raw_value": np.nan,
53
  "shap_value": np.nan,
54
  }
55
  ]
 
67
  return values
68
 
69
 
70
+ def _clean_raw_value(value: Any) -> Any:
71
+ if value is None or pd.isna(value):
72
+ return None
73
+ if isinstance(value, (np.integer, np.floating)):
74
+ return value.item()
75
+ return value
76
+
77
+
78
+ def _strip_feature_prefix(feature_name: str) -> str:
79
+ return feature_name.split("__", 1)[1] if "__" in feature_name else feature_name
80
+
81
+
82
+ def _lookup_raw_value(feature_name: str, raw_df: pd.DataFrame, preprocessor) -> Any:
83
+ cleaned_name = _strip_feature_prefix(feature_name)
84
+ if cleaned_name in raw_df.columns:
85
+ return raw_df.at[0, cleaned_name]
86
+ for prefix in ("is_missing_", "is_outlier_"):
87
+ if cleaned_name.startswith(prefix):
88
+ base = cleaned_name[len(prefix):]
89
+ if base in raw_df.columns:
90
+ return raw_df.at[0, base]
91
+ for col in getattr(preprocessor, "categorical_columns", []):
92
+ if cleaned_name.startswith(f"{col}_") and col in raw_df.columns:
93
+ return raw_df.at[0, col]
94
+ return None
95
+
96
+ def _align_features_to_model(X: Any, model: Any) -> Any:
97
+ expected = getattr(model, "feature_names_in_", None)
98
+ if expected is None:
99
+ return X
100
+ if isinstance(X, pd.DataFrame):
101
+ return X.reindex(columns=list(expected), fill_value=0)
102
+ return X
103
+
104
+ def _model_family(model: Any) -> str:
105
+ name = type(model).__name__.lower()
106
+ if "xgb" in name:
107
+ return "xgb"
108
+ if "lgbm" in name or "lightgbm" in name:
109
+ return "lgbm"
110
+ if "histgradientboosting" in name:
111
+ return "histgb"
112
+ return "unknown"
113
+
114
+ def _xgb_pred_contribs(estimator: Any, X: Any) -> np.ndarray:
115
+ import xgboost as xgb
116
+
117
+ if isinstance(X, pd.DataFrame):
118
+ dm = xgb.DMatrix(X, feature_names=list(X.columns))
119
+ else:
120
+ dm = xgb.DMatrix(np.asarray(X))
121
+
122
+ booster = estimator.get_booster() if hasattr(estimator, "get_booster") else estimator
123
+ contrib = booster.predict(dm, pred_contribs=True)
124
+ return np.asarray(contrib)[:, :-1]
125
+
126
+
127
+ def _lgbm_pred_contribs(estimator: Any, X: Any) -> np.ndarray:
128
+ contrib = estimator.predict(X, pred_contrib=True)
129
+ return np.asarray(contrib)[:, :-1]
130
+
131
+
132
  def _compute_shap_top_features(record: dict[str, Any], top_k: int = 10) -> pd.DataFrame:
133
  preprocessor = app.state.preprocessor
134
+ model = app.state.model
135
  df_raw = pd.DataFrame.from_records([record])
136
  df_norm, _, _ = _normalize_inputs(df_raw, preprocessor)
137
+ raw_reference = new_features_creation(
138
+ df_norm,
139
+ days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
140
+ engineered_sources=ENGINEERED_SOURCES,
141
+ )
142
+ features = prepare_inference_features(df_norm, preprocessor, model)
143
+ features = _align_features_to_model(features, model)
144
+
145
  try:
146
  import shap
147
  except ImportError:
148
  return _shap_error_table("SHAP not installed.")
149
 
150
+ estimator = model
151
+ X_shap = features
152
+ if hasattr(model, "named_steps") and model.named_steps.get("preprocessing") is not None:
153
+ estimator = model.named_steps.get("estimator", model)
154
+ pipeline_preprocessor = model.named_steps["preprocessing"]
155
+ try:
156
+ X_shap = pipeline_preprocessor.transform(features)
157
+ except Exception as exc:
158
+ return _shap_error_table(f"SHAP preprocessing failed: {exc}")
159
+ try:
160
+ import scipy.sparse as sp
161
+ if sp.issparse(X_shap):
162
+ X_shap = X_shap.toarray()
163
+ except Exception:
164
+ pass
165
  try:
166
+ feature_names = pipeline_preprocessor.get_feature_names_out()
167
  except Exception:
168
+ feature_names = None
169
+ if feature_names is not None:
170
+ X_shap = pd.DataFrame(X_shap, columns=feature_names)
171
 
172
+ family = _model_family(estimator)
173
+
174
+ values: np.ndarray | None = None
175
+
176
+ # 1) Contributions natives (meilleur choix pour XGB/LGBM)
177
  try:
178
+ if family == "xgb":
179
+ values = _xgb_pred_contribs(estimator, X_shap)
180
+ elif family == "lgbm":
181
+ values = _lgbm_pred_contribs(estimator, X_shap)
182
  except Exception:
183
+ values = None
184
+
185
+ # 2) Fallback SHAP (utile surtout pour HistGB / inconnus)
186
+ if values is None:
187
+ cache = getattr(app.state, "shap_explainer_cache", {})
188
+ key = f"{MODEL_VERSION}:{type(estimator).__name__}"
189
+ explainer = cache.get(key)
190
+
191
+ if explainer is None:
192
+ try:
193
+ import shap
194
+ predict_fn = (
195
+ (lambda X: estimator.predict_proba(X)[:, 1])
196
+ if hasattr(estimator, "predict_proba")
197
+ else (lambda X: estimator.predict(X))
198
+ )
199
+
200
+ # Evite le background dégénéré (1 seule ligne)
201
+ if isinstance(X_shap, pd.DataFrame):
202
+ bg = pd.concat([X_shap] * 50, ignore_index=True)
203
+ else:
204
+ bg = np.repeat(np.asarray(X_shap), repeats=50, axis=0)
205
+
206
+ explainer = shap.Explainer(predict_fn, bg)
207
+ except Exception as exc:
208
+ return _shap_error_table(f"SHAP explainer init failed: {exc}")
209
+
210
+ cache[key] = explainer
211
+ app.state.shap_explainer_cache = cache
212
+
213
+ try:
214
+ import shap
215
+ explanation = explainer(X_shap)
216
+ values = _extract_shap_values(explanation.values)
217
+ except Exception as exc:
218
+ return _shap_error_table(f"SHAP failed: {exc}")
219
 
220
  shap_row = values[0]
221
+ if isinstance(X_shap, pd.DataFrame):
222
+ feature_values = X_shap.iloc[0].to_numpy()
223
+ feature_names = X_shap.columns
224
+ else:
225
+ feature_values = np.asarray(X_shap)[0]
226
+ feature_names = [f"feature_{idx}" for idx in range(len(feature_values))]
227
  top_idx = np.argsort(np.abs(shap_row))[::-1][:top_k]
228
  rows = [
229
  {
230
+ "feature": str(feature_names[idx]),
231
+ "raw_value": _clean_raw_value(
232
+ _lookup_raw_value(str(feature_names[idx]), raw_reference, preprocessor)
233
+ ),
234
+ "shap_value": float(np.round(shap_row[idx], 6)),
235
  }
236
  for idx in top_idx
237
  ]
 
242
  sk_id_curr: float,
243
  amt_credit: float,
244
  duration_months: float,
245
+ ) -> tuple[float | None, str, pd.DataFrame, dict[str, Any]]:
 
246
  _ensure_startup()
247
  try:
248
  payload = MinimalPredictionRequest(
 
251
  duration_months=int(duration_months),
252
  )
253
  record = _build_minimal_record(payload, app.state.preprocessor)
254
+ response = predict_minimal(payload, threshold=None, x_client_source="gradio")
255
  result = response["predictions"][0]
256
  probability = float(result.get("probability", 0.0))
257
  pred_value = int(result.get("prediction", 0))
 
264
  "DURATION_MONTHS": int(duration_months),
265
  }
266
  )
267
+ return probability, label, shap_table, snapshot
268
  except HTTPException as exc:
269
+ return None, f"Erreur: {exc.detail}", _shap_error_table("No SHAP available."), {"error": exc.detail}
270
  except Exception as exc: # pragma: no cover - UI fallback
271
+ return None, f"Erreur: {exc}", _shap_error_table("No SHAP available."), {"error": str(exc)}
272
 
273
 
274
  with gr.Blocks(title="Credit scoring MLOps") as demo:
 
291
  sk_id_curr = gr.Number(label="Identifiant client", precision=0, value=100001)
292
  amt_credit = gr.Number(label="Montant du crédit", value=200000)
293
  duration_months = gr.Number(label="Durée (mois)", precision=0, value=60)
 
294
 
295
  run_btn = gr.Button("Scorer")
296
 
297
  with gr.Row():
298
  probability = gr.Number(label="Probabilité de défaut")
299
  prediction = gr.Textbox(label="Prédiction")
 
300
 
301
  shap_table = gr.Dataframe(
302
+ headers=["feature", "raw_value", "shap_value"],
303
  label="Top 10 SHAP (local)",
304
+ datatype=["str", "str", "number"],
305
  interactive=False,
306
  )
307
 
 
309
 
310
  run_btn.click(
311
  score_minimal,
312
+ inputs=[sk_id_curr, amt_credit, duration_months],
313
+ outputs=[probability, prediction, shap_table, snapshot],
314
  )
315
 
316
 
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py CHANGED
@@ -3,9 +3,19 @@ from __future__ import annotations
3
  from typing import Any
4
 
5
  import gradio as gr
 
 
6
  from fastapi import HTTPException
7
 
8
- from app.main import MinimalPredictionRequest, app, predict_minimal, startup_event
 
 
 
 
 
 
 
 
9
 
10
 
11
  def _ensure_startup() -> None:
@@ -30,12 +40,73 @@ def _customer_snapshot(sk_id_curr: int) -> dict[str, Any]:
30
  return snapshot
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def score_minimal(
34
  sk_id_curr: float,
35
  amt_credit: float,
36
  duration_months: float,
37
  threshold: float,
38
- ) -> tuple[float | None, str, float | None, dict[str, Any]]:
39
  _ensure_startup()
40
  try:
41
  payload = MinimalPredictionRequest(
@@ -43,11 +114,13 @@ def score_minimal(
43
  amt_credit=float(amt_credit),
44
  duration_months=int(duration_months),
45
  )
 
46
  response = predict_minimal(payload, threshold=float(threshold))
47
  result = response["predictions"][0]
48
  probability = float(result.get("probability", 0.0))
49
  pred_value = int(result.get("prediction", 0))
50
  label = "Default (1)" if pred_value == 1 else "No default (0)"
 
51
  snapshot = _customer_snapshot(int(sk_id_curr))
52
  snapshot.update(
53
  {
@@ -55,39 +128,55 @@ def score_minimal(
55
  "DURATION_MONTHS": int(duration_months),
56
  }
57
  )
58
- return probability, label, float(response.get("threshold", 0.0)), snapshot
59
  except HTTPException as exc:
60
- return None, f"Erreur: {exc.detail}", None, {"error": exc.detail}
61
  except Exception as exc: # pragma: no cover - UI fallback
62
- return None, f"Erreur: {exc}", None, {"error": str(exc)}
63
-
64
-
65
- with gr.Blocks(title="Credit Scoring - Minimal Inputs") as demo:
66
- gr.Markdown("# Credit Scoring - Minimal Inputs")
 
 
 
 
 
 
 
 
 
 
67
  gr.Markdown(
68
- "Renseignez l'identifiant client, le montant du credit et la duree. "
69
- "Les autres features proviennent des donnees clients reference."
70
  )
71
 
72
  with gr.Row():
73
- sk_id_curr = gr.Number(label="SK_ID_CURR", precision=0, value=100001)
74
- amt_credit = gr.Number(label="AMT_CREDIT", value=200000)
75
- duration_months = gr.Number(label="Duree (mois)", precision=0, value=60)
76
  threshold = gr.Slider(label="Seuil", minimum=0.0, maximum=1.0, value=0.5, step=0.01)
77
 
78
  run_btn = gr.Button("Scorer")
79
 
80
  with gr.Row():
81
- probability = gr.Number(label="Probabilite de defaut")
82
- prediction = gr.Textbox(label="Decision")
83
- threshold_used = gr.Number(label="Seuil utilise")
 
 
 
 
 
 
 
84
 
85
- snapshot = gr.JSON(label="Snapshot client (reference)")
86
 
87
  run_btn.click(
88
  score_minimal,
89
  inputs=[sk_id_curr, amt_credit, duration_months, threshold],
90
- outputs=[probability, prediction, threshold_used, snapshot],
91
  )
92
 
93
 
 
3
  from typing import Any
4
 
5
  import gradio as gr
6
+ import numpy as np
7
+ import pandas as pd
8
  from fastapi import HTTPException
9
 
10
+ from app.main import (
11
+ MinimalPredictionRequest,
12
+ app,
13
+ predict_minimal,
14
+ startup_event,
15
+ _build_minimal_record,
16
+ _normalize_inputs,
17
+ preprocess_input,
18
+ )
19
 
20
 
21
  def _ensure_startup() -> None:
 
40
  return snapshot
41
 
42
 
43
+ def _shap_error_table(message: str) -> pd.DataFrame:
44
+ return pd.DataFrame(
45
+ [
46
+ {
47
+ "feature": message,
48
+ "value": np.nan,
49
+ "shap_value": np.nan,
50
+ }
51
+ ]
52
+ )
53
+
54
+
55
+ def _extract_shap_values(shap_values: Any) -> np.ndarray:
56
+ if isinstance(shap_values, list):
57
+ shap_values = shap_values[1] if len(shap_values) > 1 else shap_values[0]
58
+ values = np.asarray(shap_values)
59
+ if values.ndim == 3:
60
+ values = values[:, :, 1]
61
+ if values.ndim == 1:
62
+ values = values.reshape(1, -1)
63
+ return values
64
+
65
+
66
+ def _compute_shap_top_features(record: dict[str, Any], top_k: int = 10) -> pd.DataFrame:
67
+ preprocessor = app.state.preprocessor
68
+ df_raw = pd.DataFrame.from_records([record])
69
+ df_norm, _, _ = _normalize_inputs(df_raw, preprocessor)
70
+ features = preprocess_input(df_norm, preprocessor)
71
+ try:
72
+ import shap
73
+ except ImportError:
74
+ return _shap_error_table("SHAP not installed.")
75
+
76
+ explainer = getattr(app.state, "shap_explainer", None)
77
+ if explainer is None:
78
+ try:
79
+ explainer = shap.TreeExplainer(app.state.model)
80
+ except Exception:
81
+ explainer = shap.Explainer(app.state.model, features)
82
+ app.state.shap_explainer = explainer
83
+
84
+ try:
85
+ explanation = explainer(features)
86
+ values = _extract_shap_values(explanation.values)
87
+ except Exception:
88
+ values = _extract_shap_values(explainer.shap_values(features))
89
+
90
+ shap_row = values[0]
91
+ feature_values = features.iloc[0].to_numpy()
92
+ top_idx = np.argsort(np.abs(shap_row))[::-1][:top_k]
93
+ rows = [
94
+ {
95
+ "feature": str(features.columns[idx]),
96
+ "value": float(feature_values[idx]),
97
+ "shap_value": float(shap_row[idx]),
98
+ }
99
+ for idx in top_idx
100
+ ]
101
+ return pd.DataFrame(rows)
102
+
103
+
104
  def score_minimal(
105
  sk_id_curr: float,
106
  amt_credit: float,
107
  duration_months: float,
108
  threshold: float,
109
+ ) -> tuple[float | None, str, float | None, pd.DataFrame, dict[str, Any]]:
110
  _ensure_startup()
111
  try:
112
  payload = MinimalPredictionRequest(
 
114
  amt_credit=float(amt_credit),
115
  duration_months=int(duration_months),
116
  )
117
+ record = _build_minimal_record(payload, app.state.preprocessor)
118
  response = predict_minimal(payload, threshold=float(threshold))
119
  result = response["predictions"][0]
120
  probability = float(result.get("probability", 0.0))
121
  pred_value = int(result.get("prediction", 0))
122
  label = "Default (1)" if pred_value == 1 else "No default (0)"
123
+ shap_table = _compute_shap_top_features(record, top_k=10)
124
  snapshot = _customer_snapshot(int(sk_id_curr))
125
  snapshot.update(
126
  {
 
128
  "DURATION_MONTHS": int(duration_months),
129
  }
130
  )
131
+ return probability, label, float(response.get("threshold", 0.0)), shap_table, snapshot
132
  except HTTPException as exc:
133
+ return None, f"Erreur: {exc.detail}", None, _shap_error_table("No SHAP available."), {"error": exc.detail}
134
  except Exception as exc: # pragma: no cover - UI fallback
135
+ return None, f"Erreur: {exc}", None, _shap_error_table("No SHAP available."), {"error": str(exc)}
136
+
137
+
138
+ with gr.Blocks(title="Credit scoring MLOps") as demo:
139
+ gr.Markdown("# Credit scoring MLOps")
140
+ gr.HTML("""
141
+ <div style="display:flex; gap:0.5rem; flex-wrap:wrap;">
142
+ <a href="https://github.com/stephmnt/credit-scoring-mlops/releases" target="_blank" rel="noreferrer">
143
+ <img src="https://img.shields.io/github/v/release/stephmnt/credit-scoring-mlops" alt="GitHub Release" />
144
+ </a>
145
+ <a href="https://github.com/stephmnt/credit-scoring-mlops/actions/workflows/deploy.yml" target="_blank" rel="noreferrer">
146
+ <img src="https://img.shields.io/github/actions/workflow/status/stephmnt/credit-scoring-mlops/deploy.yml" alt="GitHub Actions Workflow Status" />
147
+ </a>
148
+ </div>
149
+ """)
150
  gr.Markdown(
151
+ "Renseignez l'identifiant client, le montant du crédit et la durée. "
 
152
  )
153
 
154
  with gr.Row():
155
+ sk_id_curr = gr.Number(label="Identifiant client", precision=0, value=100001)
156
+ amt_credit = gr.Number(label="Montant du crédit", value=200000)
157
+ duration_months = gr.Number(label="Durée (mois)", precision=0, value=60)
158
  threshold = gr.Slider(label="Seuil", minimum=0.0, maximum=1.0, value=0.5, step=0.01)
159
 
160
  run_btn = gr.Button("Scorer")
161
 
162
  with gr.Row():
163
+ probability = gr.Number(label="Probabilité de défaut")
164
+ prediction = gr.Textbox(label="Prédiction")
165
+ threshold_used = gr.Number(label="Seuil utilisé")
166
+
167
+ shap_table = gr.Dataframe(
168
+ headers=["feature", "value", "shap_value"],
169
+ label="Top 10 SHAP (local)",
170
+ datatype=["str", "number", "number"],
171
+ interactive=False,
172
+ )
173
 
174
+ snapshot = gr.JSON(label="Snapshot client (référence)")
175
 
176
  run_btn.click(
177
  score_minimal,
178
  inputs=[sk_id_curr, amt_credit, duration_months, threshold],
179
+ outputs=[probability, prediction, threshold_used, shap_table, snapshot],
180
  )
181
 
182
 
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/__init__.py CHANGED
@@ -1,3 +1 @@
1
- """Expose combined ASGI app for HF Spaces default loader."""
2
-
3
- from app_entry import app, demo # re-export for uvicorn app:app
 
1
+ """Package marker for the FastAPI app package."""
 
 
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/Dockerfile CHANGED
@@ -9,8 +9,9 @@ COPY requirements.txt .
9
  RUN pip install --no-cache-dir -r requirements.txt
10
 
11
  COPY app/ app/
12
- COPY data/HistGB_final_model.pkl data/
13
- COPY artifacts/preprocessor.joblib artifacts/
 
14
 
15
  EXPOSE 7860
16
 
 
9
  RUN pip install --no-cache-dir -r requirements.txt
10
 
11
  COPY app/ app/
12
+ COPY app_entry.py app.py gradio_app.py ./
13
+ COPY data/ data/
14
+ COPY artifacts/ artifacts/
15
 
16
  EXPOSE 7860
17
 
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py CHANGED
@@ -1,22 +1,4 @@
1
- from fastapi import FastAPI
2
- import gradio as gr
3
-
4
- from app.main import app as api_app
5
- from app.main import startup_event
6
- from gradio_app import demo
7
-
8
-
9
- root_app = FastAPI()
10
- root_app.mount("/api", api_app)
11
- root_app = gr.mount_gradio_app(root_app, demo, path="/")
12
-
13
-
14
- @root_app.on_event("startup")
15
- def _startup() -> None:
16
- startup_event()
17
-
18
-
19
- app = root_app
20
 
21
 
22
  if __name__ == "__main__":
 
1
+ from app_entry import app, demo # re-export for HF Spaces
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
 
4
  if __name__ == "__main__":
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/__init__.py CHANGED
@@ -1 +1,3 @@
1
- # Package marker for app module.
 
 
 
1
+ """Expose combined ASGI app for HF Spaces default loader."""
2
+
3
+ from app_entry import app, demo # re-export for uvicorn app:app
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py CHANGED
@@ -1113,6 +1113,16 @@ def startup_event() -> None:
1113
  logger.info("Loading model from %s", model_path)
1114
  app.state.model = load_model(model_path)
1115
 
 
 
 
 
 
 
 
 
 
 
1116
  try:
1117
  artifacts_path = ARTIFACTS_PATH
1118
  if not artifacts_path.exists():
@@ -1125,7 +1135,7 @@ def startup_event() -> None:
1125
  if downloaded is not None:
1126
  artifacts_path = downloaded
1127
  logger.info("Loading preprocessor artifacts from %s", artifacts_path)
1128
- app.state.preprocessor = load_preprocessor(DATA_PATH, artifacts_path)
1129
  except RuntimeError as exc:
1130
  if ALLOW_MISSING_ARTIFACTS:
1131
  logger.warning("Preprocessor artifacts missing (%s). Using fallback preprocessor.", exc)
 
1113
  logger.info("Loading model from %s", model_path)
1114
  app.state.model = load_model(model_path)
1115
 
1116
+ data_path = DATA_PATH
1117
+ if not data_path.exists():
1118
+ downloaded = _ensure_hf_asset(
1119
+ data_path,
1120
+ HF_CUSTOMER_REPO_ID,
1121
+ HF_CUSTOMER_FILENAME,
1122
+ HF_CUSTOMER_REPO_TYPE,
1123
+ )
1124
+ if downloaded is not None:
1125
+ data_path = downloaded
1126
  try:
1127
  artifacts_path = ARTIFACTS_PATH
1128
  if not artifacts_path.exists():
 
1135
  if downloaded is not None:
1136
  artifacts_path = downloaded
1137
  logger.info("Loading preprocessor artifacts from %s", artifacts_path)
1138
+ app.state.preprocessor = load_preprocessor(data_path, artifacts_path)
1139
  except RuntimeError as exc:
1140
  if ALLOW_MISSING_ARTIFACTS:
1141
  logger.warning("Preprocessor artifacts missing (%s). Using fallback preprocessor.", exc)
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app_entry.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ import gradio as gr
3
+
4
+ from app.main import app as api_app
5
+ from app.main import startup_event
6
+ from gradio_app import demo
7
+
8
+
9
+ root_app = FastAPI()
10
+ root_app.mount("/api", api_app)
11
+ root_app = gr.mount_gradio_app(root_app, demo, path="/")
12
+
13
+
14
+ @root_app.on_event("startup")
15
+ def _startup() -> None:
16
+ startup_event()
17
+
18
+
19
+ app = root_app
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/Dockerfile CHANGED
@@ -14,4 +14,4 @@ COPY artifacts/preprocessor.joblib artifacts/
14
 
15
  EXPOSE 7860
16
 
17
- CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
 
14
 
15
  EXPOSE 7860
16
 
17
+ CMD ["uvicorn", "app_entry:app", "--host", "0.0.0.0", "--port", "7860"]
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md CHANGED
@@ -198,29 +198,38 @@ Exemple (un seul repo dataset avec 3 fichiers) :
198
 
199
  ### Demo live (commandes cles en main)
200
 
201
- Lancer l'API :
202
 
203
  ```shell
204
  uvicorn app.main:app --reload --port 7860
205
  ```
206
 
 
 
 
 
 
 
207
  Verifier le service (HF) :
208
 
209
  ```shell
210
  BASE_URL="https://stephmnt-credit-scoring-mlops.hf.space"
211
- curl -s "${BASE_URL}/health"
 
212
  ```
213
 
 
 
214
  Voir les features attendues (HF) :
215
 
216
  ```shell
217
- curl -s "${BASE_URL}/features"
218
  ```
219
 
220
  Predire un client (HF) :
221
 
222
  ```shell
223
- curl -s -X POST "${BASE_URL}/predict?threshold=0.5" \
224
  -H "Content-Type: application/json" \
225
  -d '{
226
  "data": {
@@ -242,7 +251,7 @@ curl -s -X POST "${BASE_URL}/predict?threshold=0.5" \
242
  Predire plusieurs clients (batch, HF) :
243
 
244
  ```shell
245
- curl -s -X POST "${BASE_URL}/predict?threshold=0.45" \
246
  -H "Content-Type: application/json" \
247
  -d '{
248
  "data": [
@@ -279,7 +288,7 @@ curl -s -X POST "${BASE_URL}/predict?threshold=0.45" \
279
  Exemple d'erreur (champ requis manquant, HF) :
280
 
281
  ```shell
282
- curl -s -X POST "${BASE_URL}/predict" \
283
  -H "Content-Type: application/json" \
284
  -d '{
285
  "data": {
@@ -316,13 +325,13 @@ Recuperer les logs (HF) :
316
  Configurer `LOGS_ACCESS_TOKEN` dans les secrets du Space, puis :
317
 
318
  ```shell
319
- curl -s -H "X-Logs-Token: $LOGS_ACCESS_TOKEN" "${BASE_URL}/logs?tail=200"
320
  ```
321
 
322
  Alternative :
323
 
324
  ```shell
325
- curl -s -H "Authorization: Bearer $LOGS_ACCESS_TOKEN" "${BASE_URL}/logs?tail=200"
326
  ```
327
 
328
  Apres quelques requêtes, générer le rapport de drift :
 
198
 
199
  ### Demo live (commandes cles en main)
200
 
201
+ Lancer l'API (sans UI) :
202
 
203
  ```shell
204
  uvicorn app.main:app --reload --port 7860
205
  ```
206
 
207
+ Lancer l'UI Gradio + API (chemin `/api`) :
208
+
209
+ ```shell
210
+ uvicorn app_entry:app --reload --port 7860
211
+ ```
212
+
213
  Verifier le service (HF) :
214
 
215
  ```shell
216
  BASE_URL="https://stephmnt-credit-scoring-mlops.hf.space"
217
+ API_BASE="${BASE_URL}/api"
218
+ curl -s "${API_BASE}/health"
219
  ```
220
 
221
+ Note : sur HF Spaces, l'UI Gradio est a la racine, l'API est sous `/api`.
222
+
223
  Voir les features attendues (HF) :
224
 
225
  ```shell
226
+ curl -s "${API_BASE}/features"
227
  ```
228
 
229
  Predire un client (HF) :
230
 
231
  ```shell
232
+ curl -s -X POST "${API_BASE}/predict?threshold=0.5" \
233
  -H "Content-Type: application/json" \
234
  -d '{
235
  "data": {
 
251
  Predire plusieurs clients (batch, HF) :
252
 
253
  ```shell
254
+ curl -s -X POST "${API_BASE}/predict?threshold=0.45" \
255
  -H "Content-Type: application/json" \
256
  -d '{
257
  "data": [
 
288
  Exemple d'erreur (champ requis manquant, HF) :
289
 
290
  ```shell
291
+ curl -s -X POST "${API_BASE}/predict" \
292
  -H "Content-Type: application/json" \
293
  -d '{
294
  "data": {
 
325
  Configurer `LOGS_ACCESS_TOKEN` dans les secrets du Space, puis :
326
 
327
  ```shell
328
+ curl -s -H "X-Logs-Token: $LOGS_ACCESS_TOKEN" "${API_BASE}/logs?tail=200"
329
  ```
330
 
331
  Alternative :
332
 
333
  ```shell
334
+ curl -s -H "Authorization: Bearer $LOGS_ACCESS_TOKEN" "${API_BASE}/logs?tail=200"
335
  ```
336
 
337
  Apres quelques requêtes, générer le rapport de drift :
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.gitattributes CHANGED
@@ -1,35 +1,4 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
  *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
  *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  *.parquet filter=lfs diff=lfs merge=lfs -text
2
+ data/HistGB_final_model.pkl filter=lfs diff=lfs merge=lfs -text
 
3
  *.pkl filter=lfs diff=lfs merge=lfs -text
4
+ *.joblib filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: deploy-assets
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ inputs:
6
+ repo_id:
7
+ description: "HF repo id (e.g. stephmnt/assets-credit-scoring-mlops)"
8
+ required: true
9
+ default: "stephmnt/assets-credit-scoring-mlops"
10
+ repo_type:
11
+ description: "HF repo type (dataset or model)"
12
+ required: true
13
+ default: "dataset"
14
+
15
+ jobs:
16
+ upload-assets:
17
+ runs-on: ubuntu-latest
18
+ steps:
19
+ - name: Checkout
20
+ uses: actions/checkout@v4
21
+ with:
22
+ lfs: true
23
+
24
+ - name: Set up Python
25
+ uses: actions/setup-python@v5
26
+ with:
27
+ python-version: "3.11"
28
+
29
+ - name: Install dependencies
30
+ run: |
31
+ python -m pip install --upgrade pip
32
+ python -m pip install huggingface_hub
33
+
34
+ - name: Upload assets to Hugging Face Hub
35
+ env:
36
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
37
+ HF_REPO_ID: ${{ inputs.repo_id }}
38
+ HF_REPO_TYPE: ${{ inputs.repo_type }}
39
+ run: |
40
+ python - <<'PY'
41
+ import os
42
+ from pathlib import Path
43
+ from huggingface_hub import HfApi
44
+
45
+ repo_id = os.environ["HF_REPO_ID"]
46
+ repo_type = os.environ["HF_REPO_TYPE"]
47
+ token = os.environ["HF_TOKEN"]
48
+
49
+ files = {
50
+ "data/HistGB_final_model.pkl": "HistGB_final_model.pkl",
51
+ "artifacts/preprocessor.joblib": "preprocessor.joblib",
52
+ "data/data_final.parquet": "data_final.parquet",
53
+ }
54
+
55
+ api = HfApi()
56
+ for local_path, remote_name in files.items():
57
+ path = Path(local_path)
58
+ if not path.exists():
59
+ raise SystemExit(f"Missing file: {path}")
60
+ api.upload_file(
61
+ path_or_fileobj=str(path),
62
+ path_in_repo=remote_name,
63
+ repo_id=repo_id,
64
+ repo_type=repo_type,
65
+ token=token,
66
+ commit_message=f"Update {remote_name}",
67
+ )
68
+ print("Assets uploaded.")
69
+ PY
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy.yml CHANGED
@@ -12,6 +12,8 @@ jobs:
12
  steps:
13
  - name: Checkout
14
  uses: actions/checkout@v4
 
 
15
 
16
  - name: Set up Python
17
  uses: actions/setup-python@v5
@@ -47,6 +49,8 @@ jobs:
47
  --exclude 'logs' \
48
  --exclude 'reports' \
49
  --exclude 'screen-mlflow.png' \
 
 
50
  --exclude 'data/*.csv' \
51
  --exclude 'data/*.parquet' \
52
  ./ hf_space/
 
12
  steps:
13
  - name: Checkout
14
  uses: actions/checkout@v4
15
+ with:
16
+ lfs: true
17
 
18
  - name: Set up Python
19
  uses: actions/setup-python@v5
 
49
  --exclude 'logs' \
50
  --exclude 'reports' \
51
  --exclude 'screen-mlflow.png' \
52
+ --exclude 'data/HistGB_final_model.pkl' \
53
+ --exclude 'artifacts/preprocessor.joblib' \
54
  --exclude 'data/*.csv' \
55
  --exclude 'data/*.parquet' \
56
  ./ hf_space/
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.gitignore CHANGED
@@ -6,6 +6,7 @@ logs/
6
  reports/
7
  data/*
8
  !data/HistGB_final_model.pkl
 
9
  artifacts/*
10
  !artifacts/preprocessor.joblib
11
  .DS_Store
@@ -18,7 +19,8 @@ mlruns/
18
  *.code-workspace
19
  presentation_projet08.pptx
20
  rapport_projet06.md
21
-
 
22
  ## https://github.com/github/gitignore/blob/e8554d85bf62e38d6db966a50d2064ac025fd82a/Python.gitignore
23
 
24
  # Byte-compiled / optimized / DLL files
 
6
  reports/
7
  data/*
8
  !data/HistGB_final_model.pkl
9
+ !data/data_final.parquet
10
  artifacts/*
11
  !artifacts/preprocessor.joblib
12
  .DS_Store
 
19
  *.code-workspace
20
  presentation_projet08.pptx
21
  rapport_projet06.md
22
+ rapport_template.md
23
+ data_final.parquet
24
  ## https://github.com/github/gitignore/blob/e8554d85bf62e38d6db966a50d2064ac025fd82a/Python.gitignore
25
 
26
  # Byte-compiled / optimized / DLL files
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py CHANGED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ import gradio as gr
3
+
4
+ from app.main import app as api_app
5
+ from app.main import startup_event
6
+ from gradio_app import demo
7
+
8
+
9
+ root_app = FastAPI()
10
+ root_app.mount("/api", api_app)
11
+ root_app = gr.mount_gradio_app(root_app, demo, path="/")
12
+
13
+
14
+ @root_app.on_event("startup")
15
+ def _startup() -> None:
16
+ startup_event()
17
+
18
+
19
+ app = root_app
20
+
21
+
22
+ if __name__ == "__main__":
23
+ import uvicorn
24
+
25
+ uvicorn.run(app, host="0.0.0.0", port=7860)
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py CHANGED
@@ -41,6 +41,18 @@ LOG_INCLUDE_INPUTS = os.getenv("LOG_INCLUDE_INPUTS", "1") == "1"
41
  LOG_HASH_SK_ID = os.getenv("LOG_HASH_SK_ID", "0") == "1"
42
  MODEL_VERSION = os.getenv("MODEL_VERSION", MODEL_PATH.name)
43
  LOGS_ACCESS_TOKEN = os.getenv("LOGS_ACCESS_TOKEN")
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  IGNORE_FEATURES = ["is_train", "is_test", "TARGET", "SK_ID_CURR"]
46
  ENGINEERED_FEATURES = [
@@ -117,6 +129,13 @@ class PredictionRequest(BaseModel):
117
  data: dict[str, Any] | list[dict[str, Any]]
118
 
119
 
 
 
 
 
 
 
 
120
  @dataclass
121
  class PreprocessorArtifacts:
122
  columns_keep: list[str]
@@ -173,6 +192,32 @@ def _normalize_category_value(value: object, mapping: dict[str, str]) -> object:
173
  return mapping.get(key, "Unknown")
174
 
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  def _normalize_inputs(
177
  df_raw: pd.DataFrame,
178
  preprocessor: PreprocessorArtifacts,
@@ -262,6 +307,54 @@ def _build_data_quality_records(
262
  return records
263
 
264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  def _append_log_entries(entries: list[dict[str, Any]]) -> None:
266
  if not LOG_PREDICTIONS:
267
  return
@@ -596,6 +689,41 @@ def load_model(model_path: Path):
596
  return pickle.load(handle)
597
 
598
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
599
  def _infer_numeric_ranges_from_scaler(preprocessor: PreprocessorArtifacts) -> dict[str, tuple[float, float]]:
600
  ranges = {}
601
  scaler = getattr(preprocessor, "scaler", None)
@@ -963,19 +1091,41 @@ def preprocess_input(df_raw: pd.DataFrame, artifacts: PreprocessorArtifacts) ->
963
 
964
  @app.on_event("startup")
965
  def startup_event() -> None:
966
- if not MODEL_PATH.exists():
 
 
 
 
 
 
 
 
 
 
 
 
967
  if ALLOW_MISSING_ARTIFACTS:
968
- logger.warning("Model file not found: %s. Using dummy model.", MODEL_PATH)
969
  app.state.model = DummyModel()
970
  else:
971
- raise RuntimeError(f"Model file not found: {MODEL_PATH}")
972
  else:
973
- logger.info("Loading model from %s", MODEL_PATH)
974
- app.state.model = load_model(MODEL_PATH)
975
 
976
  try:
977
- logger.info("Loading preprocessor artifacts from %s", ARTIFACTS_PATH)
978
- app.state.preprocessor = load_preprocessor(DATA_PATH, ARTIFACTS_PATH)
 
 
 
 
 
 
 
 
 
 
979
  except RuntimeError as exc:
980
  if ALLOW_MISSING_ARTIFACTS:
981
  logger.warning("Preprocessor artifacts missing (%s). Using fallback preprocessor.", exc)
@@ -983,6 +1133,19 @@ def startup_event() -> None:
983
  else:
984
  raise
985
 
 
 
 
 
 
 
 
 
 
 
 
 
 
986
 
987
  @app.get("/health")
988
  def health() -> dict[str, str]:
@@ -1063,16 +1226,11 @@ def logs(
1063
  return Response(content="".join(lines), media_type="application/x-ndjson")
1064
 
1065
 
1066
- @app.post("/predict")
1067
- def predict(
1068
- payload: PredictionRequest,
1069
- threshold: float | None = Query(default=None, ge=0.0, le=1.0),
1070
- ) -> dict[str, Any]:
1071
  model = app.state.model
1072
  preprocessor: PreprocessorArtifacts = app.state.preprocessor
1073
  request_id = str(uuid.uuid4())
1074
  start_time = time.perf_counter()
1075
- records = payload.data if isinstance(payload.data, list) else [payload.data]
1076
 
1077
  if not records:
1078
  raise HTTPException(status_code=422, detail={"message": "No input records provided."})
@@ -1168,3 +1326,22 @@ def predict(
1168
  error=str(exc),
1169
  )
1170
  raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  LOG_HASH_SK_ID = os.getenv("LOG_HASH_SK_ID", "0") == "1"
42
  MODEL_VERSION = os.getenv("MODEL_VERSION", MODEL_PATH.name)
43
  LOGS_ACCESS_TOKEN = os.getenv("LOGS_ACCESS_TOKEN")
44
+ CUSTOMER_DATA_PATH = Path(os.getenv("CUSTOMER_DATA_PATH", str(DATA_PATH)))
45
+ CUSTOMER_LOOKUP_ENABLED = os.getenv("CUSTOMER_LOOKUP_ENABLED", "1") == "1"
46
+ CUSTOMER_LOOKUP_CACHE = os.getenv("CUSTOMER_LOOKUP_CACHE", "1") == "1"
47
+ HF_MODEL_REPO_ID = os.getenv("HF_MODEL_REPO_ID")
48
+ HF_MODEL_REPO_TYPE = os.getenv("HF_MODEL_REPO_TYPE", "model")
49
+ HF_MODEL_FILENAME = os.getenv("HF_MODEL_FILENAME", MODEL_PATH.name)
50
+ HF_PREPROCESSOR_REPO_ID = os.getenv("HF_PREPROCESSOR_REPO_ID", HF_MODEL_REPO_ID or "")
51
+ HF_PREPROCESSOR_REPO_TYPE = os.getenv("HF_PREPROCESSOR_REPO_TYPE", HF_MODEL_REPO_TYPE)
52
+ HF_PREPROCESSOR_FILENAME = os.getenv("HF_PREPROCESSOR_FILENAME", ARTIFACTS_PATH.name)
53
+ HF_CUSTOMER_REPO_ID = os.getenv("HF_CUSTOMER_REPO_ID")
54
+ HF_CUSTOMER_REPO_TYPE = os.getenv("HF_CUSTOMER_REPO_TYPE", "dataset")
55
+ HF_CUSTOMER_FILENAME = os.getenv("HF_CUSTOMER_FILENAME", CUSTOMER_DATA_PATH.name)
56
 
57
  IGNORE_FEATURES = ["is_train", "is_test", "TARGET", "SK_ID_CURR"]
58
  ENGINEERED_FEATURES = [
 
129
  data: dict[str, Any] | list[dict[str, Any]]
130
 
131
 
132
+ class MinimalPredictionRequest(BaseModel):
133
+ sk_id_curr: int
134
+ amt_credit: float
135
+ duration_months: int | None = None
136
+ amt_annuity: float | None = None
137
+
138
+
139
  @dataclass
140
  class PreprocessorArtifacts:
141
  columns_keep: list[str]
 
192
  return mapping.get(key, "Unknown")
193
 
194
 
195
+ def _ensure_hf_asset(
196
+ local_path: Path,
197
+ repo_id: str | None,
198
+ filename: str,
199
+ repo_type: str,
200
+ ) -> Path | None:
201
+ if local_path.exists():
202
+ return local_path
203
+ if not repo_id:
204
+ return None
205
+ try:
206
+ from huggingface_hub import hf_hub_download
207
+ except ImportError as exc: # pragma: no cover - optional dependency
208
+ raise RuntimeError("huggingface_hub is required to download remote assets.") from exc
209
+ local_path.parent.mkdir(parents=True, exist_ok=True)
210
+ return Path(
211
+ hf_hub_download(
212
+ repo_id=repo_id,
213
+ filename=filename,
214
+ repo_type=repo_type,
215
+ local_dir=str(local_path.parent),
216
+ local_dir_use_symlinks=False,
217
+ )
218
+ )
219
+
220
+
221
  def _normalize_inputs(
222
  df_raw: pd.DataFrame,
223
  preprocessor: PreprocessorArtifacts,
 
307
  return records
308
 
309
 
310
+ def _build_minimal_record(
311
+ payload: MinimalPredictionRequest,
312
+ preprocessor: PreprocessorArtifacts,
313
+ ) -> dict[str, Any]:
314
+ reference = _get_customer_reference(preprocessor)
315
+ if reference is None:
316
+ raise HTTPException(
317
+ status_code=503,
318
+ detail={"message": "Customer reference data is not available."},
319
+ )
320
+ sk_id = int(payload.sk_id_curr)
321
+ if sk_id not in reference.index:
322
+ raise HTTPException(
323
+ status_code=404,
324
+ detail={"message": f"Client {sk_id} not found in reference data."},
325
+ )
326
+ record = reference.loc[sk_id].to_dict()
327
+ record["SK_ID_CURR"] = sk_id
328
+ if payload.amt_credit <= 0:
329
+ raise HTTPException(
330
+ status_code=422,
331
+ detail={"message": "AMT_CREDIT must be positive."},
332
+ )
333
+ record["AMT_CREDIT"] = float(payload.amt_credit)
334
+ if payload.amt_annuity is not None:
335
+ if payload.amt_annuity <= 0:
336
+ raise HTTPException(
337
+ status_code=422,
338
+ detail={"message": "AMT_ANNUITY must be positive."},
339
+ )
340
+ record["AMT_ANNUITY"] = float(payload.amt_annuity)
341
+ elif payload.duration_months is not None:
342
+ if payload.duration_months <= 0:
343
+ raise HTTPException(
344
+ status_code=422,
345
+ detail={"message": "duration_months must be positive."},
346
+ )
347
+ record["AMT_ANNUITY"] = float(payload.amt_credit) / float(payload.duration_months)
348
+ else:
349
+ raise HTTPException(
350
+ status_code=422,
351
+ detail={"message": "Provide duration_months or amt_annuity."},
352
+ )
353
+ if "AMT_GOODS_PRICE" in record:
354
+ record["AMT_GOODS_PRICE"] = float(payload.amt_credit)
355
+ return record
356
+
357
+
358
  def _append_log_entries(entries: list[dict[str, Any]]) -> None:
359
  if not LOG_PREDICTIONS:
360
  return
 
689
  return pickle.load(handle)
690
 
691
 
692
+ def _load_customer_reference(
693
+ data_path: Path,
694
+ preprocessor: PreprocessorArtifacts,
695
+ ) -> pd.DataFrame:
696
+ columns = list(preprocessor.input_feature_columns)
697
+ if "SK_ID_CURR" not in columns:
698
+ columns.insert(0, "SK_ID_CURR")
699
+ df = pd.read_parquet(data_path, columns=columns)
700
+ df = df.drop_duplicates(subset=["SK_ID_CURR"], keep="last").set_index("SK_ID_CURR")
701
+ return df
702
+
703
+
704
+ def _get_customer_reference(preprocessor: PreprocessorArtifacts) -> pd.DataFrame | None:
705
+ if not CUSTOMER_LOOKUP_ENABLED:
706
+ return None
707
+ cached = getattr(app.state, "customer_reference", None)
708
+ if cached is not None:
709
+ return cached
710
+ data_path = CUSTOMER_DATA_PATH
711
+ if not data_path.exists():
712
+ downloaded = _ensure_hf_asset(
713
+ data_path,
714
+ HF_CUSTOMER_REPO_ID,
715
+ HF_CUSTOMER_FILENAME,
716
+ HF_CUSTOMER_REPO_TYPE,
717
+ )
718
+ if downloaded is None:
719
+ return None
720
+ data_path = downloaded
721
+ ref = _load_customer_reference(data_path, preprocessor)
722
+ if CUSTOMER_LOOKUP_CACHE:
723
+ app.state.customer_reference = ref
724
+ return ref
725
+
726
+
727
  def _infer_numeric_ranges_from_scaler(preprocessor: PreprocessorArtifacts) -> dict[str, tuple[float, float]]:
728
  ranges = {}
729
  scaler = getattr(preprocessor, "scaler", None)
 
1091
 
1092
  @app.on_event("startup")
1093
  def startup_event() -> None:
1094
+ if getattr(app.state, "model", None) is not None and getattr(app.state, "preprocessor", None) is not None:
1095
+ return
1096
+ model_path = MODEL_PATH
1097
+ if not model_path.exists():
1098
+ downloaded = _ensure_hf_asset(
1099
+ model_path,
1100
+ HF_MODEL_REPO_ID,
1101
+ HF_MODEL_FILENAME,
1102
+ HF_MODEL_REPO_TYPE,
1103
+ )
1104
+ if downloaded is not None:
1105
+ model_path = downloaded
1106
+ if not model_path.exists():
1107
  if ALLOW_MISSING_ARTIFACTS:
1108
+ logger.warning("Model file not found: %s. Using dummy model.", model_path)
1109
  app.state.model = DummyModel()
1110
  else:
1111
+ raise RuntimeError(f"Model file not found: {model_path}")
1112
  else:
1113
+ logger.info("Loading model from %s", model_path)
1114
+ app.state.model = load_model(model_path)
1115
 
1116
  try:
1117
+ artifacts_path = ARTIFACTS_PATH
1118
+ if not artifacts_path.exists():
1119
+ downloaded = _ensure_hf_asset(
1120
+ artifacts_path,
1121
+ HF_PREPROCESSOR_REPO_ID or None,
1122
+ HF_PREPROCESSOR_FILENAME,
1123
+ HF_PREPROCESSOR_REPO_TYPE,
1124
+ )
1125
+ if downloaded is not None:
1126
+ artifacts_path = downloaded
1127
+ logger.info("Loading preprocessor artifacts from %s", artifacts_path)
1128
+ app.state.preprocessor = load_preprocessor(DATA_PATH, artifacts_path)
1129
  except RuntimeError as exc:
1130
  if ALLOW_MISSING_ARTIFACTS:
1131
  logger.warning("Preprocessor artifacts missing (%s). Using fallback preprocessor.", exc)
 
1133
  else:
1134
  raise
1135
 
1136
+ app.state.customer_reference = None
1137
+ if CUSTOMER_LOOKUP_ENABLED and CUSTOMER_LOOKUP_CACHE:
1138
+ try:
1139
+ ref = _get_customer_reference(app.state.preprocessor)
1140
+ if ref is not None:
1141
+ logger.info("Loaded customer reference data (%s rows)", len(ref))
1142
+ else:
1143
+ logger.warning("Customer reference data not available.")
1144
+ except Exception as exc: # pragma: no cover - optional cache load
1145
+ logger.warning("Failed to load customer reference data: %s", exc)
1146
+ elif CUSTOMER_LOOKUP_ENABLED:
1147
+ logger.info("Customer lookup enabled without cache (on-demand load).")
1148
+
1149
 
1150
  @app.get("/health")
1151
  def health() -> dict[str, str]:
 
1226
  return Response(content="".join(lines), media_type="application/x-ndjson")
1227
 
1228
 
1229
+ def _predict_records(records: list[dict[str, Any]], threshold: float | None) -> dict[str, Any]:
 
 
 
 
1230
  model = app.state.model
1231
  preprocessor: PreprocessorArtifacts = app.state.preprocessor
1232
  request_id = str(uuid.uuid4())
1233
  start_time = time.perf_counter()
 
1234
 
1235
  if not records:
1236
  raise HTTPException(status_code=422, detail={"message": "No input records provided."})
 
1326
  error=str(exc),
1327
  )
1328
  raise
1329
+
1330
+
1331
+ @app.post("/predict")
1332
+ def predict(
1333
+ payload: PredictionRequest,
1334
+ threshold: float | None = Query(default=None, ge=0.0, le=1.0),
1335
+ ) -> dict[str, Any]:
1336
+ records = payload.data if isinstance(payload.data, list) else [payload.data]
1337
+ return _predict_records(records, threshold)
1338
+
1339
+
1340
+ @app.post("/predict-minimal")
1341
+ def predict_minimal(
1342
+ payload: MinimalPredictionRequest,
1343
+ threshold: float | None = Query(default=None, ge=0.0, le=1.0),
1344
+ ) -> dict[str, Any]:
1345
+ preprocessor: PreprocessorArtifacts = app.state.preprocessor
1346
+ record = _build_minimal_record(payload, preprocessor)
1347
+ return _predict_records([record], threshold)
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ import gradio as gr
6
+ from fastapi import HTTPException
7
+
8
+ from app.main import MinimalPredictionRequest, app, predict_minimal, startup_event
9
+
10
+
11
+ def _ensure_startup() -> None:
12
+ if not getattr(app.state, "preprocessor", None):
13
+ startup_event()
14
+
15
+
16
+ def _customer_snapshot(sk_id_curr: int) -> dict[str, Any]:
17
+ reference = getattr(app.state, "customer_reference", None)
18
+ if reference is None or sk_id_curr not in reference.index:
19
+ return {}
20
+ row = reference.loc[sk_id_curr]
21
+ snapshot: dict[str, Any] = {"SK_ID_CURR": int(sk_id_curr)}
22
+ if "CODE_GENDER" in row:
23
+ snapshot["CODE_GENDER"] = row["CODE_GENDER"]
24
+ if "FLAG_OWN_CAR" in row:
25
+ snapshot["FLAG_OWN_CAR"] = row["FLAG_OWN_CAR"]
26
+ if "AMT_INCOME_TOTAL" in row:
27
+ snapshot["AMT_INCOME_TOTAL"] = float(row["AMT_INCOME_TOTAL"])
28
+ if "DAYS_BIRTH" in row:
29
+ snapshot["AGE_YEARS"] = round(abs(float(row["DAYS_BIRTH"])) / 365.25, 1)
30
+ return snapshot
31
+
32
+
33
+ def score_minimal(
34
+ sk_id_curr: float,
35
+ amt_credit: float,
36
+ duration_months: float,
37
+ threshold: float,
38
+ ) -> tuple[float | None, str, float | None, dict[str, Any]]:
39
+ _ensure_startup()
40
+ try:
41
+ payload = MinimalPredictionRequest(
42
+ sk_id_curr=int(sk_id_curr),
43
+ amt_credit=float(amt_credit),
44
+ duration_months=int(duration_months),
45
+ )
46
+ response = predict_minimal(payload, threshold=float(threshold))
47
+ result = response["predictions"][0]
48
+ probability = float(result.get("probability", 0.0))
49
+ pred_value = int(result.get("prediction", 0))
50
+ label = "Default (1)" if pred_value == 1 else "No default (0)"
51
+ snapshot = _customer_snapshot(int(sk_id_curr))
52
+ snapshot.update(
53
+ {
54
+ "AMT_CREDIT_REQUESTED": float(amt_credit),
55
+ "DURATION_MONTHS": int(duration_months),
56
+ }
57
+ )
58
+ return probability, label, float(response.get("threshold", 0.0)), snapshot
59
+ except HTTPException as exc:
60
+ return None, f"Erreur: {exc.detail}", None, {"error": exc.detail}
61
+ except Exception as exc: # pragma: no cover - UI fallback
62
+ return None, f"Erreur: {exc}", None, {"error": str(exc)}
63
+
64
+
65
+ with gr.Blocks(title="Credit Scoring - Minimal Inputs") as demo:
66
+ gr.Markdown("# Credit Scoring - Minimal Inputs")
67
+ gr.Markdown(
68
+ "Renseignez l'identifiant client, le montant du credit et la duree. "
69
+ "Les autres features proviennent des donnees clients reference."
70
+ )
71
+
72
+ with gr.Row():
73
+ sk_id_curr = gr.Number(label="SK_ID_CURR", precision=0, value=100001)
74
+ amt_credit = gr.Number(label="AMT_CREDIT", value=200000)
75
+ duration_months = gr.Number(label="Duree (mois)", precision=0, value=60)
76
+ threshold = gr.Slider(label="Seuil", minimum=0.0, maximum=1.0, value=0.5, step=0.01)
77
+
78
+ run_btn = gr.Button("Scorer")
79
+
80
+ with gr.Row():
81
+ probability = gr.Number(label="Probabilite de defaut")
82
+ prediction = gr.Textbox(label="Decision")
83
+ threshold_used = gr.Number(label="Seuil utilise")
84
+
85
+ snapshot = gr.JSON(label="Snapshot client (reference)")
86
+
87
+ run_btn.click(
88
+ score_minimal,
89
+ inputs=[sk_id_curr, amt_credit, duration_months, threshold],
90
+ outputs=[probability, prediction, threshold_used, snapshot],
91
+ )
92
+
93
+
94
+ if __name__ == "__main__":
95
+ _ensure_startup()
96
+ demo.launch()
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: OCR Projet 06
3
  emoji: 🤖
4
  colorFrom: indigo
5
  colorTo: green
@@ -8,7 +8,7 @@ app_port: 7860
8
  pinned: false
9
  ---
10
 
11
- # OCR Projet 06 – Crédit
12
 
13
  [![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/stephmnt/credit-scoring-mlops/deploy.yml)](https://github.com/stephmnt/credit-scoring-mlops/actions/workflows/deploy.yml)
14
  [![GitHub Release Date](https://img.shields.io/github/release-date/stephmnt/credit-scoring-mlops?display_date=published_at&style=flat-square)](https://github.com/stephmnt/credit-scoring-mlops/releases)
@@ -62,24 +62,33 @@ Parametres utiles (selection des features) :
62
  - `FEATURE_SELECTION_TOP_N` (defaut: `8`)
63
  - `FEATURE_SELECTION_MIN_CORR` (defaut: `0.02`)
64
 
65
- ### Environnement Poetry (recommande)
66
 
67
- Le fichier `pyproject.toml` fixe des versions compatibles pour un stack recent
68
- (`numpy>=2`, `pyarrow>=15`, `scikit-learn>=1.6`). L'environnement vise Python
69
- 3.11.
70
 
71
  ```shell
72
- poetry env use 3.11
73
- poetry install
 
 
 
 
 
 
 
 
 
 
 
74
  poetry run pytest -q
75
  poetry run uvicorn app.main:app --reload --port 7860
76
  ```
77
 
78
  Important : le modele `HistGB_final_model.pkl` doit etre regenere avec la
79
- nouvelle version de scikit-learn (re-execution de
80
- `P6_MANET_Stephane_notebook_modélisation.ipynb`, cellule de sauvegarde pickle).
81
-
82
- Note : `requirements.txt` est aligne sur `pyproject.toml` (meme versions).
83
 
84
  ### Exemple d'input (schema + valeurs)
85
 
@@ -123,9 +132,70 @@ Valeurs d'exemple :
123
  }
124
  ```
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  Note : l'API valide strictement les champs requis (`/features`). Pour afficher
127
  toutes les colonnes possibles : `/features?include_all=true`.
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  ### Demo live (commandes cles en main)
130
 
131
  Lancer l'API :
@@ -231,6 +301,10 @@ Variables utiles :
231
  - `LOGS_ACCESS_TOKEN` pour proteger l'endpoint `/logs`
232
  - `LOG_HASH_SK_ID=1` pour anonymiser `SK_ID_CURR`
233
 
 
 
 
 
234
  Exemple local :
235
 
236
  ```shell
@@ -251,27 +325,70 @@ Alternative :
251
  curl -s -H "Authorization: Bearer $LOGS_ACCESS_TOKEN" "${BASE_URL}/logs?tail=200"
252
  ```
253
 
254
- Apres quelques requêtes, gélérer le rapport de drift :
255
 
256
  ```shell
257
  python monitoring/drift_report.py \
258
  --logs logs/predictions.jsonl \
259
  --reference data/data_final.parquet \
260
- --output-dir reports
 
 
 
 
261
  ```
262
 
263
  Le rapport HTML est généré dans `reports/drift_report.html` (avec des plots dans
264
  `reports/plots/`). Sur Hugging Face, le disque est éphemère : télécharger les logs
265
  avant d'analyser.
266
 
 
 
 
 
 
 
 
 
 
267
  Le rapport inclut aussi la distribution des scores predits et le taux de prediction
268
- (option `--score-bins` pour ajuster le nombre de bins).
 
 
 
 
 
 
 
269
 
270
  Captures (snapshot local du reporting + stockage):
271
 
272
  - Rapport: `docs/monitoring/drift_report.html` + `docs/monitoring/plots/`
273
  - Stockage des logs: `docs/monitoring/logs_storage.png`
274
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  ## Contenu de la release
276
 
277
  - **Preparation + pipeline** : nettoyage / preparation, encodage, imputation et pipeline d'entrainement presentes.
@@ -282,8 +399,10 @@ Captures (snapshot local du reporting + stockage):
282
  - **Score metier + seuil optimal** : le `custom_score` est la metrique principale des tableaux de comparaison et de la CV, avec un `best_threshold` calcule.
283
  - **Explicabilite** : feature importance, SHAP et LIME sont inclus.
284
  - **Selection de features par correlation** : top‑N numeriques + un petit set categoriel, expose via `/features`.
285
- - **Monitoring & drift** : rapport HTML avec KS/PSI + distribution des scores predits et taux de prediction
286
- (snapshots dans `docs/monitoring/`).
 
 
287
  - **CI/CD** : tests avec couverture (`pytest-cov`), build Docker et deploy vers Hugging Face Spaces.
288
 
289
  ![Screenshot MLFlow](https://raw.githubusercontent.com/stephmnt/credit-scoring-mlops/main/screen-mlflow.png)
@@ -304,5 +423,4 @@ Captures (snapshot local du reporting + stockage):
304
 
305
  * Compléter les tests API: /logs (auth OK/KO), batch predict, param threshold, SK_ID_CURR manquant, outliers dans test_api.py.
306
  * Simplifier le fallback ALLOW_MISSING_ARTIFACTS et DummyModel si les artefacts sont versionnés (nettoyer main.py et conftest.py).
307
- * Unifier la gestion des dépendances (Poetry vs requirements.txt) et aligner pyproject.toml / requirements.txt.
308
  * Si l’évaluateur attend une stratégie de branches, créer une branche feature et fusionner pour preuve.
 
1
  ---
2
+ title: Credit scoring MLOps
3
  emoji: 🤖
4
  colorFrom: indigo
5
  colorTo: green
 
8
  pinned: false
9
  ---
10
 
11
+ # Credit scoring MLOps
12
 
13
  [![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/stephmnt/credit-scoring-mlops/deploy.yml)](https://github.com/stephmnt/credit-scoring-mlops/actions/workflows/deploy.yml)
14
  [![GitHub Release Date](https://img.shields.io/github/release-date/stephmnt/credit-scoring-mlops?display_date=published_at&style=flat-square)](https://github.com/stephmnt/credit-scoring-mlops/releases)
 
62
  - `FEATURE_SELECTION_TOP_N` (defaut: `8`)
63
  - `FEATURE_SELECTION_MIN_CORR` (defaut: `0.02`)
64
 
65
+ ### Environnement pip (dev)
66
 
67
+ Le developpement local utilise pip et `requirements.txt` (versions figees),
68
+ avec Python 3.11+.
 
69
 
70
  ```shell
71
+ python3 -m venv .venv
72
+ source .venv/bin/activate
73
+ python -m pip install -r requirements.txt
74
+ pytest -q
75
+ uvicorn app.main:app --reload --port 7860
76
+ ```
77
+
78
+ ### Environnement Poetry (livrable)
79
+
80
+ Le livrable inclut `pyproject.toml`, aligne sur `requirements.txt`. Si besoin :
81
+
82
+ ```shell
83
+ poetry install --with dev
84
  poetry run pytest -q
85
  poetry run uvicorn app.main:app --reload --port 7860
86
  ```
87
 
88
  Important : le modele `HistGB_final_model.pkl` doit etre regenere avec la
89
+ version de scikit-learn definie dans `requirements.txt` / `pyproject.toml`
90
+ (re-execution de `P6_MANET_Stephane_notebook_modélisation.ipynb`, cellule de
91
+ sauvegarde pickle).
 
92
 
93
  ### Exemple d'input (schema + valeurs)
94
 
 
132
  }
133
  ```
134
 
135
+ ### Prediction minimale (client existant)
136
+
137
+ Endpoint `POST /predict-minimal` : l'utilisateur fournit un identifiant client,
138
+ un montant de credit et une duree. Les autres features sont prises depuis la
139
+ reference clients (`CUSTOMER_DATA_PATH`, par defaut `data/data_final.parquet`).
140
+ Si la reference est absente, l'API renvoie 503.
141
+
142
+ ```shell
143
+ curl -s -X POST "${BASE_URL}/predict-minimal" \
144
+ -H "Content-Type: application/json" \
145
+ -d '{
146
+ "sk_id_curr": 100001,
147
+ "amt_credit": 200000,
148
+ "duration_months": 60
149
+ }'
150
+ ```
151
+
152
+ Variables utiles :
153
+
154
+ - `CUSTOMER_LOOKUP_ENABLED=1` active la recherche client (defaut: 1)
155
+ - `CUSTOMER_DATA_PATH=data/data_final.parquet`
156
+ - `CUSTOMER_LOOKUP_CACHE=1` garde la reference en memoire
157
+
158
+ ### Data contract (validation)
159
+
160
+ - Types numeriques stricts (invalides -> 422).
161
+ - Ranges numeriques (min/max entrainement) controles.
162
+ - Categoriels normalises: `CODE_GENDER` -> {`F`, `M`}, `FLAG_OWN_CAR` -> {`Y`, `N`}.
163
+ - Sentinelle `DAYS_EMPLOYED=365243` remplacee par NaN.
164
+ - Logs enrichis via `data_quality` pour distinguer drift vs qualite de donnees.
165
+
166
+ ### Interface Gradio (scoring)
167
+
168
+ ```shell
169
+ python gradio_app.py
170
+ ```
171
+
172
+ Sur Hugging Face Spaces, `app.py` lance l'UI Gradio automatiquement.
173
+
174
  Note : l'API valide strictement les champs requis (`/features`). Pour afficher
175
  toutes les colonnes possibles : `/features?include_all=true`.
176
 
177
+ ### Hugging Face (assets lourds)
178
+
179
+ Les fichiers binaires (modele, preprocessor, data_final) ne sont pas pushes
180
+ dans le Space. Ils sont telecharges a l'execution via Hugging Face Hub si les
181
+ variables suivantes sont definies :
182
+
183
+ - `HF_MODEL_REPO_ID` + `HF_MODEL_FILENAME` + `HF_MODEL_REPO_TYPE`
184
+ - `HF_PREPROCESSOR_REPO_ID` + `HF_PREPROCESSOR_FILENAME` + `HF_PREPROCESSOR_REPO_TYPE`
185
+ - `HF_CUSTOMER_REPO_ID` + `HF_CUSTOMER_FILENAME` + `HF_CUSTOMER_REPO_TYPE`
186
+
187
+ Exemple (un seul repo dataset avec 3 fichiers) :
188
+
189
+ - `HF_MODEL_REPO_ID=stephmnt/credit-scoring-mlops-assets`
190
+ - `HF_MODEL_REPO_TYPE=dataset`
191
+ - `HF_MODEL_FILENAME=HistGB_final_model.pkl`
192
+ - `HF_PREPROCESSOR_REPO_ID=stephmnt/credit-scoring-mlops-assets`
193
+ - `HF_PREPROCESSOR_REPO_TYPE=dataset`
194
+ - `HF_PREPROCESSOR_FILENAME=preprocessor.joblib`
195
+ - `HF_CUSTOMER_REPO_ID=stephmnt/credit-scoring-mlops-assets`
196
+ - `HF_CUSTOMER_REPO_TYPE=dataset`
197
+ - `HF_CUSTOMER_FILENAME=data_final.parquet`
198
+
199
  ### Demo live (commandes cles en main)
200
 
201
  Lancer l'API :
 
301
  - `LOGS_ACCESS_TOKEN` pour proteger l'endpoint `/logs`
302
  - `LOG_HASH_SK_ID=1` pour anonymiser `SK_ID_CURR`
303
 
304
+ Les logs incluent un bloc `data_quality` par requete (champs manquants,
305
+ types invalides, out-of-range, categories inconnues, sentinelle
306
+ `DAYS_EMPLOYED`).
307
+
308
  Exemple local :
309
 
310
  ```shell
 
325
  curl -s -H "Authorization: Bearer $LOGS_ACCESS_TOKEN" "${BASE_URL}/logs?tail=200"
326
  ```
327
 
328
+ Apres quelques requêtes, générer le rapport de drift :
329
 
330
  ```shell
331
  python monitoring/drift_report.py \
332
  --logs logs/predictions.jsonl \
333
  --reference data/data_final.parquet \
334
+ --output-dir reports \
335
+ --min-prod-samples 200 \
336
+ --fdr-alpha 0.05 \
337
+ --prod-since "2024-01-01T00:00:00Z" \
338
+ --prod-until "2024-01-31T23:59:59Z"
339
  ```
340
 
341
  Le rapport HTML est généré dans `reports/drift_report.html` (avec des plots dans
342
  `reports/plots/`). Sur Hugging Face, le disque est éphemère : télécharger les logs
343
  avant d'analyser.
344
 
345
+ Le drift est calcule uniquement si `n_prod >= --min-prod-samples` (defaut 200).
346
+ Sinon, un badge "Sample insuffisant" est affiche et les alertes sont desactivees.
347
+
348
+ Robustesse integree:
349
+
350
+ - Categoriels: PSI avec lissage (`--psi-eps`) + categories rares regroupees (OTHER).
351
+ - Numeriques: KS corrige par FDR (Benjamini-Hochberg, `--fdr-alpha`).
352
+ - Sentinel `DAYS_EMPLOYED`: converti en NaN + taux suivi.
353
+
354
  Le rapport inclut aussi la distribution des scores predits et le taux de prediction
355
+ (option `--score-bins` pour ajuster le nombre de bins), ainsi qu'une section
356
+ Data Quality si les logs contiennent `data_quality` (types, NaN, out-of-range,
357
+ categories inconnues).
358
+
359
+ Pour simuler des fenetres glissantes, utiliser `--prod-since` / `--prod-until`
360
+ avec les timestamps des logs.
361
+
362
+ Runbook drift: `docs/monitoring/runbook.md`.
363
 
364
  Captures (snapshot local du reporting + stockage):
365
 
366
  - Rapport: `docs/monitoring/drift_report.html` + `docs/monitoring/plots/`
367
  - Stockage des logs: `docs/monitoring/logs_storage.png`
368
 
369
+ ## Profiling & Optimisation (Etape 4)
370
+
371
+ Profiling et benchmark d'inference (cProfile + latence) :
372
+
373
+ ```shell
374
+ python profiling/profile_inference.py \
375
+ --sample-size 2000 \
376
+ --batch-size 128 \
377
+ --runs 3
378
+ ```
379
+
380
+ Sorties:
381
+
382
+ - `docs/performance/benchmark_results.json`
383
+ - `docs/performance/profile_summary.txt`
384
+ - Rapport detaille: `docs/performance/performance_report.md`
385
+
386
+ Dashboard local Streamlit (monitoring + drift):
387
+
388
+ ```shell
389
+ python -m streamlit run monitoring/streamlit_app.py
390
+ ```
391
+
392
  ## Contenu de la release
393
 
394
  - **Preparation + pipeline** : nettoyage / preparation, encodage, imputation et pipeline d'entrainement presentes.
 
399
  - **Score metier + seuil optimal** : le `custom_score` est la metrique principale des tableaux de comparaison et de la CV, avec un `best_threshold` calcule.
400
  - **Explicabilite** : feature importance, SHAP et LIME sont inclus.
401
  - **Selection de features par correlation** : top‑N numeriques + un petit set categoriel, expose via `/features`.
402
+ - **Interface Gradio** : formulaire minimal (id client + montant + duree) base sur la reference clients.
403
+ - **Monitoring & drift** : rapport HTML avec gating par volume, PSI robuste, KS + FDR, data quality et
404
+ distribution des scores (snapshots dans `docs/monitoring/`).
405
+ - **Profiling & optimisation** : benchmark d'inference + profil cProfile (dossier `docs/performance/`).
406
  - **CI/CD** : tests avec couverture (`pytest-cov`), build Docker et deploy vers Hugging Face Spaces.
407
 
408
  ![Screenshot MLFlow](https://raw.githubusercontent.com/stephmnt/credit-scoring-mlops/main/screen-mlflow.png)
 
423
 
424
  * Compléter les tests API: /logs (auth OK/KO), batch predict, param threshold, SK_ID_CURR manquant, outliers dans test_api.py.
425
  * Simplifier le fallback ALLOW_MISSING_ARTIFACTS et DummyModel si les artefacts sont versionnés (nettoyer main.py et conftest.py).
 
426
  * Si l’évaluateur attend une stratégie de branches, créer une branche feature et fusionner pour preuve.
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Monitoring Captures
2
+
3
+ These files are snapshot artifacts for the monitoring deliverable.
4
+
5
+ - drift_report.html: report generated by monitoring/drift_report.py (sample-size 5000).
6
+ - runbook.md: triage et actions quand une alerte drift apparait.
7
+ - plots/: feature drift plots + score distribution + prediction rate.
8
+ - predictions_sample.jsonl: sanitized example of production logs.
9
+ - logs_storage.png: snapshot of the logging storage format.
10
+
11
+ Notes:
12
+ - Drift alerts are gated by minimum production volume (see report badge).
13
+ - Data quality metrics appear when logs include `data_quality`.
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/drift_report.html ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html>
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <title>Drift Report</title>
6
+ <style>
7
+ body { font-family: Arial, sans-serif; margin: 24px; }
8
+ table { border-collapse: collapse; width: 100%; }
9
+ th, td { border: 1px solid #ddd; padding: 8px; }
10
+ th { background: #f3f3f3; }
11
+ img { max-width: 720px; }
12
+ </style>
13
+ </head>
14
+ <body>
15
+ <h2>Production Monitoring Summary</h2>
16
+ <ul>
17
+ <li>Total calls: 1</li>
18
+ <li>Error rate: 0.00%</li>
19
+ <li>Latency p50: 82.04 ms</li>
20
+ <li>Latency p95: 82.04 ms</li>
21
+ </ul>
22
+ <h2>Score Monitoring</h2>
23
+ <ul>
24
+ <li>Score mean: 0.3755</li>
25
+ <li>Score p50: 0.3755</li>
26
+ <li>Score p95: 0.3755</li>
27
+ <li>Score min: 0.3755</li>
28
+ <li>Score max: 0.3755</li>
29
+ <li>Predicted default rate: 0.00%</li>
30
+ </ul>
31
+ <img src='plots/score_distribution.png' />
32
+ <img src='plots/prediction_rate.png' />
33
+ <h2>Data Drift Summary</h2>
34
+ <table border="1" class="dataframe">
35
+ <thead>
36
+ <tr style="text-align: right;">
37
+ <th>feature</th>
38
+ <th>type</th>
39
+ <th>ks_stat</th>
40
+ <th>p_value</th>
41
+ <th>drift_detected</th>
42
+ <th>psi</th>
43
+ </tr>
44
+ </thead>
45
+ <tbody>
46
+ <tr>
47
+ <td>EXT_SOURCE_2</td>
48
+ <td>numeric</td>
49
+ <td>0.5905</td>
50
+ <td>0.819238</td>
51
+ <td>False</td>
52
+ <td>NaN</td>
53
+ </tr>
54
+ <tr>
55
+ <td>EXT_SOURCE_3</td>
56
+ <td>numeric</td>
57
+ <td>0.9047</td>
58
+ <td>0.191111</td>
59
+ <td>False</td>
60
+ <td>NaN</td>
61
+ </tr>
62
+ <tr>
63
+ <td>AMT_ANNUITY</td>
64
+ <td>numeric</td>
65
+ <td>0.5184</td>
66
+ <td>0.963407</td>
67
+ <td>False</td>
68
+ <td>NaN</td>
69
+ </tr>
70
+ <tr>
71
+ <td>EXT_SOURCE_1</td>
72
+ <td>numeric</td>
73
+ <td>0.5822</td>
74
+ <td>0.836199</td>
75
+ <td>False</td>
76
+ <td>NaN</td>
77
+ </tr>
78
+ <tr>
79
+ <td>CODE_GENDER</td>
80
+ <td>categorical</td>
81
+ <td>NaN</td>
82
+ <td>NaN</td>
83
+ <td>True</td>
84
+ <td>9.6538</td>
85
+ </tr>
86
+ <tr>
87
+ <td>DAYS_EMPLOYED</td>
88
+ <td>numeric</td>
89
+ <td>0.6508</td>
90
+ <td>0.698660</td>
91
+ <td>False</td>
92
+ <td>NaN</td>
93
+ </tr>
94
+ <tr>
95
+ <td>AMT_CREDIT</td>
96
+ <td>numeric</td>
97
+ <td>0.5996</td>
98
+ <td>0.801040</td>
99
+ <td>False</td>
100
+ <td>NaN</td>
101
+ </tr>
102
+ <tr>
103
+ <td>AMT_GOODS_PRICE</td>
104
+ <td>numeric</td>
105
+ <td>0.6115</td>
106
+ <td>0.777177</td>
107
+ <td>False</td>
108
+ <td>NaN</td>
109
+ </tr>
110
+ <tr>
111
+ <td>DAYS_BIRTH</td>
112
+ <td>numeric</td>
113
+ <td>0.9474</td>
114
+ <td>0.105579</td>
115
+ <td>False</td>
116
+ <td>NaN</td>
117
+ </tr>
118
+ <tr>
119
+ <td>FLAG_OWN_CAR</td>
120
+ <td>categorical</td>
121
+ <td>NaN</td>
122
+ <td>NaN</td>
123
+ <td>True</td>
124
+ <td>4.3985</td>
125
+ </tr>
126
+ </tbody>
127
+ </table>
128
+ <h2>Feature Distributions</h2>
129
+ <h4>EXT_SOURCE_2</h4><img src='plots/EXT_SOURCE_2.png' />
130
+ <h4>EXT_SOURCE_3</h4><img src='plots/EXT_SOURCE_3.png' />
131
+ <h4>AMT_ANNUITY</h4><img src='plots/AMT_ANNUITY.png' />
132
+ <h4>EXT_SOURCE_1</h4><img src='plots/EXT_SOURCE_1.png' />
133
+ <h4>CODE_GENDER</h4><img src='plots/CODE_GENDER.png' />
134
+ <h4>DAYS_EMPLOYED</h4><img src='plots/DAYS_EMPLOYED.png' />
135
+ <h4>AMT_CREDIT</h4><img src='plots/AMT_CREDIT.png' />
136
+ <h4>AMT_GOODS_PRICE</h4><img src='plots/AMT_GOODS_PRICE.png' />
137
+ <h4>DAYS_BIRTH</h4><img src='plots/DAYS_BIRTH.png' />
138
+ <h4>FLAG_OWN_CAR</h4><img src='plots/FLAG_OWN_CAR.png' />
139
+ </body>
140
+ </html>
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/logs_storage.png ADDED
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/AMT_ANNUITY.png ADDED
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/AMT_CREDIT.png ADDED
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/AMT_GOODS_PRICE.png ADDED
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/CODE_GENDER.png ADDED