Spaces:
Runtime error
Runtime error
GitHub Actions commited on
Commit ·
271ec19
1
Parent(s): decf87a
Auto-deploy from GitHub Actions
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- hf_space/.github/workflows/deploy-assets.yml +0 -2
- hf_space/.github/workflows/deploy.yml +0 -2
- hf_space/.gitignore +0 -3
- hf_space/Dockerfile +0 -2
- hf_space/README.md +7 -29
- hf_space/docs/performance/performance_report.md +7 -8
- hf_space/hf_space/app/main.py +155 -11
- hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml +18 -0
- hf_space/hf_space/hf_space/data/xgb_final_model.pkl +3 -0
- hf_space/hf_space/hf_space/gradio_app.py +1 -121
- hf_space/hf_space/hf_space/hf_space/.gitignore +1 -1
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml +6 -2
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py +7 -5
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py +2 -3
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy.yml +0 -1
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py +127 -3
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py +8 -8
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/HistGB_final_model.pkl +3 -0
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.env.example +46 -0
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml +10 -11
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy.yml +2 -1
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/Dockerfile +1 -0
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md +54 -17
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py +365 -60
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/performance/performance_report.md +2 -1
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py +161 -27
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py +108 -19
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/__init__.py +1 -3
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/Dockerfile +3 -2
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py +1 -19
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/__init__.py +3 -1
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py +11 -1
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app_entry.py +19 -0
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/Dockerfile +1 -1
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md +17 -8
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.gitattributes +2 -33
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml +69 -0
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy.yml +4 -0
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.gitignore +3 -1
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py +25 -0
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py +190 -13
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py +96 -0
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md +136 -18
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/README.md +13 -0
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/drift_report.html +140 -0
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/logs_storage.png +0 -0
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/AMT_ANNUITY.png +0 -0
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/AMT_CREDIT.png +0 -0
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/AMT_GOODS_PRICE.png +0 -0
- hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/CODE_GENDER.png +0 -0
hf_space/.github/workflows/deploy-assets.yml
CHANGED
|
@@ -22,8 +22,6 @@ jobs:
|
|
| 22 |
steps:
|
| 23 |
- name: Checkout
|
| 24 |
uses: actions/checkout@v4
|
| 25 |
-
with:
|
| 26 |
-
lfs: true
|
| 27 |
|
| 28 |
- name: Set up Python
|
| 29 |
uses: actions/setup-python@v5
|
|
|
|
| 22 |
steps:
|
| 23 |
- name: Checkout
|
| 24 |
uses: actions/checkout@v4
|
|
|
|
|
|
|
| 25 |
|
| 26 |
- name: Set up Python
|
| 27 |
uses: actions/setup-python@v5
|
hf_space/.github/workflows/deploy.yml
CHANGED
|
@@ -12,8 +12,6 @@ jobs:
|
|
| 12 |
steps:
|
| 13 |
- name: Checkout
|
| 14 |
uses: actions/checkout@v4
|
| 15 |
-
with:
|
| 16 |
-
lfs: true
|
| 17 |
|
| 18 |
- name: Set up Python
|
| 19 |
uses: actions/setup-python@v5
|
|
|
|
| 12 |
steps:
|
| 13 |
- name: Checkout
|
| 14 |
uses: actions/checkout@v4
|
|
|
|
|
|
|
| 15 |
|
| 16 |
- name: Set up Python
|
| 17 |
uses: actions/setup-python@v5
|
hf_space/.gitignore
CHANGED
|
@@ -5,10 +5,7 @@ __pycache__/
|
|
| 5 |
logs/
|
| 6 |
reports/
|
| 7 |
data/*
|
| 8 |
-
!data/*_final_model.pkl
|
| 9 |
-
!data/data_final.parquet
|
| 10 |
artifacts/*
|
| 11 |
-
!artifacts/preprocessor.joblib
|
| 12 |
.DS_Store
|
| 13 |
.vscode/
|
| 14 |
.idea/
|
|
|
|
| 5 |
logs/
|
| 6 |
reports/
|
| 7 |
data/*
|
|
|
|
|
|
|
| 8 |
artifacts/*
|
|
|
|
| 9 |
.DS_Store
|
| 10 |
.vscode/
|
| 11 |
.idea/
|
hf_space/Dockerfile
CHANGED
|
@@ -11,8 +11,6 @@ RUN pip install --no-cache-dir -r requirements.txt
|
|
| 11 |
COPY app/ app/
|
| 12 |
COPY app_entry.py app.py gradio_app.py ./
|
| 13 |
COPY src/ src/
|
| 14 |
-
COPY data/ data/
|
| 15 |
-
COPY artifacts/ artifacts/
|
| 16 |
|
| 17 |
EXPOSE 7860
|
| 18 |
|
|
|
|
| 11 |
COPY app/ app/
|
| 12 |
COPY app_entry.py app.py gradio_app.py ./
|
| 13 |
COPY src/ src/
|
|
|
|
|
|
|
| 14 |
|
| 15 |
EXPOSE 7860
|
| 16 |
|
hf_space/README.md
CHANGED
|
@@ -381,7 +381,7 @@ python monitoring/drift_report.py \
|
|
| 381 |
--logs logs/predictions.jsonl \
|
| 382 |
--reference data/data_final.parquet \
|
| 383 |
--output-dir reports \
|
| 384 |
-
--min-prod-samples
|
| 385 |
--fdr-alpha 0.05 \
|
| 386 |
--prod-since "2024-01-01T00:00:00Z" \
|
| 387 |
--prod-until "2024-01-31T23:59:59Z"
|
|
@@ -391,7 +391,7 @@ Le rapport HTML est généré dans `reports/drift_report.html` (avec des plots d
|
|
| 391 |
`reports/plots/`). Sur Hugging Face, le disque est éphemère : télécharger les logs
|
| 392 |
avant d'analyser.
|
| 393 |
|
| 394 |
-
Le drift est calcule uniquement si `n_prod >= --min-prod-samples` (defaut
|
| 395 |
Sinon, un badge "Sample insuffisant" est affiche et les alertes sont desactivees.
|
| 396 |
|
| 397 |
Robustesse integree:
|
|
@@ -418,20 +418,16 @@ Captures (snapshot local du reporting + stockage):
|
|
| 418 |
|
| 419 |
## Profiling & Optimisation (Etape 4)
|
| 420 |
|
| 421 |
-
Profiling et benchmark d'inference (cProfile + latence)
|
| 422 |
|
| 423 |
-
-
|
| 424 |
-
-
|
| 425 |
-
|
| 426 |
-
Sorties:
|
| 427 |
-
|
| 428 |
-
- `docs/performance/benchmark_results.json`
|
| 429 |
-
- `docs/performance/profile_summary.txt`
|
| 430 |
-
- Rapport detaille: `docs/performance/performance_report.md`
|
| 431 |
|
| 432 |
Dashboard local Streamlit (monitoring + drift):
|
| 433 |
|
| 434 |
```shell
|
|
|
|
|
|
|
| 435 |
python -m streamlit run monitoring/streamlit_app.py
|
| 436 |
```
|
| 437 |
|
|
@@ -452,21 +448,3 @@ python -m streamlit run monitoring/streamlit_app.py
|
|
| 452 |
- **CI/CD** : tests avec couverture (`pytest-cov`), build Docker et deploy vers Hugging Face Spaces.
|
| 453 |
|
| 454 |

|
| 455 |
-
|
| 456 |
-
### Manques prioritaires
|
| 457 |
-
|
| 458 |
-
* Mission 2 Étape 4 non couverte: pas de profiling/optimisation post‑déploiement ni rapport de gains, à livrer avec une version optimisée.
|
| 459 |
-
|
| 460 |
-
### Preuves / doc à compléter
|
| 461 |
-
|
| 462 |
-
* Lien explicite vers le dépôt public + stratégie de versions/branches à ajouter dans README.md.
|
| 463 |
-
* Preuve de model registry/serving MLflow à conserver (capture UI registry ou commande de serving) en plus de screen-mlflow.png.
|
| 464 |
-
* Dataset de référence non versionné (data_final.parquet est ignoré), documenter l’obtention pour exécuter drift_report.py.
|
| 465 |
-
* Badge GitHub Actions pointe vers OCR_Projet05 dans README.md, corriger l’URL.
|
| 466 |
-
* RGPD/PII: LOG_HASH_SK_ID est désactivé par défaut dans main.py, préciser l’activation en prod dans README.md.
|
| 467 |
-
|
| 468 |
-
### Améliorations recommandées
|
| 469 |
-
|
| 470 |
-
* Compléter les tests API: /logs (auth OK/KO), batch predict, param threshold, SK_ID_CURR manquant, outliers dans test_api.py.
|
| 471 |
-
* Simplifier le fallback ALLOW_MISSING_ARTIFACTS et DummyModel si les artefacts sont versionnés (nettoyer main.py et conftest.py).
|
| 472 |
-
* Si l’évaluateur attend une stratégie de branches, créer une branche feature et fusionner pour preuve.
|
|
|
|
| 381 |
--logs logs/predictions.jsonl \
|
| 382 |
--reference data/data_final.parquet \
|
| 383 |
--output-dir reports \
|
| 384 |
+
--min-prod-samples 50 \
|
| 385 |
--fdr-alpha 0.05 \
|
| 386 |
--prod-since "2024-01-01T00:00:00Z" \
|
| 387 |
--prod-until "2024-01-31T23:59:59Z"
|
|
|
|
| 391 |
`reports/plots/`). Sur Hugging Face, le disque est éphemère : télécharger les logs
|
| 392 |
avant d'analyser.
|
| 393 |
|
| 394 |
+
Le drift est calcule uniquement si `n_prod >= --min-prod-samples` (defaut 50).
|
| 395 |
Sinon, un badge "Sample insuffisant" est affiche et les alertes sont desactivees.
|
| 396 |
|
| 397 |
Robustesse integree:
|
|
|
|
| 418 |
|
| 419 |
## Profiling & Optimisation (Etape 4)
|
| 420 |
|
| 421 |
+
Profiling et benchmark d'inference (cProfile + latence):
|
| 422 |
|
| 423 |
+
- Notebook: `notebooks/P6_MANET_Stephane_notebook_modélisation.ipynb` (section TODO 5).
|
| 424 |
+
- Resultats: `docs/performance/benchmark_results.json`, `docs/performance/profile_summary.txt`, `docs/performance/performance_report.md`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 425 |
|
| 426 |
Dashboard local Streamlit (monitoring + drift):
|
| 427 |
|
| 428 |
```shell
|
| 429 |
+
streamlit run monitoring/streamlit_app.py
|
| 430 |
+
# ou
|
| 431 |
python -m streamlit run monitoring/streamlit_app.py
|
| 432 |
```
|
| 433 |
|
|
|
|
| 448 |
- **CI/CD** : tests avec couverture (`pytest-cov`), build Docker et deploy vers Hugging Face Spaces.
|
| 449 |
|
| 450 |

|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
hf_space/docs/performance/performance_report.md
CHANGED
|
@@ -6,11 +6,10 @@ Mesurer la latence d'inference, identifier les goulots d'etranglement et propose
|
|
| 6 |
|
| 7 |
## Setup
|
| 8 |
|
| 9 |
-
-
|
| 10 |
-
- Workflow courant: notebook modélisation (section TODO 5)
|
| 11 |
- Donnees: `data/data_final.parquet` (echantillon)
|
| 12 |
- Parametres: `--sample-size 500 --batch-size 100 --runs 2`
|
| 13 |
-
- Modele: `
|
| 14 |
|
| 15 |
Les resultats sont sauvegardes dans:
|
| 16 |
|
|
@@ -21,21 +20,21 @@ Les resultats sont sauvegardes dans:
|
|
| 21 |
|
| 22 |
| Scenario | Batch | Mean (ms) | P50 (ms) | P95 (ms) | Throughput (rows/s) |
|
| 23 |
| --- | --- | ---:| ---:| ---:| ---:|
|
| 24 |
-
| optimized_preprocess | 100 |
|
| 25 |
-
| legacy_preprocess_alignment | 100 |
|
| 26 |
|
| 27 |
-
Gain observe (moyenne): ~
|
| 28 |
|
| 29 |
## Goulots d'etranglement (cProfile)
|
| 30 |
|
| 31 |
Extrait `docs/performance/profile_summary.txt`:
|
| 32 |
|
| 33 |
-
- `app.main:preprocess_input` represente l'essentiel du temps cumule (
|
| 34 |
- Operations pandas dominantes:
|
| 35 |
- `DataFrame.__setitem__` / `insert`
|
| 36 |
- `fillna`, `to_numeric`
|
| 37 |
- `get_dummies`
|
| 38 |
-
- `
|
| 39 |
|
| 40 |
## Optimisation appliquee
|
| 41 |
|
|
|
|
| 6 |
|
| 7 |
## Setup
|
| 8 |
|
| 9 |
+
- Notebook: `notebooks/P6_MANET_Stephane_notebook_modélisation.ipynb` (section TODO 5)
|
|
|
|
| 10 |
- Donnees: `data/data_final.parquet` (echantillon)
|
| 11 |
- Parametres: `--sample-size 500 --batch-size 100 --runs 2`
|
| 12 |
+
- Modele: `data/*_final_model.pkl` (ex: `data/xgb_final_model.pkl`)
|
| 13 |
|
| 14 |
Les resultats sont sauvegardes dans:
|
| 15 |
|
|
|
|
| 20 |
|
| 21 |
| Scenario | Batch | Mean (ms) | P50 (ms) | P95 (ms) | Throughput (rows/s) |
|
| 22 |
| --- | --- | ---:| ---:| ---:| ---:|
|
| 23 |
+
| optimized_preprocess | 100 | 35.73 | 33.77 | 43.09 | 2798.44 |
|
| 24 |
+
| legacy_preprocess_alignment | 100 | 47.57 | 47.19 | 51.23 | 2102.36 |
|
| 25 |
|
| 26 |
+
Gain observe (moyenne): ~25% de reduction de latence par batch sur le chemin optimise.
|
| 27 |
|
| 28 |
## Goulots d'etranglement (cProfile)
|
| 29 |
|
| 30 |
Extrait `docs/performance/profile_summary.txt`:
|
| 31 |
|
| 32 |
+
- `app.main:preprocess_input` represente l'essentiel du temps cumule (voir `docs/performance/profile_summary.txt`).
|
| 33 |
- Operations pandas dominantes:
|
| 34 |
- `DataFrame.__setitem__` / `insert`
|
| 35 |
- `fillna`, `to_numeric`
|
| 36 |
- `get_dummies`
|
| 37 |
+
- `predict_proba` est present mais non majoritaire.
|
| 38 |
|
| 39 |
## Optimisation appliquee
|
| 40 |
|
hf_space/hf_space/app/main.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
|
|
|
| 3 |
import logging
|
| 4 |
import os
|
| 5 |
import pickle
|
|
@@ -8,6 +9,7 @@ from datetime import datetime, timezone
|
|
| 8 |
import hashlib
|
| 9 |
import json
|
| 10 |
from pathlib import Path
|
|
|
|
| 11 |
import time
|
| 12 |
from typing import Any
|
| 13 |
import uuid
|
|
@@ -16,6 +18,7 @@ from collections import deque
|
|
| 16 |
import numpy as np
|
| 17 |
import pandas as pd
|
| 18 |
from fastapi import FastAPI, Header, HTTPException, Query, Response
|
|
|
|
| 19 |
from pydantic import BaseModel
|
| 20 |
from sklearn.preprocessing import MinMaxScaler
|
| 21 |
import joblib
|
|
@@ -78,6 +81,19 @@ HF_CUSTOMER_REPO_ID = os.getenv("HF_CUSTOMER_REPO_ID")
|
|
| 78 |
HF_CUSTOMER_REPO_TYPE = os.getenv("HF_CUSTOMER_REPO_TYPE", "dataset")
|
| 79 |
HF_CUSTOMER_FILENAME = os.getenv("HF_CUSTOMER_FILENAME", CUSTOMER_DATA_PATH.name)
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
IGNORE_FEATURES = ["is_train", "is_test", "TARGET", "SK_ID_CURR"]
|
| 82 |
ENGINEERED_FEATURES = [
|
| 83 |
"DAYS_EMPLOYED_ANOM",
|
|
@@ -218,6 +234,87 @@ def _hash_value(value: Any) -> str:
|
|
| 218 |
return hashlib.sha256(str(value).encode("utf-8")).hexdigest()
|
| 219 |
|
| 220 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
def _normalize_category_value(value: object, mapping: dict[str, str]) -> object:
|
| 222 |
if pd.isna(value): # type: ignore
|
| 223 |
return np.nan
|
|
@@ -234,7 +331,9 @@ def _ensure_hf_asset(
|
|
| 234 |
repo_type: str,
|
| 235 |
) -> Path | None:
|
| 236 |
if local_path.exists():
|
| 237 |
-
|
|
|
|
|
|
|
| 238 |
if not repo_id:
|
| 239 |
return None
|
| 240 |
|
|
@@ -254,6 +353,16 @@ def _ensure_hf_asset(
|
|
| 254 |
)
|
| 255 |
|
| 256 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
|
| 258 |
def _normalize_inputs(
|
| 259 |
df_raw: pd.DataFrame,
|
|
@@ -470,11 +579,39 @@ def _log_prediction_entries(
|
|
| 470 |
"prediction": result.get("prediction"),
|
| 471 |
}
|
| 472 |
)
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
_append_log_entries(entries)
|
| 477 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 478 |
|
| 479 |
def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
|
| 480 |
df = pd.read_parquet(data_path)
|
|
@@ -853,7 +990,7 @@ def _get_customer_reference(preprocessor: PreprocessorArtifacts) -> pd.DataFrame
|
|
| 853 |
if cached is not None:
|
| 854 |
return cached
|
| 855 |
data_path = CUSTOMER_DATA_PATH
|
| 856 |
-
if not data_path.exists():
|
| 857 |
downloaded = _ensure_hf_asset(
|
| 858 |
data_path,
|
| 859 |
HF_CUSTOMER_REPO_ID,
|
|
@@ -1362,7 +1499,7 @@ def startup_event() -> None:
|
|
| 1362 |
if getattr(app.state, "model", None) is not None and getattr(app.state, "preprocessor", None) is not None:
|
| 1363 |
return
|
| 1364 |
model_path = MODEL_PATH
|
| 1365 |
-
if not model_path.exists():
|
| 1366 |
downloaded = _ensure_hf_asset(
|
| 1367 |
model_path,
|
| 1368 |
HF_MODEL_REPO_ID,
|
|
@@ -1371,7 +1508,7 @@ def startup_event() -> None:
|
|
| 1371 |
)
|
| 1372 |
if downloaded is not None:
|
| 1373 |
model_path = downloaded
|
| 1374 |
-
if not model_path.exists():
|
| 1375 |
if ALLOW_MISSING_ARTIFACTS:
|
| 1376 |
logger.warning("Model file not found: %s. Using dummy model.", model_path)
|
| 1377 |
app.state.model = DummyModel()
|
|
@@ -1379,10 +1516,17 @@ def startup_event() -> None:
|
|
| 1379 |
raise RuntimeError(f"Model file not found: {model_path}")
|
| 1380 |
else:
|
| 1381 |
logger.info("Loading model from %s", model_path)
|
| 1382 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1383 |
|
| 1384 |
data_path = DATA_PATH
|
| 1385 |
-
if not data_path.exists():
|
| 1386 |
downloaded = _ensure_hf_asset(
|
| 1387 |
data_path,
|
| 1388 |
HF_CUSTOMER_REPO_ID,
|
|
@@ -1393,7 +1537,7 @@ def startup_event() -> None:
|
|
| 1393 |
data_path = downloaded
|
| 1394 |
try:
|
| 1395 |
artifacts_path = ARTIFACTS_PATH
|
| 1396 |
-
if not artifacts_path.exists():
|
| 1397 |
downloaded = _ensure_hf_asset(
|
| 1398 |
artifacts_path,
|
| 1399 |
HF_PREPROCESSOR_REPO_ID or None,
|
|
@@ -1404,7 +1548,7 @@ def startup_event() -> None:
|
|
| 1404 |
artifacts_path = downloaded
|
| 1405 |
logger.info("Loading preprocessor artifacts from %s", artifacts_path)
|
| 1406 |
app.state.preprocessor = load_preprocessor(data_path, artifacts_path)
|
| 1407 |
-
except
|
| 1408 |
if ALLOW_MISSING_ARTIFACTS:
|
| 1409 |
logger.warning("Preprocessor artifacts missing (%s). Using fallback preprocessor.", exc)
|
| 1410 |
app.state.preprocessor = build_fallback_preprocessor()
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
+
import io
|
| 4 |
import logging
|
| 5 |
import os
|
| 6 |
import pickle
|
|
|
|
| 9 |
import hashlib
|
| 10 |
import json
|
| 11 |
from pathlib import Path
|
| 12 |
+
import threading
|
| 13 |
import time
|
| 14 |
from typing import Any
|
| 15 |
import uuid
|
|
|
|
| 18 |
import numpy as np
|
| 19 |
import pandas as pd
|
| 20 |
from fastapi import FastAPI, Header, HTTPException, Query, Response
|
| 21 |
+
from huggingface_hub import HfApi
|
| 22 |
from pydantic import BaseModel
|
| 23 |
from sklearn.preprocessing import MinMaxScaler
|
| 24 |
import joblib
|
|
|
|
| 81 |
HF_CUSTOMER_REPO_TYPE = os.getenv("HF_CUSTOMER_REPO_TYPE", "dataset")
|
| 82 |
HF_CUSTOMER_FILENAME = os.getenv("HF_CUSTOMER_FILENAME", CUSTOMER_DATA_PATH.name)
|
| 83 |
|
| 84 |
+
HF_LOG_ENABLED = os.getenv("HF_LOG_ENABLED", "1") == "1"
|
| 85 |
+
HF_LOG_DATASET_REPO = os.getenv("HF_LOG_DATASET_REPO")
|
| 86 |
+
HF_LOG_PATH_PREFIX = os.getenv("HF_LOG_PATH_PREFIX", "prod_logs")
|
| 87 |
+
|
| 88 |
+
HF_LOG_BUFFER_MAX = int(os.getenv("HF_LOG_BUFFER_MAX", "50"))
|
| 89 |
+
HF_LOG_FLUSH_SECONDS = int(os.getenv("HF_LOG_FLUSH_SECONDS", "60"))
|
| 90 |
+
|
| 91 |
+
_hf_api = HfApi(token=os.getenv("HF_TOKEN")) if os.getenv("HF_TOKEN") else None
|
| 92 |
+
_hf_lock = threading.Lock()
|
| 93 |
+
_hf_buffer: list[dict[str, Any]] = []
|
| 94 |
+
_hf_last_flush = 0.0
|
| 95 |
+
_hf_flusher_started = False
|
| 96 |
+
|
| 97 |
IGNORE_FEATURES = ["is_train", "is_test", "TARGET", "SK_ID_CURR"]
|
| 98 |
ENGINEERED_FEATURES = [
|
| 99 |
"DAYS_EMPLOYED_ANOM",
|
|
|
|
| 234 |
return hashlib.sha256(str(value).encode("utf-8")).hexdigest()
|
| 235 |
|
| 236 |
|
| 237 |
+
def _utc_day() -> str:
|
| 238 |
+
return datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
def _utc_stamp() -> str:
|
| 242 |
+
return datetime.now(timezone.utc).strftime("%H%M%S")
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
def _start_hf_flusher_if_needed() -> None:
|
| 246 |
+
global _hf_flusher_started
|
| 247 |
+
if _hf_flusher_started:
|
| 248 |
+
return
|
| 249 |
+
_hf_flusher_started = True
|
| 250 |
+
|
| 251 |
+
def _loop() -> None:
|
| 252 |
+
while True:
|
| 253 |
+
time.sleep(HF_LOG_FLUSH_SECONDS)
|
| 254 |
+
with _hf_lock:
|
| 255 |
+
_flush_hf_locked(force=True)
|
| 256 |
+
|
| 257 |
+
threading.Thread(target=_loop, daemon=True).start()
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def _upload_parquet_part(df: pd.DataFrame) -> None:
|
| 261 |
+
if not (HF_LOG_ENABLED and _hf_api and HF_LOG_DATASET_REPO):
|
| 262 |
+
return
|
| 263 |
+
|
| 264 |
+
part_path = (
|
| 265 |
+
f"{HF_LOG_PATH_PREFIX}/date={_utc_day()}/"
|
| 266 |
+
f"part-{_utc_stamp()}-{uuid.uuid4().hex}.parquet"
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
bio = io.BytesIO()
|
| 270 |
+
df.to_parquet(bio, index=False)
|
| 271 |
+
|
| 272 |
+
for attempt in range(3):
|
| 273 |
+
try:
|
| 274 |
+
bio.seek(0)
|
| 275 |
+
_hf_api.upload_file(
|
| 276 |
+
path_or_fileobj=bio,
|
| 277 |
+
path_in_repo=part_path,
|
| 278 |
+
repo_id=HF_LOG_DATASET_REPO,
|
| 279 |
+
repo_type="dataset",
|
| 280 |
+
commit_message=f"Add inference logs {_utc_day()}",
|
| 281 |
+
)
|
| 282 |
+
return
|
| 283 |
+
except Exception:
|
| 284 |
+
if attempt == 2:
|
| 285 |
+
raise
|
| 286 |
+
time.sleep(1.5 * (attempt + 1))
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
def _flush_hf_locked(force: bool = False) -> None:
|
| 290 |
+
global _hf_buffer, _hf_last_flush
|
| 291 |
+
if not _hf_buffer:
|
| 292 |
+
return
|
| 293 |
+
|
| 294 |
+
now = time.time()
|
| 295 |
+
if not force:
|
| 296 |
+
if len(_hf_buffer) < HF_LOG_BUFFER_MAX and (now - _hf_last_flush) < HF_LOG_FLUSH_SECONDS:
|
| 297 |
+
return
|
| 298 |
+
|
| 299 |
+
df = pd.DataFrame(_hf_buffer)
|
| 300 |
+
_hf_buffer = []
|
| 301 |
+
_hf_last_flush = now
|
| 302 |
+
|
| 303 |
+
try:
|
| 304 |
+
_upload_parquet_part(df)
|
| 305 |
+
except Exception as exc:
|
| 306 |
+
logger.warning("HF log upload failed: %s", exc)
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
def hf_log_rows(rows: list[dict[str, Any]]) -> None:
|
| 310 |
+
if not (HF_LOG_ENABLED and _hf_api and HF_LOG_DATASET_REPO):
|
| 311 |
+
return
|
| 312 |
+
_start_hf_flusher_if_needed()
|
| 313 |
+
with _hf_lock:
|
| 314 |
+
_hf_buffer.extend(rows)
|
| 315 |
+
_flush_hf_locked(force=False)
|
| 316 |
+
|
| 317 |
+
|
| 318 |
def _normalize_category_value(value: object, mapping: dict[str, str]) -> object:
|
| 319 |
if pd.isna(value): # type: ignore
|
| 320 |
return np.nan
|
|
|
|
| 331 |
repo_type: str,
|
| 332 |
) -> Path | None:
|
| 333 |
if local_path.exists():
|
| 334 |
+
if not _is_lfs_pointer(local_path):
|
| 335 |
+
return local_path
|
| 336 |
+
logger.warning("LFS pointer detected for %s; attempting remote download.", local_path)
|
| 337 |
if not repo_id:
|
| 338 |
return None
|
| 339 |
|
|
|
|
| 353 |
)
|
| 354 |
|
| 355 |
|
| 356 |
+
def _is_lfs_pointer(path: Path) -> bool:
|
| 357 |
+
try:
|
| 358 |
+
with path.open("rb") as handle:
|
| 359 |
+
head = handle.read(200)
|
| 360 |
+
except OSError:
|
| 361 |
+
return False
|
| 362 |
+
text = head.decode("utf-8", errors="ignore")
|
| 363 |
+
return text.startswith("version https://git-lfs.github.com/spec/v1")
|
| 364 |
+
|
| 365 |
+
|
| 366 |
|
| 367 |
def _normalize_inputs(
|
| 368 |
df_raw: pd.DataFrame,
|
|
|
|
| 579 |
"prediction": result.get("prediction"),
|
| 580 |
}
|
| 581 |
)
|
| 582 |
+
if error:
|
| 583 |
+
entry["error"] = error
|
| 584 |
+
entries.append(entry)
|
| 585 |
_append_log_entries(entries)
|
| 586 |
|
| 587 |
+
flat_rows: list[dict[str, Any]] = []
|
| 588 |
+
for entry in entries:
|
| 589 |
+
row = {
|
| 590 |
+
"timestamp_utc": entry.get("timestamp"),
|
| 591 |
+
"request_id": entry.get("request_id"),
|
| 592 |
+
"endpoint": entry.get("endpoint"),
|
| 593 |
+
"source": entry.get("source"),
|
| 594 |
+
"status_code": entry.get("status_code"),
|
| 595 |
+
"latency_ms": entry.get("latency_ms"),
|
| 596 |
+
"model_version": entry.get("model_version"),
|
| 597 |
+
"threshold": entry.get("threshold"),
|
| 598 |
+
"sk_id_curr": entry.get("sk_id_curr"),
|
| 599 |
+
"probability": entry.get("probability"),
|
| 600 |
+
"prediction": entry.get("prediction"),
|
| 601 |
+
"error": entry.get("error"),
|
| 602 |
+
}
|
| 603 |
+
inputs = entry.get("inputs") or {}
|
| 604 |
+
for key, value in inputs.items():
|
| 605 |
+
row[f"input__{key}"] = value
|
| 606 |
+
|
| 607 |
+
dq = entry.get("data_quality") or {}
|
| 608 |
+
for key, value in dq.items():
|
| 609 |
+
row[f"dq__{key}"] = value
|
| 610 |
+
|
| 611 |
+
flat_rows.append(row)
|
| 612 |
+
|
| 613 |
+
hf_log_rows(flat_rows)
|
| 614 |
+
|
| 615 |
|
| 616 |
def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
|
| 617 |
df = pd.read_parquet(data_path)
|
|
|
|
| 990 |
if cached is not None:
|
| 991 |
return cached
|
| 992 |
data_path = CUSTOMER_DATA_PATH
|
| 993 |
+
if not data_path.exists() or _is_lfs_pointer(data_path):
|
| 994 |
downloaded = _ensure_hf_asset(
|
| 995 |
data_path,
|
| 996 |
HF_CUSTOMER_REPO_ID,
|
|
|
|
| 1499 |
if getattr(app.state, "model", None) is not None and getattr(app.state, "preprocessor", None) is not None:
|
| 1500 |
return
|
| 1501 |
model_path = MODEL_PATH
|
| 1502 |
+
if not model_path.exists() or _is_lfs_pointer(model_path):
|
| 1503 |
downloaded = _ensure_hf_asset(
|
| 1504 |
model_path,
|
| 1505 |
HF_MODEL_REPO_ID,
|
|
|
|
| 1508 |
)
|
| 1509 |
if downloaded is not None:
|
| 1510 |
model_path = downloaded
|
| 1511 |
+
if not model_path.exists() or _is_lfs_pointer(model_path):
|
| 1512 |
if ALLOW_MISSING_ARTIFACTS:
|
| 1513 |
logger.warning("Model file not found: %s. Using dummy model.", model_path)
|
| 1514 |
app.state.model = DummyModel()
|
|
|
|
| 1516 |
raise RuntimeError(f"Model file not found: {model_path}")
|
| 1517 |
else:
|
| 1518 |
logger.info("Loading model from %s", model_path)
|
| 1519 |
+
try:
|
| 1520 |
+
app.state.model = load_model(model_path)
|
| 1521 |
+
except Exception as exc:
|
| 1522 |
+
if ALLOW_MISSING_ARTIFACTS:
|
| 1523 |
+
logger.warning("Model load failed (%s). Using dummy model.", exc)
|
| 1524 |
+
app.state.model = DummyModel()
|
| 1525 |
+
else:
|
| 1526 |
+
raise
|
| 1527 |
|
| 1528 |
data_path = DATA_PATH
|
| 1529 |
+
if not data_path.exists() or _is_lfs_pointer(data_path):
|
| 1530 |
downloaded = _ensure_hf_asset(
|
| 1531 |
data_path,
|
| 1532 |
HF_CUSTOMER_REPO_ID,
|
|
|
|
| 1537 |
data_path = downloaded
|
| 1538 |
try:
|
| 1539 |
artifacts_path = ARTIFACTS_PATH
|
| 1540 |
+
if not artifacts_path.exists() or _is_lfs_pointer(artifacts_path):
|
| 1541 |
downloaded = _ensure_hf_asset(
|
| 1542 |
artifacts_path,
|
| 1543 |
HF_PREPROCESSOR_REPO_ID or None,
|
|
|
|
| 1548 |
artifacts_path = downloaded
|
| 1549 |
logger.info("Loading preprocessor artifacts from %s", artifacts_path)
|
| 1550 |
app.state.preprocessor = load_preprocessor(data_path, artifacts_path)
|
| 1551 |
+
except Exception as exc:
|
| 1552 |
if ALLOW_MISSING_ARTIFACTS:
|
| 1553 |
logger.warning("Preprocessor artifacts missing (%s). Using fallback preprocessor.", exc)
|
| 1554 |
app.state.preprocessor = build_fallback_preprocessor()
|
hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml
CHANGED
|
@@ -59,6 +59,24 @@ jobs:
|
|
| 59 |
model_path = candidates[0]
|
| 60 |
|
| 61 |
api = HfApi()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
for path in [model_path]:
|
| 63 |
api.upload_file(
|
| 64 |
path_or_fileobj=str(path),
|
|
|
|
| 59 |
model_path = candidates[0]
|
| 60 |
|
| 61 |
api = HfApi()
|
| 62 |
+
existing = api.list_repo_files(
|
| 63 |
+
repo_id=repo_id,
|
| 64 |
+
repo_type=repo_type,
|
| 65 |
+
token=token,
|
| 66 |
+
)
|
| 67 |
+
to_delete = [
|
| 68 |
+
name
|
| 69 |
+
for name in existing
|
| 70 |
+
if name.endswith("_final_model.pkl") and name != model_path.name
|
| 71 |
+
]
|
| 72 |
+
for name in to_delete:
|
| 73 |
+
api.delete_file(
|
| 74 |
+
path_in_repo=name,
|
| 75 |
+
repo_id=repo_id,
|
| 76 |
+
repo_type=repo_type,
|
| 77 |
+
token=token,
|
| 78 |
+
commit_message=f"Remove {name}",
|
| 79 |
+
)
|
| 80 |
for path in [model_path]:
|
| 81 |
api.upload_file(
|
| 82 |
path_or_fileobj=str(path),
|
hf_space/hf_space/hf_space/data/xgb_final_model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0fe10d7c60f50f96a87bafd298f2919653aed37d90a091059017800450e6273b
|
| 3 |
+
size 1370510
|
hf_space/hf_space/hf_space/gradio_app.py
CHANGED
|
@@ -21,20 +21,10 @@ from app.main import (
|
|
| 21 |
_normalize_inputs,
|
| 22 |
)
|
| 23 |
|
| 24 |
-
import io
|
| 25 |
-
import os
|
| 26 |
-
import threading
|
| 27 |
-
import time
|
| 28 |
-
import uuid
|
| 29 |
-
from datetime import datetime, timezone
|
| 30 |
-
|
| 31 |
-
from huggingface_hub import HfApi
|
| 32 |
-
|
| 33 |
|
| 34 |
def _ensure_startup() -> None:
|
| 35 |
if not getattr(app.state, "preprocessor", None):
|
| 36 |
startup_event()
|
| 37 |
-
_start_log_flusher_if_needed()
|
| 38 |
|
| 39 |
|
| 40 |
def _customer_snapshot(sk_id_curr: int) -> dict[str, Any]:
|
|
@@ -297,8 +287,7 @@ with gr.Blocks(title="Credit scoring MLOps") as demo:
|
|
| 297 |
"""
|
| 298 |
<p>Renseignez l'identifiant client, le montant du crédit et la durée.</p>
|
| 299 |
<p>Le modèle prédit la probabilité de défaut de paiement ainsi que la prédiction binaire associée. Le tableau SHAP affiche les 10 features les plus influentes pour cette prédiction. Le snapshot client affiche quelques informations de référence sur le client.</p>
|
| 300 |
-
<p>Pour accéder au data drift monitoring et aux rapports, rendez-vous sur l'application Streamlit dédiée.</p>
|
| 301 |
-
<p> Le dataset est disponible sur <a href="https://huggingface.co/datasets/stephmnt/assets-credit-scoring-mlops" rel="noreferrer">Hugging Face</a>.</p>
|
| 302 |
"""
|
| 303 |
)
|
| 304 |
|
|
@@ -328,115 +317,6 @@ with gr.Blocks(title="Credit scoring MLOps") as demo:
|
|
| 328 |
outputs=[probability, prediction, shap_table, snapshot],
|
| 329 |
)
|
| 330 |
|
| 331 |
-
# =========================
|
| 332 |
-
# HF Dataset logging (Parquet parts)
|
| 333 |
-
# =========================
|
| 334 |
-
|
| 335 |
-
LOG_ENABLED = os.getenv("LOG_ENABLED", "1") == "1"
|
| 336 |
-
LOG_DATASET_REPO = os.getenv("LOG_DATASET_REPO", "stephmnt/assets-credit-scoring-mlops")
|
| 337 |
-
LOG_PATH_PREFIX = os.getenv("LOG_PATH_PREFIX", "prod_logs")
|
| 338 |
-
HF_TOKEN = os.getenv("HF_TOKEN") # Secret HF (write) sur le Space inference
|
| 339 |
-
|
| 340 |
-
LOG_BUFFER_MAX = int(os.getenv("LOG_BUFFER_MAX", "50")) # flush dès 50 lignes
|
| 341 |
-
LOG_FLUSH_SECONDS = int(os.getenv("LOG_FLUSH_SECONDS", "60")) # flush au moins toutes les 60s
|
| 342 |
-
|
| 343 |
-
_hf_api = HfApi(token=HF_TOKEN) if HF_TOKEN else None
|
| 344 |
-
_log_lock = threading.Lock()
|
| 345 |
-
_log_buffer: list[dict] = []
|
| 346 |
-
_last_flush_ts = 0.0
|
| 347 |
-
_flusher_started = False
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
def _now_utc_iso() -> str:
|
| 351 |
-
return datetime.now(timezone.utc).isoformat()
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
def _upload_parquet_part(df: pd.DataFrame) -> None:
|
| 355 |
-
if _hf_api is None:
|
| 356 |
-
return # pas de token => pas de write
|
| 357 |
-
day = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
| 358 |
-
stamp = datetime.now(timezone.utc).strftime("%H%M%S")
|
| 359 |
-
part = f"{LOG_PATH_PREFIX}/date={day}/part-{stamp}-{uuid.uuid4().hex}.parquet"
|
| 360 |
-
|
| 361 |
-
bio = io.BytesIO()
|
| 362 |
-
df.to_parquet(bio, index=False)
|
| 363 |
-
bio.seek(0)
|
| 364 |
-
|
| 365 |
-
_hf_api.upload_file(
|
| 366 |
-
path_or_fileobj=bio,
|
| 367 |
-
path_in_repo=part,
|
| 368 |
-
repo_id=LOG_DATASET_REPO,
|
| 369 |
-
repo_type="dataset",
|
| 370 |
-
commit_message=f"Add inference logs {day}",
|
| 371 |
-
)
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
def _flush_logs_locked(force: bool = False) -> None:
|
| 375 |
-
global _log_buffer, _last_flush_ts
|
| 376 |
-
if not _log_buffer:
|
| 377 |
-
return
|
| 378 |
-
|
| 379 |
-
now = time.time()
|
| 380 |
-
if not force:
|
| 381 |
-
if len(_log_buffer) < LOG_BUFFER_MAX and (now - _last_flush_ts) < LOG_FLUSH_SECONDS:
|
| 382 |
-
return
|
| 383 |
-
|
| 384 |
-
df = pd.DataFrame(_log_buffer)
|
| 385 |
-
_log_buffer = []
|
| 386 |
-
_last_flush_ts = now
|
| 387 |
-
|
| 388 |
-
try:
|
| 389 |
-
_upload_parquet_part(df)
|
| 390 |
-
except Exception:
|
| 391 |
-
# En prod tu peux logger ça en stderr / structlog etc.
|
| 392 |
-
# On évite de faire échouer l'inférence.
|
| 393 |
-
pass
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
def _start_log_flusher_if_needed() -> None:
|
| 397 |
-
global _flusher_started
|
| 398 |
-
if _flusher_started:
|
| 399 |
-
return
|
| 400 |
-
_flusher_started = True
|
| 401 |
-
|
| 402 |
-
def _loop():
|
| 403 |
-
while True:
|
| 404 |
-
time.sleep(LOG_FLUSH_SECONDS)
|
| 405 |
-
with _log_lock:
|
| 406 |
-
_flush_logs_locked(force=True)
|
| 407 |
-
|
| 408 |
-
t = threading.Thread(target=_loop, daemon=True)
|
| 409 |
-
t.start()
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
def log_inference_row(row: dict) -> None:
|
| 413 |
-
if not LOG_ENABLED or _hf_api is None:
|
| 414 |
-
return
|
| 415 |
-
with _log_lock:
|
| 416 |
-
_log_buffer.append(row)
|
| 417 |
-
_flush_logs_locked(force=False)
|
| 418 |
-
|
| 419 |
-
# --- Logging (Evidently-friendly) ---
|
| 420 |
-
row = {
|
| 421 |
-
"timestamp_utc": _now_utc_iso(),
|
| 422 |
-
"model_version": MODEL_VERSION,
|
| 423 |
-
"source": "gradio",
|
| 424 |
-
"sk_id_curr": int(sk_id_curr),
|
| 425 |
-
"amt_credit_requested": float(amt_credit),
|
| 426 |
-
"duration_months": int(duration_months),
|
| 427 |
-
"probability": float(probability),
|
| 428 |
-
"prediction": int(pred_value),
|
| 429 |
-
}
|
| 430 |
-
# Ajoute quelques features "business" utiles au drift (cat + num)
|
| 431 |
-
# (tu peux en ajouter plus si tu veux)
|
| 432 |
-
for k, v in snapshot.items():
|
| 433 |
-
if k == "SK_ID_CURR":
|
| 434 |
-
continue
|
| 435 |
-
row[f"cust__{k}"] = v
|
| 436 |
-
|
| 437 |
-
log_inference_row(row)
|
| 438 |
-
|
| 439 |
-
|
| 440 |
if __name__ == "__main__":
|
| 441 |
_ensure_startup()
|
| 442 |
demo.launch()
|
|
|
|
| 21 |
_normalize_inputs,
|
| 22 |
)
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
def _ensure_startup() -> None:
|
| 26 |
if not getattr(app.state, "preprocessor", None):
|
| 27 |
startup_event()
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
def _customer_snapshot(sk_id_curr: int) -> dict[str, Any]:
|
|
|
|
| 287 |
"""
|
| 288 |
<p>Renseignez l'identifiant client, le montant du crédit et la durée.</p>
|
| 289 |
<p>Le modèle prédit la probabilité de défaut de paiement ainsi que la prédiction binaire associée. Le tableau SHAP affiche les 10 features les plus influentes pour cette prédiction. Le snapshot client affiche quelques informations de référence sur le client.</p>
|
| 290 |
+
<p>Pour accéder au data drift monitoring et aux rapports, rendez-vous sur l'application Streamlit dédiée. Le dataset est disponible sur <a href="https://huggingface.co/datasets/stephmnt/assets-credit-scoring-mlops" rel="noreferrer">Hugging Face</a>.</p>
|
|
|
|
| 291 |
"""
|
| 292 |
)
|
| 293 |
|
|
|
|
| 317 |
outputs=[probability, prediction, shap_table, snapshot],
|
| 318 |
)
|
| 319 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
if __name__ == "__main__":
|
| 321 |
_ensure_startup()
|
| 322 |
demo.launch()
|
hf_space/hf_space/hf_space/hf_space/.gitignore
CHANGED
|
@@ -5,7 +5,7 @@ __pycache__/
|
|
| 5 |
logs/
|
| 6 |
reports/
|
| 7 |
data/*
|
| 8 |
-
!data/
|
| 9 |
!data/data_final.parquet
|
| 10 |
artifacts/*
|
| 11 |
!artifacts/preprocessor.joblib
|
|
|
|
| 5 |
logs/
|
| 6 |
reports/
|
| 7 |
data/*
|
| 8 |
+
!data/*_final_model.pkl
|
| 9 |
!data/data_final.parquet
|
| 10 |
artifacts/*
|
| 11 |
!artifacts/preprocessor.joblib
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml
CHANGED
|
@@ -11,6 +11,10 @@ on:
|
|
| 11 |
description: "HF repo type (dataset or model)"
|
| 12 |
required: true
|
| 13 |
default: "dataset"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
jobs:
|
| 16 |
upload-assets:
|
|
@@ -34,8 +38,8 @@ jobs:
|
|
| 34 |
- name: Upload assets to Hugging Face Hub
|
| 35 |
env:
|
| 36 |
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 37 |
-
HF_REPO_ID: ${{ inputs.repo_id }}
|
| 38 |
-
HF_REPO_TYPE: ${{ inputs.repo_type }}
|
| 39 |
run: |
|
| 40 |
python - <<'PY'
|
| 41 |
import os
|
|
|
|
| 11 |
description: "HF repo type (dataset or model)"
|
| 12 |
required: true
|
| 13 |
default: "dataset"
|
| 14 |
+
push:
|
| 15 |
+
branches: ["main"]
|
| 16 |
+
paths:
|
| 17 |
+
- "data/*_final_model.pkl"
|
| 18 |
|
| 19 |
jobs:
|
| 20 |
upload-assets:
|
|
|
|
| 38 |
- name: Upload assets to Hugging Face Hub
|
| 39 |
env:
|
| 40 |
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 41 |
+
HF_REPO_ID: ${{ inputs.repo_id || 'stephmnt/assets-credit-scoring-mlops' }}
|
| 42 |
+
HF_REPO_TYPE: ${{ inputs.repo_type || 'dataset' }}
|
| 43 |
run: |
|
| 44 |
python - <<'PY'
|
| 45 |
import os
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py
CHANGED
|
@@ -237,22 +237,24 @@ def _ensure_hf_asset(
|
|
| 237 |
return local_path
|
| 238 |
if not repo_id:
|
| 239 |
return None
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
|
|
|
| 244 |
local_path.parent.mkdir(parents=True, exist_ok=True)
|
| 245 |
return Path(
|
| 246 |
hf_hub_download(
|
| 247 |
repo_id=repo_id,
|
| 248 |
filename=filename,
|
| 249 |
repo_type=repo_type,
|
|
|
|
| 250 |
local_dir=str(local_path.parent),
|
| 251 |
-
local_dir_use_symlinks=False,
|
| 252 |
)
|
| 253 |
)
|
| 254 |
|
| 255 |
|
|
|
|
| 256 |
def _normalize_inputs(
|
| 257 |
df_raw: pd.DataFrame,
|
| 258 |
preprocessor: PreprocessorArtifacts,
|
|
|
|
| 237 |
return local_path
|
| 238 |
if not repo_id:
|
| 239 |
return None
|
| 240 |
+
|
| 241 |
+
from huggingface_hub import hf_hub_download
|
| 242 |
+
|
| 243 |
+
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
|
| 244 |
+
|
| 245 |
local_path.parent.mkdir(parents=True, exist_ok=True)
|
| 246 |
return Path(
|
| 247 |
hf_hub_download(
|
| 248 |
repo_id=repo_id,
|
| 249 |
filename=filename,
|
| 250 |
repo_type=repo_type,
|
| 251 |
+
token=token, # ✅ essentiel pour repo gated
|
| 252 |
local_dir=str(local_path.parent),
|
|
|
|
| 253 |
)
|
| 254 |
)
|
| 255 |
|
| 256 |
|
| 257 |
+
|
| 258 |
def _normalize_inputs(
|
| 259 |
df_raw: pd.DataFrame,
|
| 260 |
preprocessor: PreprocessorArtifacts,
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py
CHANGED
|
@@ -296,10 +296,9 @@ with gr.Blocks(title="Credit scoring MLOps") as demo:
|
|
| 296 |
gr.HTML(
|
| 297 |
"""
|
| 298 |
<p>Renseignez l'identifiant client, le montant du crédit et la durée.</p>
|
| 299 |
-
<p>Le modèle prédit la probabilité de défaut de paiement ainsi que la prédiction binaire associée.</p>
|
| 300 |
-
<p>Le tableau SHAP affiche les 10 features les plus influentes pour cette prédiction.</p>
|
| 301 |
-
<p>Le snapshot client affiche quelques informations de référence sur le client.</p>
|
| 302 |
<p>Pour accéder au data drift monitoring et aux rapports, rendez-vous sur l'application Streamlit dédiée.</p>
|
|
|
|
| 303 |
"""
|
| 304 |
)
|
| 305 |
|
|
|
|
| 296 |
gr.HTML(
|
| 297 |
"""
|
| 298 |
<p>Renseignez l'identifiant client, le montant du crédit et la durée.</p>
|
| 299 |
+
<p>Le modèle prédit la probabilité de défaut de paiement ainsi que la prédiction binaire associée. Le tableau SHAP affiche les 10 features les plus influentes pour cette prédiction. Le snapshot client affiche quelques informations de référence sur le client.</p>
|
|
|
|
|
|
|
| 300 |
<p>Pour accéder au data drift monitoring et aux rapports, rendez-vous sur l'application Streamlit dédiée.</p>
|
| 301 |
+
<p> Le dataset est disponible sur <a href="https://huggingface.co/datasets/stephmnt/assets-credit-scoring-mlops" rel="noreferrer">Hugging Face</a>.</p>
|
| 302 |
"""
|
| 303 |
)
|
| 304 |
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy.yml
CHANGED
|
@@ -49,7 +49,6 @@ jobs:
|
|
| 49 |
--exclude 'logs' \
|
| 50 |
--exclude 'reports' \
|
| 51 |
--exclude 'screen-mlflow.png' \
|
| 52 |
-
--exclude 'data/*_final_model.pkl' \
|
| 53 |
--exclude 'artifacts/preprocessor.joblib' \
|
| 54 |
--exclude 'data/*.csv' \
|
| 55 |
--exclude 'data/*.parquet' \
|
|
|
|
| 49 |
--exclude 'logs' \
|
| 50 |
--exclude 'reports' \
|
| 51 |
--exclude 'screen-mlflow.png' \
|
|
|
|
| 52 |
--exclude 'artifacts/preprocessor.joblib' \
|
| 53 |
--exclude 'data/*.csv' \
|
| 54 |
--exclude 'data/*.parquet' \
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py
CHANGED
|
@@ -21,10 +21,20 @@ from app.main import (
|
|
| 21 |
_normalize_inputs,
|
| 22 |
)
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
def _ensure_startup() -> None:
|
| 26 |
if not getattr(app.state, "preprocessor", None):
|
| 27 |
startup_event()
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
def _customer_snapshot(sk_id_curr: int) -> dict[str, Any]:
|
|
@@ -283,13 +293,19 @@ with gr.Blocks(title="Credit scoring MLOps") as demo:
|
|
| 283 |
</a>
|
| 284 |
</div>
|
| 285 |
""")
|
| 286 |
-
gr.
|
| 287 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
)
|
| 289 |
|
| 290 |
with gr.Row():
|
| 291 |
sk_id_curr = gr.Number(label="Identifiant client", precision=0, value=100001)
|
| 292 |
-
amt_credit = gr.Number(label="Montant du crédit", value=
|
| 293 |
duration_months = gr.Number(label="Durée (mois)", precision=0, value=60)
|
| 294 |
|
| 295 |
run_btn = gr.Button("Scorer")
|
|
@@ -313,6 +329,114 @@ with gr.Blocks(title="Credit scoring MLOps") as demo:
|
|
| 313 |
outputs=[probability, prediction, shap_table, snapshot],
|
| 314 |
)
|
| 315 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
|
| 317 |
if __name__ == "__main__":
|
| 318 |
_ensure_startup()
|
|
|
|
| 21 |
_normalize_inputs,
|
| 22 |
)
|
| 23 |
|
| 24 |
+
import io
|
| 25 |
+
import os
|
| 26 |
+
import threading
|
| 27 |
+
import time
|
| 28 |
+
import uuid
|
| 29 |
+
from datetime import datetime, timezone
|
| 30 |
+
|
| 31 |
+
from huggingface_hub import HfApi
|
| 32 |
+
|
| 33 |
|
| 34 |
def _ensure_startup() -> None:
|
| 35 |
if not getattr(app.state, "preprocessor", None):
|
| 36 |
startup_event()
|
| 37 |
+
_start_log_flusher_if_needed()
|
| 38 |
|
| 39 |
|
| 40 |
def _customer_snapshot(sk_id_curr: int) -> dict[str, Any]:
|
|
|
|
| 293 |
</a>
|
| 294 |
</div>
|
| 295 |
""")
|
| 296 |
+
gr.HTML(
|
| 297 |
+
"""
|
| 298 |
+
<p>Renseignez l'identifiant client, le montant du crédit et la durée.</p>
|
| 299 |
+
<p>Le modèle prédit la probabilité de défaut de paiement ainsi que la prédiction binaire associée.</p>
|
| 300 |
+
<p>Le tableau SHAP affiche les 10 features les plus influentes pour cette prédiction.</p>
|
| 301 |
+
<p>Le snapshot client affiche quelques informations de référence sur le client.</p>
|
| 302 |
+
<p>Pour accéder au data drift monitoring et aux rapports, rendez-vous sur l'application Streamlit dédiée.</p>
|
| 303 |
+
"""
|
| 304 |
)
|
| 305 |
|
| 306 |
with gr.Row():
|
| 307 |
sk_id_curr = gr.Number(label="Identifiant client", precision=0, value=100001)
|
| 308 |
+
amt_credit = gr.Number(label="Montant du crédit", value=2000000)
|
| 309 |
duration_months = gr.Number(label="Durée (mois)", precision=0, value=60)
|
| 310 |
|
| 311 |
run_btn = gr.Button("Scorer")
|
|
|
|
| 329 |
outputs=[probability, prediction, shap_table, snapshot],
|
| 330 |
)
|
| 331 |
|
| 332 |
+
# =========================
|
| 333 |
+
# HF Dataset logging (Parquet parts)
|
| 334 |
+
# =========================
|
| 335 |
+
|
| 336 |
+
LOG_ENABLED = os.getenv("LOG_ENABLED", "1") == "1"
|
| 337 |
+
LOG_DATASET_REPO = os.getenv("LOG_DATASET_REPO", "stephmnt/assets-credit-scoring-mlops")
|
| 338 |
+
LOG_PATH_PREFIX = os.getenv("LOG_PATH_PREFIX", "prod_logs")
|
| 339 |
+
HF_TOKEN = os.getenv("HF_TOKEN") # Secret HF (write) sur le Space inference
|
| 340 |
+
|
| 341 |
+
LOG_BUFFER_MAX = int(os.getenv("LOG_BUFFER_MAX", "50")) # flush dès 50 lignes
|
| 342 |
+
LOG_FLUSH_SECONDS = int(os.getenv("LOG_FLUSH_SECONDS", "60")) # flush au moins toutes les 60s
|
| 343 |
+
|
| 344 |
+
_hf_api = HfApi(token=HF_TOKEN) if HF_TOKEN else None
|
| 345 |
+
_log_lock = threading.Lock()
|
| 346 |
+
_log_buffer: list[dict] = []
|
| 347 |
+
_last_flush_ts = 0.0
|
| 348 |
+
_flusher_started = False
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
def _now_utc_iso() -> str:
|
| 352 |
+
return datetime.now(timezone.utc).isoformat()
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
def _upload_parquet_part(df: pd.DataFrame) -> None:
|
| 356 |
+
if _hf_api is None:
|
| 357 |
+
return # pas de token => pas de write
|
| 358 |
+
day = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
| 359 |
+
stamp = datetime.now(timezone.utc).strftime("%H%M%S")
|
| 360 |
+
part = f"{LOG_PATH_PREFIX}/date={day}/part-{stamp}-{uuid.uuid4().hex}.parquet"
|
| 361 |
+
|
| 362 |
+
bio = io.BytesIO()
|
| 363 |
+
df.to_parquet(bio, index=False)
|
| 364 |
+
bio.seek(0)
|
| 365 |
+
|
| 366 |
+
_hf_api.upload_file(
|
| 367 |
+
path_or_fileobj=bio,
|
| 368 |
+
path_in_repo=part,
|
| 369 |
+
repo_id=LOG_DATASET_REPO,
|
| 370 |
+
repo_type="dataset",
|
| 371 |
+
commit_message=f"Add inference logs {day}",
|
| 372 |
+
)
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
def _flush_logs_locked(force: bool = False) -> None:
|
| 376 |
+
global _log_buffer, _last_flush_ts
|
| 377 |
+
if not _log_buffer:
|
| 378 |
+
return
|
| 379 |
+
|
| 380 |
+
now = time.time()
|
| 381 |
+
if not force:
|
| 382 |
+
if len(_log_buffer) < LOG_BUFFER_MAX and (now - _last_flush_ts) < LOG_FLUSH_SECONDS:
|
| 383 |
+
return
|
| 384 |
+
|
| 385 |
+
df = pd.DataFrame(_log_buffer)
|
| 386 |
+
_log_buffer = []
|
| 387 |
+
_last_flush_ts = now
|
| 388 |
+
|
| 389 |
+
try:
|
| 390 |
+
_upload_parquet_part(df)
|
| 391 |
+
except Exception:
|
| 392 |
+
# En prod tu peux logger ça en stderr / structlog etc.
|
| 393 |
+
# On évite de faire échouer l'inférence.
|
| 394 |
+
pass
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
def _start_log_flusher_if_needed() -> None:
|
| 398 |
+
global _flusher_started
|
| 399 |
+
if _flusher_started:
|
| 400 |
+
return
|
| 401 |
+
_flusher_started = True
|
| 402 |
+
|
| 403 |
+
def _loop():
|
| 404 |
+
while True:
|
| 405 |
+
time.sleep(LOG_FLUSH_SECONDS)
|
| 406 |
+
with _log_lock:
|
| 407 |
+
_flush_logs_locked(force=True)
|
| 408 |
+
|
| 409 |
+
t = threading.Thread(target=_loop, daemon=True)
|
| 410 |
+
t.start()
|
| 411 |
+
|
| 412 |
+
|
| 413 |
+
def log_inference_row(row: dict) -> None:
|
| 414 |
+
if not LOG_ENABLED or _hf_api is None:
|
| 415 |
+
return
|
| 416 |
+
with _log_lock:
|
| 417 |
+
_log_buffer.append(row)
|
| 418 |
+
_flush_logs_locked(force=False)
|
| 419 |
+
|
| 420 |
+
# --- Logging (Evidently-friendly) ---
|
| 421 |
+
row = {
|
| 422 |
+
"timestamp_utc": _now_utc_iso(),
|
| 423 |
+
"model_version": MODEL_VERSION,
|
| 424 |
+
"source": "gradio",
|
| 425 |
+
"sk_id_curr": int(sk_id_curr),
|
| 426 |
+
"amt_credit_requested": float(amt_credit),
|
| 427 |
+
"duration_months": int(duration_months),
|
| 428 |
+
"probability": float(probability),
|
| 429 |
+
"prediction": int(pred_value),
|
| 430 |
+
}
|
| 431 |
+
# Ajoute quelques features "business" utiles au drift (cat + num)
|
| 432 |
+
# (tu peux en ajouter plus si tu veux)
|
| 433 |
+
for k, v in snapshot.items():
|
| 434 |
+
if k == "SK_ID_CURR":
|
| 435 |
+
continue
|
| 436 |
+
row[f"cust__{k}"] = v
|
| 437 |
+
|
| 438 |
+
log_inference_row(row)
|
| 439 |
+
|
| 440 |
|
| 441 |
if __name__ == "__main__":
|
| 442 |
_ensure_startup()
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py
CHANGED
|
@@ -219,7 +219,7 @@ def _hash_value(value: Any) -> str:
|
|
| 219 |
|
| 220 |
|
| 221 |
def _normalize_category_value(value: object, mapping: dict[str, str]) -> object:
|
| 222 |
-
if pd.isna(value):
|
| 223 |
return np.nan
|
| 224 |
key = str(value).strip().upper()
|
| 225 |
if not key:
|
|
@@ -265,12 +265,12 @@ def _normalize_inputs(
|
|
| 265 |
unknown_masks: dict[str, pd.Series] = {}
|
| 266 |
if "CODE_GENDER" in df.columns:
|
| 267 |
raw = df["CODE_GENDER"]
|
| 268 |
-
normalized = raw.apply(lambda v: _normalize_category_value(v, CODE_GENDER_MAPPING))
|
| 269 |
unknown_masks["CODE_GENDER"] = normalized.eq("Unknown") & raw.notna()
|
| 270 |
df["CODE_GENDER"] = normalized
|
| 271 |
if "FLAG_OWN_CAR" in df.columns:
|
| 272 |
raw = df["FLAG_OWN_CAR"]
|
| 273 |
-
normalized = raw.apply(lambda v: _normalize_category_value(v, FLAG_OWN_CAR_MAPPING))
|
| 274 |
unknown_masks["FLAG_OWN_CAR"] = normalized.eq("Unknown") & raw.notna()
|
| 275 |
df["FLAG_OWN_CAR"] = normalized
|
| 276 |
|
|
@@ -404,7 +404,7 @@ def _build_minimal_record(
|
|
| 404 |
)
|
| 405 |
if "AMT_GOODS_PRICE" in record:
|
| 406 |
record["AMT_GOODS_PRICE"] = float(payload.amt_credit)
|
| 407 |
-
return record
|
| 408 |
|
| 409 |
|
| 410 |
def _append_log_entries(entries: list[dict[str, Any]]) -> None:
|
|
@@ -1576,7 +1576,7 @@ def _predict_records(
|
|
| 1576 |
latency_ms = (time.perf_counter() - start_time) * 1000.0
|
| 1577 |
_log_prediction_entries(
|
| 1578 |
request_id=request_id,
|
| 1579 |
-
records=log_records,
|
| 1580 |
results=results,
|
| 1581 |
latency_ms=latency_ms,
|
| 1582 |
threshold=use_threshold,
|
|
@@ -1598,7 +1598,7 @@ def _predict_records(
|
|
| 1598 |
latency_ms = (time.perf_counter() - start_time) * 1000.0
|
| 1599 |
_log_prediction_entries(
|
| 1600 |
request_id=request_id,
|
| 1601 |
-
records=log_records,
|
| 1602 |
results=results,
|
| 1603 |
latency_ms=latency_ms,
|
| 1604 |
threshold=None,
|
|
@@ -1613,7 +1613,7 @@ def _predict_records(
|
|
| 1613 |
detail = exc.detail if isinstance(exc.detail, dict) else {"message": str(exc.detail)}
|
| 1614 |
_log_prediction_entries(
|
| 1615 |
request_id=request_id,
|
| 1616 |
-
records=log_records if "log_records" in locals() else records,
|
| 1617 |
results=None,
|
| 1618 |
latency_ms=latency_ms,
|
| 1619 |
threshold=threshold,
|
|
@@ -1628,7 +1628,7 @@ def _predict_records(
|
|
| 1628 |
latency_ms = (time.perf_counter() - start_time) * 1000.0
|
| 1629 |
_log_prediction_entries(
|
| 1630 |
request_id=request_id,
|
| 1631 |
-
records=log_records if "log_records" in locals() else records,
|
| 1632 |
results=None,
|
| 1633 |
latency_ms=latency_ms,
|
| 1634 |
threshold=threshold,
|
|
|
|
| 219 |
|
| 220 |
|
| 221 |
def _normalize_category_value(value: object, mapping: dict[str, str]) -> object:
|
| 222 |
+
if pd.isna(value): # type: ignore
|
| 223 |
return np.nan
|
| 224 |
key = str(value).strip().upper()
|
| 225 |
if not key:
|
|
|
|
| 265 |
unknown_masks: dict[str, pd.Series] = {}
|
| 266 |
if "CODE_GENDER" in df.columns:
|
| 267 |
raw = df["CODE_GENDER"]
|
| 268 |
+
normalized = raw.apply(lambda v: _normalize_category_value(v, CODE_GENDER_MAPPING)) # type: ignore
|
| 269 |
unknown_masks["CODE_GENDER"] = normalized.eq("Unknown") & raw.notna()
|
| 270 |
df["CODE_GENDER"] = normalized
|
| 271 |
if "FLAG_OWN_CAR" in df.columns:
|
| 272 |
raw = df["FLAG_OWN_CAR"]
|
| 273 |
+
normalized = raw.apply(lambda v: _normalize_category_value(v, FLAG_OWN_CAR_MAPPING)) # type: ignore
|
| 274 |
unknown_masks["FLAG_OWN_CAR"] = normalized.eq("Unknown") & raw.notna()
|
| 275 |
df["FLAG_OWN_CAR"] = normalized
|
| 276 |
|
|
|
|
| 404 |
)
|
| 405 |
if "AMT_GOODS_PRICE" in record:
|
| 406 |
record["AMT_GOODS_PRICE"] = float(payload.amt_credit)
|
| 407 |
+
return record # type: ignore
|
| 408 |
|
| 409 |
|
| 410 |
def _append_log_entries(entries: list[dict[str, Any]]) -> None:
|
|
|
|
| 1576 |
latency_ms = (time.perf_counter() - start_time) * 1000.0
|
| 1577 |
_log_prediction_entries(
|
| 1578 |
request_id=request_id,
|
| 1579 |
+
records=log_records, # type: ignore
|
| 1580 |
results=results,
|
| 1581 |
latency_ms=latency_ms,
|
| 1582 |
threshold=use_threshold,
|
|
|
|
| 1598 |
latency_ms = (time.perf_counter() - start_time) * 1000.0
|
| 1599 |
_log_prediction_entries(
|
| 1600 |
request_id=request_id,
|
| 1601 |
+
records=log_records, # type: ignore
|
| 1602 |
results=results,
|
| 1603 |
latency_ms=latency_ms,
|
| 1604 |
threshold=None,
|
|
|
|
| 1613 |
detail = exc.detail if isinstance(exc.detail, dict) else {"message": str(exc.detail)}
|
| 1614 |
_log_prediction_entries(
|
| 1615 |
request_id=request_id,
|
| 1616 |
+
records=log_records if "log_records" in locals() else records, # type: ignore
|
| 1617 |
results=None,
|
| 1618 |
latency_ms=latency_ms,
|
| 1619 |
threshold=threshold,
|
|
|
|
| 1628 |
latency_ms = (time.perf_counter() - start_time) * 1000.0
|
| 1629 |
_log_prediction_entries(
|
| 1630 |
request_id=request_id,
|
| 1631 |
+
records=log_records if "log_records" in locals() else records, # type: ignore
|
| 1632 |
results=None,
|
| 1633 |
latency_ms=latency_ms,
|
| 1634 |
threshold=threshold,
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/HistGB_final_model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1c7b31d6b2aa9d622717d03b6eaf79e6e21297869ff401f2f61a2d688cc55d6f
|
| 3 |
+
size 411244
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.env.example
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core paths
|
| 2 |
+
MODEL_PATH=data/HistGB_final_model.pkl
|
| 3 |
+
DATA_PATH=data/data_final.parquet
|
| 4 |
+
ARTIFACTS_PATH=artifacts/preprocessor.joblib
|
| 5 |
+
|
| 6 |
+
# Prediction behavior
|
| 7 |
+
PREDICTION_THRESHOLD=0.5
|
| 8 |
+
CACHE_PREPROCESSOR=1
|
| 9 |
+
USE_REDUCED_INPUTS=1
|
| 10 |
+
ALLOW_MISSING_ARTIFACTS=0
|
| 11 |
+
MISSING_INDICATOR_MIN_RATE=0.05
|
| 12 |
+
|
| 13 |
+
# Feature selection (correlation)
|
| 14 |
+
FEATURE_SELECTION_METHOD=correlation
|
| 15 |
+
FEATURE_SELECTION_TOP_N=8
|
| 16 |
+
FEATURE_SELECTION_MIN_CORR=0.02
|
| 17 |
+
CORRELATION_THRESHOLD=0.85
|
| 18 |
+
CORRELATION_SAMPLE_SIZE=50000
|
| 19 |
+
|
| 20 |
+
# Logging
|
| 21 |
+
LOG_PREDICTIONS=1
|
| 22 |
+
LOG_DIR=logs
|
| 23 |
+
LOG_FILE=predictions.jsonl
|
| 24 |
+
LOG_INCLUDE_INPUTS=1
|
| 25 |
+
LOG_HASH_SK_ID=0
|
| 26 |
+
MODEL_VERSION=HistGB_final_model.pkl
|
| 27 |
+
LOGS_ACCESS_TOKEN=
|
| 28 |
+
|
| 29 |
+
# Customer reference lookup
|
| 30 |
+
CUSTOMER_DATA_PATH=data/data_final.parquet
|
| 31 |
+
CUSTOMER_LOOKUP_ENABLED=1
|
| 32 |
+
CUSTOMER_LOOKUP_CACHE=1
|
| 33 |
+
|
| 34 |
+
# Hugging Face assets (optional)
|
| 35 |
+
HF_MODEL_REPO_ID=stephmnt/assets-credit-scoring-mlops
|
| 36 |
+
HF_MODEL_REPO_TYPE=model
|
| 37 |
+
HF_MODEL_FILENAME=HistGB_final_model.pkl
|
| 38 |
+
HF_PREPROCESSOR_REPO_ID=stephmnt/assets-credit-scoring-mlops
|
| 39 |
+
HF_PREPROCESSOR_REPO_TYPE=model
|
| 40 |
+
HF_PREPROCESSOR_FILENAME=preprocessor.joblib
|
| 41 |
+
HF_CUSTOMER_REPO_ID=stephmnt/assets-credit-scoring-mlops
|
| 42 |
+
HF_CUSTOMER_REPO_TYPE=dataset
|
| 43 |
+
HF_CUSTOMER_FILENAME=data_final.parquet
|
| 44 |
+
|
| 45 |
+
# MLflow
|
| 46 |
+
MLFLOW_TRACKING_URI=http://127.0.0.1:5000
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml
CHANGED
|
@@ -46,24 +46,23 @@ jobs:
|
|
| 46 |
repo_type = os.environ["HF_REPO_TYPE"]
|
| 47 |
token = os.environ["HF_TOKEN"]
|
| 48 |
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
"
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
| 54 |
|
| 55 |
api = HfApi()
|
| 56 |
-
for
|
| 57 |
-
path = Path(local_path)
|
| 58 |
-
if not path.exists():
|
| 59 |
-
raise SystemExit(f"Missing file: {path}")
|
| 60 |
api.upload_file(
|
| 61 |
path_or_fileobj=str(path),
|
| 62 |
-
path_in_repo=
|
| 63 |
repo_id=repo_id,
|
| 64 |
repo_type=repo_type,
|
| 65 |
token=token,
|
| 66 |
-
commit_message=f"Update {
|
| 67 |
)
|
| 68 |
print("Assets uploaded.")
|
| 69 |
PY
|
|
|
|
| 46 |
repo_type = os.environ["HF_REPO_TYPE"]
|
| 47 |
token = os.environ["HF_TOKEN"]
|
| 48 |
|
| 49 |
+
candidates = sorted(Path("data").glob("*_final_model.pkl"))
|
| 50 |
+
if not candidates:
|
| 51 |
+
raise SystemExit("Missing model file: data/*_final_model.pkl")
|
| 52 |
+
if len(candidates) > 1:
|
| 53 |
+
names = ", ".join(path.name for path in candidates)
|
| 54 |
+
raise SystemExit(f"Multiple *_final_model.pkl files found: {names}")
|
| 55 |
+
model_path = candidates[0]
|
| 56 |
|
| 57 |
api = HfApi()
|
| 58 |
+
for path in [model_path]:
|
|
|
|
|
|
|
|
|
|
| 59 |
api.upload_file(
|
| 60 |
path_or_fileobj=str(path),
|
| 61 |
+
path_in_repo=path.name,
|
| 62 |
repo_id=repo_id,
|
| 63 |
repo_type=repo_type,
|
| 64 |
token=token,
|
| 65 |
+
commit_message=f"Update {path.name}",
|
| 66 |
)
|
| 67 |
print("Assets uploaded.")
|
| 68 |
PY
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy.yml
CHANGED
|
@@ -49,10 +49,11 @@ jobs:
|
|
| 49 |
--exclude 'logs' \
|
| 50 |
--exclude 'reports' \
|
| 51 |
--exclude 'screen-mlflow.png' \
|
| 52 |
-
--exclude 'data/
|
| 53 |
--exclude 'artifacts/preprocessor.joblib' \
|
| 54 |
--exclude 'data/*.csv' \
|
| 55 |
--exclude 'data/*.parquet' \
|
|
|
|
| 56 |
./ hf_space/
|
| 57 |
cd hf_space
|
| 58 |
git add .
|
|
|
|
| 49 |
--exclude 'logs' \
|
| 50 |
--exclude 'reports' \
|
| 51 |
--exclude 'screen-mlflow.png' \
|
| 52 |
+
--exclude 'data/*_final_model.pkl' \
|
| 53 |
--exclude 'artifacts/preprocessor.joblib' \
|
| 54 |
--exclude 'data/*.csv' \
|
| 55 |
--exclude 'data/*.parquet' \
|
| 56 |
+
--exclude 'notebooks/mlflow.db' \
|
| 57 |
./ hf_space/
|
| 58 |
cd hf_space
|
| 59 |
git add .
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/Dockerfile
CHANGED
|
@@ -10,6 +10,7 @@ RUN pip install --no-cache-dir -r requirements.txt
|
|
| 10 |
|
| 11 |
COPY app/ app/
|
| 12 |
COPY app_entry.py app.py gradio_app.py ./
|
|
|
|
| 13 |
COPY data/ data/
|
| 14 |
COPY artifacts/ artifacts/
|
| 15 |
|
|
|
|
| 10 |
|
| 11 |
COPY app/ app/
|
| 12 |
COPY app_entry.py app.py gradio_app.py ./
|
| 13 |
+
COPY src/ src/
|
| 14 |
COPY data/ data/
|
| 15 |
COPY artifacts/ artifacts/
|
| 16 |
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md
CHANGED
|
@@ -14,6 +14,18 @@ pinned: false
|
|
| 14 |
[](https://github.com/stephmnt/credit-scoring-mlops/releases)
|
| 15 |
[](https://github.com/stephmnt/credit-scoring-mlops/blob/main/LICENSE)
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
## Lancer MLFlow
|
| 18 |
|
| 19 |
Le notebook est configure pour utiliser un serveur MLflow local (`http://127.0.0.1:5000`).
|
|
@@ -75,6 +87,28 @@ pytest -q
|
|
| 75 |
uvicorn app.main:app --reload --port 7860
|
| 76 |
```
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
### Environnement Poetry (livrable)
|
| 79 |
|
| 80 |
Le livrable inclut `pyproject.toml`, aligne sur `requirements.txt`. Si besoin :
|
|
@@ -85,9 +119,9 @@ poetry run pytest -q
|
|
| 85 |
poetry run uvicorn app.main:app --reload --port 7860
|
| 86 |
```
|
| 87 |
|
| 88 |
-
Important : le modele `
|
| 89 |
version de scikit-learn definie dans `requirements.txt` / `pyproject.toml`
|
| 90 |
-
(re-execution de `P6_MANET_Stephane_notebook_modélisation.ipynb`, cellule de
|
| 91 |
sauvegarde pickle).
|
| 92 |
|
| 93 |
### Exemple d'input (schema + valeurs)
|
|
@@ -158,10 +192,13 @@ Variables utiles :
|
|
| 158 |
### Data contract (validation)
|
| 159 |
|
| 160 |
- Types numeriques stricts (invalides -> 422).
|
| 161 |
-
- Ranges numeriques (min/max entrainement) controles.
|
| 162 |
- Categoriels normalises: `CODE_GENDER` -> {`F`, `M`}, `FLAG_OWN_CAR` -> {`Y`, `N`}.
|
| 163 |
-
- Sentinelle `DAYS_EMPLOYED=365243` remplacee par NaN.
|
| 164 |
-
-
|
|
|
|
|
|
|
|
|
|
| 165 |
|
| 166 |
### Interface Gradio (scoring)
|
| 167 |
|
|
@@ -186,13 +223,13 @@ variables suivantes sont definies :
|
|
| 186 |
|
| 187 |
Exemple (un seul repo dataset avec 3 fichiers) :
|
| 188 |
|
| 189 |
-
- `HF_MODEL_REPO_ID=stephmnt/credit-scoring-mlops
|
| 190 |
- `HF_MODEL_REPO_TYPE=dataset`
|
| 191 |
-
- `HF_MODEL_FILENAME=
|
| 192 |
-
- `HF_PREPROCESSOR_REPO_ID=stephmnt/credit-scoring-mlops
|
| 193 |
- `HF_PREPROCESSOR_REPO_TYPE=dataset`
|
| 194 |
- `HF_PREPROCESSOR_FILENAME=preprocessor.joblib`
|
| 195 |
-
- `HF_CUSTOMER_REPO_ID=stephmnt/credit-scoring-mlops
|
| 196 |
- `HF_CUSTOMER_REPO_TYPE=dataset`
|
| 197 |
- `HF_CUSTOMER_FILENAME=data_final.parquet`
|
| 198 |
|
|
@@ -311,8 +348,11 @@ Variables utiles :
|
|
| 311 |
- `LOG_HASH_SK_ID=1` pour anonymiser `SK_ID_CURR`
|
| 312 |
|
| 313 |
Les logs incluent un bloc `data_quality` par requete (champs manquants,
|
| 314 |
-
types invalides, out-of-range, categories inconnues, sentinelle
|
| 315 |
-
`DAYS_EMPLOYED`).
|
|
|
|
|
|
|
|
|
|
| 316 |
|
| 317 |
Exemple local :
|
| 318 |
|
|
@@ -359,6 +399,7 @@ Robustesse integree:
|
|
| 359 |
- Categoriels: PSI avec lissage (`--psi-eps`) + categories rares regroupees (OTHER).
|
| 360 |
- Numeriques: KS corrige par FDR (Benjamini-Hochberg, `--fdr-alpha`).
|
| 361 |
- Sentinel `DAYS_EMPLOYED`: converti en NaN + taux suivi.
|
|
|
|
| 362 |
|
| 363 |
Le rapport inclut aussi la distribution des scores predits et le taux de prediction
|
| 364 |
(option `--score-bins` pour ajuster le nombre de bins), ainsi qu'une section
|
|
@@ -379,12 +420,8 @@ Captures (snapshot local du reporting + stockage):
|
|
| 379 |
|
| 380 |
Profiling et benchmark d'inference (cProfile + latence) :
|
| 381 |
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
--sample-size 2000 \
|
| 385 |
-
--batch-size 128 \
|
| 386 |
-
--runs 3
|
| 387 |
-
```
|
| 388 |
|
| 389 |
Sorties:
|
| 390 |
|
|
|
|
| 14 |
[](https://github.com/stephmnt/credit-scoring-mlops/releases)
|
| 15 |
[](https://github.com/stephmnt/credit-scoring-mlops/blob/main/LICENSE)
|
| 16 |
|
| 17 |
+
## Structure rapide
|
| 18 |
+
|
| 19 |
+
- `app/` API FastAPI + preprocessing inference
|
| 20 |
+
- `monitoring/` rapport drift + Streamlit
|
| 21 |
+
- `notebooks/` exploration + modelisation
|
| 22 |
+
- `src/` utilitaires ML (feature engineering / pipeline)
|
| 23 |
+
- `docs/` preuves & rapports (monitoring, perf)
|
| 24 |
+
- `tests/` tests unitaires/integration
|
| 25 |
+
|
| 26 |
+
Le feature engineering est factorise dans `src/features.py` et reutilise
|
| 27 |
+
par le notebook et l'API pour eviter le training-serving skew.
|
| 28 |
+
|
| 29 |
## Lancer MLFlow
|
| 30 |
|
| 31 |
Le notebook est configure pour utiliser un serveur MLflow local (`http://127.0.0.1:5000`).
|
|
|
|
| 87 |
uvicorn app.main:app --reload --port 7860
|
| 88 |
```
|
| 89 |
|
| 90 |
+
### Workflow DEV (notebooks)
|
| 91 |
+
|
| 92 |
+
Ordre recommande (dev uniquement) :
|
| 93 |
+
|
| 94 |
+
1. `notebooks/P6_MANET_Stephane_notebook_exploration.ipynb` → genere `data/data_final.parquet` (ecrase).
|
| 95 |
+
2. `notebooks/P6_MANET_Stephane_notebook_compare_tuning_mlflow.ipynb` → compare+tuning, log MLflow, ecrit `reports/best_model.json`.
|
| 96 |
+
3. `notebooks/P6_MANET_Stephane_notebook_modélisation.ipynb` → rebuild preprocessor, entraine le modele final, exporte `data/<model>_final_model.pkl`.
|
| 97 |
+
4. Lancer manuellement le workflow `deploy-assets.yml` pour pousser `data/*_final_model.pkl`.
|
| 98 |
+
|
| 99 |
+
Note : ces notebooks restent dev-only. Le code prod reste dans `app/` et `monitoring/`.
|
| 100 |
+
|
| 101 |
+
### Configuration (.env)
|
| 102 |
+
|
| 103 |
+
Dupliquez `.env.example` en `.env` si vous voulez surcharger les chemins,
|
| 104 |
+
seuils ou sources Hugging Face.
|
| 105 |
+
Le seuil `MISSING_INDICATOR_MIN_RATE` limite les colonnes `is_missing_*`
|
| 106 |
+
aux features avec un taux de NaN >= 5% (par defaut).
|
| 107 |
+
|
| 108 |
+
```shell
|
| 109 |
+
cp .env.example .env
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
### Environnement Poetry (livrable)
|
| 113 |
|
| 114 |
Le livrable inclut `pyproject.toml`, aligne sur `requirements.txt`. Si besoin :
|
|
|
|
| 119 |
poetry run uvicorn app.main:app --reload --port 7860
|
| 120 |
```
|
| 121 |
|
| 122 |
+
Important : le modele `*_final_model.pkl` doit etre regenere avec la
|
| 123 |
version de scikit-learn definie dans `requirements.txt` / `pyproject.toml`
|
| 124 |
+
(re-execution de `notebooks/P6_MANET_Stephane_notebook_modélisation.ipynb`, cellule de
|
| 125 |
sauvegarde pickle).
|
| 126 |
|
| 127 |
### Exemple d'input (schema + valeurs)
|
|
|
|
| 192 |
### Data contract (validation)
|
| 193 |
|
| 194 |
- Types numeriques stricts (invalides -> 422).
|
| 195 |
+
- Ranges numeriques (min/max entrainement) controles, hors `SK_ID_CURR` (ID).
|
| 196 |
- Categoriels normalises: `CODE_GENDER` -> {`F`, `M`}, `FLAG_OWN_CAR` -> {`Y`, `N`}.
|
| 197 |
+
- Sentinelle `DAYS_EMPLOYED=365243` remplacee par NaN + flag `DAYS_EMPLOYED_ANOM`.
|
| 198 |
+
- Ratios securises (division par zero) + flags `DENOM_ZERO_*`.
|
| 199 |
+
- Outliers clippees (p1/p99) + flags `is_outlier_*`.
|
| 200 |
+
- Missingness indicators `is_missing_*` pour les numeriques avec taux de NaN >= 5%.
|
| 201 |
+
- Logs enrichis via `data_quality` et `source` pour distinguer drift vs qualite de donnees.
|
| 202 |
|
| 203 |
### Interface Gradio (scoring)
|
| 204 |
|
|
|
|
| 223 |
|
| 224 |
Exemple (un seul repo dataset avec 3 fichiers) :
|
| 225 |
|
| 226 |
+
- `HF_MODEL_REPO_ID=stephmnt/assets-credit-scoring-mlops`
|
| 227 |
- `HF_MODEL_REPO_TYPE=dataset`
|
| 228 |
+
- `HF_MODEL_FILENAME=histgb_final_model.pkl` (ou `lgbm_final_model.pkl` / `xgb_final_model.pkl`)
|
| 229 |
+
- `HF_PREPROCESSOR_REPO_ID=stephmnt/assets-credit-scoring-mlops`
|
| 230 |
- `HF_PREPROCESSOR_REPO_TYPE=dataset`
|
| 231 |
- `HF_PREPROCESSOR_FILENAME=preprocessor.joblib`
|
| 232 |
+
- `HF_CUSTOMER_REPO_ID=stephmnt/assets-credit-scoring-mlops`
|
| 233 |
- `HF_CUSTOMER_REPO_TYPE=dataset`
|
| 234 |
- `HF_CUSTOMER_FILENAME=data_final.parquet`
|
| 235 |
|
|
|
|
| 348 |
- `LOG_HASH_SK_ID=1` pour anonymiser `SK_ID_CURR`
|
| 349 |
|
| 350 |
Les logs incluent un bloc `data_quality` par requete (champs manquants,
|
| 351 |
+
types invalides, out-of-range, outliers, categories inconnues, sentinelle
|
| 352 |
+
`DAYS_EMPLOYED`) et un champ `source` (api/gradio/etc.).
|
| 353 |
+
|
| 354 |
+
Astuce : vous pouvez passer un header `X-Client-Source` pour tagger la source
|
| 355 |
+
des requetes (ex: `gradio`, `test`, `batch`).
|
| 356 |
|
| 357 |
Exemple local :
|
| 358 |
|
|
|
|
| 399 |
- Categoriels: PSI avec lissage (`--psi-eps`) + categories rares regroupees (OTHER).
|
| 400 |
- Numeriques: KS corrige par FDR (Benjamini-Hochberg, `--fdr-alpha`).
|
| 401 |
- Sentinel `DAYS_EMPLOYED`: converti en NaN + taux suivi.
|
| 402 |
+
- Outliers: clipping p1/p99 + taux via `data_quality`.
|
| 403 |
|
| 404 |
Le rapport inclut aussi la distribution des scores predits et le taux de prediction
|
| 405 |
(option `--score-bins` pour ajuster le nombre de bins), ainsi qu'une section
|
|
|
|
| 420 |
|
| 421 |
Profiling et benchmark d'inference (cProfile + latence) :
|
| 422 |
|
| 423 |
+
- Desormais via le notebook modélisation (section TODO 5).
|
| 424 |
+
- L'ancien script est archive dans `dev_archive/profiling/profile_inference.py`.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 425 |
|
| 426 |
Sorties:
|
| 427 |
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py
CHANGED
|
@@ -20,9 +20,33 @@ from pydantic import BaseModel
|
|
| 20 |
from sklearn.preprocessing import MinMaxScaler
|
| 21 |
import joblib
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
logger = logging.getLogger("uvicorn.error")
|
| 24 |
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
DATA_PATH = Path(os.getenv("DATA_PATH", "data/data_final.parquet"))
|
| 27 |
ARTIFACTS_PATH = Path(os.getenv("ARTIFACTS_PATH", "artifacts/preprocessor.joblib"))
|
| 28 |
DEFAULT_THRESHOLD = float(os.getenv("PREDICTION_THRESHOLD", "0.5"))
|
|
@@ -56,11 +80,17 @@ HF_CUSTOMER_FILENAME = os.getenv("HF_CUSTOMER_FILENAME", CUSTOMER_DATA_PATH.name
|
|
| 56 |
|
| 57 |
IGNORE_FEATURES = ["is_train", "is_test", "TARGET", "SK_ID_CURR"]
|
| 58 |
ENGINEERED_FEATURES = [
|
|
|
|
| 59 |
"DAYS_EMPLOYED_PERC",
|
| 60 |
"INCOME_CREDIT_PERC",
|
| 61 |
"INCOME_PER_PERSON",
|
| 62 |
"ANNUITY_INCOME_PERC",
|
| 63 |
"PAYMENT_RATE",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
]
|
| 65 |
ENGINEERED_SOURCES = [
|
| 66 |
"DAYS_EMPLOYED",
|
|
@@ -98,6 +128,9 @@ OUTLIER_COLUMNS = [
|
|
| 98 |
"AMT_REQ_CREDIT_BUREAU_YEAR",
|
| 99 |
"AMT_REQ_CREDIT_BUREAU_QRT",
|
| 100 |
]
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
CODE_GENDER_MAPPING = {
|
| 103 |
"F": "F",
|
|
@@ -143,6 +176,8 @@ class PreprocessorArtifacts:
|
|
| 143 |
numeric_medians: dict[str, float]
|
| 144 |
categorical_columns: list[str]
|
| 145 |
outlier_maxes: dict[str, float]
|
|
|
|
|
|
|
| 146 |
numeric_ranges: dict[str, tuple[float, float]]
|
| 147 |
features_to_scaled: list[str]
|
| 148 |
scaler: MinMaxScaler
|
|
@@ -243,6 +278,7 @@ def _normalize_inputs(
|
|
| 243 |
if "DAYS_EMPLOYED" in df.columns:
|
| 244 |
values = pd.to_numeric(df["DAYS_EMPLOYED"], errors="coerce")
|
| 245 |
sentinel_mask = values == DAYS_EMPLOYED_SENTINEL
|
|
|
|
| 246 |
if sentinel_mask.any():
|
| 247 |
df.loc[sentinel_mask, "DAYS_EMPLOYED"] = np.nan
|
| 248 |
|
|
@@ -267,6 +303,7 @@ def _build_data_quality_records(
|
|
| 267 |
missing_mask = df_norm[required_cols].isna() if required_cols else pd.DataFrame(index=df_norm.index)
|
| 268 |
invalid_masks: dict[str, pd.Series] = {}
|
| 269 |
out_of_range_masks: dict[str, pd.Series] = {}
|
|
|
|
| 270 |
|
| 271 |
for col in numeric_required:
|
| 272 |
if col not in df_raw.columns:
|
|
@@ -283,6 +320,13 @@ def _build_data_quality_records(
|
|
| 283 |
values = pd.to_numeric(df_norm[col], errors="coerce")
|
| 284 |
out_of_range_masks[col] = (values < min_val) | (values > max_val)
|
| 285 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
records: list[dict[str, Any]] = []
|
| 287 |
for idx in df_norm.index:
|
| 288 |
missing_cols = (
|
|
@@ -292,18 +336,26 @@ def _build_data_quality_records(
|
|
| 292 |
)
|
| 293 |
invalid_cols = [col for col, mask in invalid_masks.items() if mask.at[idx]]
|
| 294 |
out_of_range_cols = [col for col, mask in out_of_range_masks.items() if mask.at[idx]]
|
|
|
|
| 295 |
unknown_cols = [col for col, mask in unknown_masks.items() if mask.at[idx]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
nan_rate = float(missing_mask.loc[idx].mean()) if not missing_mask.empty else 0.0
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
|
|
|
|
|
|
| 307 |
return records
|
| 308 |
|
| 309 |
|
|
@@ -376,6 +428,7 @@ def _log_prediction_entries(
|
|
| 376 |
threshold: float | None,
|
| 377 |
status_code: int,
|
| 378 |
preprocessor: PreprocessorArtifacts,
|
|
|
|
| 379 |
data_quality: list[dict[str, Any]] | None = None,
|
| 380 |
error: str | None = None,
|
| 381 |
) -> None:
|
|
@@ -400,6 +453,7 @@ def _log_prediction_entries(
|
|
| 400 |
"status_code": status_code,
|
| 401 |
"model_version": MODEL_VERSION,
|
| 402 |
"threshold": threshold,
|
|
|
|
| 403 |
"inputs": inputs,
|
| 404 |
}
|
| 405 |
if data_quality and idx < len(data_quality):
|
|
@@ -420,25 +474,16 @@ def _log_prediction_entries(
|
|
| 420 |
_append_log_entries(entries)
|
| 421 |
|
| 422 |
|
| 423 |
-
def new_features_creation(df: pd.DataFrame) -> pd.DataFrame:
|
| 424 |
-
df_features = df.copy()
|
| 425 |
-
for col in ENGINEERED_SOURCES:
|
| 426 |
-
if col not in df_features.columns:
|
| 427 |
-
df_features[col] = np.nan
|
| 428 |
-
df_features["DAYS_EMPLOYED_PERC"] = df_features["DAYS_EMPLOYED"] / df_features["DAYS_BIRTH"]
|
| 429 |
-
df_features["INCOME_CREDIT_PERC"] = df_features["AMT_INCOME_TOTAL"] / df_features["AMT_CREDIT"]
|
| 430 |
-
df_features["INCOME_PER_PERSON"] = df_features["AMT_INCOME_TOTAL"] / df_features["CNT_FAM_MEMBERS"]
|
| 431 |
-
df_features["ANNUITY_INCOME_PERC"] = df_features["AMT_ANNUITY"] / df_features["AMT_INCOME_TOTAL"]
|
| 432 |
-
df_features["PAYMENT_RATE"] = df_features["AMT_ANNUITY"] / df_features["AMT_CREDIT"]
|
| 433 |
-
return df_features
|
| 434 |
-
|
| 435 |
-
|
| 436 |
def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
|
| 437 |
df = pd.read_parquet(data_path)
|
| 438 |
raw_feature_columns = df.columns.tolist()
|
| 439 |
input_feature_columns = [c for c in raw_feature_columns if c not in ["is_train", "is_test", "TARGET"]]
|
| 440 |
|
| 441 |
-
df = new_features_creation(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 442 |
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
| 443 |
|
| 444 |
missing_rate = df.isna().mean()
|
|
@@ -448,6 +493,26 @@ def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
|
|
| 448 |
df = df[columns_keep]
|
| 449 |
df = df.dropna(subset=columns_must_not_missing)
|
| 450 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
numeric_cols = df.select_dtypes(include=["number"]).columns
|
| 452 |
numeric_medians = df[numeric_cols].median().to_dict()
|
| 453 |
df[numeric_cols] = df[numeric_cols].fillna(numeric_medians)
|
|
@@ -455,12 +520,7 @@ def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
|
|
| 455 |
categorical_columns = df.select_dtypes(include=["object"]).columns.tolist()
|
| 456 |
df[categorical_columns] = df[categorical_columns].fillna("Unknown")
|
| 457 |
|
| 458 |
-
|
| 459 |
-
df = df[df["CODE_GENDER"] != "XNA"]
|
| 460 |
-
|
| 461 |
-
outlier_maxes = {col: df[col].max() for col in OUTLIER_COLUMNS if col in df.columns}
|
| 462 |
-
for col, max_val in outlier_maxes.items():
|
| 463 |
-
df = df[df[col] != max_val]
|
| 464 |
|
| 465 |
reduced_input_columns, selection_scores, selection_method = _compute_reduced_inputs(
|
| 466 |
df,
|
|
@@ -487,7 +547,11 @@ def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
|
|
| 487 |
required_input = _fallback_reduced_inputs(input_feature_columns)
|
| 488 |
else:
|
| 489 |
required_input = sorted(required_raw)
|
| 490 |
-
numeric_required = sorted(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 491 |
correlated_imputation = _build_correlated_imputation(
|
| 492 |
df,
|
| 493 |
input_feature_columns=input_feature_columns,
|
|
@@ -501,6 +565,8 @@ def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
|
|
| 501 |
numeric_medians={k: float(v) for k, v in numeric_medians.items()},
|
| 502 |
categorical_columns=categorical_columns,
|
| 503 |
outlier_maxes={k: float(v) for k, v in outlier_maxes.items()},
|
|
|
|
|
|
|
| 504 |
numeric_ranges=numeric_ranges,
|
| 505 |
features_to_scaled=features_to_scaled,
|
| 506 |
scaler=scaler,
|
|
@@ -554,9 +620,28 @@ def build_fallback_preprocessor() -> PreprocessorArtifacts:
|
|
| 554 |
]
|
| 555 |
)
|
| 556 |
|
| 557 |
-
df = new_features_creation(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 558 |
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
| 559 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 560 |
columns_keep = df.columns.tolist()
|
| 561 |
columns_must_not_missing = [col for col in columns_keep if col not in IGNORE_FEATURES]
|
| 562 |
|
|
@@ -579,7 +664,9 @@ def build_fallback_preprocessor() -> PreprocessorArtifacts:
|
|
| 579 |
required_raw.update(col for col in columns_must_not_missing if col in input_feature_columns)
|
| 580 |
required_raw.add("SK_ID_CURR")
|
| 581 |
required_input = _fallback_reduced_inputs(input_feature_columns)
|
| 582 |
-
numeric_required = sorted(
|
|
|
|
|
|
|
| 583 |
|
| 584 |
numeric_ranges = {col: (float(df[col].min()), float(df[col].max())) for col in numeric_cols}
|
| 585 |
|
|
@@ -588,7 +675,9 @@ def build_fallback_preprocessor() -> PreprocessorArtifacts:
|
|
| 588 |
columns_must_not_missing=columns_must_not_missing,
|
| 589 |
numeric_medians={k: float(v) for k, v in numeric_medians.items()},
|
| 590 |
categorical_columns=categorical_columns,
|
| 591 |
-
outlier_maxes={},
|
|
|
|
|
|
|
| 592 |
numeric_ranges=numeric_ranges,
|
| 593 |
features_to_scaled=features_to_scaled,
|
| 594 |
scaler=scaler,
|
|
@@ -633,7 +722,9 @@ def load_preprocessor(data_path: Path, artifacts_path: Path) -> PreprocessorArti
|
|
| 633 |
updated = True
|
| 634 |
if not hasattr(preprocessor, "numeric_required_columns"):
|
| 635 |
preprocessor.numeric_required_columns = sorted(
|
| 636 |
-
col
|
|
|
|
|
|
|
| 637 |
)
|
| 638 |
updated = True
|
| 639 |
if not hasattr(preprocessor, "numeric_ranges"):
|
|
@@ -646,6 +737,56 @@ def load_preprocessor(data_path: Path, artifacts_path: Path) -> PreprocessorArti
|
|
| 646 |
raise RuntimeError(f"Data file not found to rebuild preprocessor: {data_path}")
|
| 647 |
preprocessor = build_preprocessor(data_path)
|
| 648 |
updated = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 649 |
if USE_REDUCED_INPUTS:
|
| 650 |
reduced = _reduce_input_columns(preprocessor)
|
| 651 |
if preprocessor.required_input_columns != reduced:
|
|
@@ -658,7 +799,9 @@ def load_preprocessor(data_path: Path, artifacts_path: Path) -> PreprocessorArti
|
|
| 658 |
required_updated = True
|
| 659 |
updated = True
|
| 660 |
desired_numeric_required = sorted(
|
| 661 |
-
col
|
|
|
|
|
|
|
| 662 |
)
|
| 663 |
if getattr(preprocessor, "numeric_required_columns", None) != desired_numeric_required:
|
| 664 |
preprocessor.numeric_required_columns = desired_numeric_required
|
|
@@ -890,7 +1033,11 @@ def _compute_reduced_inputs_from_data(
|
|
| 890 |
if not data_path.exists():
|
| 891 |
return _fallback_reduced_inputs(preprocessor.input_feature_columns), {}, "default"
|
| 892 |
df = pd.read_parquet(data_path)
|
| 893 |
-
df = new_features_creation(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 894 |
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
| 895 |
|
| 896 |
if preprocessor.columns_keep:
|
|
@@ -908,9 +1055,25 @@ def _compute_reduced_inputs_from_data(
|
|
| 908 |
if "CODE_GENDER" in df.columns:
|
| 909 |
df = df[df["CODE_GENDER"] != "XNA"]
|
| 910 |
|
| 911 |
-
|
| 912 |
-
|
| 913 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 914 |
|
| 915 |
return _compute_reduced_inputs(df, input_feature_columns=preprocessor.input_feature_columns)
|
| 916 |
|
|
@@ -920,7 +1083,11 @@ def _compute_correlated_imputation(
|
|
| 920 |
preprocessor: PreprocessorArtifacts,
|
| 921 |
) -> dict[str, dict[str, float | str]]:
|
| 922 |
df = pd.read_parquet(data_path)
|
| 923 |
-
df = new_features_creation(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 924 |
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
| 925 |
|
| 926 |
df = df[preprocessor.columns_keep]
|
|
@@ -936,9 +1103,25 @@ def _compute_correlated_imputation(
|
|
| 936 |
if "CODE_GENDER" in df.columns:
|
| 937 |
df = df[df["CODE_GENDER"] != "XNA"]
|
| 938 |
|
| 939 |
-
|
| 940 |
-
|
| 941 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 942 |
|
| 943 |
return _build_correlated_imputation(
|
| 944 |
df,
|
|
@@ -1048,11 +1231,30 @@ def preprocess_input(df_raw: pd.DataFrame, artifacts: PreprocessorArtifacts) ->
|
|
| 1048 |
if "TARGET" not in df.columns:
|
| 1049 |
df["TARGET"] = 0
|
| 1050 |
|
| 1051 |
-
df = new_features_creation(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1052 |
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
| 1053 |
|
| 1054 |
df = df.reindex(columns=artifacts.columns_keep, fill_value=np.nan)
|
| 1055 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1056 |
_apply_correlated_imputation(df, artifacts)
|
| 1057 |
|
| 1058 |
for col, median in artifacts.numeric_medians.items():
|
|
@@ -1072,16 +1274,6 @@ def preprocess_input(df_raw: pd.DataFrame, artifacts: PreprocessorArtifacts) ->
|
|
| 1072 |
detail={"message": "CODE_GENDER cannot be 'XNA' based on training rules."},
|
| 1073 |
)
|
| 1074 |
|
| 1075 |
-
for col, max_val in artifacts.outlier_maxes.items():
|
| 1076 |
-
if col in df.columns and (df[col] >= max_val).any():
|
| 1077 |
-
raise HTTPException(
|
| 1078 |
-
status_code=422,
|
| 1079 |
-
detail={
|
| 1080 |
-
"message": "Input contains outlier values removed during training.",
|
| 1081 |
-
"outlier_columns": [col],
|
| 1082 |
-
},
|
| 1083 |
-
)
|
| 1084 |
-
|
| 1085 |
df_hot = pd.get_dummies(df, columns=artifacts.categorical_columns)
|
| 1086 |
df_hot = df_hot.reindex(columns=artifacts.features_to_scaled, fill_value=0)
|
| 1087 |
|
|
@@ -1089,6 +1281,80 @@ def preprocess_input(df_raw: pd.DataFrame, artifacts: PreprocessorArtifacts) ->
|
|
| 1089 |
return pd.DataFrame(scaled, columns=artifacts.features_to_scaled, index=df.index)
|
| 1090 |
|
| 1091 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1092 |
@app.on_event("startup")
|
| 1093 |
def startup_event() -> None:
|
| 1094 |
if getattr(app.state, "model", None) is not None and getattr(app.state, "preprocessor", None) is not None:
|
|
@@ -1183,9 +1449,19 @@ def features(include_all: bool = Query(default=False)) -> dict[str, Any]:
|
|
| 1183 |
for col in preprocessor.required_input_columns
|
| 1184 |
if col in scores
|
| 1185 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1186 |
payload = {
|
| 1187 |
"required_input_features": preprocessor.required_input_columns,
|
| 1188 |
"engineered_features": ENGINEERED_FEATURES,
|
|
|
|
|
|
|
| 1189 |
"model_features_count": len(preprocessor.features_to_scaled),
|
| 1190 |
"feature_selection_method": preprocessor.feature_selection_method,
|
| 1191 |
"feature_selection_top_n": FEATURE_SELECTION_TOP_N,
|
|
@@ -1198,6 +1474,8 @@ def features(include_all: bool = Query(default=False)) -> dict[str, Any]:
|
|
| 1198 |
if include_all:
|
| 1199 |
payload["input_features"] = preprocessor.input_feature_columns
|
| 1200 |
payload["optional_input_features"] = optional_features
|
|
|
|
|
|
|
| 1201 |
else:
|
| 1202 |
payload["input_features"] = preprocessor.required_input_columns
|
| 1203 |
payload["optional_input_features"] = []
|
|
@@ -1235,8 +1513,28 @@ def logs(
|
|
| 1235 |
|
| 1236 |
return Response(content="".join(lines), media_type="application/x-ndjson")
|
| 1237 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1238 |
|
| 1239 |
-
def _predict_records(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1240 |
model = app.state.model
|
| 1241 |
preprocessor: PreprocessorArtifacts = app.state.preprocessor
|
| 1242 |
request_id = str(uuid.uuid4())
|
|
@@ -1260,7 +1558,8 @@ def _predict_records(records: list[dict[str, Any]], threshold: float | None) ->
|
|
| 1260 |
raise HTTPException(status_code=422, detail={"message": "SK_ID_CURR is required."})
|
| 1261 |
|
| 1262 |
sk_ids = df_norm["SK_ID_CURR"].tolist()
|
| 1263 |
-
features =
|
|
|
|
| 1264 |
|
| 1265 |
if hasattr(model, "predict_proba"):
|
| 1266 |
proba = model.predict_proba(features)[:, 1]
|
|
@@ -1283,6 +1582,7 @@ def _predict_records(records: list[dict[str, Any]], threshold: float | None) ->
|
|
| 1283 |
threshold=use_threshold,
|
| 1284 |
status_code=200,
|
| 1285 |
preprocessor=preprocessor,
|
|
|
|
| 1286 |
data_quality=dq_records,
|
| 1287 |
)
|
| 1288 |
return {"predictions": results, "threshold": use_threshold}
|
|
@@ -1304,6 +1604,7 @@ def _predict_records(records: list[dict[str, Any]], threshold: float | None) ->
|
|
| 1304 |
threshold=None,
|
| 1305 |
status_code=200,
|
| 1306 |
preprocessor=preprocessor,
|
|
|
|
| 1307 |
data_quality=dq_records,
|
| 1308 |
)
|
| 1309 |
return {"predictions": results, "threshold": None}
|
|
@@ -1318,6 +1619,7 @@ def _predict_records(records: list[dict[str, Any]], threshold: float | None) ->
|
|
| 1318 |
threshold=threshold,
|
| 1319 |
status_code=exc.status_code,
|
| 1320 |
preprocessor=preprocessor,
|
|
|
|
| 1321 |
data_quality=dq_records if "dq_records" in locals() else None,
|
| 1322 |
error=json.dumps(detail, ensure_ascii=True),
|
| 1323 |
)
|
|
@@ -1332,6 +1634,7 @@ def _predict_records(records: list[dict[str, Any]], threshold: float | None) ->
|
|
| 1332 |
threshold=threshold,
|
| 1333 |
status_code=500,
|
| 1334 |
preprocessor=preprocessor,
|
|
|
|
| 1335 |
data_quality=dq_records if "dq_records" in locals() else None,
|
| 1336 |
error=str(exc),
|
| 1337 |
)
|
|
@@ -1342,16 +1645,18 @@ def _predict_records(records: list[dict[str, Any]], threshold: float | None) ->
|
|
| 1342 |
def predict(
|
| 1343 |
payload: PredictionRequest,
|
| 1344 |
threshold: float | None = Query(default=None, ge=0.0, le=1.0),
|
|
|
|
| 1345 |
) -> dict[str, Any]:
|
| 1346 |
records = payload.data if isinstance(payload.data, list) else [payload.data]
|
| 1347 |
-
return _predict_records(records, threshold)
|
| 1348 |
|
| 1349 |
|
| 1350 |
@app.post("/predict-minimal")
|
| 1351 |
def predict_minimal(
|
| 1352 |
payload: MinimalPredictionRequest,
|
| 1353 |
threshold: float | None = Query(default=None, ge=0.0, le=1.0),
|
|
|
|
| 1354 |
) -> dict[str, Any]:
|
| 1355 |
preprocessor: PreprocessorArtifacts = app.state.preprocessor
|
| 1356 |
record = _build_minimal_record(payload, preprocessor)
|
| 1357 |
-
return _predict_records([record], threshold)
|
|
|
|
| 20 |
from sklearn.preprocessing import MinMaxScaler
|
| 21 |
import joblib
|
| 22 |
|
| 23 |
+
from src.features import (
|
| 24 |
+
add_missingness_indicators,
|
| 25 |
+
apply_outlier_clipping,
|
| 26 |
+
compute_outlier_bounds,
|
| 27 |
+
new_features_creation,
|
| 28 |
+
select_missing_indicator_columns,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
logger = logging.getLogger("uvicorn.error")
|
| 32 |
|
| 33 |
+
def _resolve_model_path() -> Path:
|
| 34 |
+
env_path = os.getenv("MODEL_PATH")
|
| 35 |
+
if env_path:
|
| 36 |
+
return Path(env_path)
|
| 37 |
+
candidates = sorted(Path("data").glob("*_final_model.pkl"))
|
| 38 |
+
if len(candidates) == 1:
|
| 39 |
+
return candidates[0]
|
| 40 |
+
if candidates:
|
| 41 |
+
logger.warning(
|
| 42 |
+
"Multiple *_final_model.pkl files found; set MODEL_PATH explicitly. Using %s",
|
| 43 |
+
candidates[0],
|
| 44 |
+
)
|
| 45 |
+
return candidates[0]
|
| 46 |
+
return Path("data/histgb_final_model.pkl")
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
MODEL_PATH = _resolve_model_path()
|
| 50 |
DATA_PATH = Path(os.getenv("DATA_PATH", "data/data_final.parquet"))
|
| 51 |
ARTIFACTS_PATH = Path(os.getenv("ARTIFACTS_PATH", "artifacts/preprocessor.joblib"))
|
| 52 |
DEFAULT_THRESHOLD = float(os.getenv("PREDICTION_THRESHOLD", "0.5"))
|
|
|
|
| 80 |
|
| 81 |
IGNORE_FEATURES = ["is_train", "is_test", "TARGET", "SK_ID_CURR"]
|
| 82 |
ENGINEERED_FEATURES = [
|
| 83 |
+
"DAYS_EMPLOYED_ANOM",
|
| 84 |
"DAYS_EMPLOYED_PERC",
|
| 85 |
"INCOME_CREDIT_PERC",
|
| 86 |
"INCOME_PER_PERSON",
|
| 87 |
"ANNUITY_INCOME_PERC",
|
| 88 |
"PAYMENT_RATE",
|
| 89 |
+
"DENOM_ZERO_DAYS_EMPLOYED_PERC",
|
| 90 |
+
"DENOM_ZERO_INCOME_CREDIT_PERC",
|
| 91 |
+
"DENOM_ZERO_INCOME_PER_PERSON",
|
| 92 |
+
"DENOM_ZERO_ANNUITY_INCOME_PERC",
|
| 93 |
+
"DENOM_ZERO_PAYMENT_RATE",
|
| 94 |
]
|
| 95 |
ENGINEERED_SOURCES = [
|
| 96 |
"DAYS_EMPLOYED",
|
|
|
|
| 128 |
"AMT_REQ_CREDIT_BUREAU_YEAR",
|
| 129 |
"AMT_REQ_CREDIT_BUREAU_QRT",
|
| 130 |
]
|
| 131 |
+
OUTLIER_LOWER_Q = 0.01
|
| 132 |
+
OUTLIER_UPPER_Q = 0.99
|
| 133 |
+
MISSING_INDICATOR_MIN_RATE = float(os.getenv("MISSING_INDICATOR_MIN_RATE", "0.05"))
|
| 134 |
|
| 135 |
CODE_GENDER_MAPPING = {
|
| 136 |
"F": "F",
|
|
|
|
| 176 |
numeric_medians: dict[str, float]
|
| 177 |
categorical_columns: list[str]
|
| 178 |
outlier_maxes: dict[str, float]
|
| 179 |
+
outlier_bounds: dict[str, tuple[float, float]]
|
| 180 |
+
missing_indicator_columns: list[str]
|
| 181 |
numeric_ranges: dict[str, tuple[float, float]]
|
| 182 |
features_to_scaled: list[str]
|
| 183 |
scaler: MinMaxScaler
|
|
|
|
| 278 |
if "DAYS_EMPLOYED" in df.columns:
|
| 279 |
values = pd.to_numeric(df["DAYS_EMPLOYED"], errors="coerce")
|
| 280 |
sentinel_mask = values == DAYS_EMPLOYED_SENTINEL
|
| 281 |
+
df["DAYS_EMPLOYED_ANOM"] = sentinel_mask.astype(int)
|
| 282 |
if sentinel_mask.any():
|
| 283 |
df.loc[sentinel_mask, "DAYS_EMPLOYED"] = np.nan
|
| 284 |
|
|
|
|
| 303 |
missing_mask = df_norm[required_cols].isna() if required_cols else pd.DataFrame(index=df_norm.index)
|
| 304 |
invalid_masks: dict[str, pd.Series] = {}
|
| 305 |
out_of_range_masks: dict[str, pd.Series] = {}
|
| 306 |
+
outlier_masks: dict[str, pd.Series] = {}
|
| 307 |
|
| 308 |
for col in numeric_required:
|
| 309 |
if col not in df_raw.columns:
|
|
|
|
| 320 |
values = pd.to_numeric(df_norm[col], errors="coerce")
|
| 321 |
out_of_range_masks[col] = (values < min_val) | (values > max_val)
|
| 322 |
|
| 323 |
+
for col, (low, high) in getattr(preprocessor, "outlier_bounds", {}).items():
|
| 324 |
+
if col not in df_norm.columns:
|
| 325 |
+
outlier_masks[col] = pd.Series(False, index=df_norm.index)
|
| 326 |
+
continue
|
| 327 |
+
values = pd.to_numeric(df_norm[col], errors="coerce")
|
| 328 |
+
outlier_masks[col] = (values < low) | (values > high)
|
| 329 |
+
|
| 330 |
records: list[dict[str, Any]] = []
|
| 331 |
for idx in df_norm.index:
|
| 332 |
missing_cols = (
|
|
|
|
| 336 |
)
|
| 337 |
invalid_cols = [col for col, mask in invalid_masks.items() if mask.at[idx]]
|
| 338 |
out_of_range_cols = [col for col, mask in out_of_range_masks.items() if mask.at[idx]]
|
| 339 |
+
outlier_cols = [col for col, mask in outlier_masks.items() if mask.at[idx]]
|
| 340 |
unknown_cols = [col for col, mask in unknown_masks.items() if mask.at[idx]]
|
| 341 |
+
unknown_values = {
|
| 342 |
+
col: df_raw.at[idx, col]
|
| 343 |
+
for col in unknown_cols
|
| 344 |
+
if col in df_raw.columns
|
| 345 |
+
}
|
| 346 |
nan_rate = float(missing_mask.loc[idx].mean()) if not missing_mask.empty else 0.0
|
| 347 |
+
record = {
|
| 348 |
+
"missing_required_columns": missing_cols,
|
| 349 |
+
"invalid_numeric_columns": invalid_cols,
|
| 350 |
+
"out_of_range_columns": out_of_range_cols,
|
| 351 |
+
"outlier_columns": outlier_cols,
|
| 352 |
+
"unknown_categories": unknown_cols,
|
| 353 |
+
"days_employed_sentinel": bool(sentinel_mask.at[idx]) if not sentinel_mask.empty else False,
|
| 354 |
+
"nan_rate": nan_rate,
|
| 355 |
+
}
|
| 356 |
+
if unknown_values:
|
| 357 |
+
record["unknown_category_values"] = unknown_values
|
| 358 |
+
records.append(record)
|
| 359 |
return records
|
| 360 |
|
| 361 |
|
|
|
|
| 428 |
threshold: float | None,
|
| 429 |
status_code: int,
|
| 430 |
preprocessor: PreprocessorArtifacts,
|
| 431 |
+
source: str | None = None,
|
| 432 |
data_quality: list[dict[str, Any]] | None = None,
|
| 433 |
error: str | None = None,
|
| 434 |
) -> None:
|
|
|
|
| 453 |
"status_code": status_code,
|
| 454 |
"model_version": MODEL_VERSION,
|
| 455 |
"threshold": threshold,
|
| 456 |
+
"source": source or "api",
|
| 457 |
"inputs": inputs,
|
| 458 |
}
|
| 459 |
if data_quality and idx < len(data_quality):
|
|
|
|
| 474 |
_append_log_entries(entries)
|
| 475 |
|
| 476 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 477 |
def build_preprocessor(data_path: Path) -> PreprocessorArtifacts:
|
| 478 |
df = pd.read_parquet(data_path)
|
| 479 |
raw_feature_columns = df.columns.tolist()
|
| 480 |
input_feature_columns = [c for c in raw_feature_columns if c not in ["is_train", "is_test", "TARGET"]]
|
| 481 |
|
| 482 |
+
df = new_features_creation(
|
| 483 |
+
df,
|
| 484 |
+
days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
|
| 485 |
+
engineered_sources=ENGINEERED_SOURCES,
|
| 486 |
+
)
|
| 487 |
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
| 488 |
|
| 489 |
missing_rate = df.isna().mean()
|
|
|
|
| 493 |
df = df[columns_keep]
|
| 494 |
df = df.dropna(subset=columns_must_not_missing)
|
| 495 |
|
| 496 |
+
if "CODE_GENDER" in df.columns:
|
| 497 |
+
df = df[df["CODE_GENDER"] != "XNA"]
|
| 498 |
+
|
| 499 |
+
missing_indicator_columns = select_missing_indicator_columns(
|
| 500 |
+
df,
|
| 501 |
+
exclude_cols=set(IGNORE_FEATURES),
|
| 502 |
+
min_missing_rate=MISSING_INDICATOR_MIN_RATE,
|
| 503 |
+
)
|
| 504 |
+
df = add_missingness_indicators(df, missing_indicator_columns)
|
| 505 |
+
|
| 506 |
+
outlier_bounds = compute_outlier_bounds(
|
| 507 |
+
df,
|
| 508 |
+
OUTLIER_COLUMNS,
|
| 509 |
+
lower_q=OUTLIER_LOWER_Q,
|
| 510 |
+
upper_q=OUTLIER_UPPER_Q,
|
| 511 |
+
)
|
| 512 |
+
df = apply_outlier_clipping(df, outlier_bounds)
|
| 513 |
+
|
| 514 |
+
columns_keep = df.columns.tolist()
|
| 515 |
+
|
| 516 |
numeric_cols = df.select_dtypes(include=["number"]).columns
|
| 517 |
numeric_medians = df[numeric_cols].median().to_dict()
|
| 518 |
df[numeric_cols] = df[numeric_cols].fillna(numeric_medians)
|
|
|
|
| 520 |
categorical_columns = df.select_dtypes(include=["object"]).columns.tolist()
|
| 521 |
df[categorical_columns] = df[categorical_columns].fillna("Unknown")
|
| 522 |
|
| 523 |
+
outlier_maxes = {col: bounds[1] for col, bounds in outlier_bounds.items()}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 524 |
|
| 525 |
reduced_input_columns, selection_scores, selection_method = _compute_reduced_inputs(
|
| 526 |
df,
|
|
|
|
| 547 |
required_input = _fallback_reduced_inputs(input_feature_columns)
|
| 548 |
else:
|
| 549 |
required_input = sorted(required_raw)
|
| 550 |
+
numeric_required = sorted(
|
| 551 |
+
col
|
| 552 |
+
for col in required_input
|
| 553 |
+
if col in numeric_medians and col != "SK_ID_CURR"
|
| 554 |
+
)
|
| 555 |
correlated_imputation = _build_correlated_imputation(
|
| 556 |
df,
|
| 557 |
input_feature_columns=input_feature_columns,
|
|
|
|
| 565 |
numeric_medians={k: float(v) for k, v in numeric_medians.items()},
|
| 566 |
categorical_columns=categorical_columns,
|
| 567 |
outlier_maxes={k: float(v) for k, v in outlier_maxes.items()},
|
| 568 |
+
outlier_bounds={k: (float(v[0]), float(v[1])) for k, v in outlier_bounds.items()},
|
| 569 |
+
missing_indicator_columns=missing_indicator_columns,
|
| 570 |
numeric_ranges=numeric_ranges,
|
| 571 |
features_to_scaled=features_to_scaled,
|
| 572 |
scaler=scaler,
|
|
|
|
| 620 |
]
|
| 621 |
)
|
| 622 |
|
| 623 |
+
df = new_features_creation(
|
| 624 |
+
base,
|
| 625 |
+
days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
|
| 626 |
+
engineered_sources=ENGINEERED_SOURCES,
|
| 627 |
+
)
|
| 628 |
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
| 629 |
|
| 630 |
+
missing_indicator_columns = select_missing_indicator_columns(
|
| 631 |
+
df,
|
| 632 |
+
exclude_cols=set(IGNORE_FEATURES),
|
| 633 |
+
min_missing_rate=MISSING_INDICATOR_MIN_RATE,
|
| 634 |
+
)
|
| 635 |
+
df = add_missingness_indicators(df, missing_indicator_columns)
|
| 636 |
+
|
| 637 |
+
outlier_bounds = compute_outlier_bounds(
|
| 638 |
+
df,
|
| 639 |
+
OUTLIER_COLUMNS,
|
| 640 |
+
lower_q=OUTLIER_LOWER_Q,
|
| 641 |
+
upper_q=OUTLIER_UPPER_Q,
|
| 642 |
+
)
|
| 643 |
+
df = apply_outlier_clipping(df, outlier_bounds)
|
| 644 |
+
|
| 645 |
columns_keep = df.columns.tolist()
|
| 646 |
columns_must_not_missing = [col for col in columns_keep if col not in IGNORE_FEATURES]
|
| 647 |
|
|
|
|
| 664 |
required_raw.update(col for col in columns_must_not_missing if col in input_feature_columns)
|
| 665 |
required_raw.add("SK_ID_CURR")
|
| 666 |
required_input = _fallback_reduced_inputs(input_feature_columns)
|
| 667 |
+
numeric_required = sorted(
|
| 668 |
+
col for col in required_input if col in numeric_medians and col != "SK_ID_CURR"
|
| 669 |
+
)
|
| 670 |
|
| 671 |
numeric_ranges = {col: (float(df[col].min()), float(df[col].max())) for col in numeric_cols}
|
| 672 |
|
|
|
|
| 675 |
columns_must_not_missing=columns_must_not_missing,
|
| 676 |
numeric_medians={k: float(v) for k, v in numeric_medians.items()},
|
| 677 |
categorical_columns=categorical_columns,
|
| 678 |
+
outlier_maxes={k: float(v[1]) for k, v in outlier_bounds.items()},
|
| 679 |
+
outlier_bounds={k: (float(v[0]), float(v[1])) for k, v in outlier_bounds.items()},
|
| 680 |
+
missing_indicator_columns=missing_indicator_columns,
|
| 681 |
numeric_ranges=numeric_ranges,
|
| 682 |
features_to_scaled=features_to_scaled,
|
| 683 |
scaler=scaler,
|
|
|
|
| 722 |
updated = True
|
| 723 |
if not hasattr(preprocessor, "numeric_required_columns"):
|
| 724 |
preprocessor.numeric_required_columns = sorted(
|
| 725 |
+
col
|
| 726 |
+
for col in preprocessor.required_input_columns
|
| 727 |
+
if col in preprocessor.numeric_medians and col != "SK_ID_CURR"
|
| 728 |
)
|
| 729 |
updated = True
|
| 730 |
if not hasattr(preprocessor, "numeric_ranges"):
|
|
|
|
| 737 |
raise RuntimeError(f"Data file not found to rebuild preprocessor: {data_path}")
|
| 738 |
preprocessor = build_preprocessor(data_path)
|
| 739 |
updated = True
|
| 740 |
+
needs_missing_indicators = (
|
| 741 |
+
not hasattr(preprocessor, "missing_indicator_columns")
|
| 742 |
+
or not preprocessor.missing_indicator_columns
|
| 743 |
+
)
|
| 744 |
+
needs_outlier_bounds = (
|
| 745 |
+
not hasattr(preprocessor, "outlier_bounds") or not preprocessor.outlier_bounds
|
| 746 |
+
)
|
| 747 |
+
prepared_df = None
|
| 748 |
+
if (needs_missing_indicators or needs_outlier_bounds) and data_path.exists():
|
| 749 |
+
prepared_df = pd.read_parquet(data_path)
|
| 750 |
+
prepared_df = new_features_creation(
|
| 751 |
+
prepared_df,
|
| 752 |
+
days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
|
| 753 |
+
engineered_sources=ENGINEERED_SOURCES,
|
| 754 |
+
)
|
| 755 |
+
prepared_df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
| 756 |
+
if preprocessor.columns_keep:
|
| 757 |
+
prepared_df = prepared_df[preprocessor.columns_keep]
|
| 758 |
+
if preprocessor.columns_must_not_missing:
|
| 759 |
+
prepared_df = prepared_df.dropna(subset=preprocessor.columns_must_not_missing)
|
| 760 |
+
if "CODE_GENDER" in prepared_df.columns:
|
| 761 |
+
prepared_df = prepared_df[prepared_df["CODE_GENDER"] != "XNA"]
|
| 762 |
+
if needs_missing_indicators:
|
| 763 |
+
if prepared_df is not None:
|
| 764 |
+
preprocessor.missing_indicator_columns = select_missing_indicator_columns(
|
| 765 |
+
prepared_df,
|
| 766 |
+
exclude_cols=set(IGNORE_FEATURES),
|
| 767 |
+
min_missing_rate=MISSING_INDICATOR_MIN_RATE,
|
| 768 |
+
)
|
| 769 |
+
else:
|
| 770 |
+
preprocessor.missing_indicator_columns = []
|
| 771 |
+
updated = True
|
| 772 |
+
if needs_outlier_bounds:
|
| 773 |
+
if prepared_df is not None:
|
| 774 |
+
preprocessor.outlier_bounds = compute_outlier_bounds(
|
| 775 |
+
prepared_df,
|
| 776 |
+
OUTLIER_COLUMNS,
|
| 777 |
+
lower_q=OUTLIER_LOWER_Q,
|
| 778 |
+
upper_q=OUTLIER_UPPER_Q,
|
| 779 |
+
)
|
| 780 |
+
else:
|
| 781 |
+
preprocessor.outlier_bounds = {}
|
| 782 |
+
for col, max_val in getattr(preprocessor, "outlier_maxes", {}).items():
|
| 783 |
+
min_val = None
|
| 784 |
+
if hasattr(preprocessor, "numeric_ranges") and col in preprocessor.numeric_ranges:
|
| 785 |
+
min_val = preprocessor.numeric_ranges[col][0]
|
| 786 |
+
if min_val is None:
|
| 787 |
+
min_val = float("-inf")
|
| 788 |
+
preprocessor.outlier_bounds[col] = (float(min_val), float(max_val))
|
| 789 |
+
updated = True
|
| 790 |
if USE_REDUCED_INPUTS:
|
| 791 |
reduced = _reduce_input_columns(preprocessor)
|
| 792 |
if preprocessor.required_input_columns != reduced:
|
|
|
|
| 799 |
required_updated = True
|
| 800 |
updated = True
|
| 801 |
desired_numeric_required = sorted(
|
| 802 |
+
col
|
| 803 |
+
for col in preprocessor.required_input_columns
|
| 804 |
+
if col in preprocessor.numeric_medians and col != "SK_ID_CURR"
|
| 805 |
)
|
| 806 |
if getattr(preprocessor, "numeric_required_columns", None) != desired_numeric_required:
|
| 807 |
preprocessor.numeric_required_columns = desired_numeric_required
|
|
|
|
| 1033 |
if not data_path.exists():
|
| 1034 |
return _fallback_reduced_inputs(preprocessor.input_feature_columns), {}, "default"
|
| 1035 |
df = pd.read_parquet(data_path)
|
| 1036 |
+
df = new_features_creation(
|
| 1037 |
+
df,
|
| 1038 |
+
days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
|
| 1039 |
+
engineered_sources=ENGINEERED_SOURCES,
|
| 1040 |
+
)
|
| 1041 |
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
| 1042 |
|
| 1043 |
if preprocessor.columns_keep:
|
|
|
|
| 1055 |
if "CODE_GENDER" in df.columns:
|
| 1056 |
df = df[df["CODE_GENDER"] != "XNA"]
|
| 1057 |
|
| 1058 |
+
if getattr(preprocessor, "missing_indicator_columns", None):
|
| 1059 |
+
df = add_missingness_indicators(df, preprocessor.missing_indicator_columns)
|
| 1060 |
+
else:
|
| 1061 |
+
df = add_missingness_indicators(
|
| 1062 |
+
df,
|
| 1063 |
+
select_missing_indicator_columns(
|
| 1064 |
+
df,
|
| 1065 |
+
exclude_cols=set(IGNORE_FEATURES),
|
| 1066 |
+
min_missing_rate=MISSING_INDICATOR_MIN_RATE,
|
| 1067 |
+
),
|
| 1068 |
+
)
|
| 1069 |
+
|
| 1070 |
+
outlier_bounds = getattr(preprocessor, "outlier_bounds", {}) or compute_outlier_bounds(
|
| 1071 |
+
df,
|
| 1072 |
+
OUTLIER_COLUMNS,
|
| 1073 |
+
lower_q=OUTLIER_LOWER_Q,
|
| 1074 |
+
upper_q=OUTLIER_UPPER_Q,
|
| 1075 |
+
)
|
| 1076 |
+
df = apply_outlier_clipping(df, outlier_bounds)
|
| 1077 |
|
| 1078 |
return _compute_reduced_inputs(df, input_feature_columns=preprocessor.input_feature_columns)
|
| 1079 |
|
|
|
|
| 1083 |
preprocessor: PreprocessorArtifacts,
|
| 1084 |
) -> dict[str, dict[str, float | str]]:
|
| 1085 |
df = pd.read_parquet(data_path)
|
| 1086 |
+
df = new_features_creation(
|
| 1087 |
+
df,
|
| 1088 |
+
days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
|
| 1089 |
+
engineered_sources=ENGINEERED_SOURCES,
|
| 1090 |
+
)
|
| 1091 |
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
| 1092 |
|
| 1093 |
df = df[preprocessor.columns_keep]
|
|
|
|
| 1103 |
if "CODE_GENDER" in df.columns:
|
| 1104 |
df = df[df["CODE_GENDER"] != "XNA"]
|
| 1105 |
|
| 1106 |
+
if getattr(preprocessor, "missing_indicator_columns", None):
|
| 1107 |
+
df = add_missingness_indicators(df, preprocessor.missing_indicator_columns)
|
| 1108 |
+
else:
|
| 1109 |
+
df = add_missingness_indicators(
|
| 1110 |
+
df,
|
| 1111 |
+
select_missing_indicator_columns(
|
| 1112 |
+
df,
|
| 1113 |
+
exclude_cols=set(IGNORE_FEATURES),
|
| 1114 |
+
min_missing_rate=MISSING_INDICATOR_MIN_RATE,
|
| 1115 |
+
),
|
| 1116 |
+
)
|
| 1117 |
+
|
| 1118 |
+
outlier_bounds = getattr(preprocessor, "outlier_bounds", {}) or compute_outlier_bounds(
|
| 1119 |
+
df,
|
| 1120 |
+
OUTLIER_COLUMNS,
|
| 1121 |
+
lower_q=OUTLIER_LOWER_Q,
|
| 1122 |
+
upper_q=OUTLIER_UPPER_Q,
|
| 1123 |
+
)
|
| 1124 |
+
df = apply_outlier_clipping(df, outlier_bounds)
|
| 1125 |
|
| 1126 |
return _build_correlated_imputation(
|
| 1127 |
df,
|
|
|
|
| 1231 |
if "TARGET" not in df.columns:
|
| 1232 |
df["TARGET"] = 0
|
| 1233 |
|
| 1234 |
+
df = new_features_creation(
|
| 1235 |
+
df,
|
| 1236 |
+
days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
|
| 1237 |
+
engineered_sources=ENGINEERED_SOURCES,
|
| 1238 |
+
)
|
| 1239 |
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
| 1240 |
|
| 1241 |
df = df.reindex(columns=artifacts.columns_keep, fill_value=np.nan)
|
| 1242 |
|
| 1243 |
+
indicator_cols = getattr(artifacts, "missing_indicator_columns", None) or select_missing_indicator_columns(
|
| 1244 |
+
df,
|
| 1245 |
+
exclude_cols=set(IGNORE_FEATURES),
|
| 1246 |
+
min_missing_rate=MISSING_INDICATOR_MIN_RATE,
|
| 1247 |
+
)
|
| 1248 |
+
df = add_missingness_indicators(df, indicator_cols)
|
| 1249 |
+
|
| 1250 |
+
outlier_bounds = getattr(artifacts, "outlier_bounds", {}) or compute_outlier_bounds(
|
| 1251 |
+
df,
|
| 1252 |
+
OUTLIER_COLUMNS,
|
| 1253 |
+
lower_q=OUTLIER_LOWER_Q,
|
| 1254 |
+
upper_q=OUTLIER_UPPER_Q,
|
| 1255 |
+
)
|
| 1256 |
+
df = apply_outlier_clipping(df, outlier_bounds)
|
| 1257 |
+
|
| 1258 |
_apply_correlated_imputation(df, artifacts)
|
| 1259 |
|
| 1260 |
for col, median in artifacts.numeric_medians.items():
|
|
|
|
| 1274 |
detail={"message": "CODE_GENDER cannot be 'XNA' based on training rules."},
|
| 1275 |
)
|
| 1276 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1277 |
df_hot = pd.get_dummies(df, columns=artifacts.categorical_columns)
|
| 1278 |
df_hot = df_hot.reindex(columns=artifacts.features_to_scaled, fill_value=0)
|
| 1279 |
|
|
|
|
| 1281 |
return pd.DataFrame(scaled, columns=artifacts.features_to_scaled, index=df.index)
|
| 1282 |
|
| 1283 |
|
| 1284 |
+
def _prepare_pipeline_input(
|
| 1285 |
+
df_raw: pd.DataFrame,
|
| 1286 |
+
artifacts: PreprocessorArtifacts,
|
| 1287 |
+
model: Any,
|
| 1288 |
+
) -> pd.DataFrame:
|
| 1289 |
+
df = df_raw.copy()
|
| 1290 |
+
|
| 1291 |
+
for col in artifacts.required_input_columns:
|
| 1292 |
+
if col not in df.columns:
|
| 1293 |
+
df[col] = np.nan
|
| 1294 |
+
|
| 1295 |
+
allow_missing = {"DAYS_EMPLOYED"}
|
| 1296 |
+
_ensure_required_columns(df, artifacts.required_input_columns, allow_missing=allow_missing)
|
| 1297 |
+
_validate_numeric_inputs(df, artifacts.numeric_required_columns)
|
| 1298 |
+
_validate_numeric_ranges(
|
| 1299 |
+
df,
|
| 1300 |
+
{k: v for k, v in artifacts.numeric_ranges.items() if k in artifacts.numeric_required_columns},
|
| 1301 |
+
)
|
| 1302 |
+
|
| 1303 |
+
df["is_train"] = 0
|
| 1304 |
+
df["is_test"] = 1
|
| 1305 |
+
if "TARGET" not in df.columns:
|
| 1306 |
+
df["TARGET"] = 0
|
| 1307 |
+
|
| 1308 |
+
df = new_features_creation(
|
| 1309 |
+
df,
|
| 1310 |
+
days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
|
| 1311 |
+
engineered_sources=ENGINEERED_SOURCES,
|
| 1312 |
+
)
|
| 1313 |
+
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
| 1314 |
+
|
| 1315 |
+
df = df.reindex(columns=artifacts.columns_keep, fill_value=np.nan)
|
| 1316 |
+
|
| 1317 |
+
indicator_cols = getattr(artifacts, "missing_indicator_columns", None) or select_missing_indicator_columns(
|
| 1318 |
+
df,
|
| 1319 |
+
exclude_cols=set(IGNORE_FEATURES),
|
| 1320 |
+
min_missing_rate=MISSING_INDICATOR_MIN_RATE,
|
| 1321 |
+
)
|
| 1322 |
+
df = add_missingness_indicators(df, indicator_cols)
|
| 1323 |
+
|
| 1324 |
+
outlier_bounds = getattr(artifacts, "outlier_bounds", {}) or compute_outlier_bounds(
|
| 1325 |
+
df,
|
| 1326 |
+
OUTLIER_COLUMNS,
|
| 1327 |
+
lower_q=OUTLIER_LOWER_Q,
|
| 1328 |
+
upper_q=OUTLIER_UPPER_Q,
|
| 1329 |
+
)
|
| 1330 |
+
df = apply_outlier_clipping(df, outlier_bounds)
|
| 1331 |
+
|
| 1332 |
+
if "CODE_GENDER" in df.columns and (df["CODE_GENDER"] == "XNA").any():
|
| 1333 |
+
raise HTTPException(
|
| 1334 |
+
status_code=422,
|
| 1335 |
+
detail={"message": "CODE_GENDER cannot be 'XNA' based on training rules."},
|
| 1336 |
+
)
|
| 1337 |
+
|
| 1338 |
+
expected_cols = None
|
| 1339 |
+
if hasattr(model, "named_steps"):
|
| 1340 |
+
preprocessor = model.named_steps.get("preprocessing")
|
| 1341 |
+
expected_cols = getattr(preprocessor, "feature_names_in_", None)
|
| 1342 |
+
if expected_cols is None:
|
| 1343 |
+
expected_cols = [c for c in artifacts.input_feature_columns if c not in IGNORE_FEATURES]
|
| 1344 |
+
|
| 1345 |
+
return df.reindex(columns=expected_cols, fill_value=np.nan)
|
| 1346 |
+
|
| 1347 |
+
|
| 1348 |
+
def prepare_inference_features(
|
| 1349 |
+
df_raw: pd.DataFrame,
|
| 1350 |
+
artifacts: PreprocessorArtifacts,
|
| 1351 |
+
model: Any,
|
| 1352 |
+
) -> pd.DataFrame:
|
| 1353 |
+
if hasattr(model, "named_steps") and model.named_steps.get("preprocessing") is not None:
|
| 1354 |
+
return _prepare_pipeline_input(df_raw, artifacts, model)
|
| 1355 |
+
return preprocess_input(df_raw, artifacts)
|
| 1356 |
+
|
| 1357 |
+
|
| 1358 |
@app.on_event("startup")
|
| 1359 |
def startup_event() -> None:
|
| 1360 |
if getattr(app.state, "model", None) is not None and getattr(app.state, "preprocessor", None) is not None:
|
|
|
|
| 1449 |
for col in preprocessor.required_input_columns
|
| 1450 |
if col in scores
|
| 1451 |
}
|
| 1452 |
+
missing_indicator_features = [
|
| 1453 |
+
f"is_missing_{col}"
|
| 1454 |
+
for col in getattr(preprocessor, "missing_indicator_columns", []) or []
|
| 1455 |
+
]
|
| 1456 |
+
outlier_indicator_features = [
|
| 1457 |
+
f"is_outlier_{col}"
|
| 1458 |
+
for col in getattr(preprocessor, "outlier_bounds", {}) or {}
|
| 1459 |
+
]
|
| 1460 |
payload = {
|
| 1461 |
"required_input_features": preprocessor.required_input_columns,
|
| 1462 |
"engineered_features": ENGINEERED_FEATURES,
|
| 1463 |
+
"missing_indicator_features_count": len(missing_indicator_features),
|
| 1464 |
+
"outlier_indicator_features_count": len(outlier_indicator_features),
|
| 1465 |
"model_features_count": len(preprocessor.features_to_scaled),
|
| 1466 |
"feature_selection_method": preprocessor.feature_selection_method,
|
| 1467 |
"feature_selection_top_n": FEATURE_SELECTION_TOP_N,
|
|
|
|
| 1474 |
if include_all:
|
| 1475 |
payload["input_features"] = preprocessor.input_feature_columns
|
| 1476 |
payload["optional_input_features"] = optional_features
|
| 1477 |
+
payload["missing_indicator_features"] = missing_indicator_features
|
| 1478 |
+
payload["outlier_indicator_features"] = outlier_indicator_features
|
| 1479 |
else:
|
| 1480 |
payload["input_features"] = preprocessor.required_input_columns
|
| 1481 |
payload["optional_input_features"] = []
|
|
|
|
| 1513 |
|
| 1514 |
return Response(content="".join(lines), media_type="application/x-ndjson")
|
| 1515 |
|
| 1516 |
+
def _align_features_to_model(features: pd.DataFrame, model: Any) -> pd.DataFrame:
|
| 1517 |
+
expected = getattr(model, "feature_names_in_", None)
|
| 1518 |
+
if expected is None:
|
| 1519 |
+
return features
|
| 1520 |
+
expected = list(expected)
|
| 1521 |
+
|
| 1522 |
+
extra = [c for c in features.columns if c not in expected]
|
| 1523 |
+
missing = [c for c in expected if c not in features.columns]
|
| 1524 |
+
if extra or missing:
|
| 1525 |
+
logger.warning(
|
| 1526 |
+
"Feature mismatch: extra=%s missing=%s",
|
| 1527 |
+
extra[:15],
|
| 1528 |
+
missing[:15],
|
| 1529 |
+
)
|
| 1530 |
+
return features.reindex(columns=expected, fill_value=0)
|
| 1531 |
|
| 1532 |
+
def _predict_records(
|
| 1533 |
+
records: list[dict[str, Any]],
|
| 1534 |
+
threshold: float | None,
|
| 1535 |
+
*,
|
| 1536 |
+
source: str | None = None,
|
| 1537 |
+
) -> dict[str, Any]:
|
| 1538 |
model = app.state.model
|
| 1539 |
preprocessor: PreprocessorArtifacts = app.state.preprocessor
|
| 1540 |
request_id = str(uuid.uuid4())
|
|
|
|
| 1558 |
raise HTTPException(status_code=422, detail={"message": "SK_ID_CURR is required."})
|
| 1559 |
|
| 1560 |
sk_ids = df_norm["SK_ID_CURR"].tolist()
|
| 1561 |
+
features = prepare_inference_features(df_norm, preprocessor, model)
|
| 1562 |
+
features = _align_features_to_model(features, model)
|
| 1563 |
|
| 1564 |
if hasattr(model, "predict_proba"):
|
| 1565 |
proba = model.predict_proba(features)[:, 1]
|
|
|
|
| 1582 |
threshold=use_threshold,
|
| 1583 |
status_code=200,
|
| 1584 |
preprocessor=preprocessor,
|
| 1585 |
+
source=source,
|
| 1586 |
data_quality=dq_records,
|
| 1587 |
)
|
| 1588 |
return {"predictions": results, "threshold": use_threshold}
|
|
|
|
| 1604 |
threshold=None,
|
| 1605 |
status_code=200,
|
| 1606 |
preprocessor=preprocessor,
|
| 1607 |
+
source=source,
|
| 1608 |
data_quality=dq_records,
|
| 1609 |
)
|
| 1610 |
return {"predictions": results, "threshold": None}
|
|
|
|
| 1619 |
threshold=threshold,
|
| 1620 |
status_code=exc.status_code,
|
| 1621 |
preprocessor=preprocessor,
|
| 1622 |
+
source=source,
|
| 1623 |
data_quality=dq_records if "dq_records" in locals() else None,
|
| 1624 |
error=json.dumps(detail, ensure_ascii=True),
|
| 1625 |
)
|
|
|
|
| 1634 |
threshold=threshold,
|
| 1635 |
status_code=500,
|
| 1636 |
preprocessor=preprocessor,
|
| 1637 |
+
source=source,
|
| 1638 |
data_quality=dq_records if "dq_records" in locals() else None,
|
| 1639 |
error=str(exc),
|
| 1640 |
)
|
|
|
|
| 1645 |
def predict(
|
| 1646 |
payload: PredictionRequest,
|
| 1647 |
threshold: float | None = Query(default=None, ge=0.0, le=1.0),
|
| 1648 |
+
x_client_source: str | None = Header(default=None, alias="X-Client-Source"),
|
| 1649 |
) -> dict[str, Any]:
|
| 1650 |
records = payload.data if isinstance(payload.data, list) else [payload.data]
|
| 1651 |
+
return _predict_records(records, threshold, source=x_client_source)
|
| 1652 |
|
| 1653 |
|
| 1654 |
@app.post("/predict-minimal")
|
| 1655 |
def predict_minimal(
|
| 1656 |
payload: MinimalPredictionRequest,
|
| 1657 |
threshold: float | None = Query(default=None, ge=0.0, le=1.0),
|
| 1658 |
+
x_client_source: str | None = Header(default=None, alias="X-Client-Source"),
|
| 1659 |
) -> dict[str, Any]:
|
| 1660 |
preprocessor: PreprocessorArtifacts = app.state.preprocessor
|
| 1661 |
record = _build_minimal_record(payload, preprocessor)
|
| 1662 |
+
return _predict_records([record], threshold, source=x_client_source)
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/performance/performance_report.md
CHANGED
|
@@ -6,7 +6,8 @@ Mesurer la latence d'inference, identifier les goulots d'etranglement et propose
|
|
| 6 |
|
| 7 |
## Setup
|
| 8 |
|
| 9 |
-
- Script: `profiling/profile_inference.py`
|
|
|
|
| 10 |
- Donnees: `data/data_final.parquet` (echantillon)
|
| 11 |
- Parametres: `--sample-size 500 --batch-size 100 --runs 2`
|
| 12 |
- Modele: `HistGB_final_model.pkl`
|
|
|
|
| 6 |
|
| 7 |
## Setup
|
| 8 |
|
| 9 |
+
- Script (archivé): `dev_archive/profiling/profile_inference.py`
|
| 10 |
+
- Workflow courant: notebook modélisation (section TODO 5)
|
| 11 |
- Donnees: `data/data_final.parquet` (echantillon)
|
| 12 |
- Parametres: `--sample-size 500 --batch-size 100 --runs 2`
|
| 13 |
- Modele: `HistGB_final_model.pkl`
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py
CHANGED
|
@@ -8,13 +8,17 @@ import pandas as pd
|
|
| 8 |
from fastapi import HTTPException
|
| 9 |
|
| 10 |
from app.main import (
|
|
|
|
|
|
|
|
|
|
| 11 |
MinimalPredictionRequest,
|
| 12 |
app,
|
|
|
|
|
|
|
| 13 |
predict_minimal,
|
| 14 |
startup_event,
|
| 15 |
_build_minimal_record,
|
| 16 |
_normalize_inputs,
|
| 17 |
-
preprocess_input,
|
| 18 |
)
|
| 19 |
|
| 20 |
|
|
@@ -45,7 +49,7 @@ def _shap_error_table(message: str) -> pd.DataFrame:
|
|
| 45 |
[
|
| 46 |
{
|
| 47 |
"feature": message,
|
| 48 |
-
"
|
| 49 |
"shap_value": np.nan,
|
| 50 |
}
|
| 51 |
]
|
|
@@ -63,38 +67,171 @@ def _extract_shap_values(shap_values: Any) -> np.ndarray:
|
|
| 63 |
return values
|
| 64 |
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
def _compute_shap_top_features(record: dict[str, Any], top_k: int = 10) -> pd.DataFrame:
|
| 67 |
preprocessor = app.state.preprocessor
|
|
|
|
| 68 |
df_raw = pd.DataFrame.from_records([record])
|
| 69 |
df_norm, _, _ = _normalize_inputs(df_raw, preprocessor)
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
try:
|
| 72 |
import shap
|
| 73 |
except ImportError:
|
| 74 |
return _shap_error_table("SHAP not installed.")
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
try:
|
| 79 |
-
|
| 80 |
except Exception:
|
| 81 |
-
|
| 82 |
-
|
|
|
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
try:
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
| 87 |
except Exception:
|
| 88 |
-
values =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
shap_row = values[0]
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
top_idx = np.argsort(np.abs(shap_row))[::-1][:top_k]
|
| 93 |
rows = [
|
| 94 |
{
|
| 95 |
-
"feature": str(
|
| 96 |
-
"
|
| 97 |
-
|
|
|
|
|
|
|
| 98 |
}
|
| 99 |
for idx in top_idx
|
| 100 |
]
|
|
@@ -105,8 +242,7 @@ def score_minimal(
|
|
| 105 |
sk_id_curr: float,
|
| 106 |
amt_credit: float,
|
| 107 |
duration_months: float,
|
| 108 |
-
|
| 109 |
-
) -> tuple[float | None, str, float | None, pd.DataFrame, dict[str, Any]]:
|
| 110 |
_ensure_startup()
|
| 111 |
try:
|
| 112 |
payload = MinimalPredictionRequest(
|
|
@@ -115,7 +251,7 @@ def score_minimal(
|
|
| 115 |
duration_months=int(duration_months),
|
| 116 |
)
|
| 117 |
record = _build_minimal_record(payload, app.state.preprocessor)
|
| 118 |
-
response = predict_minimal(payload, threshold=
|
| 119 |
result = response["predictions"][0]
|
| 120 |
probability = float(result.get("probability", 0.0))
|
| 121 |
pred_value = int(result.get("prediction", 0))
|
|
@@ -128,11 +264,11 @@ def score_minimal(
|
|
| 128 |
"DURATION_MONTHS": int(duration_months),
|
| 129 |
}
|
| 130 |
)
|
| 131 |
-
return probability, label,
|
| 132 |
except HTTPException as exc:
|
| 133 |
-
return None, f"Erreur: {exc.detail}",
|
| 134 |
except Exception as exc: # pragma: no cover - UI fallback
|
| 135 |
-
return None, f"Erreur: {exc}",
|
| 136 |
|
| 137 |
|
| 138 |
with gr.Blocks(title="Credit scoring MLOps") as demo:
|
|
@@ -155,19 +291,17 @@ with gr.Blocks(title="Credit scoring MLOps") as demo:
|
|
| 155 |
sk_id_curr = gr.Number(label="Identifiant client", precision=0, value=100001)
|
| 156 |
amt_credit = gr.Number(label="Montant du crédit", value=200000)
|
| 157 |
duration_months = gr.Number(label="Durée (mois)", precision=0, value=60)
|
| 158 |
-
threshold = gr.Slider(label="Seuil", minimum=0.0, maximum=1.0, value=0.5, step=0.01)
|
| 159 |
|
| 160 |
run_btn = gr.Button("Scorer")
|
| 161 |
|
| 162 |
with gr.Row():
|
| 163 |
probability = gr.Number(label="Probabilité de défaut")
|
| 164 |
prediction = gr.Textbox(label="Prédiction")
|
| 165 |
-
threshold_used = gr.Number(label="Seuil utilisé")
|
| 166 |
|
| 167 |
shap_table = gr.Dataframe(
|
| 168 |
-
headers=["feature", "
|
| 169 |
label="Top 10 SHAP (local)",
|
| 170 |
-
datatype=["str", "
|
| 171 |
interactive=False,
|
| 172 |
)
|
| 173 |
|
|
@@ -175,8 +309,8 @@ with gr.Blocks(title="Credit scoring MLOps") as demo:
|
|
| 175 |
|
| 176 |
run_btn.click(
|
| 177 |
score_minimal,
|
| 178 |
-
inputs=[sk_id_curr, amt_credit, duration_months
|
| 179 |
-
outputs=[probability, prediction,
|
| 180 |
)
|
| 181 |
|
| 182 |
|
|
|
|
| 8 |
from fastapi import HTTPException
|
| 9 |
|
| 10 |
from app.main import (
|
| 11 |
+
DAYS_EMPLOYED_SENTINEL,
|
| 12 |
+
ENGINEERED_SOURCES,
|
| 13 |
+
MODEL_VERSION,
|
| 14 |
MinimalPredictionRequest,
|
| 15 |
app,
|
| 16 |
+
new_features_creation,
|
| 17 |
+
prepare_inference_features,
|
| 18 |
predict_minimal,
|
| 19 |
startup_event,
|
| 20 |
_build_minimal_record,
|
| 21 |
_normalize_inputs,
|
|
|
|
| 22 |
)
|
| 23 |
|
| 24 |
|
|
|
|
| 49 |
[
|
| 50 |
{
|
| 51 |
"feature": message,
|
| 52 |
+
"raw_value": np.nan,
|
| 53 |
"shap_value": np.nan,
|
| 54 |
}
|
| 55 |
]
|
|
|
|
| 67 |
return values
|
| 68 |
|
| 69 |
|
| 70 |
+
def _clean_raw_value(value: Any) -> Any:
|
| 71 |
+
if value is None or pd.isna(value):
|
| 72 |
+
return None
|
| 73 |
+
if isinstance(value, (np.integer, np.floating)):
|
| 74 |
+
return value.item()
|
| 75 |
+
return value
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def _strip_feature_prefix(feature_name: str) -> str:
|
| 79 |
+
return feature_name.split("__", 1)[1] if "__" in feature_name else feature_name
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def _lookup_raw_value(feature_name: str, raw_df: pd.DataFrame, preprocessor) -> Any:
|
| 83 |
+
cleaned_name = _strip_feature_prefix(feature_name)
|
| 84 |
+
if cleaned_name in raw_df.columns:
|
| 85 |
+
return raw_df.at[0, cleaned_name]
|
| 86 |
+
for prefix in ("is_missing_", "is_outlier_"):
|
| 87 |
+
if cleaned_name.startswith(prefix):
|
| 88 |
+
base = cleaned_name[len(prefix):]
|
| 89 |
+
if base in raw_df.columns:
|
| 90 |
+
return raw_df.at[0, base]
|
| 91 |
+
for col in getattr(preprocessor, "categorical_columns", []):
|
| 92 |
+
if cleaned_name.startswith(f"{col}_") and col in raw_df.columns:
|
| 93 |
+
return raw_df.at[0, col]
|
| 94 |
+
return None
|
| 95 |
+
|
| 96 |
+
def _align_features_to_model(X: Any, model: Any) -> Any:
|
| 97 |
+
expected = getattr(model, "feature_names_in_", None)
|
| 98 |
+
if expected is None:
|
| 99 |
+
return X
|
| 100 |
+
if isinstance(X, pd.DataFrame):
|
| 101 |
+
return X.reindex(columns=list(expected), fill_value=0)
|
| 102 |
+
return X
|
| 103 |
+
|
| 104 |
+
def _model_family(model: Any) -> str:
|
| 105 |
+
name = type(model).__name__.lower()
|
| 106 |
+
if "xgb" in name:
|
| 107 |
+
return "xgb"
|
| 108 |
+
if "lgbm" in name or "lightgbm" in name:
|
| 109 |
+
return "lgbm"
|
| 110 |
+
if "histgradientboosting" in name:
|
| 111 |
+
return "histgb"
|
| 112 |
+
return "unknown"
|
| 113 |
+
|
| 114 |
+
def _xgb_pred_contribs(estimator: Any, X: Any) -> np.ndarray:
|
| 115 |
+
import xgboost as xgb
|
| 116 |
+
|
| 117 |
+
if isinstance(X, pd.DataFrame):
|
| 118 |
+
dm = xgb.DMatrix(X, feature_names=list(X.columns))
|
| 119 |
+
else:
|
| 120 |
+
dm = xgb.DMatrix(np.asarray(X))
|
| 121 |
+
|
| 122 |
+
booster = estimator.get_booster() if hasattr(estimator, "get_booster") else estimator
|
| 123 |
+
contrib = booster.predict(dm, pred_contribs=True)
|
| 124 |
+
return np.asarray(contrib)[:, :-1]
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def _lgbm_pred_contribs(estimator: Any, X: Any) -> np.ndarray:
|
| 128 |
+
contrib = estimator.predict(X, pred_contrib=True)
|
| 129 |
+
return np.asarray(contrib)[:, :-1]
|
| 130 |
+
|
| 131 |
+
|
| 132 |
def _compute_shap_top_features(record: dict[str, Any], top_k: int = 10) -> pd.DataFrame:
|
| 133 |
preprocessor = app.state.preprocessor
|
| 134 |
+
model = app.state.model
|
| 135 |
df_raw = pd.DataFrame.from_records([record])
|
| 136 |
df_norm, _, _ = _normalize_inputs(df_raw, preprocessor)
|
| 137 |
+
raw_reference = new_features_creation(
|
| 138 |
+
df_norm,
|
| 139 |
+
days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
|
| 140 |
+
engineered_sources=ENGINEERED_SOURCES,
|
| 141 |
+
)
|
| 142 |
+
features = prepare_inference_features(df_norm, preprocessor, model)
|
| 143 |
+
features = _align_features_to_model(features, model)
|
| 144 |
+
|
| 145 |
try:
|
| 146 |
import shap
|
| 147 |
except ImportError:
|
| 148 |
return _shap_error_table("SHAP not installed.")
|
| 149 |
|
| 150 |
+
estimator = model
|
| 151 |
+
X_shap = features
|
| 152 |
+
if hasattr(model, "named_steps") and model.named_steps.get("preprocessing") is not None:
|
| 153 |
+
estimator = model.named_steps.get("estimator", model)
|
| 154 |
+
pipeline_preprocessor = model.named_steps["preprocessing"]
|
| 155 |
+
try:
|
| 156 |
+
X_shap = pipeline_preprocessor.transform(features)
|
| 157 |
+
except Exception as exc:
|
| 158 |
+
return _shap_error_table(f"SHAP preprocessing failed: {exc}")
|
| 159 |
+
try:
|
| 160 |
+
import scipy.sparse as sp
|
| 161 |
+
if sp.issparse(X_shap):
|
| 162 |
+
X_shap = X_shap.toarray()
|
| 163 |
+
except Exception:
|
| 164 |
+
pass
|
| 165 |
try:
|
| 166 |
+
feature_names = pipeline_preprocessor.get_feature_names_out()
|
| 167 |
except Exception:
|
| 168 |
+
feature_names = None
|
| 169 |
+
if feature_names is not None:
|
| 170 |
+
X_shap = pd.DataFrame(X_shap, columns=feature_names)
|
| 171 |
|
| 172 |
+
family = _model_family(estimator)
|
| 173 |
+
|
| 174 |
+
values: np.ndarray | None = None
|
| 175 |
+
|
| 176 |
+
# 1) Contributions natives (meilleur choix pour XGB/LGBM)
|
| 177 |
try:
|
| 178 |
+
if family == "xgb":
|
| 179 |
+
values = _xgb_pred_contribs(estimator, X_shap)
|
| 180 |
+
elif family == "lgbm":
|
| 181 |
+
values = _lgbm_pred_contribs(estimator, X_shap)
|
| 182 |
except Exception:
|
| 183 |
+
values = None
|
| 184 |
+
|
| 185 |
+
# 2) Fallback SHAP (utile surtout pour HistGB / inconnus)
|
| 186 |
+
if values is None:
|
| 187 |
+
cache = getattr(app.state, "shap_explainer_cache", {})
|
| 188 |
+
key = f"{MODEL_VERSION}:{type(estimator).__name__}"
|
| 189 |
+
explainer = cache.get(key)
|
| 190 |
+
|
| 191 |
+
if explainer is None:
|
| 192 |
+
try:
|
| 193 |
+
import shap
|
| 194 |
+
predict_fn = (
|
| 195 |
+
(lambda X: estimator.predict_proba(X)[:, 1])
|
| 196 |
+
if hasattr(estimator, "predict_proba")
|
| 197 |
+
else (lambda X: estimator.predict(X))
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
# Evite le background dégénéré (1 seule ligne)
|
| 201 |
+
if isinstance(X_shap, pd.DataFrame):
|
| 202 |
+
bg = pd.concat([X_shap] * 50, ignore_index=True)
|
| 203 |
+
else:
|
| 204 |
+
bg = np.repeat(np.asarray(X_shap), repeats=50, axis=0)
|
| 205 |
+
|
| 206 |
+
explainer = shap.Explainer(predict_fn, bg)
|
| 207 |
+
except Exception as exc:
|
| 208 |
+
return _shap_error_table(f"SHAP explainer init failed: {exc}")
|
| 209 |
+
|
| 210 |
+
cache[key] = explainer
|
| 211 |
+
app.state.shap_explainer_cache = cache
|
| 212 |
+
|
| 213 |
+
try:
|
| 214 |
+
import shap
|
| 215 |
+
explanation = explainer(X_shap)
|
| 216 |
+
values = _extract_shap_values(explanation.values)
|
| 217 |
+
except Exception as exc:
|
| 218 |
+
return _shap_error_table(f"SHAP failed: {exc}")
|
| 219 |
|
| 220 |
shap_row = values[0]
|
| 221 |
+
if isinstance(X_shap, pd.DataFrame):
|
| 222 |
+
feature_values = X_shap.iloc[0].to_numpy()
|
| 223 |
+
feature_names = X_shap.columns
|
| 224 |
+
else:
|
| 225 |
+
feature_values = np.asarray(X_shap)[0]
|
| 226 |
+
feature_names = [f"feature_{idx}" for idx in range(len(feature_values))]
|
| 227 |
top_idx = np.argsort(np.abs(shap_row))[::-1][:top_k]
|
| 228 |
rows = [
|
| 229 |
{
|
| 230 |
+
"feature": str(feature_names[idx]),
|
| 231 |
+
"raw_value": _clean_raw_value(
|
| 232 |
+
_lookup_raw_value(str(feature_names[idx]), raw_reference, preprocessor)
|
| 233 |
+
),
|
| 234 |
+
"shap_value": float(np.round(shap_row[idx], 6)),
|
| 235 |
}
|
| 236 |
for idx in top_idx
|
| 237 |
]
|
|
|
|
| 242 |
sk_id_curr: float,
|
| 243 |
amt_credit: float,
|
| 244 |
duration_months: float,
|
| 245 |
+
) -> tuple[float | None, str, pd.DataFrame, dict[str, Any]]:
|
|
|
|
| 246 |
_ensure_startup()
|
| 247 |
try:
|
| 248 |
payload = MinimalPredictionRequest(
|
|
|
|
| 251 |
duration_months=int(duration_months),
|
| 252 |
)
|
| 253 |
record = _build_minimal_record(payload, app.state.preprocessor)
|
| 254 |
+
response = predict_minimal(payload, threshold=None, x_client_source="gradio")
|
| 255 |
result = response["predictions"][0]
|
| 256 |
probability = float(result.get("probability", 0.0))
|
| 257 |
pred_value = int(result.get("prediction", 0))
|
|
|
|
| 264 |
"DURATION_MONTHS": int(duration_months),
|
| 265 |
}
|
| 266 |
)
|
| 267 |
+
return probability, label, shap_table, snapshot
|
| 268 |
except HTTPException as exc:
|
| 269 |
+
return None, f"Erreur: {exc.detail}", _shap_error_table("No SHAP available."), {"error": exc.detail}
|
| 270 |
except Exception as exc: # pragma: no cover - UI fallback
|
| 271 |
+
return None, f"Erreur: {exc}", _shap_error_table("No SHAP available."), {"error": str(exc)}
|
| 272 |
|
| 273 |
|
| 274 |
with gr.Blocks(title="Credit scoring MLOps") as demo:
|
|
|
|
| 291 |
sk_id_curr = gr.Number(label="Identifiant client", precision=0, value=100001)
|
| 292 |
amt_credit = gr.Number(label="Montant du crédit", value=200000)
|
| 293 |
duration_months = gr.Number(label="Durée (mois)", precision=0, value=60)
|
|
|
|
| 294 |
|
| 295 |
run_btn = gr.Button("Scorer")
|
| 296 |
|
| 297 |
with gr.Row():
|
| 298 |
probability = gr.Number(label="Probabilité de défaut")
|
| 299 |
prediction = gr.Textbox(label="Prédiction")
|
|
|
|
| 300 |
|
| 301 |
shap_table = gr.Dataframe(
|
| 302 |
+
headers=["feature", "raw_value", "shap_value"],
|
| 303 |
label="Top 10 SHAP (local)",
|
| 304 |
+
datatype=["str", "str", "number"],
|
| 305 |
interactive=False,
|
| 306 |
)
|
| 307 |
|
|
|
|
| 309 |
|
| 310 |
run_btn.click(
|
| 311 |
score_minimal,
|
| 312 |
+
inputs=[sk_id_curr, amt_credit, duration_months],
|
| 313 |
+
outputs=[probability, prediction, shap_table, snapshot],
|
| 314 |
)
|
| 315 |
|
| 316 |
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py
CHANGED
|
@@ -3,9 +3,19 @@ from __future__ import annotations
|
|
| 3 |
from typing import Any
|
| 4 |
|
| 5 |
import gradio as gr
|
|
|
|
|
|
|
| 6 |
from fastapi import HTTPException
|
| 7 |
|
| 8 |
-
from app.main import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
def _ensure_startup() -> None:
|
|
@@ -30,12 +40,73 @@ def _customer_snapshot(sk_id_curr: int) -> dict[str, Any]:
|
|
| 30 |
return snapshot
|
| 31 |
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
def score_minimal(
|
| 34 |
sk_id_curr: float,
|
| 35 |
amt_credit: float,
|
| 36 |
duration_months: float,
|
| 37 |
threshold: float,
|
| 38 |
-
) -> tuple[float | None, str, float | None, dict[str, Any]]:
|
| 39 |
_ensure_startup()
|
| 40 |
try:
|
| 41 |
payload = MinimalPredictionRequest(
|
|
@@ -43,11 +114,13 @@ def score_minimal(
|
|
| 43 |
amt_credit=float(amt_credit),
|
| 44 |
duration_months=int(duration_months),
|
| 45 |
)
|
|
|
|
| 46 |
response = predict_minimal(payload, threshold=float(threshold))
|
| 47 |
result = response["predictions"][0]
|
| 48 |
probability = float(result.get("probability", 0.0))
|
| 49 |
pred_value = int(result.get("prediction", 0))
|
| 50 |
label = "Default (1)" if pred_value == 1 else "No default (0)"
|
|
|
|
| 51 |
snapshot = _customer_snapshot(int(sk_id_curr))
|
| 52 |
snapshot.update(
|
| 53 |
{
|
|
@@ -55,39 +128,55 @@ def score_minimal(
|
|
| 55 |
"DURATION_MONTHS": int(duration_months),
|
| 56 |
}
|
| 57 |
)
|
| 58 |
-
return probability, label, float(response.get("threshold", 0.0)), snapshot
|
| 59 |
except HTTPException as exc:
|
| 60 |
-
return None, f"Erreur: {exc.detail}", None, {"error": exc.detail}
|
| 61 |
except Exception as exc: # pragma: no cover - UI fallback
|
| 62 |
-
return None, f"Erreur: {exc}", None, {"error": str(exc)}
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
with gr.Blocks(title="Credit
|
| 66 |
-
gr.Markdown("# Credit
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
gr.Markdown(
|
| 68 |
-
"Renseignez l'identifiant client, le montant du
|
| 69 |
-
"Les autres features proviennent des donnees clients reference."
|
| 70 |
)
|
| 71 |
|
| 72 |
with gr.Row():
|
| 73 |
-
sk_id_curr = gr.Number(label="
|
| 74 |
-
amt_credit = gr.Number(label="
|
| 75 |
-
duration_months = gr.Number(label="
|
| 76 |
threshold = gr.Slider(label="Seuil", minimum=0.0, maximum=1.0, value=0.5, step=0.01)
|
| 77 |
|
| 78 |
run_btn = gr.Button("Scorer")
|
| 79 |
|
| 80 |
with gr.Row():
|
| 81 |
-
probability = gr.Number(label="
|
| 82 |
-
prediction = gr.Textbox(label="
|
| 83 |
-
threshold_used = gr.Number(label="Seuil
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
-
snapshot = gr.JSON(label="Snapshot client (
|
| 86 |
|
| 87 |
run_btn.click(
|
| 88 |
score_minimal,
|
| 89 |
inputs=[sk_id_curr, amt_credit, duration_months, threshold],
|
| 90 |
-
outputs=[probability, prediction, threshold_used, snapshot],
|
| 91 |
)
|
| 92 |
|
| 93 |
|
|
|
|
| 3 |
from typing import Any
|
| 4 |
|
| 5 |
import gradio as gr
|
| 6 |
+
import numpy as np
|
| 7 |
+
import pandas as pd
|
| 8 |
from fastapi import HTTPException
|
| 9 |
|
| 10 |
+
from app.main import (
|
| 11 |
+
MinimalPredictionRequest,
|
| 12 |
+
app,
|
| 13 |
+
predict_minimal,
|
| 14 |
+
startup_event,
|
| 15 |
+
_build_minimal_record,
|
| 16 |
+
_normalize_inputs,
|
| 17 |
+
preprocess_input,
|
| 18 |
+
)
|
| 19 |
|
| 20 |
|
| 21 |
def _ensure_startup() -> None:
|
|
|
|
| 40 |
return snapshot
|
| 41 |
|
| 42 |
|
| 43 |
+
def _shap_error_table(message: str) -> pd.DataFrame:
|
| 44 |
+
return pd.DataFrame(
|
| 45 |
+
[
|
| 46 |
+
{
|
| 47 |
+
"feature": message,
|
| 48 |
+
"value": np.nan,
|
| 49 |
+
"shap_value": np.nan,
|
| 50 |
+
}
|
| 51 |
+
]
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def _extract_shap_values(shap_values: Any) -> np.ndarray:
|
| 56 |
+
if isinstance(shap_values, list):
|
| 57 |
+
shap_values = shap_values[1] if len(shap_values) > 1 else shap_values[0]
|
| 58 |
+
values = np.asarray(shap_values)
|
| 59 |
+
if values.ndim == 3:
|
| 60 |
+
values = values[:, :, 1]
|
| 61 |
+
if values.ndim == 1:
|
| 62 |
+
values = values.reshape(1, -1)
|
| 63 |
+
return values
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def _compute_shap_top_features(record: dict[str, Any], top_k: int = 10) -> pd.DataFrame:
|
| 67 |
+
preprocessor = app.state.preprocessor
|
| 68 |
+
df_raw = pd.DataFrame.from_records([record])
|
| 69 |
+
df_norm, _, _ = _normalize_inputs(df_raw, preprocessor)
|
| 70 |
+
features = preprocess_input(df_norm, preprocessor)
|
| 71 |
+
try:
|
| 72 |
+
import shap
|
| 73 |
+
except ImportError:
|
| 74 |
+
return _shap_error_table("SHAP not installed.")
|
| 75 |
+
|
| 76 |
+
explainer = getattr(app.state, "shap_explainer", None)
|
| 77 |
+
if explainer is None:
|
| 78 |
+
try:
|
| 79 |
+
explainer = shap.TreeExplainer(app.state.model)
|
| 80 |
+
except Exception:
|
| 81 |
+
explainer = shap.Explainer(app.state.model, features)
|
| 82 |
+
app.state.shap_explainer = explainer
|
| 83 |
+
|
| 84 |
+
try:
|
| 85 |
+
explanation = explainer(features)
|
| 86 |
+
values = _extract_shap_values(explanation.values)
|
| 87 |
+
except Exception:
|
| 88 |
+
values = _extract_shap_values(explainer.shap_values(features))
|
| 89 |
+
|
| 90 |
+
shap_row = values[0]
|
| 91 |
+
feature_values = features.iloc[0].to_numpy()
|
| 92 |
+
top_idx = np.argsort(np.abs(shap_row))[::-1][:top_k]
|
| 93 |
+
rows = [
|
| 94 |
+
{
|
| 95 |
+
"feature": str(features.columns[idx]),
|
| 96 |
+
"value": float(feature_values[idx]),
|
| 97 |
+
"shap_value": float(shap_row[idx]),
|
| 98 |
+
}
|
| 99 |
+
for idx in top_idx
|
| 100 |
+
]
|
| 101 |
+
return pd.DataFrame(rows)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
def score_minimal(
|
| 105 |
sk_id_curr: float,
|
| 106 |
amt_credit: float,
|
| 107 |
duration_months: float,
|
| 108 |
threshold: float,
|
| 109 |
+
) -> tuple[float | None, str, float | None, pd.DataFrame, dict[str, Any]]:
|
| 110 |
_ensure_startup()
|
| 111 |
try:
|
| 112 |
payload = MinimalPredictionRequest(
|
|
|
|
| 114 |
amt_credit=float(amt_credit),
|
| 115 |
duration_months=int(duration_months),
|
| 116 |
)
|
| 117 |
+
record = _build_minimal_record(payload, app.state.preprocessor)
|
| 118 |
response = predict_minimal(payload, threshold=float(threshold))
|
| 119 |
result = response["predictions"][0]
|
| 120 |
probability = float(result.get("probability", 0.0))
|
| 121 |
pred_value = int(result.get("prediction", 0))
|
| 122 |
label = "Default (1)" if pred_value == 1 else "No default (0)"
|
| 123 |
+
shap_table = _compute_shap_top_features(record, top_k=10)
|
| 124 |
snapshot = _customer_snapshot(int(sk_id_curr))
|
| 125 |
snapshot.update(
|
| 126 |
{
|
|
|
|
| 128 |
"DURATION_MONTHS": int(duration_months),
|
| 129 |
}
|
| 130 |
)
|
| 131 |
+
return probability, label, float(response.get("threshold", 0.0)), shap_table, snapshot
|
| 132 |
except HTTPException as exc:
|
| 133 |
+
return None, f"Erreur: {exc.detail}", None, _shap_error_table("No SHAP available."), {"error": exc.detail}
|
| 134 |
except Exception as exc: # pragma: no cover - UI fallback
|
| 135 |
+
return None, f"Erreur: {exc}", None, _shap_error_table("No SHAP available."), {"error": str(exc)}
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
with gr.Blocks(title="Credit scoring MLOps") as demo:
|
| 139 |
+
gr.Markdown("# Credit scoring MLOps")
|
| 140 |
+
gr.HTML("""
|
| 141 |
+
<div style="display:flex; gap:0.5rem; flex-wrap:wrap;">
|
| 142 |
+
<a href="https://github.com/stephmnt/credit-scoring-mlops/releases" target="_blank" rel="noreferrer">
|
| 143 |
+
<img src="https://img.shields.io/github/v/release/stephmnt/credit-scoring-mlops" alt="GitHub Release" />
|
| 144 |
+
</a>
|
| 145 |
+
<a href="https://github.com/stephmnt/credit-scoring-mlops/actions/workflows/deploy.yml" target="_blank" rel="noreferrer">
|
| 146 |
+
<img src="https://img.shields.io/github/actions/workflow/status/stephmnt/credit-scoring-mlops/deploy.yml" alt="GitHub Actions Workflow Status" />
|
| 147 |
+
</a>
|
| 148 |
+
</div>
|
| 149 |
+
""")
|
| 150 |
gr.Markdown(
|
| 151 |
+
"Renseignez l'identifiant client, le montant du crédit et la durée. "
|
|
|
|
| 152 |
)
|
| 153 |
|
| 154 |
with gr.Row():
|
| 155 |
+
sk_id_curr = gr.Number(label="Identifiant client", precision=0, value=100001)
|
| 156 |
+
amt_credit = gr.Number(label="Montant du crédit", value=200000)
|
| 157 |
+
duration_months = gr.Number(label="Durée (mois)", precision=0, value=60)
|
| 158 |
threshold = gr.Slider(label="Seuil", minimum=0.0, maximum=1.0, value=0.5, step=0.01)
|
| 159 |
|
| 160 |
run_btn = gr.Button("Scorer")
|
| 161 |
|
| 162 |
with gr.Row():
|
| 163 |
+
probability = gr.Number(label="Probabilité de défaut")
|
| 164 |
+
prediction = gr.Textbox(label="Prédiction")
|
| 165 |
+
threshold_used = gr.Number(label="Seuil utilisé")
|
| 166 |
+
|
| 167 |
+
shap_table = gr.Dataframe(
|
| 168 |
+
headers=["feature", "value", "shap_value"],
|
| 169 |
+
label="Top 10 SHAP (local)",
|
| 170 |
+
datatype=["str", "number", "number"],
|
| 171 |
+
interactive=False,
|
| 172 |
+
)
|
| 173 |
|
| 174 |
+
snapshot = gr.JSON(label="Snapshot client (référence)")
|
| 175 |
|
| 176 |
run_btn.click(
|
| 177 |
score_minimal,
|
| 178 |
inputs=[sk_id_curr, amt_credit, duration_months, threshold],
|
| 179 |
+
outputs=[probability, prediction, threshold_used, shap_table, snapshot],
|
| 180 |
)
|
| 181 |
|
| 182 |
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/__init__.py
CHANGED
|
@@ -1,3 +1 @@
|
|
| 1 |
-
"""
|
| 2 |
-
|
| 3 |
-
from app_entry import app, demo # re-export for uvicorn app:app
|
|
|
|
| 1 |
+
"""Package marker for the FastAPI app package."""
|
|
|
|
|
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/Dockerfile
CHANGED
|
@@ -9,8 +9,9 @@ COPY requirements.txt .
|
|
| 9 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 10 |
|
| 11 |
COPY app/ app/
|
| 12 |
-
COPY
|
| 13 |
-
COPY
|
|
|
|
| 14 |
|
| 15 |
EXPOSE 7860
|
| 16 |
|
|
|
|
| 9 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 10 |
|
| 11 |
COPY app/ app/
|
| 12 |
+
COPY app_entry.py app.py gradio_app.py ./
|
| 13 |
+
COPY data/ data/
|
| 14 |
+
COPY artifacts/ artifacts/
|
| 15 |
|
| 16 |
EXPOSE 7860
|
| 17 |
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py
CHANGED
|
@@ -1,22 +1,4 @@
|
|
| 1 |
-
from
|
| 2 |
-
import gradio as gr
|
| 3 |
-
|
| 4 |
-
from app.main import app as api_app
|
| 5 |
-
from app.main import startup_event
|
| 6 |
-
from gradio_app import demo
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
root_app = FastAPI()
|
| 10 |
-
root_app.mount("/api", api_app)
|
| 11 |
-
root_app = gr.mount_gradio_app(root_app, demo, path="/")
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
@root_app.on_event("startup")
|
| 15 |
-
def _startup() -> None:
|
| 16 |
-
startup_event()
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
app = root_app
|
| 20 |
|
| 21 |
|
| 22 |
if __name__ == "__main__":
|
|
|
|
| 1 |
+
from app_entry import app, demo # re-export for HF Spaces
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
|
| 4 |
if __name__ == "__main__":
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/__init__.py
CHANGED
|
@@ -1 +1,3 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Expose combined ASGI app for HF Spaces default loader."""
|
| 2 |
+
|
| 3 |
+
from app_entry import app, demo # re-export for uvicorn app:app
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py
CHANGED
|
@@ -1113,6 +1113,16 @@ def startup_event() -> None:
|
|
| 1113 |
logger.info("Loading model from %s", model_path)
|
| 1114 |
app.state.model = load_model(model_path)
|
| 1115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1116 |
try:
|
| 1117 |
artifacts_path = ARTIFACTS_PATH
|
| 1118 |
if not artifacts_path.exists():
|
|
@@ -1125,7 +1135,7 @@ def startup_event() -> None:
|
|
| 1125 |
if downloaded is not None:
|
| 1126 |
artifacts_path = downloaded
|
| 1127 |
logger.info("Loading preprocessor artifacts from %s", artifacts_path)
|
| 1128 |
-
app.state.preprocessor = load_preprocessor(
|
| 1129 |
except RuntimeError as exc:
|
| 1130 |
if ALLOW_MISSING_ARTIFACTS:
|
| 1131 |
logger.warning("Preprocessor artifacts missing (%s). Using fallback preprocessor.", exc)
|
|
|
|
| 1113 |
logger.info("Loading model from %s", model_path)
|
| 1114 |
app.state.model = load_model(model_path)
|
| 1115 |
|
| 1116 |
+
data_path = DATA_PATH
|
| 1117 |
+
if not data_path.exists():
|
| 1118 |
+
downloaded = _ensure_hf_asset(
|
| 1119 |
+
data_path,
|
| 1120 |
+
HF_CUSTOMER_REPO_ID,
|
| 1121 |
+
HF_CUSTOMER_FILENAME,
|
| 1122 |
+
HF_CUSTOMER_REPO_TYPE,
|
| 1123 |
+
)
|
| 1124 |
+
if downloaded is not None:
|
| 1125 |
+
data_path = downloaded
|
| 1126 |
try:
|
| 1127 |
artifacts_path = ARTIFACTS_PATH
|
| 1128 |
if not artifacts_path.exists():
|
|
|
|
| 1135 |
if downloaded is not None:
|
| 1136 |
artifacts_path = downloaded
|
| 1137 |
logger.info("Loading preprocessor artifacts from %s", artifacts_path)
|
| 1138 |
+
app.state.preprocessor = load_preprocessor(data_path, artifacts_path)
|
| 1139 |
except RuntimeError as exc:
|
| 1140 |
if ALLOW_MISSING_ARTIFACTS:
|
| 1141 |
logger.warning("Preprocessor artifacts missing (%s). Using fallback preprocessor.", exc)
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app_entry.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI
|
| 2 |
+
import gradio as gr
|
| 3 |
+
|
| 4 |
+
from app.main import app as api_app
|
| 5 |
+
from app.main import startup_event
|
| 6 |
+
from gradio_app import demo
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
root_app = FastAPI()
|
| 10 |
+
root_app.mount("/api", api_app)
|
| 11 |
+
root_app = gr.mount_gradio_app(root_app, demo, path="/")
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@root_app.on_event("startup")
|
| 15 |
+
def _startup() -> None:
|
| 16 |
+
startup_event()
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
app = root_app
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/Dockerfile
CHANGED
|
@@ -14,4 +14,4 @@ COPY artifacts/preprocessor.joblib artifacts/
|
|
| 14 |
|
| 15 |
EXPOSE 7860
|
| 16 |
|
| 17 |
-
CMD ["uvicorn", "
|
|
|
|
| 14 |
|
| 15 |
EXPOSE 7860
|
| 16 |
|
| 17 |
+
CMD ["uvicorn", "app_entry:app", "--host", "0.0.0.0", "--port", "7860"]
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md
CHANGED
|
@@ -198,29 +198,38 @@ Exemple (un seul repo dataset avec 3 fichiers) :
|
|
| 198 |
|
| 199 |
### Demo live (commandes cles en main)
|
| 200 |
|
| 201 |
-
Lancer l'API :
|
| 202 |
|
| 203 |
```shell
|
| 204 |
uvicorn app.main:app --reload --port 7860
|
| 205 |
```
|
| 206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
Verifier le service (HF) :
|
| 208 |
|
| 209 |
```shell
|
| 210 |
BASE_URL="https://stephmnt-credit-scoring-mlops.hf.space"
|
| 211 |
-
|
|
|
|
| 212 |
```
|
| 213 |
|
|
|
|
|
|
|
| 214 |
Voir les features attendues (HF) :
|
| 215 |
|
| 216 |
```shell
|
| 217 |
-
curl -s "${
|
| 218 |
```
|
| 219 |
|
| 220 |
Predire un client (HF) :
|
| 221 |
|
| 222 |
```shell
|
| 223 |
-
curl -s -X POST "${
|
| 224 |
-H "Content-Type: application/json" \
|
| 225 |
-d '{
|
| 226 |
"data": {
|
|
@@ -242,7 +251,7 @@ curl -s -X POST "${BASE_URL}/predict?threshold=0.5" \
|
|
| 242 |
Predire plusieurs clients (batch, HF) :
|
| 243 |
|
| 244 |
```shell
|
| 245 |
-
curl -s -X POST "${
|
| 246 |
-H "Content-Type: application/json" \
|
| 247 |
-d '{
|
| 248 |
"data": [
|
|
@@ -279,7 +288,7 @@ curl -s -X POST "${BASE_URL}/predict?threshold=0.45" \
|
|
| 279 |
Exemple d'erreur (champ requis manquant, HF) :
|
| 280 |
|
| 281 |
```shell
|
| 282 |
-
curl -s -X POST "${
|
| 283 |
-H "Content-Type: application/json" \
|
| 284 |
-d '{
|
| 285 |
"data": {
|
|
@@ -316,13 +325,13 @@ Recuperer les logs (HF) :
|
|
| 316 |
Configurer `LOGS_ACCESS_TOKEN` dans les secrets du Space, puis :
|
| 317 |
|
| 318 |
```shell
|
| 319 |
-
curl -s -H "X-Logs-Token: $LOGS_ACCESS_TOKEN" "${
|
| 320 |
```
|
| 321 |
|
| 322 |
Alternative :
|
| 323 |
|
| 324 |
```shell
|
| 325 |
-
curl -s -H "Authorization: Bearer $LOGS_ACCESS_TOKEN" "${
|
| 326 |
```
|
| 327 |
|
| 328 |
Apres quelques requêtes, générer le rapport de drift :
|
|
|
|
| 198 |
|
| 199 |
### Demo live (commandes cles en main)
|
| 200 |
|
| 201 |
+
Lancer l'API (sans UI) :
|
| 202 |
|
| 203 |
```shell
|
| 204 |
uvicorn app.main:app --reload --port 7860
|
| 205 |
```
|
| 206 |
|
| 207 |
+
Lancer l'UI Gradio + API (chemin `/api`) :
|
| 208 |
+
|
| 209 |
+
```shell
|
| 210 |
+
uvicorn app_entry:app --reload --port 7860
|
| 211 |
+
```
|
| 212 |
+
|
| 213 |
Verifier le service (HF) :
|
| 214 |
|
| 215 |
```shell
|
| 216 |
BASE_URL="https://stephmnt-credit-scoring-mlops.hf.space"
|
| 217 |
+
API_BASE="${BASE_URL}/api"
|
| 218 |
+
curl -s "${API_BASE}/health"
|
| 219 |
```
|
| 220 |
|
| 221 |
+
Note : sur HF Spaces, l'UI Gradio est a la racine, l'API est sous `/api`.
|
| 222 |
+
|
| 223 |
Voir les features attendues (HF) :
|
| 224 |
|
| 225 |
```shell
|
| 226 |
+
curl -s "${API_BASE}/features"
|
| 227 |
```
|
| 228 |
|
| 229 |
Predire un client (HF) :
|
| 230 |
|
| 231 |
```shell
|
| 232 |
+
curl -s -X POST "${API_BASE}/predict?threshold=0.5" \
|
| 233 |
-H "Content-Type: application/json" \
|
| 234 |
-d '{
|
| 235 |
"data": {
|
|
|
|
| 251 |
Predire plusieurs clients (batch, HF) :
|
| 252 |
|
| 253 |
```shell
|
| 254 |
+
curl -s -X POST "${API_BASE}/predict?threshold=0.45" \
|
| 255 |
-H "Content-Type: application/json" \
|
| 256 |
-d '{
|
| 257 |
"data": [
|
|
|
|
| 288 |
Exemple d'erreur (champ requis manquant, HF) :
|
| 289 |
|
| 290 |
```shell
|
| 291 |
+
curl -s -X POST "${API_BASE}/predict" \
|
| 292 |
-H "Content-Type: application/json" \
|
| 293 |
-d '{
|
| 294 |
"data": {
|
|
|
|
| 325 |
Configurer `LOGS_ACCESS_TOKEN` dans les secrets du Space, puis :
|
| 326 |
|
| 327 |
```shell
|
| 328 |
+
curl -s -H "X-Logs-Token: $LOGS_ACCESS_TOKEN" "${API_BASE}/logs?tail=200"
|
| 329 |
```
|
| 330 |
|
| 331 |
Alternative :
|
| 332 |
|
| 333 |
```shell
|
| 334 |
+
curl -s -H "Authorization: Bearer $LOGS_ACCESS_TOKEN" "${API_BASE}/logs?tail=200"
|
| 335 |
```
|
| 336 |
|
| 337 |
Apres quelques requêtes, générer le rapport de drift :
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.gitattributes
CHANGED
|
@@ -1,35 +1,4 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
data/HistGB_final_model.pkl filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 3 |
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy-assets.yml
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: deploy-assets
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
workflow_dispatch:
|
| 5 |
+
inputs:
|
| 6 |
+
repo_id:
|
| 7 |
+
description: "HF repo id (e.g. stephmnt/assets-credit-scoring-mlops)"
|
| 8 |
+
required: true
|
| 9 |
+
default: "stephmnt/assets-credit-scoring-mlops"
|
| 10 |
+
repo_type:
|
| 11 |
+
description: "HF repo type (dataset or model)"
|
| 12 |
+
required: true
|
| 13 |
+
default: "dataset"
|
| 14 |
+
|
| 15 |
+
jobs:
|
| 16 |
+
upload-assets:
|
| 17 |
+
runs-on: ubuntu-latest
|
| 18 |
+
steps:
|
| 19 |
+
- name: Checkout
|
| 20 |
+
uses: actions/checkout@v4
|
| 21 |
+
with:
|
| 22 |
+
lfs: true
|
| 23 |
+
|
| 24 |
+
- name: Set up Python
|
| 25 |
+
uses: actions/setup-python@v5
|
| 26 |
+
with:
|
| 27 |
+
python-version: "3.11"
|
| 28 |
+
|
| 29 |
+
- name: Install dependencies
|
| 30 |
+
run: |
|
| 31 |
+
python -m pip install --upgrade pip
|
| 32 |
+
python -m pip install huggingface_hub
|
| 33 |
+
|
| 34 |
+
- name: Upload assets to Hugging Face Hub
|
| 35 |
+
env:
|
| 36 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 37 |
+
HF_REPO_ID: ${{ inputs.repo_id }}
|
| 38 |
+
HF_REPO_TYPE: ${{ inputs.repo_type }}
|
| 39 |
+
run: |
|
| 40 |
+
python - <<'PY'
|
| 41 |
+
import os
|
| 42 |
+
from pathlib import Path
|
| 43 |
+
from huggingface_hub import HfApi
|
| 44 |
+
|
| 45 |
+
repo_id = os.environ["HF_REPO_ID"]
|
| 46 |
+
repo_type = os.environ["HF_REPO_TYPE"]
|
| 47 |
+
token = os.environ["HF_TOKEN"]
|
| 48 |
+
|
| 49 |
+
files = {
|
| 50 |
+
"data/HistGB_final_model.pkl": "HistGB_final_model.pkl",
|
| 51 |
+
"artifacts/preprocessor.joblib": "preprocessor.joblib",
|
| 52 |
+
"data/data_final.parquet": "data_final.parquet",
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
api = HfApi()
|
| 56 |
+
for local_path, remote_name in files.items():
|
| 57 |
+
path = Path(local_path)
|
| 58 |
+
if not path.exists():
|
| 59 |
+
raise SystemExit(f"Missing file: {path}")
|
| 60 |
+
api.upload_file(
|
| 61 |
+
path_or_fileobj=str(path),
|
| 62 |
+
path_in_repo=remote_name,
|
| 63 |
+
repo_id=repo_id,
|
| 64 |
+
repo_type=repo_type,
|
| 65 |
+
token=token,
|
| 66 |
+
commit_message=f"Update {remote_name}",
|
| 67 |
+
)
|
| 68 |
+
print("Assets uploaded.")
|
| 69 |
+
PY
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.github/workflows/deploy.yml
CHANGED
|
@@ -12,6 +12,8 @@ jobs:
|
|
| 12 |
steps:
|
| 13 |
- name: Checkout
|
| 14 |
uses: actions/checkout@v4
|
|
|
|
|
|
|
| 15 |
|
| 16 |
- name: Set up Python
|
| 17 |
uses: actions/setup-python@v5
|
|
@@ -47,6 +49,8 @@ jobs:
|
|
| 47 |
--exclude 'logs' \
|
| 48 |
--exclude 'reports' \
|
| 49 |
--exclude 'screen-mlflow.png' \
|
|
|
|
|
|
|
| 50 |
--exclude 'data/*.csv' \
|
| 51 |
--exclude 'data/*.parquet' \
|
| 52 |
./ hf_space/
|
|
|
|
| 12 |
steps:
|
| 13 |
- name: Checkout
|
| 14 |
uses: actions/checkout@v4
|
| 15 |
+
with:
|
| 16 |
+
lfs: true
|
| 17 |
|
| 18 |
- name: Set up Python
|
| 19 |
uses: actions/setup-python@v5
|
|
|
|
| 49 |
--exclude 'logs' \
|
| 50 |
--exclude 'reports' \
|
| 51 |
--exclude 'screen-mlflow.png' \
|
| 52 |
+
--exclude 'data/HistGB_final_model.pkl' \
|
| 53 |
+
--exclude 'artifacts/preprocessor.joblib' \
|
| 54 |
--exclude 'data/*.csv' \
|
| 55 |
--exclude 'data/*.parquet' \
|
| 56 |
./ hf_space/
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.gitignore
CHANGED
|
@@ -6,6 +6,7 @@ logs/
|
|
| 6 |
reports/
|
| 7 |
data/*
|
| 8 |
!data/HistGB_final_model.pkl
|
|
|
|
| 9 |
artifacts/*
|
| 10 |
!artifacts/preprocessor.joblib
|
| 11 |
.DS_Store
|
|
@@ -18,7 +19,8 @@ mlruns/
|
|
| 18 |
*.code-workspace
|
| 19 |
presentation_projet08.pptx
|
| 20 |
rapport_projet06.md
|
| 21 |
-
|
|
|
|
| 22 |
## https://github.com/github/gitignore/blob/e8554d85bf62e38d6db966a50d2064ac025fd82a/Python.gitignore
|
| 23 |
|
| 24 |
# Byte-compiled / optimized / DLL files
|
|
|
|
| 6 |
reports/
|
| 7 |
data/*
|
| 8 |
!data/HistGB_final_model.pkl
|
| 9 |
+
!data/data_final.parquet
|
| 10 |
artifacts/*
|
| 11 |
!artifacts/preprocessor.joblib
|
| 12 |
.DS_Store
|
|
|
|
| 19 |
*.code-workspace
|
| 20 |
presentation_projet08.pptx
|
| 21 |
rapport_projet06.md
|
| 22 |
+
rapport_template.md
|
| 23 |
+
data_final.parquet
|
| 24 |
## https://github.com/github/gitignore/blob/e8554d85bf62e38d6db966a50d2064ac025fd82a/Python.gitignore
|
| 25 |
|
| 26 |
# Byte-compiled / optimized / DLL files
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py
CHANGED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI
|
| 2 |
+
import gradio as gr
|
| 3 |
+
|
| 4 |
+
from app.main import app as api_app
|
| 5 |
+
from app.main import startup_event
|
| 6 |
+
from gradio_app import demo
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
root_app = FastAPI()
|
| 10 |
+
root_app.mount("/api", api_app)
|
| 11 |
+
root_app = gr.mount_gradio_app(root_app, demo, path="/")
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@root_app.on_event("startup")
|
| 15 |
+
def _startup() -> None:
|
| 16 |
+
startup_event()
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
app = root_app
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
if __name__ == "__main__":
|
| 23 |
+
import uvicorn
|
| 24 |
+
|
| 25 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app/main.py
CHANGED
|
@@ -41,6 +41,18 @@ LOG_INCLUDE_INPUTS = os.getenv("LOG_INCLUDE_INPUTS", "1") == "1"
|
|
| 41 |
LOG_HASH_SK_ID = os.getenv("LOG_HASH_SK_ID", "0") == "1"
|
| 42 |
MODEL_VERSION = os.getenv("MODEL_VERSION", MODEL_PATH.name)
|
| 43 |
LOGS_ACCESS_TOKEN = os.getenv("LOGS_ACCESS_TOKEN")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
IGNORE_FEATURES = ["is_train", "is_test", "TARGET", "SK_ID_CURR"]
|
| 46 |
ENGINEERED_FEATURES = [
|
|
@@ -117,6 +129,13 @@ class PredictionRequest(BaseModel):
|
|
| 117 |
data: dict[str, Any] | list[dict[str, Any]]
|
| 118 |
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
@dataclass
|
| 121 |
class PreprocessorArtifacts:
|
| 122 |
columns_keep: list[str]
|
|
@@ -173,6 +192,32 @@ def _normalize_category_value(value: object, mapping: dict[str, str]) -> object:
|
|
| 173 |
return mapping.get(key, "Unknown")
|
| 174 |
|
| 175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
def _normalize_inputs(
|
| 177 |
df_raw: pd.DataFrame,
|
| 178 |
preprocessor: PreprocessorArtifacts,
|
|
@@ -262,6 +307,54 @@ def _build_data_quality_records(
|
|
| 262 |
return records
|
| 263 |
|
| 264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
def _append_log_entries(entries: list[dict[str, Any]]) -> None:
|
| 266 |
if not LOG_PREDICTIONS:
|
| 267 |
return
|
|
@@ -596,6 +689,41 @@ def load_model(model_path: Path):
|
|
| 596 |
return pickle.load(handle)
|
| 597 |
|
| 598 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 599 |
def _infer_numeric_ranges_from_scaler(preprocessor: PreprocessorArtifacts) -> dict[str, tuple[float, float]]:
|
| 600 |
ranges = {}
|
| 601 |
scaler = getattr(preprocessor, "scaler", None)
|
|
@@ -963,19 +1091,41 @@ def preprocess_input(df_raw: pd.DataFrame, artifacts: PreprocessorArtifacts) ->
|
|
| 963 |
|
| 964 |
@app.on_event("startup")
|
| 965 |
def startup_event() -> None:
|
| 966 |
-
if not
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 967 |
if ALLOW_MISSING_ARTIFACTS:
|
| 968 |
-
logger.warning("Model file not found: %s. Using dummy model.",
|
| 969 |
app.state.model = DummyModel()
|
| 970 |
else:
|
| 971 |
-
raise RuntimeError(f"Model file not found: {
|
| 972 |
else:
|
| 973 |
-
logger.info("Loading model from %s",
|
| 974 |
-
app.state.model = load_model(
|
| 975 |
|
| 976 |
try:
|
| 977 |
-
|
| 978 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 979 |
except RuntimeError as exc:
|
| 980 |
if ALLOW_MISSING_ARTIFACTS:
|
| 981 |
logger.warning("Preprocessor artifacts missing (%s). Using fallback preprocessor.", exc)
|
|
@@ -983,6 +1133,19 @@ def startup_event() -> None:
|
|
| 983 |
else:
|
| 984 |
raise
|
| 985 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 986 |
|
| 987 |
@app.get("/health")
|
| 988 |
def health() -> dict[str, str]:
|
|
@@ -1063,16 +1226,11 @@ def logs(
|
|
| 1063 |
return Response(content="".join(lines), media_type="application/x-ndjson")
|
| 1064 |
|
| 1065 |
|
| 1066 |
-
|
| 1067 |
-
def predict(
|
| 1068 |
-
payload: PredictionRequest,
|
| 1069 |
-
threshold: float | None = Query(default=None, ge=0.0, le=1.0),
|
| 1070 |
-
) -> dict[str, Any]:
|
| 1071 |
model = app.state.model
|
| 1072 |
preprocessor: PreprocessorArtifacts = app.state.preprocessor
|
| 1073 |
request_id = str(uuid.uuid4())
|
| 1074 |
start_time = time.perf_counter()
|
| 1075 |
-
records = payload.data if isinstance(payload.data, list) else [payload.data]
|
| 1076 |
|
| 1077 |
if not records:
|
| 1078 |
raise HTTPException(status_code=422, detail={"message": "No input records provided."})
|
|
@@ -1168,3 +1326,22 @@ def predict(
|
|
| 1168 |
error=str(exc),
|
| 1169 |
)
|
| 1170 |
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
LOG_HASH_SK_ID = os.getenv("LOG_HASH_SK_ID", "0") == "1"
|
| 42 |
MODEL_VERSION = os.getenv("MODEL_VERSION", MODEL_PATH.name)
|
| 43 |
LOGS_ACCESS_TOKEN = os.getenv("LOGS_ACCESS_TOKEN")
|
| 44 |
+
CUSTOMER_DATA_PATH = Path(os.getenv("CUSTOMER_DATA_PATH", str(DATA_PATH)))
|
| 45 |
+
CUSTOMER_LOOKUP_ENABLED = os.getenv("CUSTOMER_LOOKUP_ENABLED", "1") == "1"
|
| 46 |
+
CUSTOMER_LOOKUP_CACHE = os.getenv("CUSTOMER_LOOKUP_CACHE", "1") == "1"
|
| 47 |
+
HF_MODEL_REPO_ID = os.getenv("HF_MODEL_REPO_ID")
|
| 48 |
+
HF_MODEL_REPO_TYPE = os.getenv("HF_MODEL_REPO_TYPE", "model")
|
| 49 |
+
HF_MODEL_FILENAME = os.getenv("HF_MODEL_FILENAME", MODEL_PATH.name)
|
| 50 |
+
HF_PREPROCESSOR_REPO_ID = os.getenv("HF_PREPROCESSOR_REPO_ID", HF_MODEL_REPO_ID or "")
|
| 51 |
+
HF_PREPROCESSOR_REPO_TYPE = os.getenv("HF_PREPROCESSOR_REPO_TYPE", HF_MODEL_REPO_TYPE)
|
| 52 |
+
HF_PREPROCESSOR_FILENAME = os.getenv("HF_PREPROCESSOR_FILENAME", ARTIFACTS_PATH.name)
|
| 53 |
+
HF_CUSTOMER_REPO_ID = os.getenv("HF_CUSTOMER_REPO_ID")
|
| 54 |
+
HF_CUSTOMER_REPO_TYPE = os.getenv("HF_CUSTOMER_REPO_TYPE", "dataset")
|
| 55 |
+
HF_CUSTOMER_FILENAME = os.getenv("HF_CUSTOMER_FILENAME", CUSTOMER_DATA_PATH.name)
|
| 56 |
|
| 57 |
IGNORE_FEATURES = ["is_train", "is_test", "TARGET", "SK_ID_CURR"]
|
| 58 |
ENGINEERED_FEATURES = [
|
|
|
|
| 129 |
data: dict[str, Any] | list[dict[str, Any]]
|
| 130 |
|
| 131 |
|
| 132 |
+
class MinimalPredictionRequest(BaseModel):
|
| 133 |
+
sk_id_curr: int
|
| 134 |
+
amt_credit: float
|
| 135 |
+
duration_months: int | None = None
|
| 136 |
+
amt_annuity: float | None = None
|
| 137 |
+
|
| 138 |
+
|
| 139 |
@dataclass
|
| 140 |
class PreprocessorArtifacts:
|
| 141 |
columns_keep: list[str]
|
|
|
|
| 192 |
return mapping.get(key, "Unknown")
|
| 193 |
|
| 194 |
|
| 195 |
+
def _ensure_hf_asset(
|
| 196 |
+
local_path: Path,
|
| 197 |
+
repo_id: str | None,
|
| 198 |
+
filename: str,
|
| 199 |
+
repo_type: str,
|
| 200 |
+
) -> Path | None:
|
| 201 |
+
if local_path.exists():
|
| 202 |
+
return local_path
|
| 203 |
+
if not repo_id:
|
| 204 |
+
return None
|
| 205 |
+
try:
|
| 206 |
+
from huggingface_hub import hf_hub_download
|
| 207 |
+
except ImportError as exc: # pragma: no cover - optional dependency
|
| 208 |
+
raise RuntimeError("huggingface_hub is required to download remote assets.") from exc
|
| 209 |
+
local_path.parent.mkdir(parents=True, exist_ok=True)
|
| 210 |
+
return Path(
|
| 211 |
+
hf_hub_download(
|
| 212 |
+
repo_id=repo_id,
|
| 213 |
+
filename=filename,
|
| 214 |
+
repo_type=repo_type,
|
| 215 |
+
local_dir=str(local_path.parent),
|
| 216 |
+
local_dir_use_symlinks=False,
|
| 217 |
+
)
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
|
| 221 |
def _normalize_inputs(
|
| 222 |
df_raw: pd.DataFrame,
|
| 223 |
preprocessor: PreprocessorArtifacts,
|
|
|
|
| 307 |
return records
|
| 308 |
|
| 309 |
|
| 310 |
+
def _build_minimal_record(
|
| 311 |
+
payload: MinimalPredictionRequest,
|
| 312 |
+
preprocessor: PreprocessorArtifacts,
|
| 313 |
+
) -> dict[str, Any]:
|
| 314 |
+
reference = _get_customer_reference(preprocessor)
|
| 315 |
+
if reference is None:
|
| 316 |
+
raise HTTPException(
|
| 317 |
+
status_code=503,
|
| 318 |
+
detail={"message": "Customer reference data is not available."},
|
| 319 |
+
)
|
| 320 |
+
sk_id = int(payload.sk_id_curr)
|
| 321 |
+
if sk_id not in reference.index:
|
| 322 |
+
raise HTTPException(
|
| 323 |
+
status_code=404,
|
| 324 |
+
detail={"message": f"Client {sk_id} not found in reference data."},
|
| 325 |
+
)
|
| 326 |
+
record = reference.loc[sk_id].to_dict()
|
| 327 |
+
record["SK_ID_CURR"] = sk_id
|
| 328 |
+
if payload.amt_credit <= 0:
|
| 329 |
+
raise HTTPException(
|
| 330 |
+
status_code=422,
|
| 331 |
+
detail={"message": "AMT_CREDIT must be positive."},
|
| 332 |
+
)
|
| 333 |
+
record["AMT_CREDIT"] = float(payload.amt_credit)
|
| 334 |
+
if payload.amt_annuity is not None:
|
| 335 |
+
if payload.amt_annuity <= 0:
|
| 336 |
+
raise HTTPException(
|
| 337 |
+
status_code=422,
|
| 338 |
+
detail={"message": "AMT_ANNUITY must be positive."},
|
| 339 |
+
)
|
| 340 |
+
record["AMT_ANNUITY"] = float(payload.amt_annuity)
|
| 341 |
+
elif payload.duration_months is not None:
|
| 342 |
+
if payload.duration_months <= 0:
|
| 343 |
+
raise HTTPException(
|
| 344 |
+
status_code=422,
|
| 345 |
+
detail={"message": "duration_months must be positive."},
|
| 346 |
+
)
|
| 347 |
+
record["AMT_ANNUITY"] = float(payload.amt_credit) / float(payload.duration_months)
|
| 348 |
+
else:
|
| 349 |
+
raise HTTPException(
|
| 350 |
+
status_code=422,
|
| 351 |
+
detail={"message": "Provide duration_months or amt_annuity."},
|
| 352 |
+
)
|
| 353 |
+
if "AMT_GOODS_PRICE" in record:
|
| 354 |
+
record["AMT_GOODS_PRICE"] = float(payload.amt_credit)
|
| 355 |
+
return record
|
| 356 |
+
|
| 357 |
+
|
| 358 |
def _append_log_entries(entries: list[dict[str, Any]]) -> None:
|
| 359 |
if not LOG_PREDICTIONS:
|
| 360 |
return
|
|
|
|
| 689 |
return pickle.load(handle)
|
| 690 |
|
| 691 |
|
| 692 |
+
def _load_customer_reference(
|
| 693 |
+
data_path: Path,
|
| 694 |
+
preprocessor: PreprocessorArtifacts,
|
| 695 |
+
) -> pd.DataFrame:
|
| 696 |
+
columns = list(preprocessor.input_feature_columns)
|
| 697 |
+
if "SK_ID_CURR" not in columns:
|
| 698 |
+
columns.insert(0, "SK_ID_CURR")
|
| 699 |
+
df = pd.read_parquet(data_path, columns=columns)
|
| 700 |
+
df = df.drop_duplicates(subset=["SK_ID_CURR"], keep="last").set_index("SK_ID_CURR")
|
| 701 |
+
return df
|
| 702 |
+
|
| 703 |
+
|
| 704 |
+
def _get_customer_reference(preprocessor: PreprocessorArtifacts) -> pd.DataFrame | None:
|
| 705 |
+
if not CUSTOMER_LOOKUP_ENABLED:
|
| 706 |
+
return None
|
| 707 |
+
cached = getattr(app.state, "customer_reference", None)
|
| 708 |
+
if cached is not None:
|
| 709 |
+
return cached
|
| 710 |
+
data_path = CUSTOMER_DATA_PATH
|
| 711 |
+
if not data_path.exists():
|
| 712 |
+
downloaded = _ensure_hf_asset(
|
| 713 |
+
data_path,
|
| 714 |
+
HF_CUSTOMER_REPO_ID,
|
| 715 |
+
HF_CUSTOMER_FILENAME,
|
| 716 |
+
HF_CUSTOMER_REPO_TYPE,
|
| 717 |
+
)
|
| 718 |
+
if downloaded is None:
|
| 719 |
+
return None
|
| 720 |
+
data_path = downloaded
|
| 721 |
+
ref = _load_customer_reference(data_path, preprocessor)
|
| 722 |
+
if CUSTOMER_LOOKUP_CACHE:
|
| 723 |
+
app.state.customer_reference = ref
|
| 724 |
+
return ref
|
| 725 |
+
|
| 726 |
+
|
| 727 |
def _infer_numeric_ranges_from_scaler(preprocessor: PreprocessorArtifacts) -> dict[str, tuple[float, float]]:
|
| 728 |
ranges = {}
|
| 729 |
scaler = getattr(preprocessor, "scaler", None)
|
|
|
|
| 1091 |
|
| 1092 |
@app.on_event("startup")
|
| 1093 |
def startup_event() -> None:
|
| 1094 |
+
if getattr(app.state, "model", None) is not None and getattr(app.state, "preprocessor", None) is not None:
|
| 1095 |
+
return
|
| 1096 |
+
model_path = MODEL_PATH
|
| 1097 |
+
if not model_path.exists():
|
| 1098 |
+
downloaded = _ensure_hf_asset(
|
| 1099 |
+
model_path,
|
| 1100 |
+
HF_MODEL_REPO_ID,
|
| 1101 |
+
HF_MODEL_FILENAME,
|
| 1102 |
+
HF_MODEL_REPO_TYPE,
|
| 1103 |
+
)
|
| 1104 |
+
if downloaded is not None:
|
| 1105 |
+
model_path = downloaded
|
| 1106 |
+
if not model_path.exists():
|
| 1107 |
if ALLOW_MISSING_ARTIFACTS:
|
| 1108 |
+
logger.warning("Model file not found: %s. Using dummy model.", model_path)
|
| 1109 |
app.state.model = DummyModel()
|
| 1110 |
else:
|
| 1111 |
+
raise RuntimeError(f"Model file not found: {model_path}")
|
| 1112 |
else:
|
| 1113 |
+
logger.info("Loading model from %s", model_path)
|
| 1114 |
+
app.state.model = load_model(model_path)
|
| 1115 |
|
| 1116 |
try:
|
| 1117 |
+
artifacts_path = ARTIFACTS_PATH
|
| 1118 |
+
if not artifacts_path.exists():
|
| 1119 |
+
downloaded = _ensure_hf_asset(
|
| 1120 |
+
artifacts_path,
|
| 1121 |
+
HF_PREPROCESSOR_REPO_ID or None,
|
| 1122 |
+
HF_PREPROCESSOR_FILENAME,
|
| 1123 |
+
HF_PREPROCESSOR_REPO_TYPE,
|
| 1124 |
+
)
|
| 1125 |
+
if downloaded is not None:
|
| 1126 |
+
artifacts_path = downloaded
|
| 1127 |
+
logger.info("Loading preprocessor artifacts from %s", artifacts_path)
|
| 1128 |
+
app.state.preprocessor = load_preprocessor(DATA_PATH, artifacts_path)
|
| 1129 |
except RuntimeError as exc:
|
| 1130 |
if ALLOW_MISSING_ARTIFACTS:
|
| 1131 |
logger.warning("Preprocessor artifacts missing (%s). Using fallback preprocessor.", exc)
|
|
|
|
| 1133 |
else:
|
| 1134 |
raise
|
| 1135 |
|
| 1136 |
+
app.state.customer_reference = None
|
| 1137 |
+
if CUSTOMER_LOOKUP_ENABLED and CUSTOMER_LOOKUP_CACHE:
|
| 1138 |
+
try:
|
| 1139 |
+
ref = _get_customer_reference(app.state.preprocessor)
|
| 1140 |
+
if ref is not None:
|
| 1141 |
+
logger.info("Loaded customer reference data (%s rows)", len(ref))
|
| 1142 |
+
else:
|
| 1143 |
+
logger.warning("Customer reference data not available.")
|
| 1144 |
+
except Exception as exc: # pragma: no cover - optional cache load
|
| 1145 |
+
logger.warning("Failed to load customer reference data: %s", exc)
|
| 1146 |
+
elif CUSTOMER_LOOKUP_ENABLED:
|
| 1147 |
+
logger.info("Customer lookup enabled without cache (on-demand load).")
|
| 1148 |
+
|
| 1149 |
|
| 1150 |
@app.get("/health")
|
| 1151 |
def health() -> dict[str, str]:
|
|
|
|
| 1226 |
return Response(content="".join(lines), media_type="application/x-ndjson")
|
| 1227 |
|
| 1228 |
|
| 1229 |
+
def _predict_records(records: list[dict[str, Any]], threshold: float | None) -> dict[str, Any]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1230 |
model = app.state.model
|
| 1231 |
preprocessor: PreprocessorArtifacts = app.state.preprocessor
|
| 1232 |
request_id = str(uuid.uuid4())
|
| 1233 |
start_time = time.perf_counter()
|
|
|
|
| 1234 |
|
| 1235 |
if not records:
|
| 1236 |
raise HTTPException(status_code=422, detail={"message": "No input records provided."})
|
|
|
|
| 1326 |
error=str(exc),
|
| 1327 |
)
|
| 1328 |
raise
|
| 1329 |
+
|
| 1330 |
+
|
| 1331 |
+
@app.post("/predict")
|
| 1332 |
+
def predict(
|
| 1333 |
+
payload: PredictionRequest,
|
| 1334 |
+
threshold: float | None = Query(default=None, ge=0.0, le=1.0),
|
| 1335 |
+
) -> dict[str, Any]:
|
| 1336 |
+
records = payload.data if isinstance(payload.data, list) else [payload.data]
|
| 1337 |
+
return _predict_records(records, threshold)
|
| 1338 |
+
|
| 1339 |
+
|
| 1340 |
+
@app.post("/predict-minimal")
|
| 1341 |
+
def predict_minimal(
|
| 1342 |
+
payload: MinimalPredictionRequest,
|
| 1343 |
+
threshold: float | None = Query(default=None, ge=0.0, le=1.0),
|
| 1344 |
+
) -> dict[str, Any]:
|
| 1345 |
+
preprocessor: PreprocessorArtifacts = app.state.preprocessor
|
| 1346 |
+
record = _build_minimal_record(payload, preprocessor)
|
| 1347 |
+
return _predict_records([record], threshold)
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/gradio_app.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Any
|
| 4 |
+
|
| 5 |
+
import gradio as gr
|
| 6 |
+
from fastapi import HTTPException
|
| 7 |
+
|
| 8 |
+
from app.main import MinimalPredictionRequest, app, predict_minimal, startup_event
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _ensure_startup() -> None:
|
| 12 |
+
if not getattr(app.state, "preprocessor", None):
|
| 13 |
+
startup_event()
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _customer_snapshot(sk_id_curr: int) -> dict[str, Any]:
|
| 17 |
+
reference = getattr(app.state, "customer_reference", None)
|
| 18 |
+
if reference is None or sk_id_curr not in reference.index:
|
| 19 |
+
return {}
|
| 20 |
+
row = reference.loc[sk_id_curr]
|
| 21 |
+
snapshot: dict[str, Any] = {"SK_ID_CURR": int(sk_id_curr)}
|
| 22 |
+
if "CODE_GENDER" in row:
|
| 23 |
+
snapshot["CODE_GENDER"] = row["CODE_GENDER"]
|
| 24 |
+
if "FLAG_OWN_CAR" in row:
|
| 25 |
+
snapshot["FLAG_OWN_CAR"] = row["FLAG_OWN_CAR"]
|
| 26 |
+
if "AMT_INCOME_TOTAL" in row:
|
| 27 |
+
snapshot["AMT_INCOME_TOTAL"] = float(row["AMT_INCOME_TOTAL"])
|
| 28 |
+
if "DAYS_BIRTH" in row:
|
| 29 |
+
snapshot["AGE_YEARS"] = round(abs(float(row["DAYS_BIRTH"])) / 365.25, 1)
|
| 30 |
+
return snapshot
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def score_minimal(
|
| 34 |
+
sk_id_curr: float,
|
| 35 |
+
amt_credit: float,
|
| 36 |
+
duration_months: float,
|
| 37 |
+
threshold: float,
|
| 38 |
+
) -> tuple[float | None, str, float | None, dict[str, Any]]:
|
| 39 |
+
_ensure_startup()
|
| 40 |
+
try:
|
| 41 |
+
payload = MinimalPredictionRequest(
|
| 42 |
+
sk_id_curr=int(sk_id_curr),
|
| 43 |
+
amt_credit=float(amt_credit),
|
| 44 |
+
duration_months=int(duration_months),
|
| 45 |
+
)
|
| 46 |
+
response = predict_minimal(payload, threshold=float(threshold))
|
| 47 |
+
result = response["predictions"][0]
|
| 48 |
+
probability = float(result.get("probability", 0.0))
|
| 49 |
+
pred_value = int(result.get("prediction", 0))
|
| 50 |
+
label = "Default (1)" if pred_value == 1 else "No default (0)"
|
| 51 |
+
snapshot = _customer_snapshot(int(sk_id_curr))
|
| 52 |
+
snapshot.update(
|
| 53 |
+
{
|
| 54 |
+
"AMT_CREDIT_REQUESTED": float(amt_credit),
|
| 55 |
+
"DURATION_MONTHS": int(duration_months),
|
| 56 |
+
}
|
| 57 |
+
)
|
| 58 |
+
return probability, label, float(response.get("threshold", 0.0)), snapshot
|
| 59 |
+
except HTTPException as exc:
|
| 60 |
+
return None, f"Erreur: {exc.detail}", None, {"error": exc.detail}
|
| 61 |
+
except Exception as exc: # pragma: no cover - UI fallback
|
| 62 |
+
return None, f"Erreur: {exc}", None, {"error": str(exc)}
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
with gr.Blocks(title="Credit Scoring - Minimal Inputs") as demo:
|
| 66 |
+
gr.Markdown("# Credit Scoring - Minimal Inputs")
|
| 67 |
+
gr.Markdown(
|
| 68 |
+
"Renseignez l'identifiant client, le montant du credit et la duree. "
|
| 69 |
+
"Les autres features proviennent des donnees clients reference."
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
with gr.Row():
|
| 73 |
+
sk_id_curr = gr.Number(label="SK_ID_CURR", precision=0, value=100001)
|
| 74 |
+
amt_credit = gr.Number(label="AMT_CREDIT", value=200000)
|
| 75 |
+
duration_months = gr.Number(label="Duree (mois)", precision=0, value=60)
|
| 76 |
+
threshold = gr.Slider(label="Seuil", minimum=0.0, maximum=1.0, value=0.5, step=0.01)
|
| 77 |
+
|
| 78 |
+
run_btn = gr.Button("Scorer")
|
| 79 |
+
|
| 80 |
+
with gr.Row():
|
| 81 |
+
probability = gr.Number(label="Probabilite de defaut")
|
| 82 |
+
prediction = gr.Textbox(label="Decision")
|
| 83 |
+
threshold_used = gr.Number(label="Seuil utilise")
|
| 84 |
+
|
| 85 |
+
snapshot = gr.JSON(label="Snapshot client (reference)")
|
| 86 |
+
|
| 87 |
+
run_btn.click(
|
| 88 |
+
score_minimal,
|
| 89 |
+
inputs=[sk_id_curr, amt_credit, duration_months, threshold],
|
| 90 |
+
outputs=[probability, prediction, threshold_used, snapshot],
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
if __name__ == "__main__":
|
| 95 |
+
_ensure_startup()
|
| 96 |
+
demo.launch()
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
emoji: 🤖
|
| 4 |
colorFrom: indigo
|
| 5 |
colorTo: green
|
|
@@ -8,7 +8,7 @@ app_port: 7860
|
|
| 8 |
pinned: false
|
| 9 |
---
|
| 10 |
|
| 11 |
-
#
|
| 12 |
|
| 13 |
[](https://github.com/stephmnt/credit-scoring-mlops/actions/workflows/deploy.yml)
|
| 14 |
[](https://github.com/stephmnt/credit-scoring-mlops/releases)
|
|
@@ -62,24 +62,33 @@ Parametres utiles (selection des features) :
|
|
| 62 |
- `FEATURE_SELECTION_TOP_N` (defaut: `8`)
|
| 63 |
- `FEATURE_SELECTION_MIN_CORR` (defaut: `0.02`)
|
| 64 |
|
| 65 |
-
### Environnement
|
| 66 |
|
| 67 |
-
Le
|
| 68 |
-
|
| 69 |
-
3.11.
|
| 70 |
|
| 71 |
```shell
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
poetry run pytest -q
|
| 75 |
poetry run uvicorn app.main:app --reload --port 7860
|
| 76 |
```
|
| 77 |
|
| 78 |
Important : le modele `HistGB_final_model.pkl` doit etre regenere avec la
|
| 79 |
-
|
| 80 |
-
`P6_MANET_Stephane_notebook_modélisation.ipynb`, cellule de
|
| 81 |
-
|
| 82 |
-
Note : `requirements.txt` est aligne sur `pyproject.toml` (meme versions).
|
| 83 |
|
| 84 |
### Exemple d'input (schema + valeurs)
|
| 85 |
|
|
@@ -123,9 +132,70 @@ Valeurs d'exemple :
|
|
| 123 |
}
|
| 124 |
```
|
| 125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
Note : l'API valide strictement les champs requis (`/features`). Pour afficher
|
| 127 |
toutes les colonnes possibles : `/features?include_all=true`.
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
### Demo live (commandes cles en main)
|
| 130 |
|
| 131 |
Lancer l'API :
|
|
@@ -231,6 +301,10 @@ Variables utiles :
|
|
| 231 |
- `LOGS_ACCESS_TOKEN` pour proteger l'endpoint `/logs`
|
| 232 |
- `LOG_HASH_SK_ID=1` pour anonymiser `SK_ID_CURR`
|
| 233 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
Exemple local :
|
| 235 |
|
| 236 |
```shell
|
|
@@ -251,27 +325,70 @@ Alternative :
|
|
| 251 |
curl -s -H "Authorization: Bearer $LOGS_ACCESS_TOKEN" "${BASE_URL}/logs?tail=200"
|
| 252 |
```
|
| 253 |
|
| 254 |
-
Apres quelques requêtes,
|
| 255 |
|
| 256 |
```shell
|
| 257 |
python monitoring/drift_report.py \
|
| 258 |
--logs logs/predictions.jsonl \
|
| 259 |
--reference data/data_final.parquet \
|
| 260 |
-
--output-dir reports
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
```
|
| 262 |
|
| 263 |
Le rapport HTML est généré dans `reports/drift_report.html` (avec des plots dans
|
| 264 |
`reports/plots/`). Sur Hugging Face, le disque est éphemère : télécharger les logs
|
| 265 |
avant d'analyser.
|
| 266 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
Le rapport inclut aussi la distribution des scores predits et le taux de prediction
|
| 268 |
-
(option `--score-bins` pour ajuster le nombre de bins)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
|
| 270 |
Captures (snapshot local du reporting + stockage):
|
| 271 |
|
| 272 |
- Rapport: `docs/monitoring/drift_report.html` + `docs/monitoring/plots/`
|
| 273 |
- Stockage des logs: `docs/monitoring/logs_storage.png`
|
| 274 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
## Contenu de la release
|
| 276 |
|
| 277 |
- **Preparation + pipeline** : nettoyage / preparation, encodage, imputation et pipeline d'entrainement presentes.
|
|
@@ -282,8 +399,10 @@ Captures (snapshot local du reporting + stockage):
|
|
| 282 |
- **Score metier + seuil optimal** : le `custom_score` est la metrique principale des tableaux de comparaison et de la CV, avec un `best_threshold` calcule.
|
| 283 |
- **Explicabilite** : feature importance, SHAP et LIME sont inclus.
|
| 284 |
- **Selection de features par correlation** : top‑N numeriques + un petit set categoriel, expose via `/features`.
|
| 285 |
-
- **
|
| 286 |
-
|
|
|
|
|
|
|
| 287 |
- **CI/CD** : tests avec couverture (`pytest-cov`), build Docker et deploy vers Hugging Face Spaces.
|
| 288 |
|
| 289 |

|
|
@@ -304,5 +423,4 @@ Captures (snapshot local du reporting + stockage):
|
|
| 304 |
|
| 305 |
* Compléter les tests API: /logs (auth OK/KO), batch predict, param threshold, SK_ID_CURR manquant, outliers dans test_api.py.
|
| 306 |
* Simplifier le fallback ALLOW_MISSING_ARTIFACTS et DummyModel si les artefacts sont versionnés (nettoyer main.py et conftest.py).
|
| 307 |
-
* Unifier la gestion des dépendances (Poetry vs requirements.txt) et aligner pyproject.toml / requirements.txt.
|
| 308 |
* Si l’évaluateur attend une stratégie de branches, créer une branche feature et fusionner pour preuve.
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Credit scoring MLOps
|
| 3 |
emoji: 🤖
|
| 4 |
colorFrom: indigo
|
| 5 |
colorTo: green
|
|
|
|
| 8 |
pinned: false
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# Credit scoring MLOps
|
| 12 |
|
| 13 |
[](https://github.com/stephmnt/credit-scoring-mlops/actions/workflows/deploy.yml)
|
| 14 |
[](https://github.com/stephmnt/credit-scoring-mlops/releases)
|
|
|
|
| 62 |
- `FEATURE_SELECTION_TOP_N` (defaut: `8`)
|
| 63 |
- `FEATURE_SELECTION_MIN_CORR` (defaut: `0.02`)
|
| 64 |
|
| 65 |
+
### Environnement pip (dev)
|
| 66 |
|
| 67 |
+
Le developpement local utilise pip et `requirements.txt` (versions figees),
|
| 68 |
+
avec Python 3.11+.
|
|
|
|
| 69 |
|
| 70 |
```shell
|
| 71 |
+
python3 -m venv .venv
|
| 72 |
+
source .venv/bin/activate
|
| 73 |
+
python -m pip install -r requirements.txt
|
| 74 |
+
pytest -q
|
| 75 |
+
uvicorn app.main:app --reload --port 7860
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
### Environnement Poetry (livrable)
|
| 79 |
+
|
| 80 |
+
Le livrable inclut `pyproject.toml`, aligne sur `requirements.txt`. Si besoin :
|
| 81 |
+
|
| 82 |
+
```shell
|
| 83 |
+
poetry install --with dev
|
| 84 |
poetry run pytest -q
|
| 85 |
poetry run uvicorn app.main:app --reload --port 7860
|
| 86 |
```
|
| 87 |
|
| 88 |
Important : le modele `HistGB_final_model.pkl` doit etre regenere avec la
|
| 89 |
+
version de scikit-learn definie dans `requirements.txt` / `pyproject.toml`
|
| 90 |
+
(re-execution de `P6_MANET_Stephane_notebook_modélisation.ipynb`, cellule de
|
| 91 |
+
sauvegarde pickle).
|
|
|
|
| 92 |
|
| 93 |
### Exemple d'input (schema + valeurs)
|
| 94 |
|
|
|
|
| 132 |
}
|
| 133 |
```
|
| 134 |
|
| 135 |
+
### Prediction minimale (client existant)
|
| 136 |
+
|
| 137 |
+
Endpoint `POST /predict-minimal` : l'utilisateur fournit un identifiant client,
|
| 138 |
+
un montant de credit et une duree. Les autres features sont prises depuis la
|
| 139 |
+
reference clients (`CUSTOMER_DATA_PATH`, par defaut `data/data_final.parquet`).
|
| 140 |
+
Si la reference est absente, l'API renvoie 503.
|
| 141 |
+
|
| 142 |
+
```shell
|
| 143 |
+
curl -s -X POST "${BASE_URL}/predict-minimal" \
|
| 144 |
+
-H "Content-Type: application/json" \
|
| 145 |
+
-d '{
|
| 146 |
+
"sk_id_curr": 100001,
|
| 147 |
+
"amt_credit": 200000,
|
| 148 |
+
"duration_months": 60
|
| 149 |
+
}'
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
Variables utiles :
|
| 153 |
+
|
| 154 |
+
- `CUSTOMER_LOOKUP_ENABLED=1` active la recherche client (defaut: 1)
|
| 155 |
+
- `CUSTOMER_DATA_PATH=data/data_final.parquet`
|
| 156 |
+
- `CUSTOMER_LOOKUP_CACHE=1` garde la reference en memoire
|
| 157 |
+
|
| 158 |
+
### Data contract (validation)
|
| 159 |
+
|
| 160 |
+
- Types numeriques stricts (invalides -> 422).
|
| 161 |
+
- Ranges numeriques (min/max entrainement) controles.
|
| 162 |
+
- Categoriels normalises: `CODE_GENDER` -> {`F`, `M`}, `FLAG_OWN_CAR` -> {`Y`, `N`}.
|
| 163 |
+
- Sentinelle `DAYS_EMPLOYED=365243` remplacee par NaN.
|
| 164 |
+
- Logs enrichis via `data_quality` pour distinguer drift vs qualite de donnees.
|
| 165 |
+
|
| 166 |
+
### Interface Gradio (scoring)
|
| 167 |
+
|
| 168 |
+
```shell
|
| 169 |
+
python gradio_app.py
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
Sur Hugging Face Spaces, `app.py` lance l'UI Gradio automatiquement.
|
| 173 |
+
|
| 174 |
Note : l'API valide strictement les champs requis (`/features`). Pour afficher
|
| 175 |
toutes les colonnes possibles : `/features?include_all=true`.
|
| 176 |
|
| 177 |
+
### Hugging Face (assets lourds)
|
| 178 |
+
|
| 179 |
+
Les fichiers binaires (modele, preprocessor, data_final) ne sont pas pushes
|
| 180 |
+
dans le Space. Ils sont telecharges a l'execution via Hugging Face Hub si les
|
| 181 |
+
variables suivantes sont definies :
|
| 182 |
+
|
| 183 |
+
- `HF_MODEL_REPO_ID` + `HF_MODEL_FILENAME` + `HF_MODEL_REPO_TYPE`
|
| 184 |
+
- `HF_PREPROCESSOR_REPO_ID` + `HF_PREPROCESSOR_FILENAME` + `HF_PREPROCESSOR_REPO_TYPE`
|
| 185 |
+
- `HF_CUSTOMER_REPO_ID` + `HF_CUSTOMER_FILENAME` + `HF_CUSTOMER_REPO_TYPE`
|
| 186 |
+
|
| 187 |
+
Exemple (un seul repo dataset avec 3 fichiers) :
|
| 188 |
+
|
| 189 |
+
- `HF_MODEL_REPO_ID=stephmnt/credit-scoring-mlops-assets`
|
| 190 |
+
- `HF_MODEL_REPO_TYPE=dataset`
|
| 191 |
+
- `HF_MODEL_FILENAME=HistGB_final_model.pkl`
|
| 192 |
+
- `HF_PREPROCESSOR_REPO_ID=stephmnt/credit-scoring-mlops-assets`
|
| 193 |
+
- `HF_PREPROCESSOR_REPO_TYPE=dataset`
|
| 194 |
+
- `HF_PREPROCESSOR_FILENAME=preprocessor.joblib`
|
| 195 |
+
- `HF_CUSTOMER_REPO_ID=stephmnt/credit-scoring-mlops-assets`
|
| 196 |
+
- `HF_CUSTOMER_REPO_TYPE=dataset`
|
| 197 |
+
- `HF_CUSTOMER_FILENAME=data_final.parquet`
|
| 198 |
+
|
| 199 |
### Demo live (commandes cles en main)
|
| 200 |
|
| 201 |
Lancer l'API :
|
|
|
|
| 301 |
- `LOGS_ACCESS_TOKEN` pour proteger l'endpoint `/logs`
|
| 302 |
- `LOG_HASH_SK_ID=1` pour anonymiser `SK_ID_CURR`
|
| 303 |
|
| 304 |
+
Les logs incluent un bloc `data_quality` par requete (champs manquants,
|
| 305 |
+
types invalides, out-of-range, categories inconnues, sentinelle
|
| 306 |
+
`DAYS_EMPLOYED`).
|
| 307 |
+
|
| 308 |
Exemple local :
|
| 309 |
|
| 310 |
```shell
|
|
|
|
| 325 |
curl -s -H "Authorization: Bearer $LOGS_ACCESS_TOKEN" "${BASE_URL}/logs?tail=200"
|
| 326 |
```
|
| 327 |
|
| 328 |
+
Apres quelques requêtes, générer le rapport de drift :
|
| 329 |
|
| 330 |
```shell
|
| 331 |
python monitoring/drift_report.py \
|
| 332 |
--logs logs/predictions.jsonl \
|
| 333 |
--reference data/data_final.parquet \
|
| 334 |
+
--output-dir reports \
|
| 335 |
+
--min-prod-samples 200 \
|
| 336 |
+
--fdr-alpha 0.05 \
|
| 337 |
+
--prod-since "2024-01-01T00:00:00Z" \
|
| 338 |
+
--prod-until "2024-01-31T23:59:59Z"
|
| 339 |
```
|
| 340 |
|
| 341 |
Le rapport HTML est généré dans `reports/drift_report.html` (avec des plots dans
|
| 342 |
`reports/plots/`). Sur Hugging Face, le disque est éphemère : télécharger les logs
|
| 343 |
avant d'analyser.
|
| 344 |
|
| 345 |
+
Le drift est calcule uniquement si `n_prod >= --min-prod-samples` (defaut 200).
|
| 346 |
+
Sinon, un badge "Sample insuffisant" est affiche et les alertes sont desactivees.
|
| 347 |
+
|
| 348 |
+
Robustesse integree:
|
| 349 |
+
|
| 350 |
+
- Categoriels: PSI avec lissage (`--psi-eps`) + categories rares regroupees (OTHER).
|
| 351 |
+
- Numeriques: KS corrige par FDR (Benjamini-Hochberg, `--fdr-alpha`).
|
| 352 |
+
- Sentinel `DAYS_EMPLOYED`: converti en NaN + taux suivi.
|
| 353 |
+
|
| 354 |
Le rapport inclut aussi la distribution des scores predits et le taux de prediction
|
| 355 |
+
(option `--score-bins` pour ajuster le nombre de bins), ainsi qu'une section
|
| 356 |
+
Data Quality si les logs contiennent `data_quality` (types, NaN, out-of-range,
|
| 357 |
+
categories inconnues).
|
| 358 |
+
|
| 359 |
+
Pour simuler des fenetres glissantes, utiliser `--prod-since` / `--prod-until`
|
| 360 |
+
avec les timestamps des logs.
|
| 361 |
+
|
| 362 |
+
Runbook drift: `docs/monitoring/runbook.md`.
|
| 363 |
|
| 364 |
Captures (snapshot local du reporting + stockage):
|
| 365 |
|
| 366 |
- Rapport: `docs/monitoring/drift_report.html` + `docs/monitoring/plots/`
|
| 367 |
- Stockage des logs: `docs/monitoring/logs_storage.png`
|
| 368 |
|
| 369 |
+
## Profiling & Optimisation (Etape 4)
|
| 370 |
+
|
| 371 |
+
Profiling et benchmark d'inference (cProfile + latence) :
|
| 372 |
+
|
| 373 |
+
```shell
|
| 374 |
+
python profiling/profile_inference.py \
|
| 375 |
+
--sample-size 2000 \
|
| 376 |
+
--batch-size 128 \
|
| 377 |
+
--runs 3
|
| 378 |
+
```
|
| 379 |
+
|
| 380 |
+
Sorties:
|
| 381 |
+
|
| 382 |
+
- `docs/performance/benchmark_results.json`
|
| 383 |
+
- `docs/performance/profile_summary.txt`
|
| 384 |
+
- Rapport detaille: `docs/performance/performance_report.md`
|
| 385 |
+
|
| 386 |
+
Dashboard local Streamlit (monitoring + drift):
|
| 387 |
+
|
| 388 |
+
```shell
|
| 389 |
+
python -m streamlit run monitoring/streamlit_app.py
|
| 390 |
+
```
|
| 391 |
+
|
| 392 |
## Contenu de la release
|
| 393 |
|
| 394 |
- **Preparation + pipeline** : nettoyage / preparation, encodage, imputation et pipeline d'entrainement presentes.
|
|
|
|
| 399 |
- **Score metier + seuil optimal** : le `custom_score` est la metrique principale des tableaux de comparaison et de la CV, avec un `best_threshold` calcule.
|
| 400 |
- **Explicabilite** : feature importance, SHAP et LIME sont inclus.
|
| 401 |
- **Selection de features par correlation** : top‑N numeriques + un petit set categoriel, expose via `/features`.
|
| 402 |
+
- **Interface Gradio** : formulaire minimal (id client + montant + duree) base sur la reference clients.
|
| 403 |
+
- **Monitoring & drift** : rapport HTML avec gating par volume, PSI robuste, KS + FDR, data quality et
|
| 404 |
+
distribution des scores (snapshots dans `docs/monitoring/`).
|
| 405 |
+
- **Profiling & optimisation** : benchmark d'inference + profil cProfile (dossier `docs/performance/`).
|
| 406 |
- **CI/CD** : tests avec couverture (`pytest-cov`), build Docker et deploy vers Hugging Face Spaces.
|
| 407 |
|
| 408 |

|
|
|
|
| 423 |
|
| 424 |
* Compléter les tests API: /logs (auth OK/KO), batch predict, param threshold, SK_ID_CURR manquant, outliers dans test_api.py.
|
| 425 |
* Simplifier le fallback ALLOW_MISSING_ARTIFACTS et DummyModel si les artefacts sont versionnés (nettoyer main.py et conftest.py).
|
|
|
|
| 426 |
* Si l’évaluateur attend une stratégie de branches, créer une branche feature et fusionner pour preuve.
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/README.md
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Monitoring Captures
|
| 2 |
+
|
| 3 |
+
These files are snapshot artifacts for the monitoring deliverable.
|
| 4 |
+
|
| 5 |
+
- drift_report.html: report generated by monitoring/drift_report.py (sample-size 5000).
|
| 6 |
+
- runbook.md: triage et actions quand une alerte drift apparait.
|
| 7 |
+
- plots/: feature drift plots + score distribution + prediction rate.
|
| 8 |
+
- predictions_sample.jsonl: sanitized example of production logs.
|
| 9 |
+
- logs_storage.png: snapshot of the logging storage format.
|
| 10 |
+
|
| 11 |
+
Notes:
|
| 12 |
+
- Drift alerts are gated by minimum production volume (see report badge).
|
| 13 |
+
- Data quality metrics appear when logs include `data_quality`.
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/drift_report.html
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!doctype html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="utf-8" />
|
| 5 |
+
<title>Drift Report</title>
|
| 6 |
+
<style>
|
| 7 |
+
body { font-family: Arial, sans-serif; margin: 24px; }
|
| 8 |
+
table { border-collapse: collapse; width: 100%; }
|
| 9 |
+
th, td { border: 1px solid #ddd; padding: 8px; }
|
| 10 |
+
th { background: #f3f3f3; }
|
| 11 |
+
img { max-width: 720px; }
|
| 12 |
+
</style>
|
| 13 |
+
</head>
|
| 14 |
+
<body>
|
| 15 |
+
<h2>Production Monitoring Summary</h2>
|
| 16 |
+
<ul>
|
| 17 |
+
<li>Total calls: 1</li>
|
| 18 |
+
<li>Error rate: 0.00%</li>
|
| 19 |
+
<li>Latency p50: 82.04 ms</li>
|
| 20 |
+
<li>Latency p95: 82.04 ms</li>
|
| 21 |
+
</ul>
|
| 22 |
+
<h2>Score Monitoring</h2>
|
| 23 |
+
<ul>
|
| 24 |
+
<li>Score mean: 0.3755</li>
|
| 25 |
+
<li>Score p50: 0.3755</li>
|
| 26 |
+
<li>Score p95: 0.3755</li>
|
| 27 |
+
<li>Score min: 0.3755</li>
|
| 28 |
+
<li>Score max: 0.3755</li>
|
| 29 |
+
<li>Predicted default rate: 0.00%</li>
|
| 30 |
+
</ul>
|
| 31 |
+
<img src='plots/score_distribution.png' />
|
| 32 |
+
<img src='plots/prediction_rate.png' />
|
| 33 |
+
<h2>Data Drift Summary</h2>
|
| 34 |
+
<table border="1" class="dataframe">
|
| 35 |
+
<thead>
|
| 36 |
+
<tr style="text-align: right;">
|
| 37 |
+
<th>feature</th>
|
| 38 |
+
<th>type</th>
|
| 39 |
+
<th>ks_stat</th>
|
| 40 |
+
<th>p_value</th>
|
| 41 |
+
<th>drift_detected</th>
|
| 42 |
+
<th>psi</th>
|
| 43 |
+
</tr>
|
| 44 |
+
</thead>
|
| 45 |
+
<tbody>
|
| 46 |
+
<tr>
|
| 47 |
+
<td>EXT_SOURCE_2</td>
|
| 48 |
+
<td>numeric</td>
|
| 49 |
+
<td>0.5905</td>
|
| 50 |
+
<td>0.819238</td>
|
| 51 |
+
<td>False</td>
|
| 52 |
+
<td>NaN</td>
|
| 53 |
+
</tr>
|
| 54 |
+
<tr>
|
| 55 |
+
<td>EXT_SOURCE_3</td>
|
| 56 |
+
<td>numeric</td>
|
| 57 |
+
<td>0.9047</td>
|
| 58 |
+
<td>0.191111</td>
|
| 59 |
+
<td>False</td>
|
| 60 |
+
<td>NaN</td>
|
| 61 |
+
</tr>
|
| 62 |
+
<tr>
|
| 63 |
+
<td>AMT_ANNUITY</td>
|
| 64 |
+
<td>numeric</td>
|
| 65 |
+
<td>0.5184</td>
|
| 66 |
+
<td>0.963407</td>
|
| 67 |
+
<td>False</td>
|
| 68 |
+
<td>NaN</td>
|
| 69 |
+
</tr>
|
| 70 |
+
<tr>
|
| 71 |
+
<td>EXT_SOURCE_1</td>
|
| 72 |
+
<td>numeric</td>
|
| 73 |
+
<td>0.5822</td>
|
| 74 |
+
<td>0.836199</td>
|
| 75 |
+
<td>False</td>
|
| 76 |
+
<td>NaN</td>
|
| 77 |
+
</tr>
|
| 78 |
+
<tr>
|
| 79 |
+
<td>CODE_GENDER</td>
|
| 80 |
+
<td>categorical</td>
|
| 81 |
+
<td>NaN</td>
|
| 82 |
+
<td>NaN</td>
|
| 83 |
+
<td>True</td>
|
| 84 |
+
<td>9.6538</td>
|
| 85 |
+
</tr>
|
| 86 |
+
<tr>
|
| 87 |
+
<td>DAYS_EMPLOYED</td>
|
| 88 |
+
<td>numeric</td>
|
| 89 |
+
<td>0.6508</td>
|
| 90 |
+
<td>0.698660</td>
|
| 91 |
+
<td>False</td>
|
| 92 |
+
<td>NaN</td>
|
| 93 |
+
</tr>
|
| 94 |
+
<tr>
|
| 95 |
+
<td>AMT_CREDIT</td>
|
| 96 |
+
<td>numeric</td>
|
| 97 |
+
<td>0.5996</td>
|
| 98 |
+
<td>0.801040</td>
|
| 99 |
+
<td>False</td>
|
| 100 |
+
<td>NaN</td>
|
| 101 |
+
</tr>
|
| 102 |
+
<tr>
|
| 103 |
+
<td>AMT_GOODS_PRICE</td>
|
| 104 |
+
<td>numeric</td>
|
| 105 |
+
<td>0.6115</td>
|
| 106 |
+
<td>0.777177</td>
|
| 107 |
+
<td>False</td>
|
| 108 |
+
<td>NaN</td>
|
| 109 |
+
</tr>
|
| 110 |
+
<tr>
|
| 111 |
+
<td>DAYS_BIRTH</td>
|
| 112 |
+
<td>numeric</td>
|
| 113 |
+
<td>0.9474</td>
|
| 114 |
+
<td>0.105579</td>
|
| 115 |
+
<td>False</td>
|
| 116 |
+
<td>NaN</td>
|
| 117 |
+
</tr>
|
| 118 |
+
<tr>
|
| 119 |
+
<td>FLAG_OWN_CAR</td>
|
| 120 |
+
<td>categorical</td>
|
| 121 |
+
<td>NaN</td>
|
| 122 |
+
<td>NaN</td>
|
| 123 |
+
<td>True</td>
|
| 124 |
+
<td>4.3985</td>
|
| 125 |
+
</tr>
|
| 126 |
+
</tbody>
|
| 127 |
+
</table>
|
| 128 |
+
<h2>Feature Distributions</h2>
|
| 129 |
+
<h4>EXT_SOURCE_2</h4><img src='plots/EXT_SOURCE_2.png' />
|
| 130 |
+
<h4>EXT_SOURCE_3</h4><img src='plots/EXT_SOURCE_3.png' />
|
| 131 |
+
<h4>AMT_ANNUITY</h4><img src='plots/AMT_ANNUITY.png' />
|
| 132 |
+
<h4>EXT_SOURCE_1</h4><img src='plots/EXT_SOURCE_1.png' />
|
| 133 |
+
<h4>CODE_GENDER</h4><img src='plots/CODE_GENDER.png' />
|
| 134 |
+
<h4>DAYS_EMPLOYED</h4><img src='plots/DAYS_EMPLOYED.png' />
|
| 135 |
+
<h4>AMT_CREDIT</h4><img src='plots/AMT_CREDIT.png' />
|
| 136 |
+
<h4>AMT_GOODS_PRICE</h4><img src='plots/AMT_GOODS_PRICE.png' />
|
| 137 |
+
<h4>DAYS_BIRTH</h4><img src='plots/DAYS_BIRTH.png' />
|
| 138 |
+
<h4>FLAG_OWN_CAR</h4><img src='plots/FLAG_OWN_CAR.png' />
|
| 139 |
+
</body>
|
| 140 |
+
</html>
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/logs_storage.png
ADDED
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/AMT_ANNUITY.png
ADDED
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/AMT_CREDIT.png
ADDED
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/AMT_GOODS_PRICE.png
ADDED
|
hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/docs/monitoring/plots/CODE_GENDER.png
ADDED
|