OC_P8_test / app.py
GitHub Actions
Sync to HF Spaces [no-ci]
42a08fb
"""Gradio app for Credit Scoring using an MLflow LightGBM model."""
import json
from typing import Any, Dict
# EXPLICATION : Imports nécessaires pour le logging structuré JSON
import logging
import time
from datetime import datetime
# EXPLICATION : Path pour gestion robuste des chemins de logs (multi-plateforme)
from pathlib import Path
# Compatibility shim: HF Spaces may install a `huggingface_hub` that no longer
# exports `HfFolder` (used by older Gradio 4.x oauth). Try to import and patch
# the real `huggingface_hub` when available; only create a minimal shim if the
# package is absent so we don't shadow the real implementation.
import os
try:
import huggingface_hub as _hf # prefer the real package when available
except Exception:
_hf = None
if _hf is not None:
# Patch only missing symbols to preserve real package behaviour
if not hasattr(_hf, 'HfFolder'):
class HfFolder:
@staticmethod
def get_token():
return os.environ.get('HF_TOKEN') or os.environ.get('HUGGINGFACE_HUB_TOKEN')
_hf.HfFolder = HfFolder
if not hasattr(_hf, 'whoami'):
def whoami(token=None):
return {}
_hf.whoami = whoami
else:
import sys, types
_mod = types.ModuleType('huggingface_hub')
class HfFolder:
@staticmethod
def get_token():
return os.environ.get('HF_TOKEN') or os.environ.get('HUGGINGFACE_HUB_TOKEN')
def whoami(token=None):
return {}
_mod.HfFolder = HfFolder
_mod.whoami = whoami
sys.modules['huggingface_hub'] = _mod
import re as _re
import gradio as gr
import mlflow
import mlflow.lightgbm
import pandas as pd
import numpy as np
from pathlib import Path
# joblib est requis pour charger le preprocessor vectorisé (etape 4 optimisée)
import joblib
# Lightweight transformer to accept "raw" payloads (categorical strings, booleans)
# === VERSION OPTIMISÉE 4.4 - Gain 15.7x : import VectorizedPreprocessor ===
from src.preprocessing import RawToModelTransformer, VectorizedPreprocessor
# Load the model once at startup for efficiency (lazy loading for tests).
MODEL = None
def _load_model():
"""Lazy-load the model on first use.
Behavior:
- Try local LightGBM model file `models/lightgbm.txt` first (fastest, works in Docker/HF).
- If that fails, try the MLflow Model Registry as fallback (for local dev with MLflow server).
"""
global MODEL
if MODEL is None:
import lightgbm as lgb
# 1) Local model file (primary — portable for Docker / HF Spaces)
candidate_paths = [
Path(__file__).resolve().parent / "models" / "lightgbm.txt",
Path.cwd() / "models" / "lightgbm.txt",
]
env_path = os.environ.get("LOCAL_MODEL_PATH")
if env_path:
candidate_paths.insert(0, Path(env_path))
for p in candidate_paths:
if p.exists():
try:
MODEL = lgb.Booster(model_file=str(p))
print(f"Loaded local LightGBM model from {p}")
return MODEL
except Exception as err:
print(f"Warning: failed to load {p}: {err}")
# 2) Fallback: MLflow Model Registry (for local dev)
try:
MODEL = mlflow.lightgbm.load_model("models:/LightGBM/Production")
print("Loaded model from MLflow registry")
return MODEL
except Exception as mlflow_err:
raise RuntimeError(
f"No local model found at {[str(p) for p in candidate_paths]} "
f"and MLflow registry failed: {mlflow_err}. "
"Place the model at `models/lightgbm.txt` or set LOCAL_MODEL_PATH."
) from mlflow_err
return MODEL
# Preprocessor (accept "raw" input and map to model features)
PREPROCESSOR = None
def _load_preprocessor():
"""Charge le VectorizedPreprocessor (version optimisée étape 4).
Priorité de chargement :
1. models/preprocessor_vectorized.joblib (VectorizedPreprocessor, 15.7x plus rapide)
2. Auto-création depuis models/preprocessor.joblib (wrap RawToModelTransformer)
3. Création d'un RawToModelTransformer de base (fallback)
"""
global PREPROCESSOR
if PREPROCESSOR is not None:
return PREPROCESSOR
# === VERSION OPTIMISÉE 4.4 - Gain 15.7x ===
# Essayer d'abord le preprocessor vectorisé sauvegardé
vectorized_path = Path("models") / "preprocessor_vectorized.joblib"
if vectorized_path.exists():
try:
PREPROCESSOR = joblib.load(vectorized_path)
print("✅ VectorizedPreprocessor chargé (étape 4 optimisée)")
return PREPROCESSOR
except Exception as e:
print(f"⚠️ Chargement vectorized échoué, fallback : {e}")
PREPROCESSOR = None
# Auto-création : wrapper VectorizedPreprocessor autour de l'ancien preprocessor
base_path = Path("models") / "preprocessor.joblib"
base_transformer = None
if base_path.exists():
try:
base_transformer = joblib.load(base_path)
# Vérifier que c'est bien un RawToModelTransformer (pas déjà un VectorizedPreprocessor)
if isinstance(base_transformer, VectorizedPreprocessor):
PREPROCESSOR = base_transformer
print("✅ VectorizedPreprocessor chargé depuis preprocessor.joblib")
return PREPROCESSOR
if not isinstance(base_transformer, RawToModelTransformer):
base_transformer = None
except Exception:
base_transformer = None
if base_transformer is None:
# Aucun fichier disponible : créer un RawToModelTransformer de base
base_transformer = RawToModelTransformer()
# Wrappeur VectorizedPreprocessor (cœur du gain 15.7x)
PREPROCESSOR = VectorizedPreprocessor(base_transformer)
print("✅ VectorizedPreprocessor créé (étape 4 optimisée)")
# Sauvegarder pour les prochains démarrages
try:
vectorized_path.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(PREPROCESSOR, vectorized_path)
print(f"✅ VectorizedPreprocessor sauvegardé → {vectorized_path}")
except Exception as e:
print(f"⚠️ Sauvegarde échouée (non bloquant) : {e}")
return PREPROCESSOR
def _validate_payload(payload: Dict[str, Any]) -> Dict[str, Any]:
"""Basic validation on input payload.
Raises:
ValueError: If the payload is invalid.
"""
if not isinstance(payload, dict):
raise ValueError("Le JSON doit être un objet (clé/valeur).")
if not payload:
raise ValueError("Le JSON est vide.")
for key, value in payload.items():
# EXPLICATION : None est accepté (LightGBM gère nativement les NaN)
if value is not None and isinstance(value, (list, dict)):
raise ValueError(f"La valeur de '{key}' doit être scalaire.")
return payload
def _parse_json_line(json_line: str) -> pd.DataFrame:
"""Parse a single JSON line into a one-row DataFrame."""
try:
raw = json.loads(json_line)
except json.JSONDecodeError as exc:
raise ValueError("JSON invalide. Vérifie la syntaxe.") from exc
payload = _validate_payload(raw)
# Build a single-row DataFrame and sanitize common problematic inputs:
# - convert empty strings to NaN so numeric coercion / imputation works
# - convert string booleans to actual booleans ("True"/"False")
df = pd.DataFrame([payload])
df = df.replace({"": np.nan, "True": True, "False": False})
# EXPLICATION : Sanitiser les noms de colonnes pour matcher ceux attendus par le modèle.
# Le modèle a été entraîné avec des noms sanitisés (espaces → _, caractères spéciaux → _).
# Sans cette étape, des colonnes comme "BURO_CREDIT_ACTIVE_Bad debt_MEAN" ne matchent pas
# "BURO_CREDIT_ACTIVE_Bad_debt_MEAN" → fill_value=0 → prédictions faussées (tout Accordé).
df.columns = [_re.sub(r'[^a-zA-Z0-9_]', '_', c.replace(' ', '_')) for c in df.columns]
# Force all columns to numeric dtypes (LightGBM rejects object/str columns).
# Booleans become 1/0, strings that are still present become NaN.
for col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
# Try to apply a lightweight preprocessor to accept "raw" payloads
# The transformer maps categorical strings (ex. NAME_CONTRACT_TYPE) to the
# one-hot columns expected by the trained model. On any failure we keep the
# original dataframe and rely on column reindexing later.
#
# IMPORTANT: Skip preprocessor if input is already processed data (e.g. from
# features_train.csv / reference.csv). Detect this by checking how many input
# columns match expected model features. If >50% match, data is already
# processed — running the preprocessor would replace NaN with median values,
# destroying the signal that LightGBM uses for missing-value splits.
try:
pre = _load_preprocessor()
if pre is not None:
expected_feats = set(pre.get_feature_names_out()) if hasattr(pre, 'get_feature_names_out') else set()
overlap = len(set(df.columns) & expected_feats)
if expected_feats and overlap / len(expected_feats) > 0.5:
# Data is already processed — skip preprocessor to avoid double processing
pass
else:
df = pre.transform(df)
except Exception:
# Non-fatal: continue with the original df (alignment step will fill missing)
pass
return df
def _get_model_feature_names(model) -> list | None:
"""Try to obtain the model's expected feature names.
Tries common LightGBM / sklearn attributes first, then falls back to
reading the header of `data/processed/features_train.csv`.
Returns a list of column names or None if not found.
"""
# 1) common LightGBM / sklearn attributes
try:
fn = getattr(model, "feature_name", None)
if callable(fn):
names = list(fn())
if names:
return names
except Exception:
pass
names = getattr(model, "feature_name_", None)
if isinstance(names, (list, tuple)):
return list(names)
# LightGBM scikit-learn wrapper exposes `booster_`
try:
if hasattr(model, "booster_") and getattr(model.booster_, "feature_name", None):
return list(model.booster_.feature_name())
except Exception:
pass
# 2) Fallback to header from the preprocessed training CSV
try:
header_path = Path("data/processed/features_train.csv")
if header_path.exists():
df_header = pd.read_csv(header_path, nrows=0)
cols = [c for c in df_header.columns if c not in ("SK_ID_CURR", "TARGET")]
# Apply same sanitization as training notebook (spaces → _, non-alnum → _)
cols = [_re.sub(r'[^a-zA-Z0-9_]', '_', c.replace(' ', '_')) for c in cols]
if cols:
return cols
except Exception:
pass
return None
# EXPLICATION : Fonction helper pour logger chaque prédiction avec tous les champs requis
# IMPORTANT : Écrit DIRECTEMENT dans le fichier (pas de FileHandler)
# pour éviter les problèmes d'interférence avec Gradio/autres loggers
def log_prediction(input_raw: str, input_features: dict, output_proba: float,
output_decision: str, execution_time_ms: float, error: str = None):
"""Log une prédiction au format JSON structuré dans logs/predictions.jsonl."""
try:
# Chemin absolu ancré sur app.py → fonctionne quel que soit le cwd de lancement
_log_dir = Path(__file__).resolve().parent / "logs"
_log_dir.mkdir(parents=True, exist_ok=True)
# EXPLICATION : Construit l'entrée JSON
log_entry = {
"timestamp": datetime.utcnow().isoformat() + "Z",
"input_raw": input_raw,
"input_features": input_features,
"output_proba": round(output_proba, 4) if output_proba is not None else None,
"output_decision": output_decision,
"execution_time_ms": round(execution_time_ms, 1),
"error": error,
"model_version": "models:/LightGBM/Production",
"threshold": 0.4
}
# EXPLICATION : Écrit DIRECTEMENT dans le fichier (robuste à Gradio)
# Mode "a" = append, newline assuré après chaque log
log_line = json.dumps(log_entry, ensure_ascii=False) + "\n"
log_file = _log_dir / "predictions.jsonl"
with open(log_file, "a", encoding="utf-8") as f:
f.write(log_line)
f.flush() # Force l'écriture immédiate (important pour le suivi en temps réel)
# EXPLICATION : Aussi afficher dans la console pour Docker/HF Spaces
print(f"[LOG] {log_line.strip()}")
except Exception as exc:
# EXPLICATION : N'échoue pas silencieusement si le logging échoue
print(f"[ERROR] Logging échoué : {exc}", flush=True)
# === VERSION OPTIMISÉE 4.4 - Gain 15.7x ===
# Remplace l'ancienne _predict (boucle ligne par ligne)
# par une version vectorisée pandas : prétraitement en une seule opération.
def _predict(json_line: str, threshold: float = 0.4) -> str:
"""Predict default probability and return a formatted response.
Version optimisée étape 4 (15.7x plus rapide - vectorisation pandas)
"""
# Capture du temps de début pour calculer execution_time_ms
start_time = time.perf_counter()
try:
# === ÉTAPE 1 : Validation JSON (fail-fast avant tout traitement) ===
try:
raw = json.loads(json_line)
except json.JSONDecodeError as exc:
raise ValueError("JSON invalide. Vérifie la syntaxe.") from exc
_validate_payload(raw)
# === ÉTAPE 2 : Preprocessing vectorisé (cœur du gain 15.7x) ===
# VectorizedPreprocessor.transform_one_sample construit le DataFrame
# depuis le dict en UNE seule opération pandas (pas de boucle).
prep = _load_preprocessor()
if prep is not None and isinstance(prep, VectorizedPreprocessor):
# Chemin optimisé : VectorizedPreprocessor (vectorisation pandas)
df = prep.transform_one_sample(json_line)
else:
# Fallback : ancien chemin (RawToModelTransformer ligne par ligne)
df = _parse_json_line(json_line)
# === ÉTAPE 3 : Alignement colonnes sur les features attendues du modèle ===
# fill_value=np.nan (pas 0) : LightGBM utilise ses splits natifs manquants
model = _load_model()
expected = _get_model_feature_names(model)
if expected:
df = df.reindex(columns=expected, fill_value=np.nan)
# Garantie finale : toutes les colonnes numériques (LightGBM requirement)
# NaN préservés — LightGBM les gère nativement.
for col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
# === ÉTAPE 4 : Inférence LightGBM (predict_proba vectorisé) ===
try:
proba = float(model.predict_proba(df)[:, 1][0])
except AttributeError:
# Fallback pour les modèles exposant predict() retournant des probabilités
proba = float(model.predict(df)[0])
if not 0.0 <= proba <= 1.0:
raise ValueError("La probabilité prédite est hors de l'intervalle [0, 1].")
score = int(proba * 1000)
# Seuil de décision : < threshold = Accordé (risque faible)
decision = "Accordé" if proba < threshold else "Refusé"
# === ÉTAPE 5 : Log structuré de la prédiction réussie ===
execution_time_ms = (time.perf_counter() - start_time) * 1000
log_prediction(
input_raw=json_line,
input_features=raw,
output_proba=proba,
output_decision=decision,
execution_time_ms=execution_time_ms,
error=None
)
return (
f"Score: {score}\n"
f"Probabilité de défaut: {proba:.4f}\n"
f"Décision: {decision}"
)
except ValueError as exc:
# Log de l'erreur avec temps d'exécution et message d'erreur
execution_time_ms = (time.perf_counter() - start_time) * 1000
try:
input_features = json.loads(json_line)
except Exception:
input_features = {}
log_prediction(
input_raw=json_line,
input_features=input_features,
output_proba=None,
output_decision="Erreur",
execution_time_ms=execution_time_ms,
error=f"ValueError: {exc}"
)
return f"Erreur: {exc}"
except KeyError as exc:
execution_time_ms = (time.perf_counter() - start_time) * 1000
try:
input_features = json.loads(json_line)
except Exception:
input_features = {}
log_prediction(
input_raw=json_line,
input_features=input_features,
output_proba=None,
output_decision="Erreur",
execution_time_ms=execution_time_ms,
error=f"KeyError: {exc}"
)
return f"Erreur: colonne manquante ({exc})."
except TypeError as exc:
execution_time_ms = (time.perf_counter() - start_time) * 1000
try:
input_features = json.loads(json_line)
except Exception:
input_features = {}
log_prediction(
input_raw=json_line,
input_features=input_features,
output_proba=None,
output_decision="Erreur",
execution_time_ms=execution_time_ms,
error=f"TypeError: {exc}"
)
return f"Erreur: type invalide ({exc})."
except Exception as exc: # noqa: BLE001
execution_time_ms = (time.perf_counter() - start_time) * 1000
try:
input_features = json.loads(json_line)
except Exception:
input_features = {}
log_prediction(
input_raw=json_line,
input_features=input_features,
output_proba=None,
output_decision="Erreur",
execution_time_ms=execution_time_ms,
error=f"Exception: {exc}"
)
return f"Erreur inattendue: {exc}"
def build_demo() -> gr.Blocks:
"""Build and return the Gradio Blocks demo."""
with gr.Blocks(title="Credit Scoring API") as demo:
gr.Markdown(
"# Credit Scoring API\n"
"Saisis une seule ligne JSON avec les variables d'entrée.\n"
"Le modèle LightGBM retourne une probabilité de défaut, un score, et une décision.\n"
"*Version optimisée étape 4 (15.7x plus rapide - vectorisation pandas)*"
)
with gr.Row():
input_json = gr.Textbox(
label="JSON (ligne unique)",
lines=12,
max_lines=30,
placeholder='{"feature1": value1, "feature2": value2, ...}'
)
output_text = gr.Textbox(
label="Résultat",
lines=5,
)
predict_btn = gr.Button("Prédire")
predict_btn.click(
fn=_predict,
inputs=[input_json],
outputs=[output_text],
)
gr.Markdown(
"**Note:** Le seuil de décision est fixé à 0.4 par défaut."
)
return demo
demo = build_demo()
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=int(os.environ.get("PORT", 7860)),
)