Spaces:
Sleeping
Sleeping
| """Gradio app for Credit Scoring using an MLflow LightGBM model.""" | |
| import json | |
| from typing import Any, Dict | |
| # EXPLICATION : Imports nécessaires pour le logging structuré JSON | |
| import logging | |
| import time | |
| from datetime import datetime | |
| # EXPLICATION : Path pour gestion robuste des chemins de logs (multi-plateforme) | |
| from pathlib import Path | |
| # Compatibility shim: HF Spaces may install a `huggingface_hub` that no longer | |
| # exports `HfFolder` (used by older Gradio 4.x oauth). Try to import and patch | |
| # the real `huggingface_hub` when available; only create a minimal shim if the | |
| # package is absent so we don't shadow the real implementation. | |
| import os | |
| try: | |
| import huggingface_hub as _hf # prefer the real package when available | |
| except Exception: | |
| _hf = None | |
| if _hf is not None: | |
| # Patch only missing symbols to preserve real package behaviour | |
| if not hasattr(_hf, 'HfFolder'): | |
| class HfFolder: | |
| def get_token(): | |
| return os.environ.get('HF_TOKEN') or os.environ.get('HUGGINGFACE_HUB_TOKEN') | |
| _hf.HfFolder = HfFolder | |
| if not hasattr(_hf, 'whoami'): | |
| def whoami(token=None): | |
| return {} | |
| _hf.whoami = whoami | |
| else: | |
| import sys, types | |
| _mod = types.ModuleType('huggingface_hub') | |
| class HfFolder: | |
| def get_token(): | |
| return os.environ.get('HF_TOKEN') or os.environ.get('HUGGINGFACE_HUB_TOKEN') | |
| def whoami(token=None): | |
| return {} | |
| _mod.HfFolder = HfFolder | |
| _mod.whoami = whoami | |
| sys.modules['huggingface_hub'] = _mod | |
| import re as _re | |
| import gradio as gr | |
| import mlflow | |
| import mlflow.lightgbm | |
| import pandas as pd | |
| import numpy as np | |
| from pathlib import Path | |
| # joblib est requis pour charger le preprocessor vectorisé (etape 4 optimisée) | |
| import joblib | |
| # Lightweight transformer to accept "raw" payloads (categorical strings, booleans) | |
| # === VERSION OPTIMISÉE 4.4 - Gain 15.7x : import VectorizedPreprocessor === | |
| from src.preprocessing import RawToModelTransformer, VectorizedPreprocessor | |
| # Load the model once at startup for efficiency (lazy loading for tests). | |
| MODEL = None | |
| def _load_model(): | |
| """Lazy-load the model on first use. | |
| Behavior: | |
| - Try local LightGBM model file `models/lightgbm.txt` first (fastest, works in Docker/HF). | |
| - If that fails, try the MLflow Model Registry as fallback (for local dev with MLflow server). | |
| """ | |
| global MODEL | |
| if MODEL is None: | |
| import lightgbm as lgb | |
| # 1) Local model file (primary — portable for Docker / HF Spaces) | |
| candidate_paths = [ | |
| Path(__file__).resolve().parent / "models" / "lightgbm.txt", | |
| Path.cwd() / "models" / "lightgbm.txt", | |
| ] | |
| env_path = os.environ.get("LOCAL_MODEL_PATH") | |
| if env_path: | |
| candidate_paths.insert(0, Path(env_path)) | |
| for p in candidate_paths: | |
| if p.exists(): | |
| try: | |
| MODEL = lgb.Booster(model_file=str(p)) | |
| print(f"Loaded local LightGBM model from {p}") | |
| return MODEL | |
| except Exception as err: | |
| print(f"Warning: failed to load {p}: {err}") | |
| # 2) Fallback: MLflow Model Registry (for local dev) | |
| try: | |
| MODEL = mlflow.lightgbm.load_model("models:/LightGBM/Production") | |
| print("Loaded model from MLflow registry") | |
| return MODEL | |
| except Exception as mlflow_err: | |
| raise RuntimeError( | |
| f"No local model found at {[str(p) for p in candidate_paths]} " | |
| f"and MLflow registry failed: {mlflow_err}. " | |
| "Place the model at `models/lightgbm.txt` or set LOCAL_MODEL_PATH." | |
| ) from mlflow_err | |
| return MODEL | |
| # Preprocessor (accept "raw" input and map to model features) | |
| PREPROCESSOR = None | |
| def _load_preprocessor(): | |
| """Charge le VectorizedPreprocessor (version optimisée étape 4). | |
| Priorité de chargement : | |
| 1. models/preprocessor_vectorized.joblib (VectorizedPreprocessor, 15.7x plus rapide) | |
| 2. Auto-création depuis models/preprocessor.joblib (wrap RawToModelTransformer) | |
| 3. Création d'un RawToModelTransformer de base (fallback) | |
| """ | |
| global PREPROCESSOR | |
| if PREPROCESSOR is not None: | |
| return PREPROCESSOR | |
| # === VERSION OPTIMISÉE 4.4 - Gain 15.7x === | |
| # Essayer d'abord le preprocessor vectorisé sauvegardé | |
| vectorized_path = Path("models") / "preprocessor_vectorized.joblib" | |
| if vectorized_path.exists(): | |
| try: | |
| PREPROCESSOR = joblib.load(vectorized_path) | |
| print("✅ VectorizedPreprocessor chargé (étape 4 optimisée)") | |
| return PREPROCESSOR | |
| except Exception as e: | |
| print(f"⚠️ Chargement vectorized échoué, fallback : {e}") | |
| PREPROCESSOR = None | |
| # Auto-création : wrapper VectorizedPreprocessor autour de l'ancien preprocessor | |
| base_path = Path("models") / "preprocessor.joblib" | |
| base_transformer = None | |
| if base_path.exists(): | |
| try: | |
| base_transformer = joblib.load(base_path) | |
| # Vérifier que c'est bien un RawToModelTransformer (pas déjà un VectorizedPreprocessor) | |
| if isinstance(base_transformer, VectorizedPreprocessor): | |
| PREPROCESSOR = base_transformer | |
| print("✅ VectorizedPreprocessor chargé depuis preprocessor.joblib") | |
| return PREPROCESSOR | |
| if not isinstance(base_transformer, RawToModelTransformer): | |
| base_transformer = None | |
| except Exception: | |
| base_transformer = None | |
| if base_transformer is None: | |
| # Aucun fichier disponible : créer un RawToModelTransformer de base | |
| base_transformer = RawToModelTransformer() | |
| # Wrappeur VectorizedPreprocessor (cœur du gain 15.7x) | |
| PREPROCESSOR = VectorizedPreprocessor(base_transformer) | |
| print("✅ VectorizedPreprocessor créé (étape 4 optimisée)") | |
| # Sauvegarder pour les prochains démarrages | |
| try: | |
| vectorized_path.parent.mkdir(parents=True, exist_ok=True) | |
| joblib.dump(PREPROCESSOR, vectorized_path) | |
| print(f"✅ VectorizedPreprocessor sauvegardé → {vectorized_path}") | |
| except Exception as e: | |
| print(f"⚠️ Sauvegarde échouée (non bloquant) : {e}") | |
| return PREPROCESSOR | |
| def _validate_payload(payload: Dict[str, Any]) -> Dict[str, Any]: | |
| """Basic validation on input payload. | |
| Raises: | |
| ValueError: If the payload is invalid. | |
| """ | |
| if not isinstance(payload, dict): | |
| raise ValueError("Le JSON doit être un objet (clé/valeur).") | |
| if not payload: | |
| raise ValueError("Le JSON est vide.") | |
| for key, value in payload.items(): | |
| # EXPLICATION : None est accepté (LightGBM gère nativement les NaN) | |
| if value is not None and isinstance(value, (list, dict)): | |
| raise ValueError(f"La valeur de '{key}' doit être scalaire.") | |
| return payload | |
| def _parse_json_line(json_line: str) -> pd.DataFrame: | |
| """Parse a single JSON line into a one-row DataFrame.""" | |
| try: | |
| raw = json.loads(json_line) | |
| except json.JSONDecodeError as exc: | |
| raise ValueError("JSON invalide. Vérifie la syntaxe.") from exc | |
| payload = _validate_payload(raw) | |
| # Build a single-row DataFrame and sanitize common problematic inputs: | |
| # - convert empty strings to NaN so numeric coercion / imputation works | |
| # - convert string booleans to actual booleans ("True"/"False") | |
| df = pd.DataFrame([payload]) | |
| df = df.replace({"": np.nan, "True": True, "False": False}) | |
| # EXPLICATION : Sanitiser les noms de colonnes pour matcher ceux attendus par le modèle. | |
| # Le modèle a été entraîné avec des noms sanitisés (espaces → _, caractères spéciaux → _). | |
| # Sans cette étape, des colonnes comme "BURO_CREDIT_ACTIVE_Bad debt_MEAN" ne matchent pas | |
| # "BURO_CREDIT_ACTIVE_Bad_debt_MEAN" → fill_value=0 → prédictions faussées (tout Accordé). | |
| df.columns = [_re.sub(r'[^a-zA-Z0-9_]', '_', c.replace(' ', '_')) for c in df.columns] | |
| # Force all columns to numeric dtypes (LightGBM rejects object/str columns). | |
| # Booleans become 1/0, strings that are still present become NaN. | |
| for col in df.columns: | |
| df[col] = pd.to_numeric(df[col], errors='coerce') | |
| # Try to apply a lightweight preprocessor to accept "raw" payloads | |
| # The transformer maps categorical strings (ex. NAME_CONTRACT_TYPE) to the | |
| # one-hot columns expected by the trained model. On any failure we keep the | |
| # original dataframe and rely on column reindexing later. | |
| # | |
| # IMPORTANT: Skip preprocessor if input is already processed data (e.g. from | |
| # features_train.csv / reference.csv). Detect this by checking how many input | |
| # columns match expected model features. If >50% match, data is already | |
| # processed — running the preprocessor would replace NaN with median values, | |
| # destroying the signal that LightGBM uses for missing-value splits. | |
| try: | |
| pre = _load_preprocessor() | |
| if pre is not None: | |
| expected_feats = set(pre.get_feature_names_out()) if hasattr(pre, 'get_feature_names_out') else set() | |
| overlap = len(set(df.columns) & expected_feats) | |
| if expected_feats and overlap / len(expected_feats) > 0.5: | |
| # Data is already processed — skip preprocessor to avoid double processing | |
| pass | |
| else: | |
| df = pre.transform(df) | |
| except Exception: | |
| # Non-fatal: continue with the original df (alignment step will fill missing) | |
| pass | |
| return df | |
| def _get_model_feature_names(model) -> list | None: | |
| """Try to obtain the model's expected feature names. | |
| Tries common LightGBM / sklearn attributes first, then falls back to | |
| reading the header of `data/processed/features_train.csv`. | |
| Returns a list of column names or None if not found. | |
| """ | |
| # 1) common LightGBM / sklearn attributes | |
| try: | |
| fn = getattr(model, "feature_name", None) | |
| if callable(fn): | |
| names = list(fn()) | |
| if names: | |
| return names | |
| except Exception: | |
| pass | |
| names = getattr(model, "feature_name_", None) | |
| if isinstance(names, (list, tuple)): | |
| return list(names) | |
| # LightGBM scikit-learn wrapper exposes `booster_` | |
| try: | |
| if hasattr(model, "booster_") and getattr(model.booster_, "feature_name", None): | |
| return list(model.booster_.feature_name()) | |
| except Exception: | |
| pass | |
| # 2) Fallback to header from the preprocessed training CSV | |
| try: | |
| header_path = Path("data/processed/features_train.csv") | |
| if header_path.exists(): | |
| df_header = pd.read_csv(header_path, nrows=0) | |
| cols = [c for c in df_header.columns if c not in ("SK_ID_CURR", "TARGET")] | |
| # Apply same sanitization as training notebook (spaces → _, non-alnum → _) | |
| cols = [_re.sub(r'[^a-zA-Z0-9_]', '_', c.replace(' ', '_')) for c in cols] | |
| if cols: | |
| return cols | |
| except Exception: | |
| pass | |
| return None | |
| # EXPLICATION : Fonction helper pour logger chaque prédiction avec tous les champs requis | |
| # IMPORTANT : Écrit DIRECTEMENT dans le fichier (pas de FileHandler) | |
| # pour éviter les problèmes d'interférence avec Gradio/autres loggers | |
| def log_prediction(input_raw: str, input_features: dict, output_proba: float, | |
| output_decision: str, execution_time_ms: float, error: str = None): | |
| """Log une prédiction au format JSON structuré dans logs/predictions.jsonl.""" | |
| try: | |
| # Chemin absolu ancré sur app.py → fonctionne quel que soit le cwd de lancement | |
| _log_dir = Path(__file__).resolve().parent / "logs" | |
| _log_dir.mkdir(parents=True, exist_ok=True) | |
| # EXPLICATION : Construit l'entrée JSON | |
| log_entry = { | |
| "timestamp": datetime.utcnow().isoformat() + "Z", | |
| "input_raw": input_raw, | |
| "input_features": input_features, | |
| "output_proba": round(output_proba, 4) if output_proba is not None else None, | |
| "output_decision": output_decision, | |
| "execution_time_ms": round(execution_time_ms, 1), | |
| "error": error, | |
| "model_version": "models:/LightGBM/Production", | |
| "threshold": 0.4 | |
| } | |
| # EXPLICATION : Écrit DIRECTEMENT dans le fichier (robuste à Gradio) | |
| # Mode "a" = append, newline assuré après chaque log | |
| log_line = json.dumps(log_entry, ensure_ascii=False) + "\n" | |
| log_file = _log_dir / "predictions.jsonl" | |
| with open(log_file, "a", encoding="utf-8") as f: | |
| f.write(log_line) | |
| f.flush() # Force l'écriture immédiate (important pour le suivi en temps réel) | |
| # EXPLICATION : Aussi afficher dans la console pour Docker/HF Spaces | |
| print(f"[LOG] {log_line.strip()}") | |
| except Exception as exc: | |
| # EXPLICATION : N'échoue pas silencieusement si le logging échoue | |
| print(f"[ERROR] Logging échoué : {exc}", flush=True) | |
| # === VERSION OPTIMISÉE 4.4 - Gain 15.7x === | |
| # Remplace l'ancienne _predict (boucle ligne par ligne) | |
| # par une version vectorisée pandas : prétraitement en une seule opération. | |
| def _predict(json_line: str, threshold: float = 0.4) -> str: | |
| """Predict default probability and return a formatted response. | |
| Version optimisée étape 4 (15.7x plus rapide - vectorisation pandas) | |
| """ | |
| # Capture du temps de début pour calculer execution_time_ms | |
| start_time = time.perf_counter() | |
| try: | |
| # === ÉTAPE 1 : Validation JSON (fail-fast avant tout traitement) === | |
| try: | |
| raw = json.loads(json_line) | |
| except json.JSONDecodeError as exc: | |
| raise ValueError("JSON invalide. Vérifie la syntaxe.") from exc | |
| _validate_payload(raw) | |
| # === ÉTAPE 2 : Preprocessing vectorisé (cœur du gain 15.7x) === | |
| # VectorizedPreprocessor.transform_one_sample construit le DataFrame | |
| # depuis le dict en UNE seule opération pandas (pas de boucle). | |
| prep = _load_preprocessor() | |
| if prep is not None and isinstance(prep, VectorizedPreprocessor): | |
| # Chemin optimisé : VectorizedPreprocessor (vectorisation pandas) | |
| df = prep.transform_one_sample(json_line) | |
| else: | |
| # Fallback : ancien chemin (RawToModelTransformer ligne par ligne) | |
| df = _parse_json_line(json_line) | |
| # === ÉTAPE 3 : Alignement colonnes sur les features attendues du modèle === | |
| # fill_value=np.nan (pas 0) : LightGBM utilise ses splits natifs manquants | |
| model = _load_model() | |
| expected = _get_model_feature_names(model) | |
| if expected: | |
| df = df.reindex(columns=expected, fill_value=np.nan) | |
| # Garantie finale : toutes les colonnes numériques (LightGBM requirement) | |
| # NaN préservés — LightGBM les gère nativement. | |
| for col in df.columns: | |
| df[col] = pd.to_numeric(df[col], errors='coerce') | |
| # === ÉTAPE 4 : Inférence LightGBM (predict_proba vectorisé) === | |
| try: | |
| proba = float(model.predict_proba(df)[:, 1][0]) | |
| except AttributeError: | |
| # Fallback pour les modèles exposant predict() retournant des probabilités | |
| proba = float(model.predict(df)[0]) | |
| if not 0.0 <= proba <= 1.0: | |
| raise ValueError("La probabilité prédite est hors de l'intervalle [0, 1].") | |
| score = int(proba * 1000) | |
| # Seuil de décision : < threshold = Accordé (risque faible) | |
| decision = "Accordé" if proba < threshold else "Refusé" | |
| # === ÉTAPE 5 : Log structuré de la prédiction réussie === | |
| execution_time_ms = (time.perf_counter() - start_time) * 1000 | |
| log_prediction( | |
| input_raw=json_line, | |
| input_features=raw, | |
| output_proba=proba, | |
| output_decision=decision, | |
| execution_time_ms=execution_time_ms, | |
| error=None | |
| ) | |
| return ( | |
| f"Score: {score}\n" | |
| f"Probabilité de défaut: {proba:.4f}\n" | |
| f"Décision: {decision}" | |
| ) | |
| except ValueError as exc: | |
| # Log de l'erreur avec temps d'exécution et message d'erreur | |
| execution_time_ms = (time.perf_counter() - start_time) * 1000 | |
| try: | |
| input_features = json.loads(json_line) | |
| except Exception: | |
| input_features = {} | |
| log_prediction( | |
| input_raw=json_line, | |
| input_features=input_features, | |
| output_proba=None, | |
| output_decision="Erreur", | |
| execution_time_ms=execution_time_ms, | |
| error=f"ValueError: {exc}" | |
| ) | |
| return f"Erreur: {exc}" | |
| except KeyError as exc: | |
| execution_time_ms = (time.perf_counter() - start_time) * 1000 | |
| try: | |
| input_features = json.loads(json_line) | |
| except Exception: | |
| input_features = {} | |
| log_prediction( | |
| input_raw=json_line, | |
| input_features=input_features, | |
| output_proba=None, | |
| output_decision="Erreur", | |
| execution_time_ms=execution_time_ms, | |
| error=f"KeyError: {exc}" | |
| ) | |
| return f"Erreur: colonne manquante ({exc})." | |
| except TypeError as exc: | |
| execution_time_ms = (time.perf_counter() - start_time) * 1000 | |
| try: | |
| input_features = json.loads(json_line) | |
| except Exception: | |
| input_features = {} | |
| log_prediction( | |
| input_raw=json_line, | |
| input_features=input_features, | |
| output_proba=None, | |
| output_decision="Erreur", | |
| execution_time_ms=execution_time_ms, | |
| error=f"TypeError: {exc}" | |
| ) | |
| return f"Erreur: type invalide ({exc})." | |
| except Exception as exc: # noqa: BLE001 | |
| execution_time_ms = (time.perf_counter() - start_time) * 1000 | |
| try: | |
| input_features = json.loads(json_line) | |
| except Exception: | |
| input_features = {} | |
| log_prediction( | |
| input_raw=json_line, | |
| input_features=input_features, | |
| output_proba=None, | |
| output_decision="Erreur", | |
| execution_time_ms=execution_time_ms, | |
| error=f"Exception: {exc}" | |
| ) | |
| return f"Erreur inattendue: {exc}" | |
| def build_demo() -> gr.Blocks: | |
| """Build and return the Gradio Blocks demo.""" | |
| with gr.Blocks(title="Credit Scoring API") as demo: | |
| gr.Markdown( | |
| "# Credit Scoring API\n" | |
| "Saisis une seule ligne JSON avec les variables d'entrée.\n" | |
| "Le modèle LightGBM retourne une probabilité de défaut, un score, et une décision.\n" | |
| "*Version optimisée étape 4 (15.7x plus rapide - vectorisation pandas)*" | |
| ) | |
| with gr.Row(): | |
| input_json = gr.Textbox( | |
| label="JSON (ligne unique)", | |
| lines=12, | |
| max_lines=30, | |
| placeholder='{"feature1": value1, "feature2": value2, ...}' | |
| ) | |
| output_text = gr.Textbox( | |
| label="Résultat", | |
| lines=5, | |
| ) | |
| predict_btn = gr.Button("Prédire") | |
| predict_btn.click( | |
| fn=_predict, | |
| inputs=[input_json], | |
| outputs=[output_text], | |
| ) | |
| gr.Markdown( | |
| "**Note:** Le seuil de décision est fixé à 0.4 par défaut." | |
| ) | |
| return demo | |
| demo = build_demo() | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=int(os.environ.get("PORT", 7860)), | |
| ) | |