Spaces:

ASI-Engineer
/

OC_P8_prod

Sleeping

GitHub Actions

Sync to HF Spaces [no-ci]

178345a 7 days ago

17.5 kB

	"""Gradio app for Credit Scoring using an MLflow LightGBM model."""

	import json
	from typing import Any, Dict
	# EXPLICATION : Imports nécessaires pour le logging structuré JSON
	import logging
	import time
	from datetime import datetime
	# EXPLICATION : Path pour gestion robuste des chemins de logs (multi-plateforme)
	from pathlib import Path

	# Compatibility shim: HF Spaces may install a `huggingface_hub` that no longer
	# exports `HfFolder` (used by older Gradio 4.x oauth). Try to import and patch
	# the real `huggingface_hub` when available; only create a minimal shim if the
	# package is absent so we don't shadow the real implementation.
	import os
	try:
	import huggingface_hub as _hf # prefer the real package when available
	except Exception:
	_hf = None

	if _hf is not None:
	# Patch only missing symbols to preserve real package behaviour
	if not hasattr(_hf, 'HfFolder'):
	class HfFolder:
	@staticmethod
	def get_token():
	return os.environ.get('HF_TOKEN') or os.environ.get('HUGGINGFACE_HUB_TOKEN')
	_hf.HfFolder = HfFolder
	if not hasattr(_hf, 'whoami'):
	def whoami(token=None):
	return {}
	_hf.whoami = whoami
	else:
	import sys, types
	_mod = types.ModuleType('huggingface_hub')
	class HfFolder:
	@staticmethod
	def get_token():
	return os.environ.get('HF_TOKEN') or os.environ.get('HUGGINGFACE_HUB_TOKEN')
	def whoami(token=None):
	return {}
	_mod.HfFolder = HfFolder
	_mod.whoami = whoami
	sys.modules['huggingface_hub'] = _mod

	import re as _re

	import gradio as gr
	import mlflow
	import mlflow.lightgbm
	import pandas as pd
	import numpy as np
	from pathlib import Path

	# joblib est requis pour charger le preprocessor vectorisé (etape 4 optimisée)
	import joblib

	# Lightweight transformer to accept "raw" payloads (categorical strings, booleans)
	# === VERSION OPTIMISÉE 4.4 - Gain 15.7x : import VectorizedPreprocessor ===
	from src.preprocessing import RawToModelTransformer, VectorizedPreprocessor


	# Load the model once at startup for efficiency (lazy loading for tests).
	MODEL = None

	def _load_model():
	"""Lazy-load the model on first use.

	Behavior:
	- Try local LightGBM model file `models/lightgbm.txt` first (fastest, works in Docker/HF).
	- If that fails, try the MLflow Model Registry as fallback (for local dev with MLflow server).
	"""
	global MODEL
	if MODEL is None:
	import lightgbm as lgb

	# 1) Local model file (primary — portable for Docker / HF Spaces)
	candidate_paths = [
	Path(__file__).resolve().parent / "models" / "lightgbm.txt",
	Path.cwd() / "models" / "lightgbm.txt",
	]
	env_path = os.environ.get("LOCAL_MODEL_PATH")
	if env_path:
	candidate_paths.insert(0, Path(env_path))

	for p in candidate_paths:
	if p.exists():
	try:
	MODEL = lgb.Booster(model_file=str(p))
	print(f"Loaded local LightGBM model from {p}")
	return MODEL
	except Exception as err:
	print(f"Warning: failed to load {p}: {err}")

	# 2) Fallback: MLflow Model Registry (for local dev)
	try:
	MODEL = mlflow.lightgbm.load_model("models:/LightGBM/Production")
	print("Loaded model from MLflow registry")
	return MODEL
	except Exception as mlflow_err:
	raise RuntimeError(
	f"No local model found at {[str(p) for p in candidate_paths]} "
	f"and MLflow registry failed: {mlflow_err}. "
	"Place the model at `models/lightgbm.txt` or set LOCAL_MODEL_PATH."
	) from mlflow_err

	return MODEL


	# Preprocessor (accept "raw" input and map to model features)
	PREPROCESSOR = None

	def _load_preprocessor():
	"""Charge le VectorizedPreprocessor (version optimisée étape 4).

	Priorité de chargement :
	1. models/preprocessor_vectorized.joblib (VectorizedPreprocessor, 15.7x plus rapide)
	2. Auto-création depuis models/preprocessor.joblib (wrap RawToModelTransformer)
	3. Création d'un RawToModelTransformer de base (fallback)
	"""
	global PREPROCESSOR
	if PREPROCESSOR is not None:
	return PREPROCESSOR

	# === VERSION OPTIMISÉE 4.4 - Gain 15.7x ===
	# Essayer d'abord le preprocessor vectorisé sauvegardé
	vectorized_path = Path("models") / "preprocessor_vectorized.joblib"
	if vectorized_path.exists():
	try:
	PREPROCESSOR = joblib.load(vectorized_path)
	print("✅ VectorizedPreprocessor chargé (étape 4 optimisée)")
	return PREPROCESSOR
	except Exception as e:
	print(f"⚠️ Chargement vectorized échoué, fallback : {e}")
	PREPROCESSOR = None

	# Auto-création : wrapper VectorizedPreprocessor autour de l'ancien preprocessor
	base_path = Path("models") / "preprocessor.joblib"
	base_transformer = None
	if base_path.exists():
	try:
	base_transformer = joblib.load(base_path)
	# Vérifier que c'est bien un RawToModelTransformer (pas déjà un VectorizedPreprocessor)
	if isinstance(base_transformer, VectorizedPreprocessor):
	PREPROCESSOR = base_transformer
	print("✅ VectorizedPreprocessor chargé depuis preprocessor.joblib")
	return PREPROCESSOR
	if not isinstance(base_transformer, RawToModelTransformer):
	base_transformer = None
	except Exception:
	base_transformer = None

	if base_transformer is None:
	# Aucun fichier disponible : créer un RawToModelTransformer de base
	base_transformer = RawToModelTransformer()

	# Wrappeur VectorizedPreprocessor (cœur du gain 15.7x)
	PREPROCESSOR = VectorizedPreprocessor(base_transformer)
	print("✅ VectorizedPreprocessor créé (étape 4 optimisée)")

	# Sauvegarder pour les prochains démarrages
	try:
	vectorized_path.parent.mkdir(parents=True, exist_ok=True)
	joblib.dump(PREPROCESSOR, vectorized_path)
	print(f"✅ VectorizedPreprocessor sauvegardé → {vectorized_path}")
	except Exception as e:
	print(f"⚠️ Sauvegarde échouée (non bloquant) : {e}")

	return PREPROCESSOR

	def _validate_payload(payload: Dict[str, Any]) -> Dict[str, Any]:
	"""Basic validation on input payload.

	Raises:
	ValueError: If the payload is invalid.
	"""
	if not isinstance(payload, dict):
	raise ValueError("Le JSON doit être un objet (clé/valeur).")

	if not payload:
	raise ValueError("Le JSON est vide.")

	for key, value in payload.items():
	# EXPLICATION : None est accepté (LightGBM gère nativement les NaN)
	if value is not None and isinstance(value, (list, dict)):
	raise ValueError(f"La valeur de '{key}' doit être scalaire.")

	return payload


	def _parse_json_line(json_line: str) -> pd.DataFrame:
	"""Parse a single JSON line into a one-row DataFrame."""
	try:
	raw = json.loads(json_line)
	except json.JSONDecodeError as exc:
	raise ValueError("JSON invalide. Vérifie la syntaxe.") from exc

	payload = _validate_payload(raw)

	# Build a single-row DataFrame and sanitize common problematic inputs:
	# - convert empty strings to NaN so numeric coercion / imputation works
	# - convert string booleans to actual booleans ("True"/"False")
	df = pd.DataFrame([payload])
	df = df.replace({"": np.nan, "True": True, "False": False})

	# EXPLICATION : Sanitiser les noms de colonnes pour matcher ceux attendus par le modèle.
	# Le modèle a été entraîné avec des noms sanitisés (espaces → _, caractères spéciaux → _).
	# Sans cette étape, des colonnes comme "BURO_CREDIT_ACTIVE_Bad debt_MEAN" ne matchent pas
	# "BURO_CREDIT_ACTIVE_Bad_debt_MEAN" → fill_value=0 → prédictions faussées (tout Accordé).
	df.columns = [_re.sub(r'[^a-zA-Z0-9_]', '_', c.replace(' ', '_')) for c in df.columns]

	# Force all columns to numeric dtypes (LightGBM rejects object/str columns).
	# Booleans become 1/0, strings that are still present become NaN.
	for col in df.columns:
	df[col] = pd.to_numeric(df[col], errors='coerce')

	# Try to apply a lightweight preprocessor to accept "raw" payloads
	# The transformer maps categorical strings (ex. NAME_CONTRACT_TYPE) to the
	# one-hot columns expected by the trained model. On any failure we keep the
	# original dataframe and rely on column reindexing later.
	#
	# IMPORTANT: Skip preprocessor if input is already processed data (e.g. from
	# features_train.csv / reference.csv). Detect this by checking how many input
	# columns match expected model features. If >50% match, data is already
	# processed — running the preprocessor would replace NaN with median values,
	# destroying the signal that LightGBM uses for missing-value splits.
	try:
	pre = _load_preprocessor()
	if pre is not None:
	expected_feats = set(pre.get_feature_names_out()) if hasattr(pre, 'get_feature_names_out') else set()
	overlap = len(set(df.columns) & expected_feats)
	if expected_feats and overlap / len(expected_feats) > 0.5:
	# Data is already processed — skip preprocessor to avoid double processing
	pass
	else:
	df = pre.transform(df)
	except Exception:
	# Non-fatal: continue with the original df (alignment step will fill missing)
	pass

	return df


	def _get_model_feature_names(model) -> list \| None:
	"""Try to obtain the model's expected feature names.

	Tries common LightGBM / sklearn attributes first, then falls back to
	reading the header of `data/processed/features_train.csv`.
	Returns a list of column names or None if not found.
	"""
	# 1) common LightGBM / sklearn attributes
	try:
	fn = getattr(model, "feature_name", None)
	if callable(fn):
	names = list(fn())
	if names:
	return names
	except Exception:
	pass

	names = getattr(model, "feature_name_", None)
	if isinstance(names, (list, tuple)):
	return list(names)

	# LightGBM scikit-learn wrapper exposes `booster_`
	try:
	if hasattr(model, "booster_") and getattr(model.booster_, "feature_name", None):
	return list(model.booster_.feature_name())
	except Exception:
	pass

	# 2) Fallback to header from the preprocessed training CSV
	try:
	header_path = Path("data/processed/features_train.csv")
	if header_path.exists():
	df_header = pd.read_csv(header_path, nrows=0)
	cols = [c for c in df_header.columns if c not in ("SK_ID_CURR", "TARGET")]
	# Apply same sanitization as training notebook (spaces → _, non-alnum → _)
	cols = [_re.sub(r'[^a-zA-Z0-9_]', '_', c.replace(' ', '_')) for c in cols]
	if cols:
	return cols
	except Exception:
	pass

	return None


	# EXPLICATION : Fonction helper pour logger chaque prédiction avec tous les champs requis
	# IMPORTANT : Écrit DIRECTEMENT dans le fichier (pas de FileHandler)
	# pour éviter les problèmes d'interférence avec Gradio/autres loggers
	def log_prediction(input_raw: str, input_features: dict, output_proba: float,
	output_decision: str, execution_time_ms: float, error: str = None):
	"""Log une prédiction au format JSON structuré dans logs/predictions.jsonl."""
	try:
	# Chemin absolu ancré sur app.py → fonctionne quel que soit le cwd de lancement
	_log_dir = Path(__file__).resolve().parent / "logs"
	_log_dir.mkdir(parents=True, exist_ok=True)

	# EXPLICATION : Construit l'entrée JSON
	log_entry = {
	"timestamp": datetime.utcnow().isoformat() + "Z",
	"input_raw": input_raw,
	"input_features": input_features,
	"output_proba": round(output_proba, 4) if output_proba is not None else None,
	"output_decision": output_decision,
	"execution_time_ms": round(execution_time_ms, 1),
	"error": error,
	"model_version": "models:/LightGBM/Production",
	"threshold": 0.4
	}

	# EXPLICATION : Écrit DIRECTEMENT dans le fichier (robuste à Gradio)
	# Mode "a" = append, newline assuré après chaque log
	log_line = json.dumps(log_entry, ensure_ascii=False) + "\n"
	log_file = _log_dir / "predictions.jsonl"

	with open(log_file, "a", encoding="utf-8") as f:
	f.write(log_line)
	f.flush() # Force l'écriture immédiate (important pour le suivi en temps réel)

	# EXPLICATION : Aussi afficher dans la console pour Docker/HF Spaces
	print(f"[LOG] {log_line.strip()}")

	except Exception as exc:
	# EXPLICATION : N'échoue pas silencieusement si le logging échoue
	print(f"[ERROR] Logging échoué : {exc}", flush=True)


	# === VERSION OPTIMISÉE 4.4 - Gain 15.7x ===
	# Remplace l'ancienne _predict (boucle ligne par ligne)
	# par une version vectorisée pandas : prétraitement en une seule opération.
	def _predict(json_line: str, threshold: float = 0.4) -> str:
	"""Predict default probability and return a formatted response.

	Version optimisée étape 4 (15.7x plus rapide - vectorisation pandas)
	"""
	# Capture du temps de début pour calculer execution_time_ms
	start_time = time.perf_counter()

	try:
	# === ÉTAPE 1 : Validation JSON (fail-fast avant tout traitement) ===
	try:
	raw = json.loads(json_line)
	except json.JSONDecodeError as exc:
	raise ValueError("JSON invalide. Vérifie la syntaxe.") from exc
	_validate_payload(raw)

	# === ÉTAPE 2 : Preprocessing vectorisé (cœur du gain 15.7x) ===
	# VectorizedPreprocessor.transform_one_sample construit le DataFrame
	# depuis le dict en UNE seule opération pandas (pas de boucle).
	prep = _load_preprocessor()
	if prep is not None and isinstance(prep, VectorizedPreprocessor):
	# Chemin optimisé : VectorizedPreprocessor (vectorisation pandas)
	df = prep.transform_one_sample(json_line)
	else:
	# Fallback : ancien chemin (RawToModelTransformer ligne par ligne)
	df = _parse_json_line(json_line)

	# === ÉTAPE 3 : Alignement colonnes sur les features attendues du modèle ===
	# fill_value=np.nan (pas 0) : LightGBM utilise ses splits natifs manquants
	model = _load_model()
	expected = _get_model_feature_names(model)
	if expected:
	df = df.reindex(columns=expected, fill_value=np.nan)

	# Garantie finale : toutes les colonnes numériques (LightGBM requirement)
	# NaN préservés — LightGBM les gère nativement.
	for col in df.columns:
	df[col] = pd.to_numeric(df[col], errors='coerce')

	# === ÉTAPE 4 : Inférence LightGBM (predict_proba vectorisé) ===
	try:
	proba = float(model.predict_proba(df)[:, 1][0])
	except AttributeError:
	# Fallback pour les modèles exposant predict() retournant des probabilités
	proba = float(model.predict(df)[0])

	if not 0.0 <= proba <= 1.0:
	raise ValueError("La probabilité prédite est hors de l'intervalle [0, 1].")

	score = int(proba * 1000)
	# Seuil de décision : < threshold = Accordé (risque faible)
	decision = "Accordé" if proba < threshold else "Refusé"

	# === ÉTAPE 5 : Log structuré de la prédiction réussie ===
	execution_time_ms = (time.perf_counter() - start_time) * 1000
	log_prediction(
	input_raw=json_line,
	input_features=raw,
	output_proba=proba,
	output_decision=decision,
	execution_time_ms=execution_time_ms,
	error=None
	)

	return (
	f"Score: {score}\n"
	f"Probabilité de défaut: {proba:.4f}\n"
	f"Décision: {decision}"
	)

	except ValueError as exc:
	# Log de l'erreur avec temps d'exécution et message d'erreur
	execution_time_ms = (time.perf_counter() - start_time) * 1000
	try:
	input_features = json.loads(json_line)
	except Exception:
	input_features = {}
	log_prediction(
	input_raw=json_line,
	input_features=input_features,
	output_proba=None,
	output_decision="Erreur",
	execution_time_ms=execution_time_ms,
	error=f"ValueError: {exc}"
	)
	return f"Erreur: {exc}"
	except KeyError as exc:
	execution_time_ms = (time.perf_counter() - start_time) * 1000
	try:
	input_features = json.loads(json_line)
	except Exception:
	input_features = {}
	log_prediction(
	input_raw=json_line,
	input_features=input_features,
	output_proba=None,
	output_decision="Erreur",
	execution_time_ms=execution_time_ms,
	error=f"KeyError: {exc}"
	)
	return f"Erreur: colonne manquante ({exc})."
	except TypeError as exc:
	execution_time_ms = (time.perf_counter() - start_time) * 1000
	try:
	input_features = json.loads(json_line)
	except Exception:
	input_features = {}
	log_prediction(
	input_raw=json_line,
	input_features=input_features,
	output_proba=None,
	output_decision="Erreur",
	execution_time_ms=execution_time_ms,
	error=f"TypeError: {exc}"
	)
	return f"Erreur: type invalide ({exc})."
	except Exception as exc: # noqa: BLE001
	execution_time_ms = (time.perf_counter() - start_time) * 1000
	try:
	input_features = json.loads(json_line)
	except Exception:
	input_features = {}
	log_prediction(
	input_raw=json_line,
	input_features=input_features,
	output_proba=None,
	output_decision="Erreur",
	execution_time_ms=execution_time_ms,
	error=f"Exception: {exc}"
	)
	return f"Erreur inattendue: {exc}"


	def build_demo() -> gr.Blocks:
	"""Build and return the Gradio Blocks demo."""
	with gr.Blocks(title="Credit Scoring API") as demo:
	gr.Markdown(
	"# Credit Scoring API\n"
	"Saisis une seule ligne JSON avec les variables d'entrée.\n"
	"Le modèle LightGBM retourne une probabilité de défaut, un score, et une décision.\n"
	"Version optimisée étape 4 (15.7x plus rapide - vectorisation pandas)"
	)

	with gr.Row():
	input_json = gr.Textbox(
	label="JSON (ligne unique)",
	lines=12,
	max_lines=30,
	placeholder='{"feature1": value1, "feature2": value2, ...}'
	)

	output_text = gr.Textbox(
	label="Résultat",
	lines=5,
	)

	predict_btn = gr.Button("Prédire")
	predict_btn.click(
	fn=_predict,
	inputs=[input_json],
	outputs=[output_text],
	)

	gr.Markdown(
	"Note: Le seuil de décision est fixé à 0.4 par défaut."
	)

	return demo


	demo = build_demo()

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=int(os.environ.get("PORT", 7860)),
	)