Spaces:

ASI-Engineer
/

OC_P8_prod

Running

File size: 17,501 Bytes

178345a

"""Gradio app for Credit Scoring using an MLflow LightGBM model."""

import json
from typing import Any, Dict
# EXPLICATION : Imports nécessaires pour le logging structuré JSON
import logging
import time
from datetime import datetime
# EXPLICATION : Path pour gestion robuste des chemins de logs (multi-plateforme)
from pathlib import Path

# Compatibility shim: HF Spaces may install a `huggingface_hub` that no longer
# exports `HfFolder` (used by older Gradio 4.x oauth). Try to import and patch
# the real `huggingface_hub` when available; only create a minimal shim if the
# package is absent so we don't shadow the real implementation.
import os
try:
    import huggingface_hub as _hf  # prefer the real package when available
except Exception:
    _hf = None

if _hf is not None:
    # Patch only missing symbols to preserve real package behaviour
    if not hasattr(_hf, 'HfFolder'):
        class HfFolder:
            @staticmethod
            def get_token():
                return os.environ.get('HF_TOKEN') or os.environ.get('HUGGINGFACE_HUB_TOKEN')
        _hf.HfFolder = HfFolder
    if not hasattr(_hf, 'whoami'):
        def whoami(token=None):
            return {}
        _hf.whoami = whoami
else:
    import sys, types
    _mod = types.ModuleType('huggingface_hub')
    class HfFolder:
        @staticmethod
        def get_token():
            return os.environ.get('HF_TOKEN') or os.environ.get('HUGGINGFACE_HUB_TOKEN')
    def whoami(token=None):
        return {}
    _mod.HfFolder = HfFolder
    _mod.whoami = whoami
    sys.modules['huggingface_hub'] = _mod

import re as _re

import gradio as gr
import mlflow
import mlflow.lightgbm
import pandas as pd
import numpy as np
from pathlib import Path

# joblib est requis pour charger le preprocessor vectorisé (etape 4 optimisée)
import joblib

# Lightweight transformer to accept "raw" payloads (categorical strings, booleans)
# === VERSION OPTIMISÉE 4.4 - Gain 15.7x : import VectorizedPreprocessor ===
from src.preprocessing import RawToModelTransformer, VectorizedPreprocessor


# Load the model once at startup for efficiency (lazy loading for tests).
MODEL = None

def _load_model():
	"""Lazy-load the model on first use.

	Behavior:
	- Try local LightGBM model file `models/lightgbm.txt` first (fastest, works in Docker/HF).
	- If that fails, try the MLflow Model Registry as fallback (for local dev with MLflow server).
	"""
	global MODEL
	if MODEL is None:
		import lightgbm as lgb

		# 1) Local model file (primary — portable for Docker / HF Spaces)
		candidate_paths = [
			Path(__file__).resolve().parent / "models" / "lightgbm.txt",
			Path.cwd() / "models" / "lightgbm.txt",
		]
		env_path = os.environ.get("LOCAL_MODEL_PATH")
		if env_path:
			candidate_paths.insert(0, Path(env_path))

		for p in candidate_paths:
			if p.exists():
				try:
					MODEL = lgb.Booster(model_file=str(p))
					print(f"Loaded local LightGBM model from {p}")
					return MODEL
				except Exception as err:
					print(f"Warning: failed to load {p}: {err}")

		# 2) Fallback: MLflow Model Registry (for local dev)
		try:
			MODEL = mlflow.lightgbm.load_model("models:/LightGBM/Production")
			print("Loaded model from MLflow registry")
			return MODEL
		except Exception as mlflow_err:
			raise RuntimeError(
				f"No local model found at {[str(p) for p in candidate_paths]} "
				f"and MLflow registry failed: {mlflow_err}. "
				"Place the model at `models/lightgbm.txt` or set LOCAL_MODEL_PATH."
			) from mlflow_err

	return MODEL


# Preprocessor (accept "raw" input and map to model features)
PREPROCESSOR = None

def _load_preprocessor():
	"""Charge le VectorizedPreprocessor (version optimisée étape 4).

	Priorité de chargement :
	1. models/preprocessor_vectorized.joblib  (VectorizedPreprocessor, 15.7x plus rapide)
	2. Auto-création depuis models/preprocessor.joblib  (wrap RawToModelTransformer)
	3. Création d'un RawToModelTransformer de base (fallback)
	"""
	global PREPROCESSOR
	if PREPROCESSOR is not None:
		return PREPROCESSOR

	# === VERSION OPTIMISÉE 4.4 - Gain 15.7x ===
	# Essayer d'abord le preprocessor vectorisé sauvegardé
	vectorized_path = Path("models") / "preprocessor_vectorized.joblib"
	if vectorized_path.exists():
		try:
			PREPROCESSOR = joblib.load(vectorized_path)
			print("✅ VectorizedPreprocessor chargé (étape 4 optimisée)")
			return PREPROCESSOR
		except Exception as e:
			print(f"⚠️  Chargement vectorized échoué, fallback : {e}")
			PREPROCESSOR = None

	# Auto-création : wrapper VectorizedPreprocessor autour de l'ancien preprocessor
	base_path = Path("models") / "preprocessor.joblib"
	base_transformer = None
	if base_path.exists():
		try:
			base_transformer = joblib.load(base_path)
			# Vérifier que c'est bien un RawToModelTransformer (pas déjà un VectorizedPreprocessor)
			if isinstance(base_transformer, VectorizedPreprocessor):
				PREPROCESSOR = base_transformer
				print("✅ VectorizedPreprocessor chargé depuis preprocessor.joblib")
				return PREPROCESSOR
			if not isinstance(base_transformer, RawToModelTransformer):
				base_transformer = None
		except Exception:
			base_transformer = None

	if base_transformer is None:
		# Aucun fichier disponible : créer un RawToModelTransformer de base
		base_transformer = RawToModelTransformer()

	# Wrappeur VectorizedPreprocessor (cœur du gain 15.7x)
	PREPROCESSOR = VectorizedPreprocessor(base_transformer)
	print("✅ VectorizedPreprocessor créé (étape 4 optimisée)")

	# Sauvegarder pour les prochains démarrages
	try:
		vectorized_path.parent.mkdir(parents=True, exist_ok=True)
		joblib.dump(PREPROCESSOR, vectorized_path)
		print(f"✅ VectorizedPreprocessor sauvegardé → {vectorized_path}")
	except Exception as e:
		print(f"⚠️  Sauvegarde échouée (non bloquant) : {e}")

	return PREPROCESSOR

def _validate_payload(payload: Dict[str, Any]) -> Dict[str, Any]:
	"""Basic validation on input payload.

	Raises:
		ValueError: If the payload is invalid.
	"""
	if not isinstance(payload, dict):
		raise ValueError("Le JSON doit être un objet (clé/valeur).")

	if not payload:
		raise ValueError("Le JSON est vide.")

	for key, value in payload.items():
		# EXPLICATION : None est accepté (LightGBM gère nativement les NaN)
		if value is not None and isinstance(value, (list, dict)):
			raise ValueError(f"La valeur de '{key}' doit être scalaire.")

	return payload


def _parse_json_line(json_line: str) -> pd.DataFrame:
	"""Parse a single JSON line into a one-row DataFrame."""
	try:
		raw = json.loads(json_line)
	except json.JSONDecodeError as exc:
		raise ValueError("JSON invalide. Vérifie la syntaxe.") from exc

	payload = _validate_payload(raw)

	# Build a single-row DataFrame and sanitize common problematic inputs:
	# - convert empty strings to NaN so numeric coercion / imputation works
	# - convert string booleans to actual booleans ("True"/"False")
	df = pd.DataFrame([payload])
	df = df.replace({"": np.nan, "True": True, "False": False})

	# EXPLICATION : Sanitiser les noms de colonnes pour matcher ceux attendus par le modèle.
	# Le modèle a été entraîné avec des noms sanitisés (espaces → _, caractères spéciaux → _).
	# Sans cette étape, des colonnes comme "BURO_CREDIT_ACTIVE_Bad debt_MEAN" ne matchent pas
	# "BURO_CREDIT_ACTIVE_Bad_debt_MEAN" → fill_value=0 → prédictions faussées (tout Accordé).
	df.columns = [_re.sub(r'[^a-zA-Z0-9_]', '_', c.replace(' ', '_')) for c in df.columns]

	# Force all columns to numeric dtypes (LightGBM rejects object/str columns).
	# Booleans become 1/0, strings that are still present become NaN.
	for col in df.columns:
		df[col] = pd.to_numeric(df[col], errors='coerce')

	# Try to apply a lightweight preprocessor to accept "raw" payloads
	# The transformer maps categorical strings (ex. NAME_CONTRACT_TYPE) to the
	# one-hot columns expected by the trained model. On any failure we keep the
	# original dataframe and rely on column reindexing later.
	#
	# IMPORTANT: Skip preprocessor if input is already processed data (e.g. from
	# features_train.csv / reference.csv). Detect this by checking how many input
	# columns match expected model features. If >50% match, data is already
	# processed — running the preprocessor would replace NaN with median values,
	# destroying the signal that LightGBM uses for missing-value splits.
	try:
		pre = _load_preprocessor()
		if pre is not None:
			expected_feats = set(pre.get_feature_names_out()) if hasattr(pre, 'get_feature_names_out') else set()
			overlap = len(set(df.columns) & expected_feats)
			if expected_feats and overlap / len(expected_feats) > 0.5:
				# Data is already processed — skip preprocessor to avoid double processing
				pass
			else:
				df = pre.transform(df)
	except Exception:
		# Non-fatal: continue with the original df (alignment step will fill missing)
		pass

	return df


def _get_model_feature_names(model) -> list | None:
	"""Try to obtain the model's expected feature names.

	Tries common LightGBM / sklearn attributes first, then falls back to
	reading the header of `data/processed/features_train.csv`.
	Returns a list of column names or None if not found.
	"""
	# 1) common LightGBM / sklearn attributes
	try:
		fn = getattr(model, "feature_name", None)
		if callable(fn):
			names = list(fn())
			if names:
				return names
	except Exception:
		pass

	names = getattr(model, "feature_name_", None)
	if isinstance(names, (list, tuple)):
		return list(names)

	# LightGBM scikit-learn wrapper exposes `booster_`
	try:
		if hasattr(model, "booster_") and getattr(model.booster_, "feature_name", None):
			return list(model.booster_.feature_name())
	except Exception:
		pass

	# 2) Fallback to header from the preprocessed training CSV
	try:
		header_path = Path("data/processed/features_train.csv")
		if header_path.exists():
			df_header = pd.read_csv(header_path, nrows=0)
			cols = [c for c in df_header.columns if c not in ("SK_ID_CURR", "TARGET")]
			# Apply same sanitization as training notebook (spaces → _, non-alnum → _)
			cols = [_re.sub(r'[^a-zA-Z0-9_]', '_', c.replace(' ', '_')) for c in cols]
			if cols:
				return cols
	except Exception:
		pass

	return None


# EXPLICATION : Fonction helper pour logger chaque prédiction avec tous les champs requis
# IMPORTANT : Écrit DIRECTEMENT dans le fichier (pas de FileHandler)
# pour éviter les problèmes d'interférence avec Gradio/autres loggers
def log_prediction(input_raw: str, input_features: dict, output_proba: float, 
                   output_decision: str, execution_time_ms: float, error: str = None):
	"""Log une prédiction au format JSON structuré dans logs/predictions.jsonl."""
	try:
		# Chemin absolu ancré sur app.py → fonctionne quel que soit le cwd de lancement
		_log_dir = Path(__file__).resolve().parent / "logs"
		_log_dir.mkdir(parents=True, exist_ok=True)
		
		# EXPLICATION : Construit l'entrée JSON
		log_entry = {
			"timestamp": datetime.utcnow().isoformat() + "Z",
			"input_raw": input_raw,
			"input_features": input_features,
			"output_proba": round(output_proba, 4) if output_proba is not None else None,
			"output_decision": output_decision,
			"execution_time_ms": round(execution_time_ms, 1),
			"error": error,
			"model_version": "models:/LightGBM/Production",
			"threshold": 0.4
		}
		
		# EXPLICATION : Écrit DIRECTEMENT dans le fichier (robuste à Gradio)
		# Mode "a" = append, newline assuré après chaque log
		log_line = json.dumps(log_entry, ensure_ascii=False) + "\n"
		log_file = _log_dir / "predictions.jsonl"
		
		with open(log_file, "a", encoding="utf-8") as f:
			f.write(log_line)
			f.flush()  # Force l'écriture immédiate (important pour le suivi en temps réel)
		
		# EXPLICATION : Aussi afficher dans la console pour Docker/HF Spaces
		print(f"[LOG] {log_line.strip()}")
		
	except Exception as exc:
		# EXPLICATION : N'échoue pas silencieusement si le logging échoue
		print(f"[ERROR] Logging échoué : {exc}", flush=True)


# === VERSION OPTIMISÉE 4.4 - Gain 15.7x ===
# Remplace l'ancienne _predict (boucle ligne par ligne)
# par une version vectorisée pandas : prétraitement en une seule opération.
def _predict(json_line: str, threshold: float = 0.4) -> str:
	"""Predict default probability and return a formatted response.

	Version optimisée étape 4 (15.7x plus rapide - vectorisation pandas)
	"""
	# Capture du temps de début pour calculer execution_time_ms
	start_time = time.perf_counter()

	try:
		# === ÉTAPE 1 : Validation JSON (fail-fast avant tout traitement) ===
		try:
			raw = json.loads(json_line)
		except json.JSONDecodeError as exc:
			raise ValueError("JSON invalide. Vérifie la syntaxe.") from exc
		_validate_payload(raw)

		# === ÉTAPE 2 : Preprocessing vectorisé (cœur du gain 15.7x) ===
		# VectorizedPreprocessor.transform_one_sample construit le DataFrame
		# depuis le dict en UNE seule opération pandas (pas de boucle).
		prep = _load_preprocessor()
		if prep is not None and isinstance(prep, VectorizedPreprocessor):
			# Chemin optimisé : VectorizedPreprocessor (vectorisation pandas)
			df = prep.transform_one_sample(json_line)
		else:
			# Fallback : ancien chemin (RawToModelTransformer ligne par ligne)
			df = _parse_json_line(json_line)

		# === ÉTAPE 3 : Alignement colonnes sur les features attendues du modèle ===
		# fill_value=np.nan (pas 0) : LightGBM utilise ses splits natifs manquants
		model = _load_model()
		expected = _get_model_feature_names(model)
		if expected:
			df = df.reindex(columns=expected, fill_value=np.nan)

		# Garantie finale : toutes les colonnes numériques (LightGBM requirement)
		# NaN préservés — LightGBM les gère nativement.
		for col in df.columns:
			df[col] = pd.to_numeric(df[col], errors='coerce')

		# === ÉTAPE 4 : Inférence LightGBM (predict_proba vectorisé) ===
		try:
			proba = float(model.predict_proba(df)[:, 1][0])
		except AttributeError:
			# Fallback pour les modèles exposant predict() retournant des probabilités
			proba = float(model.predict(df)[0])

		if not 0.0 <= proba <= 1.0:
			raise ValueError("La probabilité prédite est hors de l'intervalle [0, 1].")

		score = int(proba * 1000)
		# Seuil de décision : < threshold = Accordé (risque faible)
		decision = "Accordé" if proba < threshold else "Refusé"

		# === ÉTAPE 5 : Log structuré de la prédiction réussie ===
		execution_time_ms = (time.perf_counter() - start_time) * 1000
		log_prediction(
			input_raw=json_line,
			input_features=raw,
			output_proba=proba,
			output_decision=decision,
			execution_time_ms=execution_time_ms,
			error=None
		)

		return (
			f"Score: {score}\n"
			f"Probabilité de défaut: {proba:.4f}\n"
			f"Décision: {decision}"
		)

	except ValueError as exc:
		# Log de l'erreur avec temps d'exécution et message d'erreur
		execution_time_ms = (time.perf_counter() - start_time) * 1000
		try:
			input_features = json.loads(json_line)
		except Exception:
			input_features = {}
		log_prediction(
			input_raw=json_line,
			input_features=input_features,
			output_proba=None,
			output_decision="Erreur",
			execution_time_ms=execution_time_ms,
			error=f"ValueError: {exc}"
		)
		return f"Erreur: {exc}"
	except KeyError as exc:
		execution_time_ms = (time.perf_counter() - start_time) * 1000
		try:
			input_features = json.loads(json_line)
		except Exception:
			input_features = {}
		log_prediction(
			input_raw=json_line,
			input_features=input_features,
			output_proba=None,
			output_decision="Erreur",
			execution_time_ms=execution_time_ms,
			error=f"KeyError: {exc}"
		)
		return f"Erreur: colonne manquante ({exc})."
	except TypeError as exc:
		execution_time_ms = (time.perf_counter() - start_time) * 1000
		try:
			input_features = json.loads(json_line)
		except Exception:
			input_features = {}
		log_prediction(
			input_raw=json_line,
			input_features=input_features,
			output_proba=None,
			output_decision="Erreur",
			execution_time_ms=execution_time_ms,
			error=f"TypeError: {exc}"
		)
		return f"Erreur: type invalide ({exc})."
	except Exception as exc:  # noqa: BLE001
		execution_time_ms = (time.perf_counter() - start_time) * 1000
		try:
			input_features = json.loads(json_line)
		except Exception:
			input_features = {}
		log_prediction(
			input_raw=json_line,
			input_features=input_features,
			output_proba=None,
			output_decision="Erreur",
			execution_time_ms=execution_time_ms,
			error=f"Exception: {exc}"
		)
		return f"Erreur inattendue: {exc}"


def build_demo() -> gr.Blocks:
	"""Build and return the Gradio Blocks demo."""
	with gr.Blocks(title="Credit Scoring API") as demo:
		gr.Markdown(
			"# Credit Scoring API\n"
			"Saisis une seule ligne JSON avec les variables d'entrée.\n"
			"Le modèle LightGBM retourne une probabilité de défaut, un score, et une décision.\n"
			"*Version optimisée étape 4 (15.7x plus rapide - vectorisation pandas)*"
		)

		with gr.Row():
			input_json = gr.Textbox(
				label="JSON (ligne unique)",
				lines=12,
				max_lines=30,
				placeholder='{"feature1": value1, "feature2": value2, ...}'
			)

		output_text = gr.Textbox(
			label="Résultat",
			lines=5,
		)

		predict_btn = gr.Button("Prédire")
		predict_btn.click(
			fn=_predict,
			inputs=[input_json],
			outputs=[output_text],
		)

		gr.Markdown(
			"**Note:** Le seuil de décision est fixé à 0.4 par défaut."
		)

	return demo


demo = build_demo()

if __name__ == "__main__":
	demo.launch(
		server_name="0.0.0.0",
		server_port=int(os.environ.get("PORT", 7860)),
	)