Spaces:
Runtime error
Runtime error
| from __future__ import annotations | |
| # | |
| from pathlib import Path | |
| from typing import Any | |
| import gradio as gr | |
| import numpy as np | |
| import pandas as pd | |
| from fastapi import HTTPException | |
| from app.main import ( | |
| DAYS_EMPLOYED_SENTINEL, | |
| ENGINEERED_SOURCES, | |
| MODEL_VERSION, | |
| MinimalPredictionRequest, | |
| app, | |
| new_features_creation, | |
| prepare_inference_features, | |
| predict_minimal, | |
| startup_event, | |
| _build_minimal_record, | |
| _normalize_inputs, | |
| ) | |
| _DESCRIPTION_PATH = Path("data/HomeCredit_columns_description.csv") | |
| _FEATURE_DESC_CACHE: dict[str, str] | None = None | |
| _FEATURE_DESC_KEYS: list[str] | None = None | |
| def _load_feature_descriptions() -> dict[str, str]: | |
| global _FEATURE_DESC_CACHE, _FEATURE_DESC_KEYS | |
| if _FEATURE_DESC_CACHE is not None: | |
| return _FEATURE_DESC_CACHE | |
| if not _DESCRIPTION_PATH.exists(): | |
| _FEATURE_DESC_CACHE = {} | |
| _FEATURE_DESC_KEYS = [] | |
| return _FEATURE_DESC_CACHE | |
| try: | |
| df = pd.read_csv(_DESCRIPTION_PATH, encoding="latin1") | |
| except Exception: | |
| try: | |
| df = pd.read_csv(_DESCRIPTION_PATH) | |
| except Exception: | |
| _FEATURE_DESC_CACHE = {} | |
| _FEATURE_DESC_KEYS = [] | |
| return _FEATURE_DESC_CACHE | |
| if "Row" not in df.columns or "Description" not in df.columns: | |
| _FEATURE_DESC_CACHE = {} | |
| _FEATURE_DESC_KEYS = [] | |
| return _FEATURE_DESC_CACHE | |
| mapping: dict[str, str] = {} | |
| for row, desc in df[["Row", "Description"]].dropna().itertuples(index=False): | |
| key = str(row).strip() | |
| if not key or key in mapping: | |
| continue | |
| mapping[key] = str(desc).strip() | |
| _FEATURE_DESC_CACHE = mapping | |
| _FEATURE_DESC_KEYS = sorted(mapping.keys(), key=len, reverse=True) | |
| return _FEATURE_DESC_CACHE | |
| def _describe_feature_name(feature_name: str, desc_map: dict[str, str]) -> str: | |
| if not desc_map: | |
| return feature_name | |
| cleaned = _strip_feature_prefix(feature_name) | |
| direct = desc_map.get(cleaned) | |
| if direct: | |
| return direct | |
| for prefix, label in (("is_missing_", "Missing indicator for"), ("is_outlier_", "Outlier indicator for")): | |
| if cleaned.startswith(prefix): | |
| base = cleaned[len(prefix):] | |
| base_desc = desc_map.get(base, base) | |
| return f"{label} {base_desc}" | |
| parts = cleaned.split("__") | |
| base_key = None | |
| for part in parts: | |
| if part in desc_map and (base_key is None or len(part) > len(base_key)): | |
| base_key = part | |
| if base_key: | |
| desc = desc_map[base_key] | |
| try: | |
| idx = parts.index(base_key) | |
| except ValueError: | |
| idx = -1 | |
| suffix_parts = parts[idx + 1:] if idx >= 0 else [] | |
| if suffix_parts: | |
| suffix = " ".join(suffix_parts) | |
| return f"{desc} ({suffix})" | |
| return desc | |
| keys = _FEATURE_DESC_KEYS or list(desc_map.keys()) | |
| for key in keys: | |
| if cleaned.startswith(key + "_"): | |
| suffix = cleaned[len(key) + 1:] | |
| desc = desc_map[key] | |
| return f"{desc} ({suffix})" if suffix else desc | |
| return cleaned | |
| def _ensure_startup() -> None: | |
| if not getattr(app.state, "preprocessor", None): | |
| startup_event() | |
| def _customer_snapshot(sk_id_curr: int) -> dict[str, Any]: | |
| reference = getattr(app.state, "customer_reference", None) | |
| if reference is None or sk_id_curr not in reference.index: | |
| return {} | |
| row = reference.loc[sk_id_curr] | |
| snapshot: dict[str, Any] = {"SK_ID_CURR": int(sk_id_curr)} | |
| if "CODE_GENDER" in row: | |
| snapshot["CODE_GENDER"] = row["CODE_GENDER"] | |
| if "FLAG_OWN_CAR" in row: | |
| snapshot["FLAG_OWN_CAR"] = row["FLAG_OWN_CAR"] | |
| if "AMT_INCOME_TOTAL" in row: | |
| snapshot["AMT_INCOME_TOTAL"] = float(row["AMT_INCOME_TOTAL"]) | |
| if "DAYS_BIRTH" in row: | |
| snapshot["AGE_YEARS"] = round(abs(float(row["DAYS_BIRTH"])) / 365.25, 1) | |
| return snapshot | |
| def _shap_error_table(message: str) -> pd.DataFrame: | |
| return pd.DataFrame( | |
| [ | |
| { | |
| "Nom": message, | |
| "Description": "", | |
| "Valeur": np.nan, | |
| "Impact sur la prédiction (SHAP)": np.nan, | |
| } | |
| ] | |
| ) | |
| def _extract_shap_values(shap_values: Any) -> np.ndarray: | |
| if isinstance(shap_values, list): | |
| shap_values = shap_values[1] if len(shap_values) > 1 else shap_values[0] | |
| values = np.asarray(shap_values) | |
| if values.ndim == 3: | |
| values = values[:, :, 1] | |
| if values.ndim == 1: | |
| values = values.reshape(1, -1) | |
| return values | |
| def _clean_raw_value(value: Any) -> Any: | |
| if value is None or pd.isna(value): | |
| return None | |
| if isinstance(value, np.integer): | |
| return value.item() | |
| if isinstance(value, (np.floating, float)): | |
| return round(float(value), 2) | |
| return value | |
| def _format_raw_value(value: Any) -> str | None: | |
| cleaned = _clean_raw_value(value) | |
| if cleaned is None: | |
| return None | |
| if isinstance(cleaned, float): | |
| text = f"{cleaned:.2f}" | |
| return text.rstrip("0").rstrip(".") | |
| return str(cleaned) | |
| def _format_probability_percent(probability: float | None) -> str | None: | |
| if probability is None: | |
| return None | |
| percent = round(probability * 100, 2) | |
| text = f"{percent:.2f}".rstrip("0").rstrip(".") | |
| return f"{text} %" | |
| def _strip_feature_prefix(feature_name: str) -> str: | |
| return feature_name.split("__", 1)[1] if "__" in feature_name else feature_name | |
| def _lookup_raw_value(feature_name: str, raw_df: pd.DataFrame, preprocessor) -> Any: | |
| cleaned_name = _strip_feature_prefix(feature_name) | |
| if cleaned_name in raw_df.columns: | |
| return raw_df.at[0, cleaned_name] | |
| for prefix in ("is_missing_", "is_outlier_"): | |
| if cleaned_name.startswith(prefix): | |
| base = cleaned_name[len(prefix):] | |
| if base in raw_df.columns: | |
| return raw_df.at[0, base] | |
| for col in getattr(preprocessor, "categorical_columns", []): | |
| if cleaned_name.startswith(f"{col}_") and col in raw_df.columns: | |
| return raw_df.at[0, col] | |
| return None | |
| def _align_features_to_model(X: Any, model: Any) -> Any: | |
| expected = getattr(model, "feature_names_in_", None) | |
| if expected is None: | |
| return X | |
| if isinstance(X, pd.DataFrame): | |
| return X.reindex(columns=list(expected), fill_value=0) | |
| return X | |
| def _model_family(model: Any) -> str: | |
| name = type(model).__name__.lower() | |
| if "xgb" in name: | |
| return "xgb" | |
| if "lgbm" in name or "lightgbm" in name: | |
| return "lgbm" | |
| if "histgradientboosting" in name: | |
| return "histgb" | |
| return "unknown" | |
| def _xgb_pred_contribs(estimator: Any, X: Any) -> np.ndarray: | |
| import xgboost as xgb | |
| if isinstance(X, pd.DataFrame): | |
| dm = xgb.DMatrix(X, feature_names=list(X.columns)) | |
| else: | |
| dm = xgb.DMatrix(np.asarray(X)) | |
| booster = estimator.get_booster() if hasattr(estimator, "get_booster") else estimator | |
| contrib = booster.predict(dm, pred_contribs=True) | |
| return np.asarray(contrib)[:, :-1] | |
| def _lgbm_pred_contribs(estimator: Any, X: Any) -> np.ndarray: | |
| contrib = estimator.predict(X, pred_contrib=True) | |
| return np.asarray(contrib)[:, :-1] | |
| def _compute_shap_top_features(record: dict[str, Any], top_k: int = 10) -> pd.DataFrame: | |
| preprocessor = app.state.preprocessor | |
| model = app.state.model | |
| df_raw = pd.DataFrame.from_records([record]) | |
| df_norm, _, _ = _normalize_inputs(df_raw, preprocessor) | |
| raw_reference = new_features_creation( | |
| df_norm, | |
| days_employed_sentinel=DAYS_EMPLOYED_SENTINEL, | |
| engineered_sources=ENGINEERED_SOURCES, | |
| ) | |
| features = prepare_inference_features(df_norm, preprocessor, model) | |
| features = _align_features_to_model(features, model) | |
| try: | |
| import shap | |
| except ImportError: | |
| return _shap_error_table("SHAP not installed.") | |
| estimator = model | |
| X_shap = features | |
| if hasattr(model, "named_steps") and model.named_steps.get("preprocessing") is not None: | |
| estimator = model.named_steps.get("estimator", model) | |
| pipeline_preprocessor = model.named_steps["preprocessing"] | |
| try: | |
| X_shap = pipeline_preprocessor.transform(features) | |
| except Exception as exc: | |
| return _shap_error_table(f"SHAP preprocessing failed: {exc}") | |
| try: | |
| import scipy.sparse as sp | |
| if sp.issparse(X_shap): | |
| X_shap = X_shap.toarray() | |
| except Exception: | |
| pass | |
| try: | |
| feature_names = pipeline_preprocessor.get_feature_names_out() | |
| except Exception: | |
| feature_names = None | |
| if feature_names is not None: | |
| X_shap = pd.DataFrame(X_shap, columns=feature_names) | |
| family = _model_family(estimator) | |
| values: np.ndarray | None = None | |
| # 1) Contributions natives (meilleur choix pour XGB/LGBM) | |
| try: | |
| if family == "xgb": | |
| values = _xgb_pred_contribs(estimator, X_shap) | |
| elif family == "lgbm": | |
| values = _lgbm_pred_contribs(estimator, X_shap) | |
| except Exception: | |
| values = None | |
| # 2) Fallback SHAP (utile surtout pour HistGB / inconnus) | |
| if values is None: | |
| cache = getattr(app.state, "shap_explainer_cache", {}) | |
| key = f"{MODEL_VERSION}:{type(estimator).__name__}" | |
| explainer = cache.get(key) | |
| if explainer is None: | |
| try: | |
| import shap | |
| predict_fn = ( | |
| (lambda X: estimator.predict_proba(X)[:, 1]) | |
| if hasattr(estimator, "predict_proba") | |
| else (lambda X: estimator.predict(X)) | |
| ) | |
| # Evite le background dégénéré (1 seule ligne) | |
| if isinstance(X_shap, pd.DataFrame): | |
| bg = pd.concat([X_shap] * 50, ignore_index=True) | |
| else: | |
| bg = np.repeat(np.asarray(X_shap), repeats=50, axis=0) | |
| explainer = shap.Explainer(predict_fn, bg) | |
| except Exception as exc: | |
| return _shap_error_table(f"SHAP explainer init failed: {exc}") | |
| cache[key] = explainer | |
| app.state.shap_explainer_cache = cache | |
| try: | |
| import shap | |
| explanation = explainer(X_shap) | |
| values = _extract_shap_values(explanation.values) | |
| except Exception as exc: | |
| return _shap_error_table(f"SHAP failed: {exc}") | |
| shap_row = values[0] | |
| if isinstance(X_shap, pd.DataFrame): | |
| feature_values = X_shap.iloc[0].to_numpy() | |
| feature_names = X_shap.columns | |
| else: | |
| feature_values = np.asarray(X_shap)[0] | |
| feature_names = [f"feature_{idx}" for idx in range(len(feature_values))] | |
| desc_map = _load_feature_descriptions() | |
| top_idx = np.argsort(np.abs(shap_row))[::-1][:top_k] | |
| rows = [] | |
| for idx in top_idx: | |
| raw_name = str(feature_names[idx]) | |
| rows.append( | |
| { | |
| "Nom": _strip_feature_prefix(raw_name), | |
| "Description": _describe_feature_name(raw_name, desc_map), | |
| "Valeur": _format_raw_value( | |
| _lookup_raw_value(raw_name, raw_reference, preprocessor) | |
| ), | |
| "Impact sur la prédiction (SHAP)": float(np.round(shap_row[idx], 2)), | |
| } | |
| ) | |
| return pd.DataFrame(rows) | |
| def score_minimal( | |
| sk_id_curr: float, | |
| amt_credit: float, | |
| duration_months: float, | |
| ) -> tuple[str | None, str, pd.DataFrame, dict[str, Any]]: | |
| _ensure_startup() | |
| try: | |
| payload = MinimalPredictionRequest( | |
| sk_id_curr=int(sk_id_curr), | |
| amt_credit=float(amt_credit), | |
| duration_months=int(duration_months), | |
| ) | |
| record = _build_minimal_record(payload, app.state.preprocessor) | |
| response = predict_minimal(payload, threshold=None, x_client_source="gradio") | |
| result = response["predictions"][0] | |
| probability = float(np.round(result.get("probability", 0.0), 2)) | |
| probability_text = _format_probability_percent(probability) | |
| pred_value = int(result.get("prediction", 0)) | |
| label = "Default (1)" if pred_value == 1 else "No default (0)" | |
| shap_table = _compute_shap_top_features(record, top_k=10) | |
| snapshot = _customer_snapshot(int(sk_id_curr)) | |
| snapshot.update( | |
| { | |
| "AMT_CREDIT_REQUESTED": float(amt_credit), | |
| "DURATION_MONTHS": int(duration_months), | |
| } | |
| ) | |
| return probability_text, label, shap_table, snapshot | |
| except HTTPException as exc: | |
| return None, f"Erreur: {exc.detail}", _shap_error_table("No SHAP available."), {"error": exc.detail} | |
| except Exception as exc: # pragma: no cover - UI fallback | |
| return None, f"Erreur: {exc}", _shap_error_table("No SHAP available."), {"error": str(exc)} | |
| with gr.Blocks(title="Credit scoring MLOps") as demo: | |
| gr.Markdown("# Credit scoring MLOps") | |
| gr.HTML(""" | |
| <div style="display:flex; gap:0.5rem; flex-wrap:wrap;"> | |
| <a href="https://github.com/stephmnt/credit-scoring-mlops/releases" target="_blank" rel="noreferrer"> | |
| <img src="https://img.shields.io/github/v/release/stephmnt/credit-scoring-mlops" alt="GitHub Release" /> | |
| </a> | |
| <a href="https://github.com/stephmnt/credit-scoring-mlops/actions/workflows/deploy.yml" target="_blank" rel="noreferrer"> | |
| <img src="https://img.shields.io/github/actions/workflow/status/stephmnt/credit-scoring-mlops/deploy.yml" alt="GitHub Actions Workflow Status" /> | |
| </a> | |
| </div> | |
| """) | |
| gr.HTML( | |
| """ | |
| <p>Renseignez l'identifiant client, le montant du crédit et la durée.</p> | |
| <p>Le modèle prédit la probabilité de défaut de paiement ainsi que la prédiction binaire associée. Le tableau SHAP affiche les 10 features les plus influentes pour cette prédiction. Le snapshot client affiche quelques informations de référence sur le client.</p> | |
| <p>Pour accéder au data drift monitoring et aux rapports, rendez-vous sur l'application Streamlit dédiée. Le dataset est disponible sur <a href="https://huggingface.co/datasets/stephmnt/assets-credit-scoring-mlops" rel="noreferrer">Hugging Face</a>.</p> | |
| """ | |
| ) | |
| with gr.Row(): | |
| sk_id_curr = gr.Number(label="Identifiant client", precision=0, value=100001) | |
| amt_credit = gr.Number(label="Montant du crédit", value=200000) | |
| duration_months = gr.Number(label="Durée (mois)", precision=0, value=60) | |
| run_btn = gr.Button("Scorer") | |
| with gr.Row(): | |
| probability = gr.Textbox(label="Probabilité de défaut") | |
| prediction = gr.Textbox(label="Prédiction") | |
| shap_table = gr.Dataframe( | |
| headers=["Nom", "Description", "Valeur", "Impact sur la prédiction (SHAP)"], | |
| label="Top 10 SHAP (local)", | |
| datatype=["str", "str", "str", "number"], | |
| interactive=False, | |
| ) | |
| snapshot = gr.JSON(label="Snapshot client (référence)") | |
| run_btn.click( | |
| score_minimal, | |
| inputs=[sk_id_curr, amt_credit, duration_months], | |
| outputs=[probability, prediction, shap_table, snapshot], | |
| ) | |
| if __name__ == "__main__": | |
| _ensure_startup() | |
| demo.launch() | |