Spaces:

GDMProjects
/

Insulin

Sleeping

App Files Files Community

GDMProjects commited on Sep 1, 2025

Commit

3fc7211

verified ·

1 Parent(s): 991eadf

Update app.py

Browse files

Files changed (1) hide show

app.py +176 -31

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
 HOST, PORT, SHARE = "0.0.0.0", 7860, True
 # ---------- Env hygiene ----------
@@ -10,6 +10,11 @@ for _k in ("HTTP_PROXY","http_proxy","HTTPS_PROXY","https_proxy"):
 os.environ.setdefault("GRADIO_OPEN_BROWSER", "false")
 os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
 # ---------- Imports ----------
 from typing import Any, Dict, Optional, Tuple, List
 import re
@@ -17,16 +22,22 @@ import numpy as np
 import pandas as pd
 import gradio as gr
 from pathlib import Path
 from pycaret.classification import load_model, predict_model
 from huggingface_hub import hf_hub_download
-REPO = os.getenv("MODEL_REPO", "GDMProjects/my-private-model")
 FNAME = os.getenv("MODEL_FILE", "best_insulin_model.pkl")
 TOKEN = os.getenv("HF_TOKEN")
-SAMPLE_FILE  = "INS.xlsx"
-TARGET_NAME  = "insulin"
-POS_CLASS    = 1
 FEATURES = [
     "age",
     "BMI",
@@ -40,13 +51,11 @@ FEATURES = [
     "Previos_Obsteric_History_AB",
     "infertility",
 ]
 NUMERIC_INPUTS = {"age", "BMI", "Previos_Obsteric_History_AB"}
-BOOL_FEATURES  = [f for f in FEATURES if f not in NUMERIC_INPUTS]  # 8 flags
 # ---------- Utilities ----------
-def strip_pkl(x: str) -> str:
-    return x[:-4] if x.lower().endswith(".pkl") else x
 def normalize(s: str) -> str:
     return re.sub(r"[^a-z0-9]+", "", str(s).lower())
@@ -58,23 +67,25 @@ def coerce_numeric(val: Any) -> Optional[float]:
 def truthy(val: Any) -> bool:
     if pd.isna(val): return False
     s = str(val).strip().lower()
-    return s in {"1","true","yes","y","t"} or val is True or val == 1
 def extract_probability_for_positive(preds: pd.DataFrame, positive_label=1) -> Optional[float]:
     str_pos = str(positive_label)
     if str_pos in preds.columns:
         return float(preds.iloc[0][str_pos])
     for c in preds.columns:
         if str_pos == str(c) or str(c).endswith("_"+str_pos):
             try: return float(preds.iloc[0][c])
             except: pass
-    for cname in ("prediction_score","Score"):
         if cname in preds.columns:
             try: return float(preds.iloc[0][cname])
             except: pass
     return None
 def get_global_importance_table(model) -> Optional[pd.DataFrame]:
     try:
         if hasattr(model, "named_steps"):
             est = model.named_steps.get("trained_model", list(model.named_steps.values())[-1])
@@ -84,6 +95,7 @@ def get_global_importance_table(model) -> Optional[pd.DataFrame]:
             est = model
     except Exception:
         est = model
     X_cols = getattr(model, "feature_names_in_", None)
     if hasattr(est, "feature_importances_"):
         vals = np.asarray(est.feature_importances_)
@@ -92,6 +104,7 @@ def get_global_importance_table(model) -> Optional[pd.DataFrame]:
         else:
             df_imp = pd.DataFrame({"feature": [f"f{i}" for i in range(len(vals))], "importance": vals})
         return df_imp.sort_values("importance", ascending=False).reset_index(drop=True)
     if hasattr(est, "coef_"):
         coef = np.array(est.coef_)
         if coef.ndim > 1: coef = coef[0]
@@ -100,14 +113,35 @@ def get_global_importance_table(model) -> Optional[pd.DataFrame]:
             df_coef = pd.DataFrame({"feature": list(X_cols), "coefficient": coef})
         else:
             df_coef = pd.DataFrame({"feature": [f"f{i}" for i in range(len(coef))], "coefficient": coef})
-        return df_coef.reindex(df_coef.iloc[:, -1].abs().sort_values(ascending=False).index).reset_index(drop=True)
     return None
-# ---------- Load model ----------
 local_path = hf_hub_download(repo_id=REPO, filename=FNAME, token=TOKEN)
 MODEL = load_model(str(Path(local_path).with_suffix("")))
-# ---------- Load fixed sample file ----------
 def load_sample_dataframe(path: str) -> Tuple[pd.DataFrame, str]:
     if not os.path.exists(path):
         raise FileNotFoundError(f"Sample file not found: {path}")
@@ -139,13 +173,11 @@ def load_sample_dataframe(path: str) -> Tuple[pd.DataFrame, str]:
 try:
     SAMPLE_DF, SAMPLE_TARGET = load_sample_dataframe(SAMPLE_FILE)
 except Exception as e:
-    # Fall back to empty DF but keep the app alive with a warning in UI
     SAMPLE_DF, SAMPLE_TARGET = pd.DataFrame(columns=FEATURES+[TARGET_NAME]), TARGET_NAME
     SAMPLE_ERROR = f"⚠️ Could not load sample file: {e}"
 else:
     SAMPLE_ERROR = ""
-# Build initial dropdown choices
 def build_sample_choices(df: pd.DataFrame, tgt: str, flt: str = "All") -> List[str]:
     if df.empty: return []
     if flt == "All":
@@ -155,6 +187,76 @@ def build_sample_choices(df: pd.DataFrame, tgt: str, flt: str = "All") -> List[s
         idxs = [i for i in range(len(df)) if str(df.iloc[i][tgt]) == str(want)]
     return [f"{i}: y={df.iloc[i][tgt]}" for i in idxs]
 # ---------- Gradio UI ----------
 with gr.Blocks(theme=gr.themes.Soft(), css="""
 * { font-family: Inter, ui-sans-serif, system-ui, -apple-system, Segoe UI; }
@@ -188,23 +290,27 @@ hr.sep { border: none; border-top: 1px solid #e5e7eb; margin: 8px 0 14px; }
                 checkbox_map[feat] = gr.Checkbox(label=feat, value=False)
             gr.Markdown("<hr class='sep'/>")
-            thr = gr.Slider(0.05, 0.95, value=0.50, step=0.01, label="Decision threshold for class '1'")
-            run_btn = gr.Button("🚀 Predict (manual)", variant="primary")
             # -------- Sample picker (fixed file) --------
             gr.Markdown("<hr class='sep'/>")
             gr.Markdown("### 2) Sample picker (from fixed file)")
-            grp_dd   = gr.Dropdown(label="Filter by target", choices=["All","0","1"], value="All")
-            choices0 = build_sample_choices(SAMPLE_DF, SAMPLE_TARGET, "All")
-            sample_dd= gr.Dropdown(label="Choose sample row", choices=choices0, value=(choices0[0] if choices0 else None))
-            pred_btn = gr.Button("🎯 Predict & compare (sample)", variant="primary")
         # -------- Right: Results --------
         with gr.Column(scale=1):
             gr.Markdown("### 3) Results")
             pred_label = gr.Textbox(label="Predicted label (with threshold decision)", interactive=False)
             with gr.Row():
-                prob_out = gr.Number(label="P(class==1)", interactive=False, precision=6)
                 decision = gr.Textbox(label="Decision @ threshold", interactive=False)
             with gr.Row():
                 gt_out   = gr.Textbox(label="Ground truth (sample)", interactive=False)
@@ -212,12 +318,14 @@ hr.sep { border: none; border-top: 1px solid #e5e7eb; margin: 8px 0 14px; }
             with gr.Accordion("Echoed input (row sent to model)", open=False):
                 echoed = gr.Dataframe(wrap=True)
-            GI = get_global_importance_table(MODEL)
-            if GI is not None and not GI.empty:
-                with gr.Accordion("Global feature importance / coefficients", open=False):
-                    gr.Dataframe(value=GI, interactive=False, wrap=True)
-            else:
-                gr.Markdown("<div class='card small'>No native importances/coefficients available for this estimator.</div>")
     # -------- Manual predict --------
     def do_predict_manual(age, bmi, prev_ab_cnt, threshold, *flag_values):
@@ -246,6 +354,23 @@ hr.sep { border: none; border-top: 1px solid #e5e7eb; margin: 8px 0 14px; }
         outputs=[pred_label, prob_out, decision, gt_out, match_out, echoed],
     )
     # -------- Update sample choices on filter change --------
     def update_choices(group_value):
         ch = build_sample_choices(SAMPLE_DF, SAMPLE_TARGET, group_value)
@@ -253,6 +378,27 @@ hr.sep { border: none; border-top: 1px solid #e5e7eb; margin: 8px 0 14px; }
     grp_dd.change(update_choices, inputs=[grp_dd], outputs=[sample_dd])
     # -------- Predict & compare for selected sample --------
     def predict_sample(sample_choice, threshold):
         if SAMPLE_DF.empty or sample_choice is None or str(sample_choice).strip() == "":
@@ -274,7 +420,6 @@ hr.sep { border: none; border-top: 1px solid #e5e7eb; margin: 8px 0 14px; }
         label = preds.iloc[0][label_col] if label_col else None
         p = extract_probability_for_positive(preds, positive_label=POS_CLASS)
-        # Decision & compare
         if p is not None:
             dec = 1 if float(p) >= float(threshold) else 0
             pretty = f"{label} (threshold {threshold:.2f} ⇒ decision={dec})"
@@ -294,4 +439,4 @@ hr.sep { border: none; border-top: 1px solid #e5e7eb; margin: 8px 0 14px; }
 # ---------- Launch ----------
 if __name__ == "__main__":
-    demo.launch()

+# ---------- Host/port ----------
 HOST, PORT, SHARE = "0.0.0.0", 7860, True
 # ---------- Env hygiene ----------
 os.environ.setdefault("GRADIO_OPEN_BROWSER", "false")
 os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
+# --- FORCE NON-INTERACTIVE MATPLOTLIB BACKEND (for SHAP plots) ---
+os.environ["MPLBACKEND"] = "Agg"
+import matplotlib
+matplotlib.use("Agg", force=True)
 # ---------- Imports ----------
 from typing import Any, Dict, Optional, Tuple, List
 import re
 import pandas as pd
 import gradio as gr
 from pathlib import Path
+import matplotlib.pyplot as plt
+import shap
 from pycaret.classification import load_model, predict_model
 from huggingface_hub import hf_hub_download
+# ---------- Hub model ----------
+REPO  = os.getenv("MODEL_REPO", "GDMProjects/my-private-model")
 FNAME = os.getenv("MODEL_FILE", "best_insulin_model.pkl")
 TOKEN = os.getenv("HF_TOKEN")
+# ---------- Data / schema ----------
+SAMPLE_FILE  = "INS.xlsx"
+TARGET_NAME  = "insulin"
+POS_CLASS    = 1
 FEATURES = [
     "age",
     "BMI",
     "Previos_Obsteric_History_AB",
     "infertility",
 ]
 NUMERIC_INPUTS = {"age", "BMI", "Previos_Obsteric_History_AB"}
+BOOL_FEATURES  = [f for f in FEATURES if f not in NUMERIC_INPUTS]  # flags
 # ---------- Utilities ----------
 def normalize(s: str) -> str:
     return re.sub(r"[^a-z0-9]+", "", str(s).lower())
 def truthy(val: Any) -> bool:
     if pd.isna(val): return False
     s = str(val).strip().lower()
+    return s in {"1","true","yes","y","t","on"} or val is True or val == 1
 def extract_probability_for_positive(preds: pd.DataFrame, positive_label=1) -> Optional[float]:
     str_pos = str(positive_label)
+    # PyCaret predict_model often outputs per-class columns named as labels
     if str_pos in preds.columns:
         return float(preds.iloc[0][str_pos])
     for c in preds.columns:
         if str_pos == str(c) or str(c).endswith("_"+str_pos):
             try: return float(preds.iloc[0][c])
             except: pass
+    for cname in ("prediction_score","Score","score"):
         if cname in preds.columns:
             try: return float(preds.iloc[0][cname])
             except: pass
     return None
 def get_global_importance_table(model) -> Optional[pd.DataFrame]:
+    """Fallback (non-SHAP) importances/coefficients from the final estimator."""
     try:
         if hasattr(model, "named_steps"):
             est = model.named_steps.get("trained_model", list(model.named_steps.values())[-1])
             est = model
     except Exception:
         est = model
     X_cols = getattr(model, "feature_names_in_", None)
     if hasattr(est, "feature_importances_"):
         vals = np.asarray(est.feature_importances_)
         else:
             df_imp = pd.DataFrame({"feature": [f"f{i}" for i in range(len(vals))], "importance": vals})
         return df_imp.sort_values("importance", ascending=False).reset_index(drop=True)
     if hasattr(est, "coef_"):
         coef = np.array(est.coef_)
         if coef.ndim > 1: coef = coef[0]
             df_coef = pd.DataFrame({"feature": list(X_cols), "coefficient": coef})
         else:
             df_coef = pd.DataFrame({"feature": [f"f{i}" for i in range(len(coef))], "coefficient": coef})
+        order = df_coef.iloc[:, -1].abs().sort_values(ascending=False).index
+        return df_coef.reindex(order).reset_index(drop=True)
     return None
+# ---------- Load model (strip .pkl because PyCaret appends) ----------
 local_path = hf_hub_download(repo_id=REPO, filename=FNAME, token=TOKEN)
 MODEL = load_model(str(Path(local_path).with_suffix("")))
+# ---------- Helpers to find positive-class index for predict_proba ----------
+def _get_pos_index_and_classes(pipe, pos_label=1):
+    est = None
+    try:
+        est = getattr(pipe, "named_steps", {}).get("trained_model", None)
+    except Exception:
+        est = None
+    if est is None:
+        est = pipe
+    classes = getattr(est, "classes_", None)
+    if classes is not None and pos_label in list(classes):
+        return list(classes).index(pos_label), list(classes)
+    # fallback: assume last column is positive if 2-class
+    if classes is not None and len(classes) == 2:
+        return 1, list(classes)
+    return -1, list(classes) if classes is not None else None
+POS_IDX, _CLASSES = _get_pos_index_and_classes(MODEL, POS_CLASS)
+# ---------- Load fixed sample file (+ normalizer) ----------
 def load_sample_dataframe(path: str) -> Tuple[pd.DataFrame, str]:
     if not os.path.exists(path):
         raise FileNotFoundError(f"Sample file not found: {path}")
 try:
     SAMPLE_DF, SAMPLE_TARGET = load_sample_dataframe(SAMPLE_FILE)
 except Exception as e:
     SAMPLE_DF, SAMPLE_TARGET = pd.DataFrame(columns=FEATURES+[TARGET_NAME]), TARGET_NAME
     SAMPLE_ERROR = f"⚠️ Could not load sample file: {e}"
 else:
     SAMPLE_ERROR = ""
 def build_sample_choices(df: pd.DataFrame, tgt: str, flt: str = "All") -> List[str]:
     if df.empty: return []
     if flt == "All":
         idxs = [i for i in range(len(df)) if str(df.iloc[i][tgt]) == str(want)]
     return [f"{i}: y={df.iloc[i][tgt]}" for i in idxs]
+# ---------- SHAP background / explainer ----------
+def _prepare_background(df_samples: pd.DataFrame | None, max_rows: int = 200) -> pd.DataFrame:
+    if df_samples is None or df_samples.empty:
+        # tiny synthetic background of zeros
+        bg = pd.DataFrame([{k: 0.0 for k in FEATURES} for _ in range(50)])
+    else:
+        bg = df_samples[FEATURES].copy()
+    # numeric coercion + boolean to {0,1} + median impute
+    for c in FEATURES:
+        if c not in bg.columns:
+            bg[c] = np.nan
+    for c in FEATURES:
+        if c in NUMERIC_INPUTS:
+            bg[c] = pd.to_numeric(bg[c], errors="coerce")
+        else:
+            bg[c] = bg[c].apply(lambda v: 1.0 if truthy(v) else 0.0)
+    bg = bg.fillna(bg.median(numeric_only=True))
+    if len(bg) > max_rows:
+        bg = bg.sample(max_rows, random_state=42)
+    return bg.reset_index(drop=True)
+BACKGROUND = _prepare_background(SAMPLE_DF)
+def _f_proba_pos(X_np: np.ndarray) -> np.ndarray:
+    X_df = pd.DataFrame(X_np, columns=FEATURES)
+    proba = MODEL.predict_proba(X_df)
+    if POS_IDX >= 0 and POS_IDX < proba.shape[1]:
+        return proba[:, POS_IDX]
+    # fallback: try class "1" if present
+    if proba.shape[1] >= 2:
+        return proba[:, 1]
+    return proba[:, 0]
+try:
+    EXPLAINER = shap.Explainer(_f_proba_pos, BACKGROUND.values)
+except Exception as e:
+    print("[WARN] SHAP explainer init failed:", e)
+    EXPLAINER = None
+def _plot_local_shap(row_dict: dict):
+    if EXPLAINER is None:
+        return None
+    X = pd.DataFrame([row_dict], columns=FEATURES)
+    exp = EXPLAINER(X.values)  # (1, n_features)
+    vals = exp.values[0]
+    order = np.argsort(np.abs(vals))
+    fig, ax = plt.subplots(figsize=(7, 4.5))
+    ax.barh(np.array(FEATURES)[order], vals[order])
+    ax.axvline(0, linewidth=1)
+    ax.set_title("Local SHAP values (current input)")
+    ax.set_xlabel(f"Impact on P(class=={POS_CLASS})")
+    fig.tight_layout()
+    return fig
+def _plot_global_shap():
+    if EXPLAINER is None:
+        return None
+    exp = EXPLAINER(BACKGROUND.values)
+    mean_abs = np.mean(np.abs(exp.values), axis=0)
+    order = np.argsort(mean_abs)
+    fig, ax = plt.subplots(figsize=(7, 4.5))
+    ax.barh(np.array(FEATURES)[order], mean_abs[order])
+    ax.set_title("Global feature importance (mean |SHAP|)")
+    ax.set_xlabel(f"Mean |impact on P(class=={POS_CLASS})|")
+    fig.tight_layout()
+    return fig
+GLOBAL_FIG = _plot_global_shap()
+GLOBAL_FI_TEXT = (get_global_importance_table(MODEL) or pd.DataFrame())
 # ---------- Gradio UI ----------
 with gr.Blocks(theme=gr.themes.Soft(), css="""
 * { font-family: Inter, ui-sans-serif, system-ui, -apple-system, Segoe UI; }
                 checkbox_map[feat] = gr.Checkbox(label=feat, value=False)
             gr.Markdown("<hr class='sep'/>")
+            thr = gr.Slider(0.05, 0.95, value=0.50, step=0.01, label=f"Decision threshold for class '{POS_CLASS}'")
+            with gr.Row():
+                run_btn   = gr.Button("🚀 Predict (manual)", variant="primary")
+                explain_btn = gr.Button("🧠 Explain (SHAP for current input)")
             # -------- Sample picker (fixed file) --------
             gr.Markdown("<hr class='sep'/>")
             gr.Markdown("### 2) Sample picker (from fixed file)")
+            grp_dd    = gr.Dropdown(label="Filter by target", choices=["All","0","1"], value="All")
+            choices0  = build_sample_choices(SAMPLE_DF, SAMPLE_TARGET, "All")
+            sample_dd = gr.Dropdown(label="Choose sample row", choices=choices0, value=(choices0[0] if choices0 else None))
+            with gr.Row():
+                load_btn = gr.Button("📥 Load sample into manual inputs", variant="secondary")
+                pred_btn = gr.Button("🎯 Predict & compare (sample)", variant="primary")
         # -------- Right: Results --------
         with gr.Column(scale=1):
             gr.Markdown("### 3) Results")
             pred_label = gr.Textbox(label="Predicted label (with threshold decision)", interactive=False)
             with gr.Row():
+                prob_out = gr.Number(label=f"P(class=={POS_CLASS})", interactive=False, precision=6)
                 decision = gr.Textbox(label="Decision @ threshold", interactive=False)
             with gr.Row():
                 gt_out   = gr.Textbox(label="Ground truth (sample)", interactive=False)
             with gr.Accordion("Echoed input (row sent to model)", open=False):
                 echoed = gr.Dataframe(wrap=True)
+            with gr.Accordion("Global feature importance (SHAP)", open=False):
+                gr.Plot(value=GLOBAL_FIG)
+                if isinstance(GLOBAL_FI_TEXT, pd.DataFrame) and not GLOBAL_FI_TEXT.empty:
+                    gr.Markdown("> Text fallback (native model importances/coefficients):")
+                    gr.Dataframe(value=GLOBAL_FI_TEXT, interactive=False, wrap=True)
+            with gr.Accordion("Local explanation (SHAP) for current input", open=False):
+                local_plot = gr.Plot()
     # -------- Manual predict --------
     def do_predict_manual(age, bmi, prev_ab_cnt, threshold, *flag_values):
         outputs=[pred_label, prob_out, decision, gt_out, match_out, echoed],
     )
+    # -------- Local SHAP for current manual input --------
+    def do_explain_local(age, bmi, prev_ab_cnt, *flag_values):
+        row = {c: None for c in FEATURES}
+        row["age"]  = coerce_numeric(age)
+        row["BMI"]  = coerce_numeric(bmi)
+        row["Previos_Obsteric_History_AB"] = coerce_numeric(prev_ab_cnt)
+        for feat, val in zip(BOOL_FEATURES, flag_values):
+            row[feat] = 1.0 if bool(val) else 0.0
+        fig = _plot_local_shap(row)
+        return fig
+    explain_btn.click(
+        do_explain_local,
+        inputs=[age_in, bmi_in, prev_ab] + [checkbox_map[f] for f in BOOL_FEATURES],
+        outputs=[local_plot],
+    )
     # -------- Update sample choices on filter change --------
     def update_choices(group_value):
         ch = build_sample_choices(SAMPLE_DF, SAMPLE_TARGET, group_value)
     grp_dd.change(update_choices, inputs=[grp_dd], outputs=[sample_dd])
+    # -------- Load selected sample INTO manual inputs --------
+    def load_into_manual(sample_choice):
+        if SAMPLE_DF.empty or sample_choice is None or str(sample_choice).strip() == "":
+            raise gr.Error("Sample file is empty or no row selected. Check SAMPLE_FILE path.")
+        idx = int(str(sample_choice).split(":")[0])
+        srow = SAMPLE_DF.iloc[idx]
+        updates = [
+            gr.update(value=coerce_numeric(srow["age"])),
+            gr.update(value=coerce_numeric(srow["BMI"])),
+            gr.update(value=coerce_numeric(srow["Previos_Obsteric_History_AB"])),
+        ]
+        for feat in BOOL_FEATURES:
+            updates.append(gr.update(value=bool(truthy(srow[feat]))))
+        # also surface ground truth to the Results panel
+        updates.append(gr.update(value=str(srow[SAMPLE_TARGET])))
+        return updates
+    load_into_outputs = [age_in, bmi_in, prev_ab] + [checkbox_map[f] for f in BOOL_FEATURES] + [gt_out]
+    load_btn.click(load_into_manual, inputs=[sample_dd], outputs=load_into_outputs)
     # -------- Predict & compare for selected sample --------
     def predict_sample(sample_choice, threshold):
         if SAMPLE_DF.empty or sample_choice is None or str(sample_choice).strip() == "":
         label = preds.iloc[0][label_col] if label_col else None
         p = extract_probability_for_positive(preds, positive_label=POS_CLASS)
         if p is not None:
             dec = 1 if float(p) >= float(threshold) else 0
             pretty = f"{label} (threshold {threshold:.2f} ⇒ decision={dec})"
 # ---------- Launch ----------
 if __name__ == "__main__":
+    demo.launch(server_name=HOST, server_port=PORT, share=SHARE)