Spaces:

Synav
/

Explainable-Acute-Leukemia-Mortality-Predictor

Running

App Files Files Community

Synav commited on Jan 19

Commit

d1146a6

verified ·

1 Parent(s): c73f26c

Create app.py

Browse files

Files changed (1) hide show

app.py +281 -0

app.py ADDED Viewed

	@@ -0,0 +1,281 @@

+import json
+from datetime import datetime
+import numpy as np
+import pandas as pd
+import streamlit as st
+import joblib
+import shap
+import matplotlib.pyplot as plt
+from sklearn.pipeline import Pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.impute import SimpleImputer
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import roc_auc_score, accuracy_score
+# ============================================================
+# Fixed schema definition (PLACEHOLDER FRAMEWORK)
+# ============================================================
+FEATURE_COLS = [chr(ord("A") + i) for i in range(26)]  # A..Z
+NUM_COLS = FEATURE_COLS[:13]   # A–M → numeric
+CAT_COLS = FEATURE_COLS[13:]   # N–Z → categorical
+LABEL_COL = "AA"
+# ============================================================
+# Model pipeline
+# ============================================================
+def build_pipeline():
+    num_pipe = Pipeline([
+        ("imputer", SimpleImputer(strategy="median")),
+        ("scaler", StandardScaler())
+    ])
+    cat_pipe = Pipeline([
+        ("imputer", SimpleImputer(strategy="most_frequent")),
+        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=True))
+    ])
+    preprocessor = ColumnTransformer(
+        transformers=[
+            ("num", num_pipe, NUM_COLS),
+            ("cat", cat_pipe, CAT_COLS)
+        ],
+        remainder="drop",
+        verbose_feature_names_out=False
+    )
+    clf = LogisticRegression(max_iter=2000, solver="lbfgs")
+    return Pipeline([
+        ("preprocess", preprocessor),
+        ("clf", clf)
+    ])
+# ============================================================
+# Validation utilities
+# ============================================================
+def validate_schema(df: pd.DataFrame) -> pd.DataFrame:
+    missing = [c for c in FEATURE_COLS + [LABEL_COL] if c not in df.columns]
+    if missing:
+        raise ValueError(
+            f"Missing required columns: {missing}. "
+            f"Excel must contain columns A..Z and AA exactly."
+        )
+    return df[FEATURE_COLS + [LABEL_COL]].copy()
+def coerce_binary_label(y: pd.Series):
+    y_clean = y.dropna()
+    uniq = list(pd.unique(y_clean))
+    if len(uniq) != 2:
+        raise ValueError(f"AA must be binary (2 unique values). Found: {uniq}")
+    if pd.api.types.is_numeric_dtype(y_clean):
+        pos = sorted(uniq)[-1]
+        return (y == pos).astype(int).to_numpy(), pos
+    if y_clean.dtype == bool:
+        return y.astype(int).to_numpy(), True
+    uniq_str = sorted([str(u) for u in uniq])
+    pos = uniq_str[-1]
+    return y.astype(str).eq(pos).astype(int).to_numpy(), pos
+# ============================================================
+# Training + persistence
+# ============================================================
+def train_and_save(df: pd.DataFrame):
+    df = validate_schema(df)
+    X = df[FEATURE_COLS].copy()
+    y_raw = df[LABEL_COL].copy()
+    for c in NUM_COLS:
+        X[c] = pd.to_numeric(X[c], errors="coerce")
+    for c in CAT_COLS:
+        X[c] = X[c].astype("string")
+    y01, pos_class = coerce_binary_label(y_raw)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y01, test_size=0.2, random_state=42, stratify=y01
+    )
+    pipe = build_pipeline()
+    pipe.fit(X_train, y_train)
+    proba = pipe.predict_proba(X_test)[:, 1]
+    pred = (proba >= 0.5).astype(int)
+    metrics = {
+        "roc_auc": float(roc_auc_score(y_test, proba)),
+        "accuracy@0.5": float(accuracy_score(y_test, pred)),
+        "n_train": int(len(X_train)),
+        "n_test": int(len(X_test)),
+    }
+    joblib.dump(pipe, "model.joblib")
+    meta = {
+        "framework": "LogiSHAP Studio",
+        "model": "Logistic Regression",
+        "created_at_utc": datetime.utcnow().isoformat(),
+        "schema": {
+            "features": FEATURE_COLS,
+            "numeric": NUM_COLS,
+            "categorical": CAT_COLS,
+            "label": LABEL_COL
+        },
+        "positive_class": str(pos_class),
+        "metrics": metrics
+    }
+    with open("meta.json", "w") as f:
+        json.dump(meta, f, indent=2)
+    return pipe, meta, X
+# ============================================================
+# SHAP
+# ============================================================
+def build_shap_explainer(pipe, X_bg, max_bg=200):
+    if len(X_bg) > max_bg:
+        X_bg = X_bg.sample(max_bg, random_state=42)
+    pre = pipe.named_steps["preprocess"]
+    clf = pipe.named_steps["clf"]
+    X_bg_t = pre.transform(X_bg)
+    explainer = shap.LinearExplainer(
+        clf, X_bg_t, feature_perturbation="interventional"
+    )
+    return explainer
+# ============================================================
+# Streamlit UI
+# ============================================================
+st.set_page_config(page_title="LogiSHAP Studio", layout="wide")
+st.title("LogiSHAP Studio")
+st.caption("Logistic Regression framework with SHAP explainability (A–Z features, AA label)")
+with st.expander("Required Excel format", expanded=True):
+    st.markdown("""
+- **A–M** → Numeric variables
+- **N–Z** → Categorical variables
+- **AA** → Binary label (0/1, Yes/No, True/False)
+Column names **must be exactly A..Z and AA**
+""")
+tab_train, tab_predict = st.tabs(["1️⃣ Train", "2️⃣ Predict + SHAP"])
+if "pipe" not in st.session_state:
+    st.session_state.pipe = None
+if "explainer" not in st.session_state:
+    st.session_state.explainer = None
+# ---------------- TRAIN ----------------
+with tab_train:
+    train_file = st.file_uploader("Upload training Excel (.xlsx)", type=["xlsx"])
+    if train_file:
+        df = pd.read_excel(train_file, engine="openpyxl")
+        st.dataframe(df.head())
+        if st.button("Train model"):
+            with st.spinner("Training model..."):
+                pipe, meta, X_bg = train_and_save(df)
+                explainer = build_shap_explainer(pipe, X_bg)
+                st.session_state.pipe = pipe
+                st.session_state.explainer = explainer
+            st.success("Training complete. model.joblib and meta.json created.")
+            m = meta["metrics"]
+            c1, c2, c3, c4 = st.columns(4)
+            c1.metric("ROC AUC", f"{m['roc_auc']:.3f}")
+            c2.metric("Accuracy", f"{m['accuracy@0.5']:.3f}")
+            c3.metric("Train N", m["n_train"])
+            c4.metric("Test N", m["n_test"])
+# ---------------- PREDICT ----------------
+with tab_predict:
+    if st.session_state.pipe is None:
+        st.warning("Train a model first.")
+    else:
+        infer_file = st.file_uploader("Upload inference Excel (.xlsx)", type=["xlsx"])
+        if infer_file:
+            df_inf = pd.read_excel(infer_file, engine="openpyxl")
+            X_inf = df_inf[FEATURE_COLS].copy()
+            for c in NUM_COLS:
+                X_inf[c] = pd.to_numeric(X_inf[c], errors="coerce")
+            for c in CAT_COLS:
+                X_inf[c] = X_inf[c].astype("string")
+            pipe = st.session_state.pipe
+            proba = pipe.predict_proba(X_inf)[:, 1]
+            df_out = df_inf.copy()
+            df_out["predicted_probability"] = proba
+            st.dataframe(df_out.head())
+            st.download_button(
+                "Download predictions",
+                df_out.to_csv(index=False).encode(),
+                "predictions.csv",
+                "text/csv"
+            )
+            st.divider()
+            st.subheader("SHAP explanation")
+            row = st.number_input("Row index", 0, len(X_inf)-1, 0)
+            X_one = X_inf.iloc[[row]]
+            pre = pipe.named_steps["preprocess"]
+            X_one_t = pre.transform(X_one)
+            explainer = st.session_state.explainer
+            shap_vals = explainer.shap_values(X_one_t)
+            base = explainer.expected_value
+            if isinstance(shap_vals, list):
+                shap_vals = shap_vals[1]
+            try:
+                names = list(pre.get_feature_names_out())
+            except Exception:
+                names = [f"f{i}" for i in range(len(shap_vals[0]))]
+            try:
+                x_dense = X_one_t.toarray()[0]
+            except Exception:
+                x_dense = np.array(X_one_t)[0]
+            exp = shap.Explanation(
+                values=shap_vals[0],
+                base_values=float(base),
+                data=x_dense,
+                feature_names=names
+            )
+            c1, c2 = st.columns(2)
+            with c1:
+                fig = plt.figure()
+                shap.plots.waterfall(exp, show=False)
+                st.pyplot(fig)
+            with c2:
+                fig2 = plt.figure()
+                shap.plots.bar(exp, show=False)
+                st.pyplot(fig2)