Spaces:

Synav
/

Explainable-Acute-Leukemia-Mortality-Predictor

Running

App Files Files Community

Synav commited on Jan 27

Commit

a4c4923

verified ·

1 Parent(s): 6d61737

Update app.py

Browse files

Files changed (1) hide show

app.py +129 -82

app.py CHANGED Viewed

@@ -32,7 +32,7 @@ from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import train_test_split
 #Figures setting block
-import io
@@ -47,6 +47,7 @@ def make_fig(figsize=(5.5, 3.6), dpi=120):
 def fig_to_png_bytes(fig, dpi=600):
     buf = io.BytesIO()
     fig.savefig(buf, format="png", dpi=int(dpi), bbox_inches="tight")
     buf.seek(0)
@@ -62,6 +63,7 @@ def render_plot_with_download(
     export_dpi: int = 600,
     key: Optional[str] = None
 ):
     png_bytes = fig_to_png_bytes(fig, dpi=export_dpi)
     st.pyplot(fig, clear_figure=False)
     st.download_button(
@@ -76,6 +78,7 @@ def render_plot_with_download(
 # ============================================================
 # Fixed schema definition (PLACEHOLDER FRAMEWORK)
 # ============================================================
@@ -191,6 +194,82 @@ def find_col(df: pd.DataFrame, candidates: list[str]) -> str | None:
             return lookup[k]
     return None
 # ============================================================
 # Model pipeline
@@ -684,82 +763,33 @@ def train_and_save(
     survival_trained = False
     surv_notes = None
     if time_days is not None and event01 is not None:
         try:
-            df_surv = df[feature_cols].copy().replace({pd.NA: np.nan})
-            # coerce numeric/cat like you already do
-            for c in num_cols:
-                if c in df_surv.columns:
-                    df_surv[c] = pd.to_numeric(df_surv[c], errors="coerce")
-            for c in cat_cols:
-                if c in df_surv.columns:
-                    df_surv[c] = df_surv[c].astype("object")
-                    df_surv.loc[df_surv[c].isna(), c] = np.nan
-                    df_surv[c] = df_surv[c].map(lambda v: v if pd.isna(v) else str(v))
-            df_surv["time_days"] = time_days
-            df_surv["event"] = event01
-            df_surv = df_surv.dropna(subset=["time_days", "event"])
-            # one-hot
-            # one-hot (THIS LINE IS MISSING IN YOUR CODE)
-            df_surv_oh = pd.get_dummies(df_surv, columns=cat_cols, drop_first=True)
-            # Remove duplicate columns (good safety)
-            df_surv_oh = df_surv_oh.loc[:, ~df_surv_oh.columns.duplicated()].copy()
-            duration_col = "time_days"
-            event_col = "event"
-            X_cols = [c for c in df_surv_oh.columns if c not in (duration_col, event_col)]
-            # Force numeric predictors
-            df_surv_oh[X_cols] = df_surv_oh[X_cols].apply(pd.to_numeric, errors="coerce")
-            # Impute (training-time)
-            imp = SimpleImputer(strategy="median")
-            # DEBUG
-            st.write("X_cols count:", len(X_cols))
-            st.write("df_surv_oh[X_cols] shape:", df_surv_oh[X_cols].shape)
-            X_imp = imp.fit_transform(df_surv_oh[X_cols])
-            st.write("imputer output shape:", X_imp.shape)
-            # Assign back (preserve index/columns)
-            df_surv_oh.loc[:, X_cols] = pd.DataFrame(X_imp, columns=X_cols, index=df_surv_oh.index)
-            # fit Cox
-            cph = CoxPHFitter(penalizer=0.1)
-            cph.fit(df_surv_oh[[*X_cols, duration_col, event_col]], duration_col=duration_col, event_col=event_col)
-            bundle = {
-                "model": cph,
-                "columns": X_cols,          # predictors only
-                "imputer": imp,
-                "cat_cols": cat_cols,
-                "num_cols": num_cols,
-                "feature_cols": feature_cols,
-                "duration_col": duration_col,
-                "event_col": event_col,
-                "version": 1
-            }
             joblib.dump(bundle, "survival_bundle.joblib", compress=3)
             survival_trained = True
-            surv_notes = "Survival model trained successfully."
         except Exception as e:
             survival_trained = False
             surv_notes = f"Survival model training failed: {e}"
     else:
         surv_notes = "Survival columns missing or could not be parsed; survival model not trained."
@@ -833,18 +863,38 @@ def train_and_save(
 # SHAP
 # ============================================================
 def build_shap_explainer(pipe, X_bg, max_bg=200):
-    import shap
-    if len(X_bg) > max_bg:
-        X_bg = X_bg.sample(max_bg, random_state=42)
     clf = pipe.named_steps["clf"]
     Xt_bg = transform_before_clf(pipe, X_bg)
     explainer = shap.LinearExplainer(
-        clf, Xt_bg, feature_perturbation="interventional"
     )
     return explainer
 def ensure_model_repo_exists(model_repo_id: str, token: str):
     """
     Optional helper: create the model repo if it doesn't exist.
@@ -1324,32 +1374,27 @@ def normalize_country_name(x: str) -> str | None:
 from typing import Optional
-def country_to_region(country: Optional[str]) -> str:
     """
-    Map a country name to a broad region for analytics.
-    Returns one of: Africa, Americas, Asia, Europe, Oceania, Unknown.
-    Lazy-imports country_converter to reduce startup memory.
     """
-    if not country or pd.isna(country):
         return REGION_UNKNOWN
-    country = str(country).strip()
-    import country_converter as coco  # lazy import
     r = coco.convert(names=country, to="continent")
     if not r or str(r).lower() in ("not found", "nan", "none"):
         return REGION_UNKNOWN
     if r == "America":
         return "Americas"
     return str(r)
 def add_ethnicity_region(df: pd.DataFrame, eth_col: str = "Ethnicity", out_col: str = "Ethnicity_Region") -> pd.DataFrame:
-    """Adds an analytics-only region column derived from the Ethnicity/nationality column."""
     if eth_col not in df.columns:
         df[out_col] = REGION_UNKNOWN
         return df
@@ -1359,6 +1404,7 @@ def add_ethnicity_region(df: pd.DataFrame, eth_col: str = "Ethnicity", out_col:
     return df
 # ============================================================
 # Streamlit UI
 # ============================================================
@@ -3120,7 +3166,8 @@ with tab_predict:
                 # Dense conversion once (used for summary + waterfalls)
                 try:
-                    X_dense = X_batch_t.toarray()
                 except Exception:
                     X_dense = np.array(X_batch_t)

 from sklearn.model_selection import train_test_split
 #Figures setting block
 def fig_to_png_bytes(fig, dpi=600):
+    import io
     buf = io.BytesIO()
     fig.savefig(buf, format="png", dpi=int(dpi), bbox_inches="tight")
     buf.seek(0)
     export_dpi: int = 600,
     key: Optional[str] = None
 ):
+    import matplotlib.pyplot as plt  # lazy
     png_bytes = fig_to_png_bytes(fig, dpi=export_dpi)
     st.pyplot(fig, clear_figure=False)
     st.download_button(
 # ============================================================
 # Fixed schema definition (PLACEHOLDER FRAMEWORK)
 # ============================================================
             return lookup[k]
     return None
+def train_survival_bundle(
+    df: pd.DataFrame,
+    feature_cols: list[str],
+    num_cols: list[str],
+    cat_cols: list[str],
+    time_days: np.ndarray,
+    event01: np.ndarray,
+    *,
+    penalizer: float = 0.1
+):
+    """
+    Returns (bundle_dict, notes). Raises exceptions if hard-fail.
+    Lazy-imports lifelines.
+    """
+    from lifelines import CoxPHFitter  # lazy
+    from sklearn.impute import SimpleImputer  # light, ok here too
+    # build survival DF
+    df_surv = df[feature_cols].copy().replace({pd.NA: np.nan})
+    # coerce numeric/cat
+    for c in num_cols:
+        if c in df_surv.columns:
+            df_surv[c] = pd.to_numeric(df_surv[c], errors="coerce")
+    for c in cat_cols:
+        if c in df_surv.columns:
+            df_surv[c] = df_surv[c].astype("object")
+            df_surv.loc[df_surv[c].isna(), c] = np.nan
+            df_surv[c] = df_surv[c].map(lambda v: v if pd.isna(v) else str(v))
+    df_surv["time_days"] = time_days
+    df_surv["event"] = event01
+    df_surv = df_surv.dropna(subset=["time_days", "event"])
+    duration_col = "time_days"
+    event_col = "event"
+    # one-hot
+    df_surv_oh = pd.get_dummies(df_surv, columns=cat_cols, drop_first=True)
+    # remove duplicate columns if any messy headers caused duplicates
+    df_surv_oh = df_surv_oh.loc[:, ~df_surv_oh.columns.duplicated()].copy()
+    # predictor columns
+    X_cols = [c for c in df_surv_oh.columns if c not in (duration_col, event_col)]
+    # force numeric for Cox predictors
+    df_surv_oh[X_cols] = df_surv_oh[X_cols].apply(pd.to_numeric, errors="coerce")
+    # impute predictors
+    imp = SimpleImputer(strategy="median")
+    X_imp = imp.fit_transform(df_surv_oh[X_cols])
+    # assign back safely with same columns + index
+    df_surv_oh.loc[:, X_cols] = pd.DataFrame(X_imp, columns=X_cols, index=df_surv_oh.index)
+    # fit Cox
+    cph = CoxPHFitter(penalizer=float(penalizer))
+    cph.fit(df_surv_oh[[*X_cols, duration_col, event_col]],
+            duration_col=duration_col,
+            event_col=event_col)
+    bundle = {
+        "model": cph,
+        "columns": X_cols,             # predictors only
+        "imputer": imp,                # fitted imputer
+        "cat_cols": cat_cols,
+        "num_cols": num_cols,
+        "feature_cols": feature_cols,
+        "duration_col": duration_col,
+        "event_col": event_col,
+        "version": 1
+    }
+    return bundle, "Survival model trained successfully."
 # ============================================================
 # Model pipeline
     survival_trained = False
     surv_notes = None
+    surv_used_cols = None
+    try:
+        time_days, event01, surv_used_cols = build_survival_targets(df)
+    except Exception:
+        time_days, event01, surv_used_cols = None, None, None
     if time_days is not None and event01 is not None:
         try:
+            bundle, surv_notes = train_survival_bundle(
+                df=df,
+                feature_cols=feature_cols,
+                num_cols=num_cols,
+                cat_cols=cat_cols,
+                time_days=time_days,
+                event01=event01,
+                penalizer=0.1
+            )
             joblib.dump(bundle, "survival_bundle.joblib", compress=3)
             survival_trained = True
         except Exception as e:
             survival_trained = False
             surv_notes = f"Survival model training failed: {e}"
     else:
         surv_notes = "Survival columns missing or could not be parsed; survival model not trained."
 # SHAP
 # ============================================================
 def build_shap_explainer(pipe, X_bg, max_bg=200):
+    import shap  # lazy
+    if X_bg is None or len(X_bg) == 0:
+        raise ValueError("SHAP background is empty.")
+    if len(X_bg) > int(max_bg):
+        X_bg = X_bg.sample(int(max_bg), random_state=42)
     clf = pipe.named_steps["clf"]
     Xt_bg = transform_before_clf(pipe, X_bg)
     explainer = shap.LinearExplainer(
+        clf,
+        Xt_bg,
+        feature_perturbation="interventional"
     )
     return explainer
+def safe_dense(Xt, max_rows: int = 2000):
+    """
+    Convert sparse->dense carefully. Avoid converting huge matrices to dense.
+    """
+    if hasattr(Xt, "shape") and Xt.shape[0] > max_rows:
+        Xt = Xt[:max_rows]
+    try:
+        return Xt.toarray()
+    except Exception:
+        return np.array(Xt)
 def ensure_model_repo_exists(model_repo_id: str, token: str):
     """
     Optional helper: create the model repo if it doesn't exist.
 from typing import Optional
+def country_to_region(country: str | None) -> str:
     """
+    Lazy-import country_converter to reduce startup memory.
+    Returns one of: Africa, Americas, Asia, Europe, Oceania, Unknown
     """
+    if not country:
         return REGION_UNKNOWN
+    import country_converter as coco  # lazy
     r = coco.convert(names=country, to="continent")
     if not r or str(r).lower() in ("not found", "nan", "none"):
         return REGION_UNKNOWN
     if r == "America":
         return "Americas"
     return str(r)
 def add_ethnicity_region(df: pd.DataFrame, eth_col: str = "Ethnicity", out_col: str = "Ethnicity_Region") -> pd.DataFrame:
     if eth_col not in df.columns:
         df[out_col] = REGION_UNKNOWN
         return df
     return df
 # ============================================================
 # Streamlit UI
 # ============================================================
                 # Dense conversion once (used for summary + waterfalls)
                 try:
+                    X_dense = safe_dense(X_batch_t, max_rows=200)
                 except Exception:
                     X_dense = np.array(X_batch_t)