Spaces:

Synav
/

Explainable-Acute-Leukemia-Mortality-Predictor

Running

App Files Files Community

Synav commited on Jan 21

Commit

66e1586

verified ·

1 Parent(s): 1668c7e

Update app.py

Browse files

Files changed (1) hide show

app.py +99 -52

app.py CHANGED Viewed

@@ -915,6 +915,7 @@ with tab_predict:
         )
         #SHAP BLOCK
         st.divider()
         st.subheader("Batch SHAP (first 200 rows)")
@@ -922,61 +923,116 @@ with tab_predict:
         n_rows = len(X_inf)
         batch_n = min(MAX_BATCH, n_rows)
-        cA, cB = st.columns([1, 1])
         with cA:
             do_batch = st.button(f"Compute batch SHAP for first {batch_n} rows", key="batch_shap_btn")
         with cB:
             max_display = st.slider("Top features to display", 5, 40, 20, 1, key="batch_max_display")
         if do_batch:
             with st.spinner("Computing batch SHAP..."):
                 pre = pipe.named_steps["preprocess"]
-                # Use first N rows (fast + predictable memory)
                 X_batch = X_inf.iloc[:batch_n].copy()
                 X_batch_t = pre.transform(X_batch)
-                # Build explainer once (cached)
-                if st.session_state.get("explainer") is None:
                     st.session_state.explainer = build_shap_explainer(pipe, X_inf)
-                explainer = st.session_state.explainer
-                shap_vals = explainer.shap_values(X_batch_t)
-                if isinstance(shap_vals, list):
-                    shap_vals = shap_vals[1]  # positive class
-                # Cache batch results
-                st.session_state.shap_batch_vals = shap_vals
-                st.session_state.shap_batch_Xt = X_batch_t
-                st.session_state.shap_batch_n = batch_n
                 try:
-                    st.session_state.shap_batch_feature_names = list(pre.get_feature_names_out())
                 except Exception:
-                    st.session_state.shap_batch_feature_names = [f"f{i}" for i in range(shap_vals.shape[1])]
             st.success(f"Batch SHAP computed for first {batch_n} rows.")
         if "shap_batch_vals" in st.session_state:
-            shap_vals = st.session_state.shap_batch_vals
-            X_batch_t = st.session_state.shap_batch_Xt
             batch_n = st.session_state.shap_batch_n
             names = st.session_state.shap_batch_feature_names
-            st.markdown("### Global SHAP summary (first {} rows)".format(batch_n))
-            # Convert X to dense only once if needed (beeswarm often needs dense)
-            try:
-                X_dense = X_batch_t.toarray()
-            except Exception:
-                X_dense = np.array(X_batch_t)
             # BAR SUMMARY
             fig_bar = plt.figure()
             shap.summary_plot(
-                shap_vals,
                 features=X_dense,
                 feature_names=names,
                 plot_type="bar",
@@ -985,22 +1041,17 @@ with tab_predict:
             )
             st.pyplot(fig_bar, clear_figure=True)
-            # BEESWARM SUMMARY
-            fig_swarm = plt.figure()
-            shap.summary_plot(
-                shap_vals,
-                features=X_dense,
-                feature_names=names,
-                max_display=max_display,
-                show=False,
-            )
-            st.pyplot(fig_swarm, clear_figure=True)
-        if "shap_batch_vals" in st.session_state:
-            shap_vals = st.session_state.shap_batch_vals
-            X_batch_t = st.session_state.shap_batch_Xt
-            batch_n = st.session_state.shap_batch_n
-            names = st.session_state.shap_batch_feature_names
             st.markdown("### Waterfall plots (batch)")
@@ -1014,20 +1065,15 @@ with tab_predict:
             max_waterfalls = st.slider("Max waterfall plots to render", 1, 10, 3, 1, key="max_waterfalls")
             rows_to_plot = rows_to_plot[:max_waterfalls]
-            base = st.session_state.explainer.expected_value
             if not np.isscalar(base):
                 base = float(np.array(base).reshape(-1)[0])
-            # dense only if needed for data in Explanation
-            try:
-                X_dense = X_batch_t.toarray()
-            except Exception:
-                X_dense = np.array(X_batch_t)
             for r in rows_to_plot:
                 st.markdown(f"**Row {r} (within first {batch_n})**")
                 exp = shap.Explanation(
-                    values=shap_vals[r],
                     base_values=float(base),
                     data=X_dense[r],
                     feature_names=names,
@@ -1036,7 +1082,8 @@ with tab_predict:
                 shap.plots.waterfall(exp, show=False, max_display=max_display)
                 st.pyplot(fig_w, clear_figure=True)
         st.subheader("SHAP explanation")
         with st.form("shap_form"):

         )
         #SHAP BLOCK
         st.divider()
         st.subheader("Batch SHAP (first 200 rows)")
         n_rows = len(X_inf)
         batch_n = min(MAX_BATCH, n_rows)
+        cA, cB, cC = st.columns([1, 1, 1])
         with cA:
             do_batch = st.button(f"Compute batch SHAP for first {batch_n} rows", key="batch_shap_btn")
         with cB:
             max_display = st.slider("Top features to display", 5, 40, 20, 1, key="batch_max_display")
+        with cC:
+            show_beeswarm = st.checkbox("Show beeswarm (slower)", value=True, key="batch_beeswarm")
         if do_batch:
             with st.spinner("Computing batch SHAP..."):
                 pre = pipe.named_steps["preprocess"]
                 X_batch = X_inf.iloc[:batch_n].copy()
                 X_batch_t = pre.transform(X_batch)
+                # Ensure explainer exists
+                explainer = st.session_state.get("explainer")
+                if explainer is None:
                     st.session_state.explainer = build_shap_explainer(pipe, X_inf)
+                    explainer = st.session_state.explainer
+                shap_vals_batch = explainer.shap_values(X_batch_t)
+                if isinstance(shap_vals_batch, list):
+                    shap_vals_batch = shap_vals_batch[1]  # positive class
+                try:
+                    names = list(pre.get_feature_names_out())
+                except Exception:
+                    names = [f"f{i}" for i in range(shap_vals_batch.shape[1])]
+                # Dense conversion once (used for summary + waterfalls)
                 try:
+                    X_dense = X_batch_t.toarray()
                 except Exception:
+                    X_dense = np.array(X_batch_t)
+                # Cache batch results
+                st.session_state.shap_batch_vals = shap_vals_batch
+                st.session_state.shap_batch_X_dense = X_dense
+                st.session_state.shap_batch_n = batch_n
+                st.session_state.shap_batch_feature_names = names
             st.success(f"Batch SHAP computed for first {batch_n} rows.")
         if "shap_batch_vals" in st.session_state:
+            shap_vals_batch = st.session_state.shap_batch_vals
+            X_dense = st.session_state.shap_batch_X_dense
             batch_n = st.session_state.shap_batch_n
             names = st.session_state.shap_batch_feature_names
+            st.divider()
+            st.subheader("Export: Top SHAP features per row (batch)")
+            top_k = st.slider("Top-K features per row", 3, 30, 10, 1, key="topk_export")
+            # Optional: include predicted probabilities for the same batch rows
+            # (Assumes you already computed proba for all X_inf earlier)
+            include_proba = st.checkbox("Include predicted probability", value=True, key="include_proba_export")
+            if st.button("Generate Top-K SHAP table", key="gen_topk_shap"):
+                shap_vals_batch = st.session_state.shap_batch_vals          # shape: (batch_n, n_features)
+                names = st.session_state.shap_batch_feature_names
+                batch_n = st.session_state.shap_batch_n
+                rows = []
+                for i in range(batch_n):
+                    sv = shap_vals_batch[i]
+                    idx = np.argsort(np.abs(sv))[::-1][:top_k]  # top-k by absolute SHAP
+                    for j in idx:
+                        val = float(sv[j])
+                        rows.append({
+                            "row_in_batch": int(i),
+                            "feature": str(names[j]),
+                            "shap_value": val,
+                            "abs_shap_value": abs(val),
+                            "direction": "↑" if val > 0 else ("↓" if val < 0 else "0"),
+                        })
+                df_topk = pd.DataFrame(rows)
+                if include_proba:
+                    # Use the same batch rows from the previously computed proba vector
+                    # If you want absolute Excel row index, add + df_inf.index[0] logic as needed
+                    proba_batch = proba[:batch_n]
+                    df_proba = pd.DataFrame({"row_in_batch": list(range(batch_n)), "predicted_probability": proba_batch})
+                    df_topk = df_topk.merge(df_proba, on="row_in_batch", how="left")
+                # Sort nicely: each row block by importance
+                df_topk = df_topk.sort_values(["row_in_batch", "abs_shap_value"], ascending=[True, False])
+                st.dataframe(df_topk, use_container_width=True)
+                st.download_button(
+                    "Download Top-K SHAP per row (CSV)",
+                    df_topk.to_csv(index=False).encode("utf-8"),
+                    file_name=f"shap_top{top_k}_per_row_first{batch_n}.csv",
+                    mime="text/csv",
+                    key="dl_topk_shap_csv"
+                )
+            st.markdown(f"### Global SHAP summary (first {batch_n} rows)")
             # BAR SUMMARY
             fig_bar = plt.figure()
             shap.summary_plot(
+                shap_vals_batch,
                 features=X_dense,
                 feature_names=names,
                 plot_type="bar",
             )
             st.pyplot(fig_bar, clear_figure=True)
+            # BEESWARM SUMMARY (optional)
+            if show_beeswarm:
+                fig_swarm = plt.figure()
+                shap.summary_plot(
+                    shap_vals_batch,
+                    features=X_dense,
+                    feature_names=names,
+                    max_display=max_display,
+                    show=False,
+                )
+                st.pyplot(fig_swarm, clear_figure=True)
             st.markdown("### Waterfall plots (batch)")
             max_waterfalls = st.slider("Max waterfall plots to render", 1, 10, 3, 1, key="max_waterfalls")
             rows_to_plot = rows_to_plot[:max_waterfalls]
+            explainer = st.session_state.get("explainer")
+            base = explainer.expected_value
             if not np.isscalar(base):
                 base = float(np.array(base).reshape(-1)[0])
             for r in rows_to_plot:
                 st.markdown(f"**Row {r} (within first {batch_n})**")
                 exp = shap.Explanation(
+                    values=shap_vals_batch[r],
                     base_values=float(base),
                     data=X_dense[r],
                     feature_names=names,
                 shap.plots.waterfall(exp, show=False, max_display=max_display)
                 st.pyplot(fig_w, clear_figure=True)
+        #Single row SHAP block
         st.subheader("SHAP explanation")
         with st.form("shap_form"):