Spaces:

Synav
/

Explainable-Acute-Leukemia-Mortality-Predictor

Running

App Files Files Community

Synav commited on Jan 24

Commit

529c5f8

verified ·

1 Parent(s): fc931be

Update app.py

Browse files

Files changed (1) hide show

app.py +220 -221

app.py CHANGED Viewed

@@ -30,7 +30,7 @@ from sklearn.model_selection import train_test_split
 #Figures setting block
 import io
-import matplotlib.pyplot as plt
 # REPLACE make_fig with this (or add this and stop using plt.plot directly)
 def make_fig(figsize=(5.5, 3.6), dpi=120):
@@ -931,6 +931,8 @@ if "pipe" not in st.session_state:
 if "explainer" not in st.session_state:
     st.session_state.explainer = None
 with tab_train:
     st.subheader("Train model")
@@ -962,250 +964,243 @@ with tab_train:
         st.divider()
         # then keep your file uploader + training button + publish block here
-        ...
-# ---------------- TRAIN ----------------
-    with tab_train:
-        st.subheader("Train model")
-        if not is_admin():
-            st.info("Training and publishing are restricted. Use Predict + SHAP for inference.")
-        else:
-            train_file = st.file_uploader("Upload training Excel (.xlsx)", type=["xlsx"])
-            if train_file is not None:
-                df = pd.read_excel(train_file, engine="openpyxl")
-                feature_cols = get_feature_cols_from_df(df)
-                st.dataframe(df.head())
-                feature_cols = get_feature_cols_from_df(df)
-                st.markdown("### Choose variable types (saved into the model)")
-                default_numeric = feature_cols[:13]  # initial suggestion
-                num_cols = st.multiselect(
-                    "Numeric variables (will be median-imputed + scaled)",
-                    options=feature_cols,
-                    default=default_numeric
-                )
-                # Everything not selected as numeric becomes categorical
-                cat_cols = [c for c in feature_cols if c not in num_cols]
-                st.write(f"Categorical variables (will be most-frequent-imputed + one-hot): {len(cat_cols)}")
-                st.caption("Note: The selected schema is stored with the trained model and must match inference files.")
-                st.markdown("### Evaluation settings")
-                n_bins = st.slider("Calibration bins", 5, 20, 10, 1)
-                cal_strategy = st.selectbox("Calibration binning strategy", ["uniform", "quantile"], index=0)
-                dca_points = st.slider("Decision curve points", 25, 200, 99, 1)
-                if st.button("Train model"):
-                    with st.spinner("Training model..."):
-                        pipe, meta, X_bg, y_test, proba = train_and_save(
-                            df, feature_cols, num_cols, cat_cols,
-                            n_bins=n_bins, cal_strategy=cal_strategy, dca_points=dca_points,
-                            use_feature_selection=use_feature_selection,
-                            l1_C=l1_C,
-                            use_dimred=use_dimred,
-                            svd_components=svd_components
-                        )
-                        explainer = build_shap_explainer(pipe, X_bg)
-                        st.session_state.pipe = pipe
-                        st.session_state.explainer = explainer
-                        st.session_state.meta = meta
-                    st.success("Training complete. model.joblib and meta.json created.")
-                    st.divider()
-                    st.subheader("Training performance (test split)")
-                    m = meta["metrics"]
-                    # Show key metrics at threshold 0.5
-                    c1, c2, c3, c4 = st.columns(4)
-                    c1.metric("ROC AUC", f"{m['roc_auc']:.3f}")
-                    c2.metric("Sensitivity (best F1 thr)", f"{m['sensitivity@best']:.3f}")
-                    c3.metric("Specificity (best F1 thr)", f"{m['specificity@best']:.3f}")
-                    c4.metric("F1 (best)", f"{m['f1@best']:.3f}")
-                    st.caption(f"Best threshold (max F1): {m['best_threshold']:.2f}")
-                    c5, c6, c7, c8 = st.columns(4)
-                    c5.metric("Precision", f"{m['precision@0.5']:.3f}")
-                    c6.metric("Accuracy", f"{m['accuracy@0.5']:.3f}")
-                    c7.metric("Balanced Acc", f"{m['balanced_accuracy@0.5']:.3f}")
-                    c8.metric("Test N", m["n_test"])
-                    # Confusion matrix display
-                    cm = m["confusion_matrix@0.5"]
-                    cm_df = pd.DataFrame(
-                        [[cm["tn"], cm["fp"]], [cm["fn"], cm["tp"]]],
-                        index=["Actual 0", "Actual 1"],
-                        columns=["Pred 0", "Pred 1"]
-                    )
-                    st.markdown("**Confusion Matrix (threshold = 0.5)**")
-                    st.dataframe(cm_df)
-                    # TRAINING: ROC curve plot
-                    # =========================
-                    roc = m["roc_curve"]
-                    fig, ax = make_fig(figsize=FIGSIZE, dpi=plot_dpi_screen)
-                    ax.plot(roc["fpr"], roc["tpr"])
-                    ax.plot([0, 1], [0, 1])
-                    ax.set_xlabel("False Positive Rate (1 - Specificity)")
-                    ax.set_ylabel("True Positive Rate (Sensitivity)")
-                    ax.set_title(f"ROC Curve (AUC = {m['roc_auc']:.3f})")
-                    render_plot_with_download(
-                        fig,
-                        title="ROC curve",
-                        filename="roc_curve.png",
-                        export_dpi=export_dpi,
-                        key="dl_train_roc"
-                    )
-                    #Precision recall curve
-                    # =========================
-                    # TRAINING: PR curve plot
-                    # =========================
-                    pr = m["pr_curve"]
-                    fig, ax = make_fig(figsize=FIGSIZE, dpi=plot_dpi_screen)
-                    ax.plot(pr["recall"], pr["precision"])
-                    ax.set_xlabel("Recall")
-                    ax.set_ylabel("Precision")
-                    ax.set_title(f"PR Curve (AP = {pr['average_precision']:.3f})")
-                    render_plot_with_download(
-                        fig,
-                        title="PR curve",
-                        filename="pr_curve.png",
-                        export_dpi=export_dpi,
-                        key="dl_train_pr"
-                    )
-                    #Calibration plot
-                    # =========================
-                    # TRAINING: Calibration plot
-                    # =========================
-                    cal = m["calibration"]
-                    fig, ax = make_fig(figsize=FIGSIZE, dpi=plot_dpi_screen)
-                    ax.plot(cal["prob_pred"], cal["prob_true"])
-                    ax.plot([0, 1], [0, 1])
-                    ax.set_xlabel("Mean predicted probability")
-                    ax.set_ylabel("Observed event rate")
-                    ax.set_title("Calibration curve")
-                    render_plot_with_download(
-                        fig,
-                        title="Calibration curve",
-                        filename="calibration_curve.png",
-                        export_dpi=export_dpi,
-                        key="dl_train_cal"
-                    )
-                    #Decision curve
-                    # =========================
-                    # TRAINING: Decision curve analysis plot
-                    # =========================
-                    dca = m["decision_curve"]
-                    fig, ax = make_fig(figsize=FIGSIZE, dpi=plot_dpi_screen)
-                    ax.plot(dca["thresholds"], dca["net_benefit_model"], label="Model")
-                    ax.plot(dca["thresholds"], dca["net_benefit_all"], label="Treat all")
-                    ax.plot(dca["thresholds"], dca["net_benefit_none"], label="Treat none")
-                    ax.set_xlabel("Threshold probability")
-                    ax.set_ylabel("Net benefit")
-                    ax.set_title("Decision curve analysis")
-                    ax.legend()
-                    render_plot_with_download(
-                        fig,
-                        title="Decision curve",
-                        filename="decision_curve.png",
-                        export_dpi=export_dpi,
-                        key="dl_train_dca"
-                    )
-                    st.caption(
-                        "If the model curve is above Treat-all and Treat-none across a threshold range, "
-                        "the model provides net clinical benefit in that range."
-                    )
-                    st.divider()
-                    st.subheader("Threshold analysis")
-                    thr = st.slider("Decision threshold", 0.0, 1.0, 0.5, 0.01)
-                    # Recompute threshold-based metrics quickly using stored probabilities
-                    # You need y_test and proba in scope. Easiest is to store them in session_state during training.
-                    st.session_state.y_test_last = y_test
-                    st.session_state.proba_last = proba
-                    if "y_test_last" in st.session_state and "proba_last" in st.session_state:
-                        cls = compute_classification_metrics(st.session_state.y_test_last, st.session_state.proba_last, threshold=thr)
-                        st.write({
-                            "Sensitivity": cls["sensitivity"],
-                            "Specificity": cls["specificity"],
-                            "Precision": cls["precision"],
-                            "Recall": cls["recall"],
-                            "F1": cls["f1"],
-                            "Accuracy": cls["accuracy"],
-                            "Balanced Accuracy": cls["balanced_accuracy"],
-                        })
-    # ---------------- PUBLISH (only after training) ----------------
-            if st.session_state.get("pipe") is not None:
-                st.divider()
-                st.subheader("Publish trained model to Hugging Face Hub")
-                default_version = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
-                version_tag = st.text_input(
-                    "Version tag",
-                    value=default_version,
-                    help="Used as releases/<version>/ in the model repository",
-                )
-                if st.button("Publish model.joblib + meta.json to Model Repo"):
-                        try:
-                            with st.spinner("Uploading to Hugging Face Model repo..."):
-                                paths = publish_to_hub(MODEL_REPO_ID, version_tag)
-                            st.success("Uploaded successfully to your model repository.")
-                            st.json(paths)
-                        except Exception as e:
-                            st.error(f"Upload failed: {e}")
 # ---------------- PREDICT ----------------
@@ -1490,8 +1485,7 @@ with tab_predict:
         TAB_MANAGED_FIELDS.add(NGS_COUNT_COL)
     # Age + key dates are handled by DOB/Dx inputs, not generic UI
-    TAB_MANAGED_FIELDS.add(AGE_FEATURE)
-    TAB_MANAGED_FIELDS.add(DX_DATE_FEATURE)
     if "Date of 1st CR" in feature_cols:
         TAB_MANAGED_FIELDS.add("Date of 1st CR")
@@ -1613,7 +1607,8 @@ with tab_predict:
                 continue
             # Age auto-calc (display integer, store float)
-            if f == AGE_FEATURE:
                 if np.isnan(derived_age):
                     st.number_input(
                         f"{f} (auto from DOB & Dx date)",
@@ -1635,28 +1630,32 @@ with tab_predict:
                     )
                     values_by_index[i] = float(derived_age)
                 continue
-            if f.strip() == "Date of 1st Bone Marrow biopsy (Date of Diagnosis)".strip():
                 values_by_index[i] = np.nan if dx_date is None else dx_date.isoformat()
-                st.text_input(f, value="" if dx_date is None else dx_date.isoformat(), disabled=True, key=f"sp_{i}_dx_show")
                 continue
             if f.strip() == "Date of 1st CR".strip():
                 values_by_index[i] = np.nan if cr1_date is None else cr1_date.isoformat()
-                st.text_input(f, value="" if cr1_date is None else cr1_date.isoformat(), disabled=True, key=f"sp_{i}_cr_show")
-                continue
-            # Dx date text (store string)
-            if f.strip() == DX_DATE_FEATURE.strip():
                 st.text_input(
                     f"{f} (auto)",
-                    value="" if dx_date is None else str(dx_date),
-                    key=f"sp_{i}_dx",
                     disabled=True,
                 )
-                values_by_index[i] = np.nan if dx_date is None else str(dx_date)
                 continue
             # ECOG mapped to int
             if f.strip() == "ECOG":
                 values_by_index[i] = int(ecog)

 #Figures setting block
 import io
 # REPLACE make_fig with this (or add this and stop using plt.plot directly)
 def make_fig(figsize=(5.5, 3.6), dpi=120):
 if "explainer" not in st.session_state:
     st.session_state.explainer = None
+# ---------------- TRAIN ----------------
 with tab_train:
     st.subheader("Train model")
         st.divider()
         # then keep your file uploader + training button + publish block here
+        train_file = st.file_uploader("Upload training Excel (.xlsx)", type=["xlsx"])
+        if train_file is None:
+           st.info("Upload a training Excel file to enable training.")
+        else:
+        df = pd.read_excel(train_file, engine="openpyxl")
+        feature_cols = get_feature_cols_from_df(df)
+        st.dataframe(df.head(), use_container_width=True)
+        feature_cols = get_feature_cols_from_df(df)
+        st.markdown("### Choose variable types (saved into the model)")
+        default_numeric = feature_cols[:13]  # initial suggestion
+        num_cols = st.multiselect(
+        "Numeric variables (will be median-imputed + scaled)",
+        options=feature_cols,
+        default=default_numeric
+        )
+        # Everything not selected as numeric becomes categorical
+        cat_cols = [c for c in feature_cols if c not in num_cols]
+        st.write(f"Categorical variables (will be most-frequent-imputed + one-hot): {len(cat_cols)}")
+        st.caption("Note: The selected schema is stored with the trained model and must match inference files.")
+        st.markdown("### Evaluation settings")
+        n_bins = st.slider("Calibration bins", 5, 20, 10, 1)
+        cal_strategy = st.selectbox("Calibration binning strategy", ["uniform", "quantile"], index=0)
+        dca_points = st.slider("Decision curve points", 25, 200, 99, 1)
+        if st.button("Train model"):
+            with st.spinner("Training model..."):
+                pipe, meta, X_bg, y_test, proba = train_and_save(
+                df, feature_cols, num_cols, cat_cols,
+                n_bins=n_bins, cal_strategy=cal_strategy, dca_points=dca_points,
+                use_feature_selection=use_feature_selection,
+                l1_C=l1_C,
+                use_dimred=use_dimred,
+                svd_components=svd_components
+                )
+                explainer = build_shap_explainer(pipe, X_bg)
+                st.session_state.pipe = pipe
+                st.session_state.explainer = explainer
+                st.session_state.meta = meta
+                st.success("Training complete. model.joblib and meta.json created.")
+                st.divider()
+                st.subheader("Training performance (test split)")
+                m = meta["metrics"]
+                # Show key metrics at threshold 0.5
+                c1, c2, c3, c4 = st.columns(4)
+                c1.metric("ROC AUC", f"{m['roc_auc']:.3f}")
+                c2.metric("Sensitivity (best F1 thr)", f"{m['sensitivity@best']:.3f}")
+                c3.metric("Specificity (best F1 thr)", f"{m['specificity@best']:.3f}")
+                c4.metric("F1 (best)", f"{m['f1@best']:.3f}")
+                st.caption(f"Best threshold (max F1): {m['best_threshold']:.2f}")
+                c5, c6, c7, c8 = st.columns(4)
+                c5.metric("Precision", f"{m['precision@0.5']:.3f}")
+                c6.metric("Accuracy", f"{m['accuracy@0.5']:.3f}")
+                c7.metric("Balanced Acc", f"{m['balanced_accuracy@0.5']:.3f}")
+                c8.metric("Test N", m["n_test"])
+                # Confusion matrix display
+                cm = m["confusion_matrix@0.5"]
+                cm_df = pd.DataFrame(
+                    [[cm["tn"], cm["fp"]], [cm["fn"], cm["tp"]]],
+                    index=["Actual 0", "Actual 1"],
+                    columns=["Pred 0", "Pred 1"]
+                )
+                st.markdown("**Confusion Matrix (threshold = 0.5)**")
+                st.dataframe(cm_df)
+                # TRAINING: ROC curve plot
+                # =========================
+                roc = m["roc_curve"]
+                fig, ax = make_fig(figsize=FIGSIZE, dpi=plot_dpi_screen)
+                ax.plot(roc["fpr"], roc["tpr"])
+                ax.plot([0, 1], [0, 1])
+                ax.set_xlabel("False Positive Rate (1 - Specificity)")
+                ax.set_ylabel("True Positive Rate (Sensitivity)")
+                ax.set_title(f"ROC Curve (AUC = {m['roc_auc']:.3f})")
+                render_plot_with_download(
+                    fig,
+                    title="ROC curve",
+                    filename="roc_curve.png",
+                    export_dpi=export_dpi,
+                    key="dl_train_roc"
+                )
+                #Precision recall curve
+                # =========================
+                # TRAINING: PR curve plot
+                # =========================
+                pr = m["pr_curve"]
+                fig, ax = make_fig(figsize=FIGSIZE, dpi=plot_dpi_screen)
+                ax.plot(pr["recall"], pr["precision"])
+                ax.set_xlabel("Recall")
+                ax.set_ylabel("Precision")
+                ax.set_title(f"PR Curve (AP = {pr['average_precision']:.3f})")
+                render_plot_with_download(
+                    fig,
+                    title="PR curve",
+                    filename="pr_curve.png",
+                    export_dpi=export_dpi,
+                    key="dl_train_pr"
+                )
+                #Calibration plot
+                # =========================
+                # TRAINING: Calibration plot
+                # =========================
+                cal = m["calibration"]
+                fig, ax = make_fig(figsize=FIGSIZE, dpi=plot_dpi_screen)
+                ax.plot(cal["prob_pred"], cal["prob_true"])
+                ax.plot([0, 1], [0, 1])
+                ax.set_xlabel("Mean predicted probability")
+                ax.set_ylabel("Observed event rate")
+                ax.set_title("Calibration curve")
+                render_plot_with_download(
+                    fig,
+                    title="Calibration curve",
+                    filename="calibration_curve.png",
+                    export_dpi=export_dpi,
+                    key="dl_train_cal"
+                )
+                #Decision curve
+                # =========================
+                # TRAINING: Decision curve analysis plot
+                # =========================
+                dca = m["decision_curve"]
+                fig, ax = make_fig(figsize=FIGSIZE, dpi=plot_dpi_screen)
+                ax.plot(dca["thresholds"], dca["net_benefit_model"], label="Model")
+                ax.plot(dca["thresholds"], dca["net_benefit_all"], label="Treat all")
+                ax.plot(dca["thresholds"], dca["net_benefit_none"], label="Treat none")
+                ax.set_xlabel("Threshold probability")
+                ax.set_ylabel("Net benefit")
+                ax.set_title("Decision curve analysis")
+                ax.legend()
+                render_plot_with_download(
+                    fig,
+                    title="Decision curve",
+                    filename="decision_curve.png",
+                    export_dpi=export_dpi,
+                    key="dl_train_dca"
+                )
+                st.caption(
+                    "If the model curve is above Treat-all and Treat-none across a threshold range, "
+                    "the model provides net clinical benefit in that range."
+                )
+                st.divider()
+                st.subheader("Threshold analysis")
+                thr = st.slider("Decision threshold", 0.0, 1.0, 0.5, 0.01)
+                # Recompute threshold-based metrics quickly using stored probabilities
+                # You need y_test and proba in scope. Easiest is to store them in session_state during training.
+                st.session_state.y_test_last = y_test
+                st.session_state.proba_last = proba
+                if "y_test_last" in st.session_state and "proba_last" in st.session_state:
+                    cls = compute_classification_metrics(st.session_state.y_test_last, st.session_state.proba_last, threshold=thr)
+                    st.write({
+                        "Sensitivity": cls["sensitivity"],
+                        "Specificity": cls["specificity"],
+                        "Precision": cls["precision"],
+                        "Recall": cls["recall"],
+                        "F1": cls["f1"],
+                        "Accuracy": cls["accuracy"],
+                        "Balanced Accuracy": cls["balanced_accuracy"],
+                    })
+# ---------------- PUBLISH (only after training) ----------------
+        if st.session_state.get("pipe") is not None:
+            st.divider()
+            st.subheader("Publish trained model to Hugging Face Hub")
+            default_version = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
+            version_tag = st.text_input(
+                "Version tag",
+                value=default_version,
+                help="Used as releases/<version>/ in the model repository",
+            )
+            if st.button("Publish model.joblib + meta.json to Model Repo"):
+                try:
+                    with st.spinner("Uploading to Hugging Face Model repo..."):
+                        paths = publish_to_hub(MODEL_REPO_ID, version_tag)
+                    st.success("Uploaded successfully to your model repository.")
+                    st.json(paths)
+                except Exception as e:
+                    st.error(f"Upload failed: {e}")
 # ---------------- PREDICT ----------------
         TAB_MANAGED_FIELDS.add(NGS_COUNT_COL)
     # Age + key dates are handled by DOB/Dx inputs, not generic UI
     if "Date of 1st CR" in feature_cols:
         TAB_MANAGED_FIELDS.add("Date of 1st CR")
                 continue
             # Age auto-calc (display integer, store float)
+            # --- Age (auto from DOB & Dx date) ---
+            if f.strip() == AGE_FEATURE.strip():
                 if np.isnan(derived_age):
                     st.number_input(
                         f"{f} (auto from DOB & Dx date)",
                     )
                     values_by_index[i] = float(derived_age)
                 continue
+            # --- Diagnosis date (auto from dx_date input) ---
+            if f.strip() == DX_DATE_FEATURE.strip():
                 values_by_index[i] = np.nan if dx_date is None else dx_date.isoformat()
+                st.text_input(
+                    f"{f} (auto)",
+                    value="" if dx_date is None else dx_date.isoformat(),
+                    disabled=True,
+                    key=f"sp_{i}_dx_show"
+                )
                 continue
             if f.strip() == "Date of 1st CR".strip():
                 values_by_index[i] = np.nan if cr1_date is None else cr1_date.isoformat()
                 st.text_input(
                     f"{f} (auto)",
+                    value="" if cr1_date is None else cr1_date.isoformat(),
                     disabled=True,
+                    key=f"sp_{i}_cr_show"
                 )
                 continue
             # ECOG mapped to int
             if f.strip() == "ECOG":
                 values_by_index[i] = int(ecog)