Spaces:

Synav
/

Explainable-Acute-Leukemia-Mortality-Predictor

Running

Explainable-Acute-Leukemia-Mortality-Predictor

File size: 125,602 Bytes

import json
from datetime import datetime
import numpy as np
import pandas as pd
import streamlit as st
import joblib


import os
from huggingface_hub import hf_hub_download, HfApi
import hmac
from sklearn.metrics import (
    roc_auc_score, accuracy_score,
    roc_curve, confusion_matrix,
    precision_score, recall_score, f1_score,
    balanced_accuracy_score,
    precision_recall_curve, average_precision_score,
    brier_score_loss
)
import shap
import matplotlib.pyplot as plt


from sklearn.calibration import calibration_curve
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from sklearn.decomposition import TruncatedSVD

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

#Figures setting block





# REPLACE make_fig with this (or add this and stop using plt.plot directly)
def make_fig(figsize=(5.5, 3.6), dpi=120):
    
    fig, ax = plt.subplots(figsize=figsize, dpi=dpi)
    return fig, ax




def fig_to_png_bytes(fig, dpi=600):
    import io
    buf = io.BytesIO()
    fig.savefig(buf, format="png", dpi=int(dpi), bbox_inches="tight")
    buf.seek(0)
    return buf.getvalue()

from typing import Optional

def render_plot_with_download(
    fig,
    *,
    title: str,
    filename: str,
    export_dpi: int = 600,
    key: Optional[str] = None
):
    
    png_bytes = fig_to_png_bytes(fig, dpi=export_dpi)
    st.pyplot(fig, clear_figure=False)
    st.download_button(
        label=f"Download {title} (PNG {export_dpi} dpi)",
        data=png_bytes,
        file_name=filename,
        mime="image/png",
        key=key or f"dl_{filename}"
    )
    plt.close(fig)





# ============================================================
# Fixed schema definition (PLACEHOLDER FRAMEWORK)
# ============================================================
# =========================
# 1) No feature limit
# =========================
LABEL_COL = "Outcome Event"   # keep label column exactly as header "Outcome Event" in your Excel
# =========================
# Survival columns (Excel headers)
# =========================
SURV_START_COL_CANDS  = ["Date of 1st Bone Marrow biopsy (Date of Diagnosis)", "Diagnosis Date", "Dx Date"]
SURV_DEATH_COL_CANDS  = ["death_date", "Death Date", "Date of Death"]
SURV_CENSOR_COL_CANDS = ["last_followup_date", "Last Follow up date", "Last Follow-up Date", "Last Seen Date"]
SURV_EVENT_COL_CANDS  = ["Outcome Event", "Death", "Event"]



def get_feature_cols_from_df(df: pd.DataFrame):
    """
    Features = ALL columns except Outcome Event (no limit), preserving original order.
    """
    if LABEL_COL not in df.columns:
        raise ValueError(f"Missing required label column '{LABEL_COL}'. "
                         f"Ensure your Excel header row contains a column named '{LABEL_COL}'.")
    return [c for c in df.columns if c != LABEL_COL]

from datetime import date

def to_date_safe(x):
    """Accepts date/datetime/str; returns python date or None."""
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return None
    if isinstance(x, datetime):
        return x.date()
    if isinstance(x, date):
        return x
    # string
    try:
        return pd.to_datetime(x).date()
    except Exception:
        return None

def age_years_at(dob, ref_date):
    """Age in years at ref_date. Returns float or np.nan."""
    dob = to_date_safe(dob)
    ref_date = to_date_safe(ref_date)
    if dob is None or ref_date is None:
        return np.nan
    if ref_date < dob:
        return np.nan
    return (ref_date - dob).days / 365.25

def build_survival_targets(df: pd.DataFrame):
    start_col  = find_col(df, SURV_START_COL_CANDS)
    death_col  = find_col(df, SURV_DEATH_COL_CANDS)
    censor_col = find_col(df, SURV_CENSOR_COL_CANDS)
    event_col  = find_col(df, SURV_EVENT_COL_CANDS)

    if start_col is None or event_col is None:
        raise ValueError("Survival requires at minimum Diagnosis date and Outcome/Event column.")

    start = pd.to_datetime(df[start_col], errors="coerce")

    # event indicator (treat YES/1/TRUE as event)
    ev_raw = df[event_col]
    if pd.api.types.is_numeric_dtype(ev_raw):
        ev = ev_raw.fillna(0).astype(int).eq(1)
    else:
        ev = (ev_raw.astype(str).str.strip().str.upper().isin(["YES","Y","1","TRUE","T","DEAD","DIED"]))


    death = pd.to_datetime(df[death_col], errors="coerce") if death_col else pd.Series([pd.NaT]*len(df))
    last  = pd.to_datetime(df[censor_col], errors="coerce") if censor_col else pd.Series([pd.NaT]*len(df))

    end = death.where(ev, last)
    time_days = (end - start).dt.days.astype("float")
    time_days = time_days.where(time_days >= 0, np.nan)

    used = {
        "start": start_col,
        "event": event_col,
        "death": death_col,
        "censor": censor_col,
    }

    return time_days.to_numpy(), ev.astype(int).to_numpy(), used




import re
def align_columns_to_schema(df: pd.DataFrame, required_cols: list[str]) -> pd.DataFrame:
    """
    Rename df columns to match required_cols using normalized matching.
    This fixes newline/space/underscore/case differences.
    """
    # normalize inference columns
    df_cols = list(df.columns)
    norm_to_actual = {norm_col(c): c for c in df_cols}

    rename_map = {}
    for req in required_cols:
        k = norm_col(req)
        if k in norm_to_actual:
            src = norm_to_actual[k]
            if src != req:
                rename_map[src] = req

    if rename_map:
        df = df.rename(columns=rename_map)

    return df


def norm_col(s: str) -> str:
    """
    Normalize column names for robust matching:
    - strip leading/trailing whitespace
    - collapse multiple spaces
    - remove underscores
    - lower-case
    """
    s = "" if s is None else str(s)
    s = s.replace("\u00A0", " ")          # non-breaking space -> normal space
    s = re.sub(r"\s+", " ", s).strip()    # collapse spaces
    s = s.replace("_", "")                # ignore underscores
    return s.lower()

def find_col(df: pd.DataFrame, candidates: list[str]) -> str | None:
    """Return the first column in df that matches any candidate (normalized), else None."""
    lookup = {norm_col(c): c for c in df.columns}
    for cand in candidates:
        k = norm_col(cand)
        if k in lookup:
            return lookup[k]
    return None

def train_survival_bundle(
    df: pd.DataFrame,
    feature_cols: list[str],
    num_cols: list[str],
    cat_cols: list[str],
    time_days: np.ndarray,
    event01: np.ndarray,
    *,
    penalizer: float = 0.1
):
    """
    Returns (bundle_dict, notes). Raises exceptions if hard-fail.
    Lazy-imports lifelines.
    """
    from lifelines import CoxPHFitter  # lazy
    from sklearn.impute import SimpleImputer  # light, ok here too

    # build survival DF
    df_surv = df[feature_cols].copy().replace({pd.NA: np.nan})

    # coerce numeric/cat
    for c in num_cols:
        if c in df_surv.columns:
            df_surv[c] = pd.to_numeric(df_surv[c], errors="coerce")

    for c in cat_cols:
        if c in df_surv.columns:
            df_surv[c] = df_surv[c].astype("object")
            df_surv.loc[df_surv[c].isna(), c] = np.nan
            df_surv[c] = df_surv[c].map(lambda v: v if pd.isna(v) else str(v))

    df_surv["time_days"] = time_days
    df_surv["event"] = event01
    df_surv = df_surv.dropna(subset=["time_days", "event"])

    duration_col = "time_days"
    event_col = "event"

    # one-hot
    df_surv_oh = pd.get_dummies(df_surv, columns=cat_cols, drop_first=True)
    if duration_col not in df_surv_oh.columns or event_col not in df_surv_oh.columns:
        raise ValueError("Survival DF missing duration/event columns after one-hot encoding.")


    # remove duplicate columns if any messy headers caused duplicates
    df_surv_oh = df_surv_oh.loc[:, ~df_surv_oh.columns.duplicated()].copy()

        # predictor columns
    X_cols = [c for c in df_surv_oh.columns if c not in (duration_col, event_col)]

    # force numeric for Cox predictors
    df_surv_oh[X_cols] = df_surv_oh[X_cols].apply(pd.to_numeric, errors="coerce")

    # ---- Drop zero-variance columns (CRITICAL) ----
    # Do this before imputation to remove constant 0/1 one-hot columns
    var0 = df_surv_oh[X_cols].var(skipna=True)
    X_cols = [c for c in X_cols if pd.notna(var0.get(c, np.nan)) and var0[c] > 0]

    if len(X_cols) == 0:
        raise ValueError("All Cox predictors are zero-variance after one-hot; cannot fit survival model.")

    # ---- Impute predictors ----
    # ---- Phase 1: temporary impute only to compute variance reliably ----
    imp_tmp = SimpleImputer(strategy="median")
    X_tmp = imp_tmp.fit_transform(df_surv_oh[X_cols])
    df_surv_oh.loc[:, X_cols] = pd.DataFrame(X_tmp, columns=X_cols, index=df_surv_oh.index)
    
    # ---- Drop any columns that became constant after imputation ----
    var1 = df_surv_oh[X_cols].var(skipna=True)
    X_cols = [c for c in X_cols if pd.notna(var1.get(c, np.nan)) and var1[c] > 0]
    if len(X_cols) == 0:
        raise ValueError("All Cox predictors became zero-variance after imputation; cannot fit survival model.")
    
    # ---- Basic sanity: events vs predictors ----
    n_events = int(np.sum(df_surv_oh[event_col].astype(int).to_numpy()))
    if n_events < 5:
        raise ValueError(f"Too few events for Cox model (events={n_events}).")
    
    # ---- Optional stabilizer: top-k most variable predictors ----
    if len(X_cols) > max(10, 2 * n_events):
        k = max(10, 2 * n_events)
        vars_sorted = var1[X_cols].sort_values(ascending=False)
        X_cols = list(vars_sorted.head(k).index)
    
    # ---- Phase 2 (CRITICAL): fit FINAL imputer on FINAL X_cols ----
    imp = SimpleImputer(strategy="median")
    X_final = imp.fit_transform(df_surv_oh[X_cols])
    df_surv_oh.loc[:, X_cols] = pd.DataFrame(X_final, columns=X_cols, index=df_surv_oh.index)
    
    # ---- Fit Cox ----
    cph = CoxPHFitter(penalizer=float(max(penalizer, 1.0)), l1_ratio=0.0)
    cph.fit(df_surv_oh[[*X_cols, duration_col, event_col]],
            duration_col=duration_col,
            event_col=event_col)


    bundle = {
        "model": cph,
        "columns": X_cols,
        "imputer": imp,
        "cat_cols": cat_cols,
        "num_cols": num_cols,
        "feature_cols": feature_cols,
        "duration_col": duration_col,
        "event_col": event_col,
        "version": 2
    }
    return bundle, "Survival model trained successfully."



# ============================================================
# Model pipeline
# ============================================================
# =========================
# 3) UPDATED pipeline builder
# =========================
def build_pipeline(
    num_cols,
    cat_cols,
    *,
    use_feature_selection: bool = True,
    l1_C: float = 1.0,
    selection_max_features: int | None = None,
    use_dimred: bool = False,
    svd_components: int = 50,
    svd_random_state: int = 42,
):
    """
    - No limit on raw variables.
    - Drops useless columns:
        (a) zero-variance after preprocessing
        (b) L1-based selection (optional)
    - Optional dimensionality reduction via TruncatedSVD (sparse-friendly).
    """

    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=True, drop="first"))

    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", num_pipe, num_cols),
            ("cat", cat_pipe, cat_cols)
        ],
        remainder="drop",
        verbose_feature_names_out=False
    )

    steps = []
    steps.append(("preprocess", preprocessor))

    # Drop columns that never vary (common in one-hot)
    steps.append(("vt", VarianceThreshold(threshold=0.0)))

    # Optional: dimensionality reduction (works with sparse)
    # WARNING: makes SHAP less interpretable (components instead of original features)
    if use_dimred:
        steps.append(("svd", TruncatedSVD(
            n_components=int(svd_components),
            random_state=int(svd_random_state)
        )))

    # Optional: feature selection using L1 logistic (sparse-friendly if SVD is OFF;
    # if SVD is ON, selection happens on components)
    if use_feature_selection:
        selector_est = LogisticRegression(
            penalty="l1", solver="saga", C=float(l1_C),
            max_iter=5000, n_jobs=-1,
            class_weight="balanced"
        )


        # If you want to cap features: set max_features and use threshold that keeps top coefficients.
        # SelectFromModel doesn't have direct "max_features" — simplest safe approach is threshold-based.
        # Keep threshold='median' as default; adjust if you want more aggressive pruning.
        selector = SelectFromModel(selector_est, threshold="median")
        steps.append(("select", selector))

    # Final classifier (keep stable, probability-calibratable)
    clf = LogisticRegression(max_iter=5000, solver="lbfgs", class_weight="balanced")

    steps.append(("clf", clf))

    return Pipeline(steps)


# ============================================================
# Validation utilities
# ============================================================


def coerce_binary_label(y: pd.Series):
    y_clean = y.dropna()
    uniq = list(pd.unique(y_clean))
    if len(uniq) != 2:
        raise ValueError(f"Outcome Event must be binary (2 unique values). Found: {uniq}")

    if pd.api.types.is_numeric_dtype(y_clean):
        pos = sorted(uniq)[-1]
        return (y == pos).astype(int).to_numpy(), pos

    if y_clean.dtype == bool:
        return y.astype(int).to_numpy(), True

    uniq_str = sorted([str(u) for u in uniq])
    pos = uniq_str[-1]
    return y.astype(str).eq(pos).astype(int).to_numpy(), pos




# ============================================================
# Training + persistence
# ============================================================

def compute_classification_metrics(y_true, y_proba, threshold: float = 0.5):
    y_pred = (y_proba >= threshold).astype(int)

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()

    sensitivity = tp / (tp + fn) if (tp + fn) else 0.0  # recall, TPR
    specificity = tn / (tn + fp) if (tn + fp) else 0.0  # TNR
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    acc = accuracy_score(y_true, y_pred)
    bacc = balanced_accuracy_score(y_true, y_pred)

    return {
        "threshold": float(threshold),
        "tn": int(tn), "fp": int(fp), "fn": int(fn), "tp": int(tp),
        "sensitivity": float(sensitivity),
        "specificity": float(specificity),
        "precision": float(precision),
        "recall": float(recall),
        "f1": float(f1),
        "accuracy": float(acc),
        "balanced_accuracy": float(bacc),
    }

def find_best_threshold(y_true, y_proba, metric="f1"):
    thresholds = np.linspace(0.05, 0.95, 181)  # step ~0.005
    best_t, best_val, best_cls = 0.5, -1, None
    for t in thresholds:
        cls = compute_classification_metrics(y_true, y_proba, threshold=float(t))
        val = cls.get(metric, 0.0)
        if val > best_val:
            best_val, best_t, best_cls = val, float(t), cls
    return best_t, best_val, best_cls

def find_best_threshold_f1(y_true, y_proba, t_min=0.01, t_max=0.99, n=199):
    """
    Returns threshold that maximizes F1 on (y_true, y_proba).
    """
    thresholds = np.linspace(float(t_min), float(t_max), int(n))
    best = {"threshold": 0.5, "f1": -1.0, "cls": None}

    for t in thresholds:
        cls = compute_classification_metrics(y_true, y_proba, threshold=float(t))
        if cls["f1"] > best["f1"]:
            best = {"threshold": float(t), "f1": float(cls["f1"]), "cls": cls}

    return best["threshold"], best["cls"]


#BOOTSTRAPPING UTILITIES
def bootstrap_internal_validation(
    df: pd.DataFrame,
    feature_cols: list[str],
    num_cols: list[str],
    cat_cols: list[str],
    *,
    n_boot: int = 1000,
    threshold: float = 0.5,
    use_feature_selection: bool = True,
    l1_C: float = 1.0,
    use_dimred: bool = False,
    svd_components: int = 50,
    random_state: int = 42,
):
    """
    Bootstrap internal validation using OOB evaluation.
    For each bootstrap replicate:
      - sample N rows with replacement
      - fit pipeline on bootstrap sample
      - evaluate on OOB rows (not sampled)
    Returns:
      - df_boot: per-iteration metrics
      - summary: mean, median, 2.5% and 97.5% CI for key metrics
      - n_skipped: count of iterations skipped due to empty OOB or single-class OOB
    """

    rng = np.random.default_rng(int(random_state))
    n = len(df)
    idx_all = np.arange(n)

    rows = []
    n_skipped = 0

    for b in range(int(n_boot)):
        boot_idx = rng.integers(0, n, size=n)  # sample N rows with replacement
        oob_mask = np.ones(n, dtype=bool)
        oob_mask[boot_idx] = False
        oob_idx = idx_all[oob_mask]

        # If OOB empty, skip (rare but possible with small N)
        if len(oob_idx) == 0:
            n_skipped += 1
            continue

        df_boot = df.iloc[boot_idx].copy()
        df_oob = df.iloc[oob_idx].copy()

        # Prepare X/y for boot and OOB
        Xb = df_boot[feature_cols].copy().replace({pd.NA: np.nan})
        yb_raw = df_boot[LABEL_COL].copy()

        Xo = df_oob[feature_cols].copy().replace({pd.NA: np.nan})
        yo_raw = df_oob[LABEL_COL].copy()

        # Numeric coercion
        for c in num_cols:
            Xb[c] = pd.to_numeric(Xb[c], errors="coerce")
            Xo[c] = pd.to_numeric(Xo[c], errors="coerce")

        # Categorical coercion
        for c in cat_cols:
            Xb[c] = Xb[c].astype("object")
            Xb.loc[Xb[c].isna(), c] = np.nan
            Xb[c] = Xb[c].map(lambda v: v if pd.isna(v) else str(v))

            Xo[c] = Xo[c].astype("object")
            Xo.loc[Xo[c].isna(), c] = np.nan
            Xo[c] = Xo[c].map(lambda v: v if pd.isna(v) else str(v))

        # Coerce labels
        try:
            yb01, _ = coerce_binary_label(yb_raw)
            yo01, _ = coerce_binary_label(yo_raw)
        except Exception:
            n_skipped += 1
            continue

        # If OOB has only one class, AUC/PR-AUC invalid -> skip
        if len(np.unique(yo01)) < 2:
            n_skipped += 1
            continue


        pipe_b = build_pipeline(
            num_cols, cat_cols,
            use_feature_selection=use_feature_selection,
            l1_C=l1_C,
            use_dimred=use_dimred,
            svd_components=svd_components,
        )

        try:
            pipe_b.fit(Xb, yb01)
            proba_oob = pipe_b.predict_proba(Xo)[:, 1]
        except Exception:
            n_skipped += 1
            continue

        # Metrics on OOB
        try:
            auc = float(roc_auc_score(yo01, proba_oob))
        except Exception:
            auc = np.nan

        pr = compute_pr_curve(yo01, proba_oob)  # returns dict with average_precision
        cal = compute_calibration(yo01, proba_oob, n_bins=10, strategy="uniform")  # has brier
        cls = compute_classification_metrics(yo01, proba_oob, threshold=float(threshold))

        rows.append({
            "iter": int(b),
            "n_oob": int(len(oob_idx)),
            "roc_auc": auc,
            "avg_precision": float(pr["average_precision"]),
            "brier": float(cal["brier"]),
            "sensitivity": float(cls["sensitivity"]),
            "specificity": float(cls["specificity"]),
            "precision": float(cls["precision"]),
            "recall": float(cls["recall"]),
            "f1": float(cls["f1"]),
            "accuracy": float(cls["accuracy"]),
            "balanced_accuracy": float(cls["balanced_accuracy"]),
        })

    df_boot = pd.DataFrame(rows)

    def summarise(col: str):
        s = df_boot[col].dropna()
        if len(s) == 0:
            return {"mean": np.nan, "median": np.nan, "ci2.5": np.nan, "ci97.5": np.nan, "n": 0}
        return {
            "mean": float(s.mean()),
            "median": float(s.median()),
            "ci2.5": float(np.quantile(s, 0.025)),
            "ci97.5": float(np.quantile(s, 0.975)),
            "n": int(len(s))
        }

    summary = {
        "roc_auc": summarise("roc_auc"),
        "avg_precision": summarise("avg_precision"),
        "brier": summarise("brier"),
        "sensitivity": summarise("sensitivity"),
        "specificity": summarise("specificity"),
        "f1": summarise("f1"),
        "balanced_accuracy": summarise("balanced_accuracy"),
    }

    return df_boot, summary, n_skipped


#PLOT CURVE UTILITIES

def compute_pr_curve(y_true, y_proba):
    precision, recall, pr_thresholds = precision_recall_curve(y_true, y_proba)
    ap = average_precision_score(y_true, y_proba)
    return {
        "average_precision": float(ap),
        "precision": [float(x) for x in precision],
        "recall": [float(x) for x in recall],
        "thresholds": [float(x) for x in pr_thresholds],
    }


def compute_calibration(y_true, y_proba, n_bins: int = 10, strategy: str = "uniform"):
    prob_true, prob_pred = calibration_curve(
        y_true, y_proba, n_bins=n_bins, strategy=strategy
    )
    brier = brier_score_loss(y_true, y_proba)
    return {
        "n_bins": int(n_bins),
        "strategy": str(strategy),
        "prob_true": [float(x) for x in prob_true],
        "prob_pred": [float(x) for x in prob_pred],
        "brier": float(brier),
    }


def decision_curve_analysis(y_true, y_proba, thresholds=None):
    y_true = np.asarray(y_true).astype(int)
    y_proba = np.asarray(y_proba).astype(float)

    if thresholds is None:
        thresholds = np.linspace(0.01, 0.99, 99)

    n = len(y_true)
    prevalence = float(np.mean(y_true))

    nb_model = []
    nb_all = []
    nb_none = []

    for pt in thresholds:
        y_pred = (y_proba >= pt).astype(int)

        tp = np.sum((y_pred == 1) & (y_true == 1))
        fp = np.sum((y_pred == 1) & (y_true == 0))

        w = pt / (1.0 - pt)

        nb_m = (tp / n) - (fp / n) * w
        nb_a = prevalence - (1.0 - prevalence) * w
        nb_n = 0.0

        nb_model.append(float(nb_m))
        nb_all.append(float(nb_a))
        nb_none.append(float(nb_n))

    return {
        "thresholds": [float(x) for x in thresholds],
        "net_benefit_model": nb_model,
        "net_benefit_all": nb_all,
        "net_benefit_none": nb_none,
        "prevalence": prevalence,
    }


def train_and_save(
    df: pd.DataFrame,
    feature_cols,
    num_cols,
    cat_cols,
    n_bins: int,
    cal_strategy: str,
    dca_points: int,
    *,
    use_feature_selection: bool,
    l1_C: float,
    use_dimred: bool,
    svd_components: int,):
    
    X = df[feature_cols].copy()
    y_raw = df[LABEL_COL].copy()

    X = X.replace({pd.NA: np.nan})

    for c in num_cols:
        X[c] = pd.to_numeric(X[c], errors="coerce")

    for c in cat_cols:
        X[c] = X[c].astype("object")
        X.loc[X[c].isna(), c] = np.nan
        X[c] = X[c].map(lambda v: v if pd.isna(v) else str(v))

    y01, pos_class = coerce_binary_label(y_raw)
    # ---- Survival targets (time-to-event) ----
    # Requires these columns to exist in the training Excel:
    # SURV_START_COL, SURV_DEATH_COL, SURV_CENSOR_COL
    surv_used_cols = None
    try:
        time_days, event01, surv_used_cols = build_survival_targets(df)
    except Exception:
        time_days, event01, surv_used_cols = None, None, None



    X_train, X_test, y_train, y_test = train_test_split(
        X, y01, test_size=0.2, random_state=42, stratify=y01
    )

    pipe = build_pipeline(
        num_cols, cat_cols,
        use_feature_selection=use_feature_selection,
        l1_C=l1_C,
        use_dimred=use_dimred,
        svd_components=svd_components
    )

    pipe.fit(X_train, y_train)
    proba = pipe.predict_proba(X_test)[:, 1]

    # (metrics code unchanged...)

    # ----- METRICS BLOCK (MISSING) -----
    roc_auc = float(roc_auc_score(y_test, proba))
    fpr, tpr, roc_thresholds = roc_curve(y_test, proba)
    cls_05 = compute_classification_metrics(y_test, proba, threshold=0.5)
    best_thr, best_val, cls_best = find_best_threshold(y_test, proba, metric="f1")
    
    metrics = {
        "roc_auc": roc_auc,
        "n_train": int(len(X_train)),
        "n_test": int(len(X_test)),
    
        # reference @0.5
        "threshold@0.5": 0.5,
        "accuracy@0.5": cls_05["accuracy"],
        "balanced_accuracy@0.5": cls_05["balanced_accuracy"],
        "precision@0.5": cls_05["precision"],
        "recall@0.5": cls_05["recall"],
        "f1@0.5": cls_05["f1"],
        "sensitivity@0.5": cls_05["sensitivity"],
        "specificity@0.5": cls_05["specificity"],
        "confusion_matrix@0.5": {
            "tn": cls_05["tn"], "fp": cls_05["fp"],
            "fn": cls_05["fn"], "tp": cls_05["tp"],
        },
    
        # primary: best F1 threshold
        "best_threshold_by": "f1",
        "best_threshold": float(best_thr),
        "best_f1": float(cls_best["f1"]),
        "accuracy@best": cls_best["accuracy"],
        "balanced_accuracy@best": cls_best["balanced_accuracy"],
        "precision@best": cls_best["precision"],
        "recall@best": cls_best["recall"],
        "f1@best": cls_best["f1"],
        "sensitivity@best": cls_best["sensitivity"],
        "specificity@best": cls_best["specificity"],
        "confusion_matrix@best": {
            "tn": cls_best["tn"], "fp": cls_best["fp"],
            "fn": cls_best["fn"], "tp": cls_best["tp"],
        },
    
        "roc_curve": {
            "fpr": [float(x) for x in fpr],
            "tpr": [float(x) for x in tpr],
            "thresholds": [float(x) for x in roc_thresholds],
        },
        "pr_curve": compute_pr_curve(y_test, proba),
        "calibration": compute_calibration(y_test, proba, n_bins, cal_strategy),
        "decision_curve": decision_curve_analysis(y_test, proba, np.linspace(0.01, 0.99, dca_points)),
    }


    # ---- Train survival model (CoxPH) ----
    from sklearn.impute import SimpleImputer

    survival_trained = False
    surv_notes = None
    surv_used_cols = None
    
    try:
        time_days, event01, surv_used_cols = build_survival_targets(df)
    except Exception:
        time_days, event01, surv_used_cols = None, None, None
    
    if time_days is not None and event01 is not None:
        try:
            bundle, surv_notes = train_survival_bundle(
                df=df,
                feature_cols=feature_cols,
                num_cols=num_cols,
                cat_cols=cat_cols,
                time_days=time_days,
                event01=event01,
                penalizer=0.1
            )
            joblib.dump(bundle, "survival_bundle.joblib", compress=3)
            survival_trained = True
        except Exception as e:
            survival_trained = False
            surv_notes = f"Survival model training failed: {e}"
    else:
        surv_notes = "Survival columns missing or could not be parsed; survival model not trained."


    


          
    
    joblib.dump(pipe, "model.joblib",compress=3)

    meta = {
        "framework": "Explainable-Acute-Leukemia-Mortality-Predictor",
        "model": "Logistic Regression",
        "created_at_utc": datetime.utcnow().isoformat(),
        "schema": {
            "features": feature_cols,   # now unlimited
            "numeric": num_cols,
            "categorical": cat_cols,
            "label": LABEL_COL,
        },
        "feature_reduction": {
            "variance_threshold": 0.0,
            "use_dimred": bool(use_dimred),
            "svd_components": int(svd_components) if use_dimred else None,
            "use_feature_selection": bool(use_feature_selection),
            "l1_C": float(l1_C) if use_feature_selection else None,
            "selection_method": "SelectFromModel(L1 saga, threshold=median)" if use_feature_selection else None,
            "note": "If SVD is enabled, SHAP becomes component-level (less interpretable)."
        },
        "shap_background": {
            "file": "background.csv",
            "max_rows": 200,
            "note": "Raw (pre-transform) background sample for SHAP LinearExplainer."
        },
        "survival": {
            "enabled": bool(survival_trained),
            "duration_units": "days",
            "model": "CoxPHFitter (lifelines) with one-hot for categoricals",
            "required_columns_any_of": {
                "start": SURV_START_COL_CANDS,
                "event": SURV_EVENT_COL_CANDS,
                "death": SURV_DEATH_COL_CANDS,
                "censor": SURV_CENSOR_COL_CANDS,
            },
            "used_columns": surv_used_cols,
            "notes": surv_notes
        },

        
        "positive_class": str(pos_class),
        "metrics": metrics,
    }
    

    
    with open("meta.json", "w", encoding="utf-8") as f:
        json.dump(meta, f, indent=2)

    st.write("Local survival bundle exists:", os.path.exists("survival_bundle.joblib"))
    if os.path.exists("survival_bundle.joblib"):
        st.write("Local survival bundle size (bytes):", os.path.getsize("survival_bundle.joblib"))
    st.write("Local survival bundle exists:", os.path.exists("survival_bundle.joblib"))
    st.json(meta.get("survival", {}))

    return pipe, meta, X_train, y_test, proba

    





# ============================================================
# SHAP
# ============================================================
def build_shap_explainer(pipe, X_bg, max_bg=200):
    

    if X_bg is None or len(X_bg) == 0:
        raise ValueError("SHAP background is empty.")

    if len(X_bg) > int(max_bg):
        X_bg = X_bg.sample(int(max_bg), random_state=42)

    clf = pipe.named_steps["clf"]
    Xt_bg = transform_before_clf(pipe, X_bg)

    explainer = shap.LinearExplainer(
        clf,
        Xt_bg,
        feature_perturbation="interventional"
    )
    return explainer


def safe_dense(Xt, max_rows: int = 200):
    """
    Convert sparse->dense carefully. Avoid converting huge matrices to dense.
    """
    if hasattr(Xt, "shape") and Xt.shape[0] > max_rows:
        Xt = Xt[:max_rows]
    try:
        return Xt.toarray()
    except Exception:
        return np.array(Xt)



def ensure_model_repo_exists(model_repo_id: str, token: str):
    """
    Optional helper: create the model repo if it doesn't exist.
    Safe to call; if it exists, it will error -> you can ignore.
    """
    
    # Increase Hub HTTP timeouts (seconds)
    os.environ["HF_HUB_HTTP_TIMEOUT"] = "300"   # 5 minutes
    os.environ["HF_HUB_ETAG_TIMEOUT"] = "300"

    api = HfApi(token=token)
    try:
        api.create_repo(repo_id=model_repo_id, repo_type="model", private=False, exist_ok=True)
    except Exception:
        pass

def coerce_X_like_schema(X: pd.DataFrame, feature_cols: list[str], num_cols: list[str], cat_cols: list[str]) -> pd.DataFrame:
    """
    Robustly align an inference dataframe to the training schema:
    - normalizes column names (spaces/newlines/underscores/case)
    - renames matching columns to the model's exact feature names
    - creates missing columns as NaN (so prediction can still run)
    """
    X = X.copy()

    # Build normalized lookup from inference file columns -> actual column name in inference
    inf_lookup = {norm_col(c): c for c in X.columns}

    # Build output with exact training columns
    X_out = pd.DataFrame(index=X.index)

    missing = []
    for col in feature_cols:
        k = norm_col(col)
        if k in inf_lookup:
            X_out[col] = X[inf_lookup[k]]
        else:
            X_out[col] = np.nan
            missing.append(col)

    # Optional: show missing columns (don’t hard fail)
    if missing:
        st.warning(
            "Inference file is missing some training columns (filled as blank/NaN): "
            + ", ".join(missing[:12]) + (" ..." if len(missing) > 12 else "")
        )

    # Coercions (same as your existing logic)
    X_out = X_out.replace({pd.NA: np.nan})

    for c in num_cols:
        if c in X_out.columns:
            X_out[c] = pd.to_numeric(X_out[c], errors="coerce")

    for c in cat_cols:
        if c in X_out.columns:
            X_out[c] = X_out[c].astype("object")
            X_out.loc[X_out[c].isna(), c] = np.nan
            X_out[c] = X_out[c].map(lambda v: v if pd.isna(v) else str(v))

    return X_out



def get_shap_background_auto(model_repo_id: str, feature_cols: list[str], num_cols: list[str], cat_cols: list[str]) -> pd.DataFrame | None:
    """
    Attempts to load SHAP background from HF repo. Returns coerced background or None.
    """
    df_bg = load_latest_background(model_repo_id)
    if df_bg is None:
        return None

    # Ensure required columns exist
    missing = [c for c in feature_cols if c not in df_bg.columns]
    if missing:
        return None

    return coerce_X_like_schema(df_bg, feature_cols, num_cols, cat_cols)



# ============================================================
# SHAP background persistence (best practice)
# ============================================================

def save_background_sample_csv(X_bg: pd.DataFrame, feature_cols: list[str], max_rows: int = 200, out_path: str = "background.csv"):
    """
    Saves a small *raw* background dataset (pre-transform) for SHAP explainer.
    Must contain columns exactly matching feature_cols.
    """
    if X_bg is None or len(X_bg) == 0:
        raise ValueError("X_bg is empty; cannot save background sample.")

    X_bg = X_bg[feature_cols].copy()

    if len(X_bg) > int(max_rows):
        X_bg = X_bg.sample(int(max_rows), random_state=42)

    # Preserve exact columns for future loading
    X_bg.to_csv(out_path, index=False, encoding="utf-8")
    return out_path


def publish_background_to_hub(model_repo_id: str, version_tag: str, background_path: str = "background.csv"):
    """
    Uploads background.csv to both versioned and latest paths.
    Requires HF_TOKEN with write permissions.
    """
    token = os.environ.get("HF_TOKEN")
    if not token:
        raise RuntimeError("HF_TOKEN not found. Add it in Space Settings → Secrets.")
    api = HfApi(token=token)

    version_bg_path = f"releases/{version_tag}/background.csv"

    # Versioned
    api.upload_file(
        path_or_fileobj=background_path,
        path_in_repo=version_bg_path,
        repo_id=model_repo_id,
        repo_type="model",
        commit_message=f"Upload SHAP background ({version_tag})"
    )

    # Latest
    api.upload_file(
        path_or_fileobj=background_path,
        path_in_repo="latest/background.csv",
        repo_id=model_repo_id,
        repo_type="model",
        commit_message=f"Update latest SHAP background ({version_tag})"
    )

    return {
        "version_bg_path": version_bg_path,
        "latest_bg_path": "latest/background.csv",
    }


def load_latest_background(model_repo_id: str) -> pd.DataFrame | None:
    """
    Loads latest/background.csv if present. Returns None if not found / cannot load.
    """
    try:
        bg_file = hf_hub_download(
            repo_id=model_repo_id,
            repo_type="model",
            filename="latest/background.csv",
        )
        df_bg = pd.read_csv(bg_file)
        return df_bg
    except Exception:
        return None


def load_background_by_version(model_repo_id: str, version_tag: str) -> pd.DataFrame | None:
    """
    Loads releases/<version>/background.csv if present.
    """
    try:
        bg_file = hf_hub_download(
            repo_id=model_repo_id,
            repo_type="model",
            filename=f"releases/{version_tag}/background.csv",
        )
        df_bg = pd.read_csv(bg_file)
        return df_bg
    except Exception:
        return None


def publish_to_hub(model_repo_id: str, version_tag: str):
    """
    Uploads model.joblib and meta.json to a Hugging Face *model* repository
    under a versioned folder and also updates a 'latest/' copy.
    Requires Space Secret: HF_TOKEN.
    """
    token = os.environ.get("HF_TOKEN")
    if not token:
        raise RuntimeError("HF_TOKEN not found. Add it in Space Settings → Secrets.")

    api = HfApi(token=token)

    # Versioned paths
    version_model_path = f"releases/{version_tag}/model.joblib"
    version_meta_path  = f"releases/{version_tag}/meta.json"

    # Upload versioned artifacts
    api.upload_file(
        path_or_fileobj="model.joblib",
        path_in_repo=version_model_path,
        repo_id=model_repo_id,
        repo_type="model",
        commit_message=f"Upload model artifacts ({version_tag})"
    )
    api.upload_file(
        path_or_fileobj="meta.json",
        path_in_repo=version_meta_path,
        repo_id=model_repo_id,
        repo_type="model",
        commit_message=f"Upload metadata ({version_tag})"
    )

    # Also maintain a 'latest/' copy for easy loading
    api.upload_file(
        path_or_fileobj="model.joblib",
        path_in_repo="latest/model.joblib",
        repo_id=model_repo_id,
        repo_type="model",
        commit_message=f"Update latest model ({version_tag})"
    )
    api.upload_file(
        path_or_fileobj="meta.json",
        path_in_repo="latest/meta.json",
        repo_id=model_repo_id,
        repo_type="model",
        commit_message=f"Update latest metadata ({version_tag})"
    )

    # Optional: upload survival model if present
    if os.path.exists("survival_bundle.joblib"):
        api.upload_file(
            path_or_fileobj="survival_bundle.joblib",
            path_in_repo=f"releases/{version_tag}/survival_bundle.joblib",
            repo_id=model_repo_id,
            repo_type="model",
            commit_message=f"Upload survival model ({version_tag})"
        )
        api.upload_file(
            path_or_fileobj="survival_bundle.joblib",
            path_in_repo="latest/survival_bundle.joblib",
            repo_id=model_repo_id,
            repo_type="model",
            commit_message=f"Update latest survival model ({version_tag})"
        )

    files = api.list_repo_files(repo_id=model_repo_id, repo_type="model")

    st.write([f for f in files if "survival" in f])

    return {
        "version_model_path": version_model_path,
        "version_meta_path": version_meta_path,
        "latest_model_path": "latest/model.joblib",
        "latest_meta_path": "latest/meta.json",
    }

MODEL_REPO_ID = "Synav/Explainable-Acute-Leukemia-Mortality-Predictor"

def list_release_versions(model_repo_id: str):
    """
    Returns sorted version tags found under releases/<version>/model.joblib in the model repo.
    """
    api = HfApi(token=os.environ.get("HF_TOKEN") or None)
    files = api.list_repo_files(repo_id=model_repo_id, repo_type="model")

    versions = set()
    for f in files:
        # We only care about releases/<version>/model.joblib
        if f.startswith("releases/") and f.endswith("/model.joblib"):
            parts = f.split("/")
            if len(parts) >= 3:
                versions.add(parts[1])

    # Most users want newest first (timestamp tags sort lexicographically)
    return sorted(versions, reverse=True)


def load_model_by_version(model_repo_id: str, version_tag: str):
    """
    Loads a specific version from releases/<version_tag>/model.joblib and meta.json
    """
    model_file = hf_hub_download(
        repo_id=model_repo_id,
        repo_type="model",
        filename=f"releases/{version_tag}/model.joblib",
    )
    meta_file = hf_hub_download(
        repo_id=model_repo_id,
        repo_type="model",
        filename=f"releases/{version_tag}/meta.json",
    )

    pipe = joblib.load(model_file)
    with open(meta_file, "r", encoding="utf-8") as f:
        meta = json.load(f)

    return pipe, meta
    
def load_latest_survival_model(model_repo_id: str):
    f = hf_hub_download(
        repo_id=model_repo_id,
        repo_type="model",
        filename="latest/survival_bundle.joblib",
    )
    return joblib.load(f)

def load_survival_model_by_version(model_repo_id: str, version_tag: str):
    f = hf_hub_download(
        repo_id=model_repo_id,
        repo_type="model",
        filename=f"releases/{version_tag}/survival_bundle.joblib",
    )
    return joblib.load(f)

def load_latest_survival_bundle(model_repo_id: str):
    f = hf_hub_download(repo_id=model_repo_id, repo_type="model", filename="latest/survival_bundle.joblib")
    return joblib.load(f)


def load_latest_model(model_repo_id: str):
    """
    Loads latest/model.joblib and latest/meta.json
    """
    model_file = hf_hub_download(
        repo_id=model_repo_id,
        repo_type="model",
        filename="latest/model.joblib",
    )
    meta_file = hf_hub_download(
        repo_id=model_repo_id,
        repo_type="model",
        filename="latest/meta.json",
    )

    pipe = joblib.load(model_file)
    with open(meta_file, "r", encoding="utf-8") as f:
        meta = json.load(f)

    return pipe, meta

def get_post_selection_feature_names(pipe) -> list[str]:
    """
    Returns feature names aligned to the exact columns seen by the final classifier.
    Works when SVD is OFF.
    Handles:
      preprocess -> VarianceThreshold -> SelectFromModel -> clf

    If some steps are missing, it degrades gracefully.
    """
    pre = pipe.named_steps.get("preprocess", None)
    if pre is None:
        raise ValueError("Pipeline missing 'preprocess' step.")

    # Start with preprocessed (post-onehot) names
    try:
        names = list(pre.get_feature_names_out())
    except Exception:
        # Fallback, but you should almost never hit this
        names = [f"f{i}" for i in range(pipe.named_steps["clf"].coef_.shape[1])]

    # Apply VarianceThreshold support mask (if present)
    vt = pipe.named_steps.get("vt", None)
    if vt is not None and hasattr(vt, "get_support"):
        vt_mask = vt.get_support()
        names = [n for n, keep in zip(names, vt_mask) if keep]

    # If SVD exists, we cannot map back to original one-hot features
    # because the feature space becomes components.
    if "svd" in pipe.named_steps:
        # Return component names for correctness
        svd = pipe.named_steps["svd"]
        k = getattr(svd, "n_components", None) or len(names)
        return [f"SVD_component_{i+1}" for i in range(int(k))]

    # Apply SelectFromModel mask (if present)
    sel = pipe.named_steps.get("select", None)
    if sel is not None and hasattr(sel, "get_support"):
        sel_mask = sel.get_support()
        names = [n for n, keep in zip(names, sel_mask) if keep]

    return names

def transform_before_clf(pipe, X):
    """
    Transform X through all pipeline steps BEFORE the classifier.
    Ensures SHAP sees the exact columns the clf sees.
    """
    Xt = X
    for name, step in pipe.steps:
        if name == "clf":
            break
        if hasattr(step, "transform"):
            Xt = step.transform(Xt)
    return Xt

def get_final_feature_names(pipe) -> list[str]:
    """
    Return feature names aligned with the exact columns seen by the classifier.
    Handles:
      preprocess -> vt -> (svd?) -> (select?) -> clf
    """
    pre = pipe.named_steps.get("preprocess", None)
    if pre is None:
        raise ValueError("Pipeline missing 'preprocess' step.")

    # Start: post-onehot names
    try:
        names = list(pre.get_feature_names_out())
    except Exception:
        names = None

    # Apply vt mask if we have names
    vt = pipe.named_steps.get("vt", None)
    if names is not None and vt is not None and hasattr(vt, "get_support"):
        vt_mask = vt.get_support()
        names = [n for n, keep in zip(names, vt_mask) if keep]

    # If SVD exists: feature space becomes components; names must be components
    if "svd" in pipe.named_steps:
        svd = pipe.named_steps["svd"]
        k = int(getattr(svd, "n_components", 0) or 0)
        if k <= 0:
            # fallback if unknown
            k = 0
        return [f"SVD_component_{i+1}" for i in range(k)]

    # Apply select mask (post-vt) if we have names
    sel = pipe.named_steps.get("select", None)
    if names is not None and sel is not None and hasattr(sel, "get_support"):
        sel_mask = sel.get_support()
        names = [n for n, keep in zip(names, sel_mask) if keep]

    # Fallback: if we couldn't get names, return generic based on clf coef size
    if names is None:
        n = pipe.named_steps["clf"].coef_.shape[1]
        names = [f"f{i}" for i in range(n)]

    return names
def options_for(col: str, df: pd.DataFrame | None):
    base = DEFAULT_OPTIONS.get(col, [])
    excel = []
    if df is not None and col in df.columns:
        excel = (
            df[col].dropna().astype(str).map(lambda x: x.strip())
            .loc[lambda s: s != ""].unique().tolist()
        )
    # keep order: defaults first, then any extra from excel
    out = []
    for v in base + sorted(set(excel)):
        if v not in out:
            out.append(v)
    return [""] + out

import re


# Canonical region labels you can use for analysis
# (UN-style: Africa, Americas, Asia, Europe, Oceania; you can later refine into subregions)
REGION_UNKNOWN = "Unknown"

# Normalization: your Excel has values like "Emarati", "FILIPINO", "Indian ", etc.
NATIONALITY_ALIASES = {
    # Nationality adjectives / common variants -> canonical country name
    "EMARATI": "United Arab Emirates",
    "EMIRATI": "United Arab Emirates",
    "UAE": "United Arab Emirates",

    "FILIPINO": "Philippines",
    "PALESTINIAN": "Palestine",
    "SYRIAN": "Syria",
    "LEBANESE": "Lebanon",
    "JORDANIAN": "Jordan",
    "YEMENI": "Yemen",
    "EGYPTIAN": "Egypt",
    "SUDANESE": "Sudan",
    "ETHIOPIAN": "Ethiopia",
    "ERITREAN": "Eritrea",
    "SOMALI": "Somalia",
    "KENYAN": "Kenya",
    "UGANDAN": "Uganda",
    "GUINEAN": "Guinea",
    "MOROCCAN": "Morocco",
    "COMORAN": "Comoros",

    "INDIAN": "India",
    "PAKISTANI": "Pakistan",
    "BANGLADESH": "Bangladesh",
    "NEPALESE": "Nepal",
    "INDONESIAN": "Indonesia",
    "MALAYSIAN": "Malaysia",

    "AMERICAN": "United States",
    "USA": "United States",
    "U.S.A": "United States",
    "UNITED STATES OF AMERICA": "United States",
}

def normalize_country_name(x: str) -> str | None:
    """Convert nationality-like strings to a canonical country name when possible."""
    if x is None:
        return None
    s = str(x).strip()
    if not s:
        return None

    s_up = re.sub(r"\s+", " ", s).strip().upper()

    # map adjectives/variants
    if s_up in NATIONALITY_ALIASES:
        return NATIONALITY_ALIASES[s_up]

    # If someone already entered a country name, keep it
    # country_converter can handle many variants; pass through as-is
    return s.strip()

from typing import Optional

def country_to_region(country: str | None) -> str:
    """
    Lazy-import country_converter to reduce startup memory.
    Returns one of: Africa, Americas, Asia, Europe, Oceania, Unknown
    """
    if not country:
        return REGION_UNKNOWN

    import country_converter as coco  # lazy

    r = coco.convert(names=country, to="continent")
    if not r or str(r).lower() in ("not found", "nan", "none"):
        return REGION_UNKNOWN

    if r == "America":
        return "Americas"
    return str(r)



def add_ethnicity_region(df: pd.DataFrame, eth_col: str = "Ethnicity", out_col: str = "Ethnicity_Region") -> pd.DataFrame:
    if eth_col not in df.columns:
        df[out_col] = REGION_UNKNOWN
        return df

    norm = df[eth_col].map(normalize_country_name)
    df[out_col] = norm.map(country_to_region)
    return df



# ============================================================
# Streamlit UI
# ============================================================
def is_admin() -> bool:
    """
    Admin gating for sensitive actions (Train + Publish).
    Requires Space secret ADMIN_KEY.
    """
    secret = os.environ.get("ADMIN_KEY", "")
    if not secret:
        return False
    entered = st.session_state.get("admin_key", "")
    return hmac.compare_digest(str(entered), str(secret))
st.set_page_config(page_title="Explainable-Acute-Leukemia-Mortality-Predictor", layout="wide")
# ---------------- Plot settings (global: Train + Predict) ----------------
with st.sidebar:
    st.markdown("### Plot settings")
    plot_width = st.slider("Plot width (inches)", 4.0, 10.0, 5.5, 0.1)
    plot_height = st.slider("Plot height (inches)", 2.5, 6.0, 3.6, 0.1)
    plot_dpi_screen = st.slider("Screen DPI", 80, 200, 120, 10)
    export_dpi = st.selectbox("Export DPI (PNG)", [300, 600, 900, 1200], index=1)

FIGSIZE = (plot_width, plot_height)


st.title("Explainable-Acute-Leukemia-Mortality-Predictor")
st.caption("Explainable clinical AI for mortality and outcome prediction in acute leukemia using SHAP-interpretable models")

with st.expander("About this AI model, who can use it, and required Excel format", expanded=True):
    st.markdown("""
    ### Quick Start (Single Patient)
    
    1. Open **Predict + SHAP**  
    2. **Load model** → select *latest* (default) 
    3. Enter patient details (Core → Clinical → FISH → NGS)  
    4. Click **Predict single patient**
    
    You will get:
    • Mortality (os) probability  
    • Risk category  
    • 6-month, 1-year, 3-year survival with PLOT
    • SHAP explanation
    
    Edit values to instantly test different scenarios.
    
    *For research/decision-support only — not a substitute for clinical judgment.*
    """)


st.warning(
    "Prediction will fail if feature names or variable types "
    "do not exactly match the trained model schema."
)


# 🔐 Admin controls come AFTER schema explanation
with st.expander("Admin controls", expanded=False):
    st.text_input("Admin key", type="password", key="admin_key")
    st.caption("Training and publishing are enabled only after admin authentication.")
    


tab_train, tab_predict = st.tabs(["1️⃣ Train", "2️⃣ Predict + SHAP"])

if "pipe" not in st.session_state:
    st.session_state.pipe = None
if "explainer" not in st.session_state:
    st.session_state.explainer = None

    
# ---------------- TRAIN ----------------
with tab_train:
    st.subheader("Train model")

    if not is_admin():
        st.info("Training and publishing are restricted. Use Predict + SHAP for inference.")
    else:
        st.markdown("### Feature reduction options")

        use_feature_selection = st.checkbox(
            "Drop columns that do not affect prediction (L1 feature selection)",
            value=True,
            key="train_use_feature_selection"
        )
        l1_C = st.slider(
            "L1 selection strength (lower = fewer features)",
            0.01, 10.0, 1.0, 0.01
        ) if use_feature_selection else 1.0

        use_dimred = st.checkbox(
            "Dimensionality reduction (TruncatedSVD) — reduces interpretability",
            value=False
        )

        svd_components = st.slider(
            "SVD components (only used if enabled)",
            5, 300, 50, 5
        ) if use_dimred else 50

        st.divider()

        # then keep your file uploader + training button + publish block here
        
        st.markdown("### Bootstrap internal validation (optional)")

        do_bootstrap = st.checkbox(
            "Enable bootstrapping (OOB internal validation)",
            value=False,
            key="train_bootstrap_enable"
        )
        
        n_boot = st.slider(
            "Bootstrap iterations",
            50, 2000, 1000, 50,
            key="train_bootstrap_n"
        ) if do_bootstrap else 0
        
        boot_thr = st.slider(
            "Bootstrap classification threshold",
            0.0, 1.0, 0.5, 0.01,
            key="train_bootstrap_thr"
        ) if do_bootstrap else 0.5

        #---------------------
    
        train_file = st.file_uploader("Upload training Excel (.xlsx)", type=["xlsx"])
    
        if train_file is None:
           st.info("Upload a training Excel file to enable training.")
        else: 
    
            df = pd.read_excel(train_file, engine="openpyxl")
            df.columns = [c.strip() for c in df.columns]
            feature_cols = get_feature_cols_from_df(df)
        
            st.dataframe(df.head(), use_container_width=True)
            feature_cols = get_feature_cols_from_df(df)
        
            st.markdown("### Choose variable types (saved into the model)")
            default_numeric = feature_cols[:13]  # initial suggestion
            num_cols = st.multiselect(
            "Numeric variables (will be median-imputed + scaled)",
            options=feature_cols,
            default=default_numeric
            )       
                                    
            # Everything not selected as numeric becomes categorical
            cat_cols = [c for c in feature_cols if c not in num_cols]
                    
            st.write(f"Categorical variables (will be most-frequent-imputed + one-hot): {len(cat_cols)}")
            st.caption("Note: The selected schema is stored with the trained model and must match inference files.")
        
            st.markdown("### Evaluation settings")
            n_bins = st.slider("Calibration bins", 5, 20, 10, 1)
            cal_strategy = st.selectbox("Calibration binning strategy", ["uniform", "quantile"], index=0)
                    
            dca_points = st.slider("Decision curve points", 25, 200, 99, 1)           
                
    
    
            if st.button("Train model"):
                with st.spinner("Training model..."):
                    pipe, meta, X_train, y_test, proba = train_and_save(
                    df, feature_cols, num_cols, cat_cols,
                    n_bins=n_bins, cal_strategy=cal_strategy, dca_points=dca_points,
                    use_feature_selection=use_feature_selection,
                    l1_C=l1_C,
                    use_dimred=use_dimred,
                    svd_components=svd_components
                    )
                # --- Save background sample for SHAP (raw X_train) ---
                try:
                    save_background_sample_csv(
                        X_bg=X_train,
                        feature_cols=feature_cols,
                        max_rows=200,
                        out_path="background.csv"
                    )
                    st.success("Saved SHAP background sample (background.csv).")
                except Exception as e:
                    st.warning(f"Could not save SHAP background sample: {e}")
                st.write("Local survival bundle exists:", os.path.exists("survival_bundle.joblib"))
                if os.path.exists("survival_bundle.joblib"):
                    st.write("Local survival bundle size (bytes):", os.path.getsize("survival_bundle.joblib"))  
                    
                explainer = build_shap_explainer(pipe, X_train)

                st.session_state.pipe = pipe
                st.session_state.explainer = explainer
                st.session_state.meta = meta

                st.success("Training complete. model.joblib and meta.json created.")

                
                st.divider()
                st.subheader("Training performance (test split)")
                
                m = meta["metrics"]
                
                # Show key metrics at threshold 0.5
                c1, c2, c3, c4 = st.columns(4)
                c1.metric("ROC AUC", f"{m['roc_auc']:.3f}")
                c2.metric("Sensitivity (best F1 thr)", f"{m['sensitivity@best']:.3f}")
                c3.metric("Specificity (best F1 thr)", f"{m['specificity@best']:.3f}")
                c4.metric("F1 (best)", f"{m['f1@best']:.3f}")
                st.caption(f"Best threshold (max F1): {m['best_threshold']:.2f}")
    
                    
                c5, c6, c7, c8 = st.columns(4)
                c5.metric("Precision", f"{m['precision@0.5']:.3f}")
                c6.metric("Accuracy", f"{m['accuracy@0.5']:.3f}")
                c7.metric("Balanced Acc", f"{m['balanced_accuracy@0.5']:.3f}")
                c8.metric("Test N", m["n_test"])


                if do_bootstrap:
                    st.divider()
                    st.subheader(f"Bootstrap internal validation (n={n_boot}, OOB)")
                
                    with st.spinner("Running bootstrap..."):
                        df_boot, boot_summary, n_skipped = bootstrap_internal_validation(
                            df=df,
                            feature_cols=feature_cols,
                            num_cols=num_cols,
                            cat_cols=cat_cols,
                            n_boot=int(n_boot),
                            threshold=float(boot_thr),
                            use_feature_selection=use_feature_selection,
                            l1_C=l1_C,
                            use_dimred=use_dimred,
                            svd_components=svd_components,
                            random_state=42,
                        )

                    st.caption(f"Completed: {len(df_boot)} iterations. Skipped: {n_skipped} (empty/single-class OOB or fitting issues).")
                
                    # Display summary
                    st.json(boot_summary)
                
                    # Show per-iteration table (optional)
                    st.dataframe(df_boot.head(30), use_container_width=True)
                
                    # Download full bootstrap results
                    st.download_button(
                        "Download bootstrap results (CSV)",
                        df_boot.to_csv(index=False).encode("utf-8"),
                        file_name=f"bootstrap_oob_{n_boot}.csv",
                        mime="text/csv",
                        key="dl_bootstrap_csv"
                    )
                
                    # Save into meta.json so it persists with the model
                    st.session_state.meta.setdefault("bootstrap", {})
                    st.session_state.meta["bootstrap"] = {
                        "method": "bootstrap_oob",
                        "n_boot": int(n_boot),
                        "threshold": float(boot_thr),
                        "skipped": int(n_skipped),
                        "summary": boot_summary,
                    }

                    meta = st.session_state.meta
                    with open("meta.json", "w", encoding="utf-8") as f:
                        json.dump(meta, f, indent=2)

                
                # Confusion matrix display
                cm = m["confusion_matrix@0.5"]
                cm_df = pd.DataFrame(
                    [[cm["tn"], cm["fp"]], [cm["fn"], cm["tp"]]],
                    index=["Actual 0", "Actual 1"],
                    columns=["Pred 0", "Pred 1"]
                )
                st.markdown("**Confusion Matrix (threshold = 0.5)**")
                st.dataframe(cm_df)
        
                st.json(meta["survival"])


                
                
                # TRAINING: ROC curve plot
                # =========================
                roc = m["roc_curve"]
                fig, ax = make_fig(figsize=FIGSIZE, dpi=plot_dpi_screen)
                ax.plot(roc["fpr"], roc["tpr"])
                ax.plot([0, 1], [0, 1])
                ax.set_xlabel("False Positive Rate (1 - Specificity)")
                ax.set_ylabel("True Positive Rate (Sensitivity)")
                ax.set_title(f"ROC Curve (AUC = {m['roc_auc']:.3f})")
                    
                render_plot_with_download(
                    fig,
                    title="ROC curve",
                    filename="roc_curve.png",
                    export_dpi=export_dpi,
                    key="dl_train_roc"
                )


                #Precision recall curve
                # =========================
                # TRAINING: PR curve plot
                # =========================
                pr = m["pr_curve"]
                fig, ax = make_fig(figsize=FIGSIZE, dpi=plot_dpi_screen)
                ax.plot(pr["recall"], pr["precision"])
                ax.set_xlabel("Recall")
                ax.set_ylabel("Precision")
                ax.set_title(f"PR Curve (AP = {pr['average_precision']:.3f})")
            
                render_plot_with_download(
                    fig,
                    title="PR curve",
                    filename="pr_curve.png",
                    export_dpi=export_dpi,
                    key="dl_train_pr"
                )
        
    
                    
                #Calibration plot
                # =========================
                # TRAINING: Calibration plot
                # =========================
                cal = m["calibration"]
                fig, ax = make_fig(figsize=FIGSIZE, dpi=plot_dpi_screen)
                ax.plot(cal["prob_pred"], cal["prob_true"])
                ax.plot([0, 1], [0, 1])
                ax.set_xlabel("Mean predicted probability")
                ax.set_ylabel("Observed event rate")
                ax.set_title("Calibration curve")
                
                render_plot_with_download(
                    fig,
                    title="Calibration curve",
                    filename="calibration_curve.png",
                    export_dpi=export_dpi,
                    key="dl_train_cal"
                )



                #Decision curve
                # =========================
                # TRAINING: Decision curve analysis plot
                # =========================
                dca = m["decision_curve"]
                fig, ax = make_fig(figsize=FIGSIZE, dpi=plot_dpi_screen)
                ax.plot(dca["thresholds"], dca["net_benefit_model"], label="Model")
                ax.plot(dca["thresholds"], dca["net_benefit_all"], label="Treat all")
                ax.plot(dca["thresholds"], dca["net_benefit_none"], label="Treat none")
                ax.set_xlabel("Threshold probability")
                ax.set_ylabel("Net benefit")
                ax.set_title("Decision curve analysis")
                ax.legend()
                    
                render_plot_with_download(
                    fig,
                    title="Decision curve",
                    filename="decision_curve.png",
                    export_dpi=export_dpi,
                    key="dl_train_dca"
                )

                
                st.caption(
                    "If the model curve is above Treat-all and Treat-none across a threshold range, "
                    "the model provides net clinical benefit in that range."
                )



        
                st.divider()
                st.subheader("Threshold analysis")
                
                thr = st.slider("Decision threshold", 0.0, 1.0, 0.5, 0.01)
                    
                # Recompute threshold-based metrics quickly using stored probabilities
                # You need y_test and proba in scope. Easiest is to store them in session_state during training.
                st.session_state.y_test_last = y_test
                st.session_state.proba_last = proba
                if "y_test_last" in st.session_state and "proba_last" in st.session_state:
                    cls = compute_classification_metrics(st.session_state.y_test_last, st.session_state.proba_last, threshold=thr)
                    st.write({
                        "Sensitivity": cls["sensitivity"],
                        "Specificity": cls["specificity"],
                        "Precision": cls["precision"],
                        "Recall": cls["recall"],
                        "F1": cls["f1"],
                        "Accuracy": cls["accuracy"],
                        "Balanced Accuracy": cls["balanced_accuracy"],
                    })





# ---------------- PUBLISH (only after training) ----------------

    
            # ---------------- PUBLISH (only after training) ----------------
            if st.session_state.get("pipe") is not None:
                st.divider()
                st.subheader("Publish trained model to Hugging Face Hub")
            
                default_version = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
                version_tag = st.text_input(
                    "Version tag",
                    value=default_version,
                    help="Used as releases/<version>/ in the model repository",
                )
            
                if st.button("Publish model.joblib + meta.json to Model Repo", key="publish_btn"):
                    try:
                        with st.spinner("Uploading to Hugging Face Model repo..."):
                            paths = publish_to_hub(MODEL_REPO_ID, version_tag)
                            # Upload background.csv if it exists
                            if os.path.exists("background.csv"):
                                bg_paths = publish_background_to_hub(MODEL_REPO_ID, version_tag, background_path="background.csv")
                                paths.update(bg_paths)
                            else:
                                st.warning("background.csv not found; SHAP background will not be uploaded.")
            
                        st.success("Uploaded successfully to your model repository.")
                        st.json(paths)
                    except Exception as e:
                        st.error(f"Upload failed: {e}")




# ---------------- PREDICT ----------------
PRED_N_BINS = 10
PRED_CAL_STRATEGY = "uniform"



with tab_predict:
    st.subheader("Select a trained model (no retraining required)")

    MODEL_REPO_ID = "Synav/Explainable-Acute-Leukemia-Mortality-Predictor"

    # Ensure session state keys exist
    if "pipe" not in st.session_state:
        st.session_state.pipe = None
    if "meta" not in st.session_state:
        st.session_state.meta = None
    if "explainer" not in st.session_state:
        st.session_state.explainer = None

    # List available releases
    try:
        versions = list_release_versions(MODEL_REPO_ID)
    except Exception as e:
        versions = []
        st.error(f"Could not list model versions: {e}")

    choices = ["latest"] + versions if versions else ["latest"]
    selected = st.selectbox("Choose model version", choices, index=0)

    if st.button("Load selected model"):
        try:
            with st.spinner("Loading model from Hugging Face Hub..."):
                if selected == "latest":
                    pipe, meta = load_latest_model(MODEL_REPO_ID)
                else:
                    pipe, meta = load_model_by_version(MODEL_REPO_ID, selected)

            st.session_state.pipe = pipe
            st.session_state.meta = meta
            st.session_state.explainer = None  # rebuild later with inference data
            st.success(f"Loaded model: {selected}")
        except Exception as e:
            st.error(f"Load failed: {e}")

    st.divider()
    if st.session_state.get("pipe") is None or st.session_state.get("meta") is None:
        st.warning("Load a model version above (it must include meta.json), then continue.")
        st.stop()
        

    pipe = st.session_state.pipe
    meta = st.session_state.meta

    # Try load survival model (optional)
    try:
        if selected == "latest":
            st.session_state.surv_model = load_latest_survival_model(MODEL_REPO_ID)
        else:
            st.session_state.surv_model = load_survival_model_by_version(MODEL_REPO_ID, selected)
    except Exception as e:
        st.session_state.surv_model = None
        st.warning(f"Survival bundle load failed: {e}")
    # bundle = st.session_state.get("surv_model", None)
    # bundle = st.session_state.get("surv_model", None)

    # st.write("surv_model type:", type(bundle))
    # st.write("surv_model keys:", list(bundle.keys()) if isinstance(bundle, dict) else None)
    
    # if isinstance(bundle, dict) and bundle.get("model") is not None:
    #     st.write("Cox predictors:", len(bundle.get("columns", [])))
    #     st.write("Cox cat cols:", len(bundle.get("cat_cols", [])))
    #     st.write("Cox num cols:", len(bundle.get("num_cols", [])))
    #     st.write("Bundle version:", bundle.get("version"))
    #     cph = bundle["model"]
    #     st.write("Cox coef shape:", getattr(cph, "params_", None).shape if hasattr(cph, "params_") else None)
    # else:
    #     cph = None
    DEBUG_SURV = False
    
    if DEBUG_SURV:
        st.write("Survival bundle loaded:", isinstance(bundle, dict))



    # 1) MUST come first: schema from meta
    feature_cols = meta["schema"]["features"]
    num_cols = meta["schema"]["numeric"]
    cat_cols = meta["schema"]["categorical"]

    # ------------------------------------------------------------
    # SHAP background: prefer inference file, else HF background.csv
    # ------------------------------------------------------------
    df_inf = st.session_state.get("df_inf")
    
    if df_inf is not None:
        # use user cohort as background (optional)
        X_bg = coerce_X_like_schema(df_inf, feature_cols, num_cols, cat_cols)
    else:
        # fall back to published background
        X_bg = get_shap_background_auto(MODEL_REPO_ID, feature_cols, num_cols, cat_cols)
    
    st.session_state.X_bg_for_shap = X_bg


    # 2) Now we can build lookup
    FEATURE_LOOKUP = {norm_col(c): c for c in feature_cols}

    

    from datetime import date

    MIN_DOB = date(1900, 1, 1)

    ALL_COUNTRIES = [
        "Afghanistan","Albania","Algeria","Andorra","Angola","Antigua and Barbuda",
        "Argentina","Armenia","Australia","Austria","Azerbaijan","Bahamas","Bahrain",
        "Bangladesh","Barbados","Belarus","Belgium","Belize","Benin","Bhutan","Bolivia",
        "Bosnia and Herzegovina","Botswana","Brazil","Brunei","Bulgaria","Burkina Faso",
        "Burundi","Cabo Verde","Cambodia","Cameroon","Canada","Central African Republic",
        "Chad","Chile","China","Colombia","Comoros","Congo (Congo-Brazzaville)",
        "Costa Rica","Cote d’Ivoire","Croatia","Cuba","Cyprus","Czechia",
        "Democratic Republic of the Congo","Denmark","Djibouti","Dominica",
        "Dominican Republic","Ecuador","Egypt","El Salvador","Equatorial Guinea",
        "Eritrea","Estonia","Eswatini","Ethiopia","Fiji","Finland","France","Gabon",
        "Gambia","Georgia","Germany","Ghana","Greece","Grenada","Guatemala","Guinea",
        "Guinea-Bissau","Guyana","Haiti","Honduras","Hungary","Iceland","India",
        "Indonesia","Iran","Iraq","Ireland","Israel","Italy","Jamaica","Japan","Jordan",
        "Kazakhstan","Kenya","Kiribati","Kuwait","Kyrgyzstan","Laos","Latvia","Lebanon",
        "Lesotho","Liberia","Libya","Liechtenstein","Lithuania","Luxembourg",
        "Madagascar","Malawi","Malaysia","Maldives","Mali","Malta","Marshall Islands",
        "Mauritania","Mauritius","Mexico","Micronesia","Moldova","Monaco","Mongolia",
        "Montenegro","Morocco","Mozambique","Myanmar","Namibia","Nauru","Nepal",
        "Netherlands","New Zealand","Nicaragua","Niger","Nigeria","North Korea",
        "North Macedonia","Norway","Oman","Pakistan","Palau","Palestine","Panama",
        "Papua New Guinea","Paraguay","Peru","Philippines","Poland","Portugal","Qatar",
        "Romania","Russia","Rwanda","Saint Kitts and Nevis","Saint Lucia",
        "Saint Vincent and the Grenadines","Samoa","San Marino","Sao Tome and Principe",
        "Saudi Arabia","Senegal","Serbia","Seychelles","Sierra Leone","Singapore",
        "Slovakia","Slovenia","Solomon Islands","Somalia","South Africa","South Korea",
        "South Sudan","Spain","Sri Lanka","Sudan","Suriname","Sweden","Switzerland",
        "Syria","Taiwan","Tajikistan","Tanzania","Thailand","Timor-Leste","Togo","Tonga",
        "Trinidad and Tobago","Tunisia","Turkey","Turkmenistan","Tuvalu","Uganda",
        "Ukraine","United Arab Emirates","United Kingdom","United States","Uruguay",
        "Uzbekistan","Vanuatu","Vatican City","Venezuela","Vietnam","Yemen","Zambia",
        "Zimbabwe",
        "Other","Unknown"
    ]

    
    
    
    DEFAULT_OPTIONS = {
    
        # ---------------- Demographics ----------------
        "Gender": ["Male", "Female"],
    
        "Ethnicity": ALL_COUNTRIES,

        # ---------------- Disease ----------------
        "Type of Leukemia": [
            "ALL",
            "AML",
            "APL",
            "CML",
            "MPAL",
            "Secondary AML",
            "Other",
            "Unknown"
        ],
    
        "Risk Assesment": [
            "Favorable",
            "Intermediate",
            "Adverse",
            "Unknown"
        ],
    
        "ECOG": [0, 1, 2, 3, 4],
    
        # ---------------- Molecular / Diagnostic ----------------
        "MDx Leukemia Screen 1": [
            "Negative",
            "t(15;17) / PML-RARA",
            "t(9;22) / BCR-ABL1",
            "RUNX1-RUNX1T1",
            "inv(16) / CBFB-MYH11",
            "KMT2A-AFF1",
            "t(1;19) / TCF3-PBX1",
            "t(10;11)",
            "Other",
            "Not done",
            "Unknown"
        ],
    
        "Post-Induction MRD": [
            "Negative",
            "Positive",
            "Pending",
            "Not done",
            "Unknown"
        ],
    
        # ---------------- Treatment ----------------
        "First_Induction": [
            "3+7",
            "3+7+GO",
            "3+7+TKI",
            "3+7+Midostaurin",
            "ATO+ATRA",
            "Aza+Ven",
            "CALGB",
            "CPX-351",
            "Dexa+TKI",
            "FLAG-IDA",
            "HCVAD",
            "HCVAD+TKI",
            "HMA",
            "Hyper-CVAD",
            "Not fit",
            "Pediatric protocol",
            "Other",
            "Unknown"
        ],

        # ---------------- Clinical Yes/No ----------------
        # (stored as 0/1 in your code)
        "Thrombocytopenia": ["No", "Yes"],
        "Bleeding": ["No", "Yes"],
        "B Symptoms": ["No", "Yes"],
        "Lymphadenopathy": ["No", "Yes"],
        "CNS Involvement": ["No", "Yes"],
        "Extramedullary Involvement": ["No", "Yes"],
    }
    
    # Canonical dropdown fields (use your intended names)
    CANON_DROPDOWNS = [
        "Ethnicity",
        "Type of Leukemia",
        "Post-Induction MRD",
        "First_Induction",
        "Risk Assesment",
        "MDx Leukemia Screen 1",
        "ECOG",
        "Gender",
    ]
    
    # Convert canonical -> actual column names present in this loaded model
    DROPDOWN_FIELDS_ACTUAL = []
    for canon in CANON_DROPDOWNS:
        key = norm_col(canon)
        if key in FEATURE_LOOKUP:
            DROPDOWN_FIELDS_ACTUAL.append(FEATURE_LOOKUP[key])
    def alias_default_options(canon_name: str):
        k = norm_col(canon_name)
        if k in FEATURE_LOOKUP:
            actual = FEATURE_LOOKUP[k]
            if canon_name in DEFAULT_OPTIONS and actual not in DEFAULT_OPTIONS:
                DEFAULT_OPTIONS[actual] = DEFAULT_OPTIONS[canon_name]
    
    alias_default_options("Ethnicity")
    alias_default_options("Type of Leukemia")
    alias_default_options("Post-Induction MRD")
    alias_default_options("First_Induction")
    alias_default_options("Risk Assesment")
    alias_default_options("MDx Leukemia Screen 1")

    
    # Fields you want as controlled dropdowns (rather than free text)
    DROPDOWN_FIELDS = [
        "Gender",
        "Ethnicity",
        "Type of Leukemia",
        "ECOG",
        "Risk Assesment",
        "MDx Leukemia Screen 1",
        "Post-Induction MRD",
        "First_Induction",
    ]
    
    # Yes/No (stored as 1/0) fields
    YESNO_FIELDS = [
        "Thrombocytopenia",
        "Bleeding",
        "B Symptoms",
        "Lymphadenopathy",
        "CNS Involvement",
        "Extramedullary Involvement",
    ]
    
    

    
    # Numeric fields with units (show units in the label)
    UNITS = {
        'WBCs on Admission\n ( x10^9/L) ': "x10^9/L",
        "Hb on Admission  (g/L)": "g/L",
        "LDH on Admission (IU/L)": "IU/L",
        "Pre-Induction bone marrow biopsy blasts %": "%",
        "Age (years)": "years",
    }

    # FISH / NGS marker groups (multi-select -> sets multiple columns 1/0)
    FISH_MARKERS = [
        "fish_inv16/cbfb_myh11",
        "fish_t8;21/runx1_runx1t1",
        "fish_t15;17/PML-RARA",
        "fish_KMT2A-MLL/11q23",
        "fish_BCR-ABL1/t9;22",
        "fish_ETV6-RUNX1/t12;21",
        "fish_TCF3_PBX1/t1;19",
        "fish_IGH/14q32rearr",
        "fish_CRLF2 rearr",
        "fish_NUP214/9q34",
        "fish_Other",
        "fish_del7q/monosomy7",
        "fish_TP53/del17p",
        "fish_del13q",
        "fish_del11q",
    ]
    FISH_COUNT_COL = "Number of FISH alteration"
    
    NGS_MARKERS = [
        "ngs_FLT3",
        "ngs_NPM1",
        "ngs_CEBPA",
        "ngs_DNMT3A",
        "ngs_IDH1",
        "ngs_IDH2",
        "ngs_TET2",
        "ngs_TP53",
        "ngs_RUNX1",
        "ngs_Other",
        "ngs_spliceosome",
    ]
    NGS_COUNT_COL = "No. of ngs_mutation"


    AGE_FEATURE = "Age (years)"
    DX_DATE_FEATURE = "Date of 1st Bone Marrow biopsy (Date of Diagnosis)"  # keep exact if your schema has trailing space

    
    # =========================
    # Fields managed by dedicated tabs (do NOT render duplicates in Core tab)
    # =========================
    

    
    
    


    def get_options_from_df(df: pd.DataFrame, col: str) -> list[str]:
        """Dropdown options from uploaded Excel (inference file)."""
        if df is None or col not in df.columns:
            return []
        vals = (
            df[col]
            .dropna()
            .astype(str)
            .map(lambda x: x.strip())
            .loc[lambda s: s != ""]
            .unique()
            .tolist()
        )
        vals = sorted(vals)
        return vals

    # =========================
    # After model is loaded
    # =========================
    pipe = st.session_state.pipe
    meta = st.session_state.meta
    

    

    # Map normalized name -> actual model column name
    FEATURE_LOOKUP = {norm_col(c): c for c in feature_cols}


    TAB_MANAGED_FIELDS = set()

    # Derived/auto fields (managed by DOB/Dx/CR widgets)
    if AGE_FEATURE in feature_cols:
        TAB_MANAGED_FIELDS.add(AGE_FEATURE)
    if DX_DATE_FEATURE in feature_cols:
        TAB_MANAGED_FIELDS.add(DX_DATE_FEATURE)
    if "Date of 1st CR" in feature_cols:
        TAB_MANAGED_FIELDS.add("Date of 1st CR")

    
    # Clinical yes/no handled in Clinical tab
    TAB_MANAGED_FIELDS.update([f for f in YESNO_FIELDS if f in feature_cols])
    
    # FISH handled in FISH tab
    TAB_MANAGED_FIELDS.update([f for f in FISH_MARKERS if f in feature_cols])
    
    # NGS handled in NGS tab
    TAB_MANAGED_FIELDS.update([f for f in NGS_MARKERS if f in feature_cols])

    DROPDOWN_FIELDS = [f for f in DROPDOWN_FIELDS if f not in TAB_MANAGED_FIELDS]
    

    # Optional marker counts: handled automatically
    if FISH_COUNT_COL in feature_cols:
        TAB_MANAGED_FIELDS.add(FISH_COUNT_COL)
    if NGS_COUNT_COL in feature_cols:
        TAB_MANAGED_FIELDS.add(NGS_COUNT_COL)
    
    
    
    
     # ---------------------------
    # Inference Excel (optional) - store in session_state for dropdown options
    # ---------------------------
    infer_file = st.file_uploader("Upload inference Excel (.xlsx) (optional, for dropdown options + batch prediction)", type=["xlsx"], key="infer_xlsx")
    if infer_file:
        st.session_state.df_inf = pd.read_excel(infer_file, engine="openpyxl")
    
        # 🔴 PLACE IT HERE — normalize headers immediately after load
        st.session_state.df_inf.columns = [
            " ".join(str(c).replace("\u00A0"," ").split())
            for c in st.session_state.df_inf.columns
        ]

        # ✅ ALIGN TO TRAINED MODEL SCHEMA (THIS FIXES YOUR ERROR)
        st.session_state.df_inf = align_columns_to_schema(
            st.session_state.df_inf,
            meta["schema"]["features"])
            
        st.session_state.df_inf = add_ethnicity_region(
            st.session_state.df_inf,
            "Ethnicity",
            "Ethnicity_Region"
        )
    else:
        st.session_state.df_inf = None

    
    df_for_options = st.session_state.df_inf
    
    st.divider()
    st.subheader("Enter patient details to get prediction (Example: DOB → Age, Dx date → prediction)")
    
    
    
    from datetime import date

    MIN_DATE = date(1900, 1, 1)
    
    c1, c2 = st.columns(2)
    with c1:
        dob_unknown = st.checkbox("DOB unknown", value=False, key="dob_unknown_chk")
        dob = None
        if not dob_unknown:
            dob = st.date_input("Date of birth (DOB)", min_value=MIN_DATE, max_value=date.today(), key="dob_date")
    
    with c2:
        dx_unknown = st.checkbox("Diagnosis date unknown", value=False, key="dx_unknown_chk")
        dx_date = None
        if not dx_unknown:
            dx_date = st.date_input("Date of 1st Bone Marrow biopsy (Diagnosis)", min_value=MIN_DATE, max_value=date.today(), key="dx_date")
    
    # Optional additional date input for CR1 if present in feature_cols
    cr1_date = None
    if "Date of 1st CR" in feature_cols:
        cr1_unknown = st.checkbox("1st CR date unknown", value=False, key="cr1_unknown")
        if not cr1_unknown:
            cr1_date = st.date_input("Date of 1st CR", min_value=MIN_DATE, max_value=date.today(), key="cr1_date")
    
    derived_age = age_years_at(dob, dx_date)


    
    def yesno_to_01(v: str):
        if v == "Yes":
            return 1
        if v == "No":
            return 0
        return np.nan
    
    # Create tabs FIRST (outside any form)
    tab_core, tab_clin, tab_fish, tab_ngs = st.tabs(["Core", "Clinical (Yes/No)", "FISH", "NGS"])
    # Storage for selections
    values_by_index = [np.nan] * len(feature_cols)
    
    
    with tab_fish:
        st.caption("Select one or many FISH alterations. Selected = 1, not selected = 0.")
        fish_selected = st.multiselect(
            "FISH alterations",
            options=[m for m in FISH_MARKERS if m in feature_cols],
            default=[],
            key="sp_fish_selected",
        )
    
    with tab_ngs:
        st.caption("Select one or many NGS mutations. Selected = 1, not selected = 0.")
        ngs_selected = st.multiselect(
            "NGS mutations",
            options=[m for m in NGS_MARKERS if m in feature_cols],
            default=[],
            key="sp_ngs_selected",
        )
    
    with tab_clin:
        st.caption("Clinical flags: Yes=1, No=0")
        for i, f in enumerate(feature_cols):
            if f in YESNO_FIELDS:
                v = st.selectbox(f, options=["", "No", "Yes"], index=0, key=f"sp_{i}_yn")
                values_by_index[i] = yesno_to_01(v)
    
    with tab_core:
        ecog = st.selectbox("ECOG", options=[0, 1, 2, 3, 4], index=0, key="sp_ecog")

        for i, f in enumerate(feature_cols):
            # Skip fields handled in other tabs / derived inputs
            if f in TAB_MANAGED_FIELDS:
                continue

            # Age auto-calc (display integer, store float)
            # --- Age (auto from DOB & Dx date) ---
            if f.strip() == AGE_FEATURE.strip():
                # If DOB or Dx date unknown, allow manual entry
                if dob_unknown or dx_unknown or np.isnan(derived_age):
                    v_age = st.number_input(
                        f"{f} (enter if DOB/Dx unknown)",
                        value=None,
                        step=1,
                        format="%d",
                        key=f"sp_{i}_age_manual",
                    )
                    values_by_index[i] = v_age
                else:
                    st.number_input(
                        f"{f} (auto from DOB & Dx date)",
                        value=int(round(derived_age)),
                        step=1,
                        format="%d",
                        key=f"sp_{i}_age_auto",
                        disabled=True,
                    )
                    values_by_index[i] = float(derived_age)
                continue



            # --- Diagnosis date (auto from dx_date input) ---
            if f.strip() == DX_DATE_FEATURE.strip():
                values_by_index[i] = np.nan if dx_date is None else dx_date.isoformat()
                st.text_input(
                    f"{f} (auto)",
                    value="" if dx_date is None else dx_date.isoformat(),
                    disabled=True,
                    key=f"sp_{i}_dx_show"
                )
                continue

            
            if f.strip() == "Date of 1st CR".strip():
                values_by_index[i] = np.nan if cr1_date is None else cr1_date.isoformat()
                st.text_input(
                    f"{f} (auto)",
                    value="" if cr1_date is None else cr1_date.isoformat(),
                    disabled=True,
                    key=f"sp_{i}_cr_show"
                )
                continue
        
           
            
        
            # ECOG mapped to int
            if f.strip() == "ECOG":
                values_by_index[i] = int(ecog)
                continue
    
            # Gender dropdown
            if f == "Gender":
                v = st.selectbox("Gender", options=["", "Male", "Female"], index=0, key=f"sp_{i}_gender")
                values_by_index[i] = np.nan if v == "" else v
                continue
    
            # Other dropdown fields (from inference df options if available)
            # Dropdown fields (robust)
            if f in DROPDOWN_FIELDS_ACTUAL and f not in ("Gender", "ECOG"):
                opts = options_for(f, df_for_options)  # defaults + excel uniques
                v = st.selectbox(f, options=opts, index=0, key=f"sp_{i}_dd")
                values_by_index[i] = np.nan if v == "" else v
                continue


            
            # Numeric
            if f in num_cols:
                unit = UNITS.get(f, None)
                label = f"{f} ({unit})" if unit else f
                if f == "Age (years)":
                    v = st.number_input(label, value=None, step=1, format="%d", key=f"sp_{i}_num")
                else:
                    v = st.number_input(label, value=None, format="%.2f", key=f"sp_{i}_num")
                values_by_index[i] = v
                continue
            
            # Categorical fallback
            if f in cat_cols:
                v = st.text_input(f, value="", key=f"sp_{i}_cat")
                values_by_index[i] = np.nan if v.strip() == "" else v
                continue
    
            # Other fallback
            v = st.text_input(f, value="", key=f"sp_{i}_other")
            values_by_index[i] = np.nan if v.strip() == "" else v

                
    # Apply FISH/NGS selections to row
    fish_set = set(fish_selected)
    ngs_set = set(ngs_selected)
    
    for i, f in enumerate(feature_cols):
        if f in FISH_MARKERS:
            values_by_index[i] = 1 if f in fish_set else 0
        if f in NGS_MARKERS:
            values_by_index[i] = 1 if f in ngs_set else 0
        
    # Auto-fill marker counts if present
    if FISH_COUNT_COL in feature_cols:
        values_by_index[feature_cols.index(FISH_COUNT_COL)] = int(len(fish_selected))
    if NGS_COUNT_COL in feature_cols:
        values_by_index[feature_cols.index(NGS_COUNT_COL)] = int(len(ngs_selected))
    st.divider()
    st.subheader("Predict single patient")

    m = meta.get("metrics", {})
    default_thr = float(m.get("best_threshold", 0.5))
    
    thr_single = st.slider(
        "Classification threshold",
        0.0, 1.0, default_thr, 0.01,
        key="sp_thr"
    )
    
    # External validation threshold
    thr_ext = st.slider(
        "External validation threshold",
        0.0, 1.0, default_thr, 0.01,
        key="thr_ext"
    )

    low_cut_s, high_cut_s = st.slider(
        "Risk band cutoffs (low, high)",
        0.0, 1.0, (0.2, 0.8), 0.01,
        key="sp_risk_cuts"
    )
    
    # Ensure low <= high
    if low_cut_s > high_cut_s:
        low_cut_s, high_cut_s = high_cut_s, low_cut_s


    def band_one(p: float) -> str:
        if p < low_cut_s:
            return "Low"
        if p >= high_cut_s:
            return "High"
        return "Intermediate"
    # Submit button (no form needed; simpler + fewer state surprises)
    if st.button("Predict single patient", key="sp_predict_btn"):
        X_one = pd.DataFrame([values_by_index], columns=feature_cols).replace({pd.NA: np.nan})
    
        for c in num_cols:
            if c in X_one.columns:
                X_one[c] = pd.to_numeric(X_one[c], errors="coerce")
    
        for c in cat_cols:
            if c in X_one.columns:
                X_one[c] = X_one[c].astype("object")
                X_one.loc[X_one[c].isna(), c] = np.nan
                X_one[c] = X_one[c].map(lambda v: v if pd.isna(v) else str(v))

    
        proba_one = float(pipe.predict_proba(X_one)[:, 1][0])
        st.success("Prediction generated.")
        st.metric("Predicted probability", f"{proba_one:.4f}") 
   

        # ---- Survival prediction for this patient (if survival model loaded) ----
        # ---- Survival prediction for this patient (if survival model loaded) ----
        bundle = st.session_state.get("surv_model", None)
        
        if isinstance(bundle, dict) and bundle.get("model") is not None:
            try:
                cph = bundle["model"]
                surv_cols = bundle.get("columns", [])



        
                # Build Cox input row (same preprocessing as Cox training)
                cox_feature_cols = bundle.get("feature_cols", feature_cols)
                cox_num_cols = bundle.get("num_cols", num_cols)
                cox_cat_cols = bundle.get("cat_cols", cat_cols)
                df_one_surv = X_one[cox_feature_cols].copy()


        
                for c in cox_num_cols:
                    if c in df_one_surv.columns:
                        df_one_surv[c] = pd.to_numeric(df_one_surv[c], errors="coerce")
        
                for c in cox_cat_cols:
                    if c in df_one_surv.columns:
                        df_one_surv[c] = df_one_surv[c].astype("object").map(
                            lambda v: v if pd.isna(v) else str(v)
                        )
        
                df_one_surv_oh = pd.get_dummies(df_one_surv, columns=cox_cat_cols, drop_first=True)
                df_one_surv_oh = df_one_surv_oh.loc[:, ~df_one_surv_oh.columns.duplicated()].copy()


                
        
                # Align to training predictor columns
                for col in surv_cols:
                    if col not in df_one_surv_oh.columns:
                        df_one_surv_oh[col] = 0
                df_one_surv_oh = df_one_surv_oh[surv_cols]

                imp = bundle.get("imputer", None)
                if imp is not None:
                    X_imp = imp.transform(df_one_surv_oh)
                    df_one_surv_oh = pd.DataFrame(X_imp, columns=surv_cols, index=df_one_surv_oh.index)
                else:
                    df_one_surv_oh = df_one_surv_oh.fillna(0)
        
                # Predict survival function
                surv_fn = cph.predict_survival_function(df_one_surv_oh)
        
                def surv_at(days: int) -> float:
                    idx = surv_fn.index.values
                    j = int(np.argmin(np.abs(idx - days)))
                    return float(surv_fn.iloc[j, 0])
        
                s6m = surv_at(180)
                s1y = surv_at(365)
                s2y = surv_at(730)
                s3y = surv_at(1095)
        
                st.subheader("Predicted survival probability")
                a, b, c, d = st.columns(4)
                a.metric("6 months", f"{s6m*100:.1f}%")
                b.metric("1 year",   f"{s1y*100:.1f}%")
                c.metric("2 years",  f"{s2y*100:.1f}%")
                d.metric("3 years",  f"{s3y*100:.1f}%")
        
                fig, ax = make_fig(figsize=FIGSIZE, dpi=plot_dpi_screen)
                ax.plot(surv_fn.index, surv_fn.values)
                ax.set_xlabel("Days from diagnosis")
                ax.set_ylabel("Survival probability")
                ax.set_ylim(0, 1)
                ax.set_title("Predicted survival curve (patient)")
        
                render_plot_with_download(
                    fig,
                    title="Patient survival curve",
                    filename="patient_survival_curve.png",
                    export_dpi=export_dpi,
                    key="dl_patient_surv_curve"
                )
        
            except Exception as e:
                st.warning(f"Survival prediction could not be computed: {e}")
        
        else:
            st.info("Survival model not loaded/published yet (survival bundle missing).")


        
        out = X_one.copy()
        out["predicted_probability"] = proba_one
        pred_class = int(proba_one >= thr_single)
        out["predicted_class"] = pred_class
        
        out["risk_band"] = band_one(proba_one)

        

        # ---- SHAP compute only (cache) ----
        X_one_t = transform_before_clf(pipe, X_one)
        
        explainer = st.session_state.get("explainer")
        explainer_sig = st.session_state.get("explainer_sig")
        
        current_sig = (
            selected,
            None if st.session_state.get("X_bg_for_shap") is None else int(len(st.session_state["X_bg_for_shap"]))
        )
        
        if explainer is None or explainer_sig != current_sig:
            X_bg = st.session_state.get("X_bg_for_shap")
            if X_bg is None:
                st.error("SHAP background not available. Admin must publish latest/background.csv.")
                st.stop()
        
            st.session_state.explainer = build_shap_explainer(pipe, X_bg)
            st.session_state.explainer_sig = current_sig
            explainer = st.session_state.explainer
        
        shap_vals = explainer.shap_values(X_one_t)
        if isinstance(shap_vals, list):
            shap_vals = shap_vals[1]
        
        names = get_final_feature_names(pipe)
        
        try:
            x_dense = X_one_t.toarray()[0]
        except Exception:
            x_dense = np.array(X_one_t)[0]
        
        base = explainer.expected_value
        if not np.isscalar(base):
            base = float(np.array(base).reshape(-1)[0])
        
        exp = shap.Explanation(
            values=shap_vals[0],
            base_values=float(base),
            data=x_dense,
            feature_names=names,
        )
        
        # CACHE ONLY
        st.session_state.shap_single_exp = exp

    
        st.dataframe(out, use_container_width=True)
    
        st.download_button(
            "Download single patient result (CSV)",
            out.to_csv(index=False).encode("utf-8"),
            file_name="single_patient_prediction.csv",
            mime="text/csv",
            key="dl_sp_csv",
        )

        # ---- Always render cached SHAP ----
        if "shap_single_exp" in st.session_state:
            exp = st.session_state.shap_single_exp
        
            max_display_single = st.slider(
                "Top features to display (single patient)",
                5, 40, 20, 1,
                key="sp_single_max_display"
            )
        
            c1, c2 = st.columns(2)
        
            with c1:
                plt.figure(figsize=FIGSIZE, dpi=plot_dpi_screen)
                shap.plots.waterfall(exp, show=False, max_display=max_display_single)
                fig_w = plt.gcf()
                render_plot_with_download(
                    fig_w,
                    title="Single-patient SHAP waterfall",
                    filename="single_patient_shap_waterfall.png",
                    export_dpi=export_dpi,
                    key="dl_sp_wf"
                )
        
            with c2:
                plt.figure(figsize=FIGSIZE, dpi=plot_dpi_screen)
                shap.plots.bar(exp, show=False, max_display=max_display_single)
                fig_b = plt.gcf()
                render_plot_with_download(
                    fig_b,
                    title="Single-patient SHAP bar",
                    filename="single_patient_shap_bar.png",
                    export_dpi=export_dpi,
                    key="dl_sp_bar"
                )

            # --- After SHAP plots are displayed ---
            with st.expander("How to interpret the SHAP explanation plots", expanded=True):
                st.markdown(r"""
            **What is SHAP?**  
            SHAP (SHapley Additive exPlanations) decomposes a model prediction into **feature-wise contributions**.  
            Each feature pushes the prediction **toward higher risk** or **toward lower risk**, relative to the model’s baseline.
            
            **What is \(E[f(X)]\)? (baseline / expected model output)**  
            - \(E[f(X)]\) is the model’s **average output** over the reference population used by the explainer (typically the training set).  
            - It is the **starting point** of the explanation before any patient-specific information is applied.
            
            **What is \(f(x)\)? (this patient’s model output)**  
            - \(f(x)\) is the model’s **final output for this patient**, after adding all feature contributions to the baseline.  
            - For many classifiers (including logistic regression), SHAP waterfall plots often use the **log-odds (logit) scale** rather than raw probability.
            
            **How the waterfall plot should be read**  
            - The plot starts at **\(E[f(X)]\)** and then adds/subtracts feature effects to arrive at **\(f(x)\)**.  
            - **Red bars** push the prediction **toward higher risk** (increase the model output).  
            - **Blue bars** push the prediction **toward lower risk / protective direction** (decrease the model output).  
            - The **bar length** indicates the **strength** of that feature’s influence for this patient.  
            - “**Other features**” is the combined net effect of features not shown individually.
            
            **How the bar plot should be read (Top contributors)**  
            - Features are ranked by **absolute SHAP value** (largest impact first).  
            - **Positive SHAP** (right) increases predicted risk; **negative SHAP** (left) decreases predicted risk.
            
            **Clinical cautions**  
            - SHAP explains the model’s behavior; it does **not** prove causality.  
            - Ensure variable definitions and patient population match the model’s intended use.
            """)


    # -----------------------------
    # Batch prediction / validation
    # -----------------------------
    st.divider()
    st.subheader("Batch prediction / External validation")

    df_inf = st.session_state.df_inf
    if df_inf is None:
        st.info("Upload an inference Excel to run batch prediction / external validation.")
    else:        
        meta = st.session_state.get("meta")
        pipe = st.session_state.get("pipe")
        if meta is None or pipe is None:
            st.error("Model not loaded. Please load a model version first.")
            st.stop()
        
        feature_cols = meta["schema"]["features"]
        num_cols = meta["schema"]["numeric"]
        cat_cols = meta["schema"]["categorical"]
        
        missing = [c for c in feature_cols if c not in df_inf.columns]
        if missing:
            st.error(f"Inference file is missing required feature columns: {missing}")
            st.stop()
        meta = st.session_state.get("meta")
        if not meta:
            st.error("Model metadata not loaded. Please load a model version.")
            st.stop()
        
                
        X_inf = coerce_X_like_schema(df_inf, feature_cols, num_cols, cat_cols)

        X_inf = X_inf.replace({pd.NA: np.nan})
        
        for c in num_cols:
            X_inf[c] = pd.to_numeric(X_inf[c], errors="coerce")
        
        for c in cat_cols:
            X_inf[c] = X_inf[c].astype("object")
            X_inf.loc[X_inf[c].isna(), c] = np.nan
            X_inf[c] = X_inf[c].map(lambda v: v if pd.isna(v) else str(v))
      

        proba = pipe.predict_proba(X_inf)[:, 1]
        
        st.divider()
        st.subheader("External validation (if Outcome Event label is present)")
        
        if LABEL_COL in df_inf.columns:
            try:
                y_ext_raw = df_inf[LABEL_COL].copy()
                y_ext01, _ = coerce_binary_label(y_ext_raw)
        
                # Core metrics
                roc_auc_ext = float(roc_auc_score(y_ext01, proba))
                fpr, tpr, roc_thresholds = roc_curve(y_ext01, proba)
        
                # Threshold metrics (user-controlled)
                
                cls_ext = compute_classification_metrics(y_ext01, proba, threshold=float(thr_ext))
        
                pr_ext = compute_pr_curve(y_ext01, proba)
                cal_ext = compute_calibration(y_ext01, proba, n_bins=PRED_N_BINS, strategy=PRED_CAL_STRATEGY)

                dca_ext = decision_curve_analysis(y_ext01, proba)
        
                # Display headline metrics
                c1, c2, c3, c4 = st.columns(4)
                c1.metric("ROC AUC (external)", f"{roc_auc_ext:.3f}")
                c2.metric("Sensitivity", f"{cls_ext['sensitivity']:.3f}")
                c3.metric("Specificity", f"{cls_ext['specificity']:.3f}")
                c4.metric("F1", f"{cls_ext['f1']:.3f}")
        
                # Confusion matrix
                cm_df = pd.DataFrame(
                    [[cls_ext["tn"], cls_ext["fp"]], [cls_ext["fn"], cls_ext["tp"]]],
                    index=["Actual 0", "Actual 1"],
                    columns=["Pred 0", "Pred 1"],
                )
                st.markdown("**Confusion Matrix (external)**")
                st.dataframe(cm_df)
        
                # ROC plot
                # =========================
                # EXTERNAL: ROC curve plot
                # (replace your current plt.plot(fpr, tpr) block)
                # =========================
                fig, ax = make_fig(figsize=FIGSIZE, dpi=plot_dpi_screen)
                ax.plot(fpr, tpr)
                ax.plot([0, 1], [0, 1])
                ax.set_xlabel("False Positive Rate (1 - Specificity)")
                ax.set_ylabel("True Positive Rate (Sensitivity)")
                ax.set_title(f"External ROC Curve (AUC = {roc_auc_ext:.3f})")
                
                render_plot_with_download(
                    fig,
                    title="External ROC curve",
                    filename="external_roc_curve.png",
                    export_dpi=export_dpi,
                    key="dl_ext_roc"
                )


                # -------------------------
                # EXTERNAL: PR curve
                # -------------------------
                fig, ax = make_fig(figsize=FIGSIZE, dpi=plot_dpi_screen)
                ax.plot(pr_ext["recall"], pr_ext["precision"])
                ax.set_xlabel("Recall")
                ax.set_ylabel("Precision")
                ax.set_title(f"External PR Curve (AP = {pr_ext['average_precision']:.3f})")
                
                render_plot_with_download(
                    fig,
                    title="External PR curve",
                    filename="external_pr_curve.png",
                    export_dpi=export_dpi,
                    key="dl_ext_pr"
                )

                # -------------------------
                # EXTERNAL: Calibration
                # -------------------------
                fig, ax = make_fig(figsize=FIGSIZE, dpi=plot_dpi_screen)
                ax.plot(cal_ext["prob_pred"], cal_ext["prob_true"])
                ax.plot([0, 1], [0, 1])
                ax.set_xlabel("Mean predicted probability")
                ax.set_ylabel("Observed event rate")
                ax.set_title("External calibration curve")
                
                render_plot_with_download(
                    fig,
                    title="External calibration curve",
                    filename="external_calibration_curve.png",
                    export_dpi=export_dpi,
                    key="dl_ext_cal"
                )

                # -------------------------
                # EXTERNAL: DCA
                # -------------------------
                fig, ax = make_fig(figsize=FIGSIZE, dpi=plot_dpi_screen)
                ax.plot(dca_ext["thresholds"], dca_ext["net_benefit_model"], label="Model")
                ax.plot(dca_ext["thresholds"], dca_ext["net_benefit_all"], label="Treat all")
                ax.plot(dca_ext["thresholds"], dca_ext["net_benefit_none"], label="Treat none")
                ax.set_xlabel("Threshold probability")
                ax.set_ylabel("Net benefit")
                ax.set_title("External decision curve analysis")
                ax.legend()
                
                render_plot_with_download(
                    fig,
                    title="External decision curve",
                    filename="external_decision_curve.png",
                    export_dpi=export_dpi,
                    key="dl_ext_dca"
                )




        
            except Exception as e:
                st.error(f"Could not compute external validation metrics: {e}")
        else:
            st.info("No Outcome Event column found in the inference Excel, so external validation metrics cannot be computed.")
        
        # Predict probabilities
        proba = pipe.predict_proba(X_inf)[:, 1]
        
        df_out = df_inf.copy()
        df_out["predicted_probability"] = proba

        # ---- Batch survival probabilities (if survival model loaded) ----
        # ---- Batch survival probabilities (if survival model loaded) ----
        bundle = st.session_state.get("surv_model", None)

        if isinstance(bundle, dict) and bundle.get("model") is not None:
            try:
                cph = bundle["model"]
                surv_cols = bundle["columns"]
                imp = bundle.get("imputer", None)
        
                cox_feature_cols = bundle.get("feature_cols", feature_cols)
                cox_num_cols = bundle.get("num_cols", num_cols)
                cox_cat_cols = bundle.get("cat_cols", cat_cols)
                df_surv_in = X_inf[cox_feature_cols].copy()


                for c in cox_num_cols:
                    if c in df_surv_in.columns:
                        df_surv_in[c] = pd.to_numeric(df_surv_in[c], errors="coerce")
                
                for c in cox_cat_cols:
                    if c in df_surv_in.columns:
                        df_surv_in[c] = df_surv_in[c].astype("object")
                        df_surv_in.loc[df_surv_in[c].isna(), c] = np.nan
                        df_surv_in[c] = df_surv_in[c].map(lambda v: v if pd.isna(v) else str(v))

                
                df_surv_in_oh = pd.get_dummies(df_surv_in, columns=cox_cat_cols, drop_first=True)
                df_surv_in_oh = df_surv_in_oh.loc[:, ~df_surv_in_oh.columns.duplicated()].copy()

        
                # align columns
                for col in surv_cols:
                    if col not in df_surv_in_oh.columns:
                        df_surv_in_oh[col] = 0
                df_surv_in_oh = df_surv_in_oh[surv_cols]
        
                # impute
                if imp is not None:
                    X_imp = imp.transform(df_surv_in_oh)
                    df_surv_in_oh = pd.DataFrame(X_imp, columns=surv_cols, index=df_surv_in_oh.index)
                else:
                    df_surv_in_oh = df_surv_in_oh.fillna(0)
        
                surv_fn_all = cph.predict_survival_function(df_surv_in_oh)
        
                def surv_vec_at(days: int):
                    idx = surv_fn_all.index.values
                    j = int(np.argmin(np.abs(idx - days)))
                    return surv_fn_all.iloc[j, :].values.astype(float)
        
                df_out["survival_6m"] = surv_vec_at(180)
                df_out["survival_1y"] = surv_vec_at(365)
                df_out["survival_2y"] = surv_vec_at(730)
                df_out["survival_3y"] = surv_vec_at(1095)
        
            except Exception as e:
                st.warning(f"Batch survival probabilities could not be computed: {e}")
        
        else:
            st.info("Survival model not loaded/published yet (survival bundle missing).")




        
        # ==========================================================
        # EXPORT: CSV needed for NEJM-style ROC + Calibration + DCA
        # (label + predicted_probability)
        # ==========================================================
        if LABEL_COL in df_out.columns:
            curves_df = df_out[[LABEL_COL, "predicted_probability"]].copy()
        
            st.download_button(
                "Download CSV for ROC/Calibration/DCA panel",
                curves_df.to_csv(index=False).encode("utf-8"),
                file_name="predictions_for_curves.csv",
                mime="text/csv",
                key="dl_predictions_for_curves"
            )
        else:
            st.info("Outcome Event column not present — panel CSV export requires true labels.")
        
        df_out = add_ethnicity_region(df_out, eth_col="Ethnicity", out_col="Ethnicity_Region")
        
        st.subheader("Grouped outcomes by region (analytics)")
        if "Ethnicity_Region" in df_out.columns:
            grp = df_out.groupby("Ethnicity_Region")["predicted_probability"].agg(["count","mean","median"]).reset_index()
            st.dataframe(grp, use_container_width=True)

        # --- classification + risk bands ---
        st.divider()
        st.subheader("Risk stratification")
        
        # Classification threshold
        thr = st.slider(
            "Decision threshold for classification",
            0.0, 1.0, 0.5, 0.01,
            key="pred_thr"
        )
        
        df_out["predicted_class"] = (df_out["predicted_probability"] >= thr).astype(int)
        
        
        # ✅ Batch risk band slider (MISSING in your code)
        low_cut, high_cut = st.slider(
            "Risk band cutoffs (low, high)",
            0.0, 1.0, (0.2, 0.8), 0.01,
            key="batch_risk_cuts"
        )
        
        # safety (optional but good practice)
        if low_cut > high_cut:
            low_cut, high_cut = high_cut, low_cut
        
        
        def band(p: float) -> str:
            if p < low_cut:
                return "Low"
            if p >= high_cut:
                return "High"
            return "Intermediate"
        
        
        df_out["risk_band"] = df_out["predicted_probability"].map(band)

        # --- END ADD ---
        
        st.dataframe(df_out.head())
        
        st.download_button(
            "Download predictions",
            df_out.to_csv(index=False).encode(),
            "predictions.csv",
            "text/csv"
        )

        #Batch SHAP for whole BLOCK
        
        st.divider()
        st.subheader("Batch SHAP (first 200 rows)")
        
        MAX_BATCH = 200
        n_rows = len(X_inf)
        batch_n = min(MAX_BATCH, n_rows)
        
        cA, cB, cC = st.columns([1, 1, 1])
        with cA:
            do_batch = st.button(f"Compute batch SHAP for first {batch_n} rows", key="batch_shap_btn")
        with cB:
            max_display_batch = st.slider("Top features to display (batch)",5, 40, 20, 1,key="batch_max_display")

        with cC:
            show_beeswarm = st.checkbox("Show beeswarm (slower)", value=True, key="batch_beeswarm")
        
        if do_batch:
            with st.spinner("Computing batch SHAP..."):
                X_batch = X_inf.iloc[:batch_n].copy()

                X_batch_t = transform_before_clf(pipe, X_batch)
                
                explainer = st.session_state.get("explainer")
                explainer_sig = st.session_state.get("explainer_sig")
                
                # Create a simple signature that changes if model changes or background changes
                # (using version + number of background rows is usually enough)
                current_sig = (
                    selected,  # or meta.get("created_at_utc") or meta.get("metrics", {}).get("roc_auc")
                    None if st.session_state.get("X_bg_for_shap") is None else int(len(st.session_state["X_bg_for_shap"]))
                )
                
                if explainer is None or explainer_sig != current_sig:
                    X_bg = st.session_state.get("X_bg_for_shap")
                    if X_bg is None:
                        st.error("SHAP background not available. Admin must publish latest/background.csv.")
                        st.stop()
                
                    st.session_state.explainer = build_shap_explainer(pipe, X_bg)
                    st.session_state.explainer_sig = current_sig
                    explainer = st.session_state.explainer

                
                shap_vals_batch = explainer.shap_values(X_batch_t)
                if isinstance(shap_vals_batch, list):
                    shap_vals_batch = shap_vals_batch[1]
                
                names = get_final_feature_names(pipe)
                
                # Absolute sanity check: align names with SHAP columns
                if len(names) != shap_vals_batch.shape[1]:
                    st.warning(
                        f"Feature name mismatch: names={len(names)} vs shap_cols={shap_vals_batch.shape[1]}. "
                        "Using generic names."
                    )
                    names = [f"f{i}" for i in range(shap_vals_batch.shape[1])]



        
                # Dense conversion once (used for summary + waterfalls)
                try:
                    X_dense = safe_dense(X_batch_t, max_rows=200)

                except Exception:
                    X_dense = np.array(X_batch_t)
        
                # Cache batch results
                st.session_state.shap_batch_vals = shap_vals_batch
                st.session_state.shap_batch_X_dense = X_dense
                st.session_state.shap_batch_n = batch_n
                st.session_state.shap_batch_feature_names = names
        
            st.success(f"Batch SHAP computed for first {batch_n} rows.")
        
        if "shap_batch_vals" in st.session_state:
            shap_vals_batch = st.session_state.shap_batch_vals
            X_dense = st.session_state.shap_batch_X_dense
            batch_n = st.session_state.shap_batch_n
            names = st.session_state.shap_batch_feature_names
            
            st.divider()
            st.subheader("Export: Top SHAP features per row (batch)")
            
            top_k = st.slider("Top-K features per row", 3, 30, 10, 1, key="topk_export")
            
            # Optional: include predicted probabilities for the same batch rows
            # (Assumes you already computed proba for all X_inf earlier)
            include_proba = st.checkbox("Include predicted probability", value=True, key="include_proba_export")
            
            if st.button("Generate Top-K SHAP table", key="gen_topk_shap"):
                shap_vals_batch = st.session_state.shap_batch_vals          # shape: (batch_n, n_features)
                names = st.session_state.shap_batch_feature_names
                batch_n = st.session_state.shap_batch_n
            
                rows = []
                for i in range(batch_n):
                    sv = shap_vals_batch[i]
                    idx = np.argsort(np.abs(sv))[::-1][:top_k]  # top-k by absolute SHAP
            
                    for j in idx:
                        val = float(sv[j])
                        rows.append({
                            "row_in_batch": int(i),
                            "feature": str(names[j]),
                            "shap_value": val,
                            "abs_shap_value": abs(val),
                            "direction": "↑" if val > 0 else ("↓" if val < 0 else "0"),
                        })
            
                df_topk = pd.DataFrame(rows)
            
                if include_proba:
                    # Use the same batch rows from the previously computed proba vector
                    # If you want absolute Excel row index, add + df_inf.index[0] logic as needed
                    proba_batch = proba[:batch_n]
                    df_proba = pd.DataFrame({"row_in_batch": list(range(batch_n)), "predicted_probability": proba_batch})
                    df_topk = df_topk.merge(df_proba, on="row_in_batch", how="left")
            
                # Sort nicely: each row block by importance
                df_topk = df_topk.sort_values(["row_in_batch", "abs_shap_value"], ascending=[True, False])
            
                st.dataframe(df_topk, use_container_width=True)
            
                st.download_button(
                    "Download Top-K SHAP per row (CSV)",
                    df_topk.to_csv(index=False).encode("utf-8"),
                    file_name=f"shap_top{top_k}_per_row_first{batch_n}.csv",
                    mime="text/csv",
                    key="dl_topk_shap_csv"
                )

            
            
        
            st.markdown(f"### Global SHAP summary (first {batch_n} rows)")
        
            # BAR SUMMARY
            plt.figure(figsize=FIGSIZE, dpi=plot_dpi_screen)
            shap.summary_plot(
                shap_vals_batch,
                features=X_dense,
                feature_names=names,
                plot_type="bar",
                max_display=max_display_batch,
                show=False
            )
            fig_bar = plt.gcf()
            render_plot_with_download(fig_bar, title="SHAP bar summary", filename="shap_summary_bar.png", export_dpi=export_dpi)


        
            # BEESWARM SUMMARY (optional)
            if show_beeswarm:
                plt.figure(figsize=FIGSIZE, dpi=plot_dpi_screen)
                shap.summary_plot(
                    shap_vals_batch,
                    features=X_dense,
                    feature_names=names,
                    max_display=max_display_batch,
                    show=False
                )
                fig_swarm = plt.gcf()
                render_plot_with_download(fig_swarm, title="SHAP beeswarm", filename="shap_beeswarm.png", export_dpi=export_dpi)


        
            st.markdown("### Waterfall plots (batch)")
        
            rows_to_plot = st.multiselect(
                "Select rows (within the first batch) to plot waterfalls",
                options=list(range(batch_n)),
                default=[0],
                key="batch_rows_to_plot"
            )
            max_display_single = st.slider("Top features to display (single-row SHAP)",5, 40, 20, 1,key="single_max_display")

            max_waterfalls = st.slider("Max waterfall plots to render", 1, 10, 3, 1, key="max_waterfalls")
            rows_to_plot = rows_to_plot[:max_waterfalls]
        
            explainer = st.session_state.get("explainer")
            base = explainer.expected_value
            if not np.isscalar(base):
                base = float(np.array(base).reshape(-1)[0])
        
            for r in rows_to_plot:
                st.markdown(f"**Row {r} (within first {batch_n})**")
            
                exp = shap.Explanation(
                    values=shap_vals_batch[r],
                    base_values=float(base),
                    data=X_dense[r],
                    feature_names=names,
                )
            
                # Waterfall
                plt.figure(figsize=FIGSIZE, dpi=plot_dpi_screen)
                shap.plots.waterfall(exp, show=False, max_display=max_display_single)
                fig_w = plt.gcf()
                render_plot_with_download(
                    fig_w,
                    title=f"Batch SHAP waterfall (row {r})",
                    filename=f"shap_waterfall_batch_row_{r}.png",
                    export_dpi=export_dpi,
                    key=f"dl_wf_batch_{r}"
                )
            
                # Bar
                plt.figure(figsize=FIGSIZE, dpi=plot_dpi_screen)
                shap.plots.bar(exp, show=False, max_display=max_display_single)
                fig_b = plt.gcf()
                render_plot_with_download(
                    fig_b,
                    title=f"Batch SHAP bar (row {r})",
                    filename=f"shap_bar_batch_row_{r}.png",
                    export_dpi=export_dpi,
                    key=f"dl_bar_batch_{r}"
                )




        
        # Single row SHAP block
        st.subheader("SHAP explanation")

        # 1) Form only computes / updates cached explanation
        with st.form("shap_form"):
            row = st.number_input("Row index", 0, len(X_inf) - 1, int(st.session_state.get("shap_row", 0)))
            explain_btn = st.form_submit_button("Generate SHAP explanation")
        
        if explain_btn:
            st.session_state.shap_row = int(row)
        
            X_one = X_inf.iloc[[int(row)]]
            X_one_t = transform_before_clf(pipe, X_one)
        
            explainer = st.session_state.get("explainer")
            explainer_sig = st.session_state.get("explainer_sig")
            
            # Create a simple signature that changes if model changes or background changes
            # (using version + number of background rows is usually enough)
            current_sig = (
                selected,  # or meta.get("created_at_utc") or meta.get("metrics", {}).get("roc_auc")
                None if st.session_state.get("X_bg_for_shap") is None else int(len(st.session_state["X_bg_for_shap"]))
            )
            
            if explainer is None or explainer_sig != current_sig:
                X_bg = st.session_state.get("X_bg_for_shap")
                if X_bg is None:
                    st.error("SHAP background not available. Admin must publish latest/background.csv.")
                    st.stop()
            
                st.session_state.explainer = build_shap_explainer(pipe, X_bg)
                st.session_state.explainer_sig = current_sig
                explainer = st.session_state.explainer

        
            shap_vals = explainer.shap_values(X_one_t)
            if isinstance(shap_vals, list):
                shap_vals = shap_vals[1]  # positive class
        
            names = get_final_feature_names(pipe)
            if len(names) != shap_vals.shape[1]:
                names = [f"f{i}" for i in range(shap_vals.shape[1])]
        
            try:
                x_dense = X_one_t.toarray()[0]
            except Exception:
                x_dense = np.array(X_one_t)[0]

            base = explainer.expected_value
            if not np.isscalar(base):
                base = float(np.array(base).reshape(-1)[0])
        
            exp = shap.Explanation(
                values=shap_vals[0],
                base_values=float(base),
                data=x_dense,
                feature_names=names,
            )
        
            # Cache for re-plotting when sliders change
            st.session_state.shap_single_exp = exp
        
        # 2) Plot section OUTSIDE the form (will rerun on slider changes)
        if "shap_single_exp" in st.session_state:
            exp = st.session_state.shap_single_exp
        
            max_display_single_row = st.slider(
                "Top features to display (single row)",
                5, 40, int(st.session_state.get("shap_single_max", 20)), 1,
                key="shap_single_max"
            )

            c1, c2 = st.columns(2)
        
            with c1:
                st.markdown("**Waterfall**")
                plt.figure(figsize=FIGSIZE, dpi=plot_dpi_screen)
                shap.plots.waterfall(exp, show=False, max_display=max_display_single_row)
                fig_w = plt.gcf()
                render_plot_with_download(
                    fig_w,
                    title="SHAP waterfall",
                    filename="shap_waterfall_row.png",
                    export_dpi=export_dpi,
                    key="dl_shap_wf_single"
                )
        
            with c2:
                st.markdown("**Top features**")
                plt.figure(figsize=FIGSIZE, dpi=plot_dpi_screen)
                shap.plots.bar(exp, show=False, max_display=max_display_single_row)
                fig_b = plt.gcf()
                render_plot_with_download(
                    fig_b,
                    title="SHAP bar",
                    filename="shap_bar_row.png",
                    export_dpi=export_dpi,
                    key="dl_shap_bar_single"
                )
        else:
            st.info("Submit a row index to generate the SHAP explanation.")