Spaces:

GS123
/

Missing_values_app

Sleeping

File size: 46,774 Bytes

"""
Missing Value Analyzer — Statistically Rigorous Pipeline
=========================================================
Phases:
  1  Upload CSV & Train/Test Split
  2  Missing Value Overview (train set only)
  3  Per-Column Diagnostics (Tables for all tests)
  4  Imputation Feasibility Gate (KDE plots, Variance %, New Outliers)
  5  Final Report & Recommendations
"""

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency, ttest_ind, norm, chi2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import warnings
warnings.filterwarnings("ignore")

# ─────────────────────────── Page config ────────────────────────────
st.set_page_config(
    page_title="Missing Value Analyzer",
    page_icon="🔬",
    layout="wide",
    initial_sidebar_state="expanded",
)

# ─────────────────────────── CSS ────────────────────────────────────
st.markdown("""
<style>
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
html,body,[class*="css"]{font-family:'Inter',sans-serif;}

section[data-testid="stSidebar"]{background:#17172b;}
section[data-testid="stSidebar"] *{color:#ffffff !important;}
section[data-testid="stSidebar"] hr{border-color:#ffffff33 !important;}

.main-title{font-size:2rem;font-weight:700;color:#17172b;margin-bottom:.2rem;}
.main-sub{font-size:1rem;color:#6060a0;margin-bottom:1.5rem;}

.metric-box{background:#f5f3ee;border-radius:8px;padding:12px 16px;text-align:center;margin-bottom:8px;}
.metric-val{font-size:1.4rem;font-weight:700;color:#17172b !important;}
.metric-lbl{font-size:.78rem;color:#6060a0 !important;margin-top:2px;}

.big-stat-box{border-radius:12px;padding:20px 24px;text-align:center;margin-bottom:8px;min-height:110px;}
.big-stat-val{font-size:2.0rem;font-weight:800;margin-bottom:4px;line-height:1.1;}
.big-stat-lbl{font-size:.80rem;font-weight:600;opacity:0.85;text-transform:uppercase;letter-spacing:.05em;}
.big-stat-sub{font-size:.76rem;opacity:0.65;margin-top:6px;}

.stat-ok{background:#edfaf3;border:2px solid #89d9ac;}
.stat-ok .big-stat-val,.stat-ok .big-stat-lbl{color:#0a5c30 !important;}
.stat-warn{background:#fffaeb;border:2px solid #f0cc7a;}
.stat-warn .big-stat-val,.stat-warn .big-stat-lbl{color:#7a4f00 !important;}
.stat-fail{background:#fff0ed;border:2px solid #f5a898;}
.stat-fail .big-stat-val,.stat-fail .big-stat-lbl{color:#900000 !important;}

.card-mcar{background:#edfaf3;border:2px solid #89d9ac;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
.card-mar {background:#fffaeb;border:2px solid #f0cc7a;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
.card-mnar{background:#fff0ed;border:2px solid #f5a898;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
.card-info{background:#eef2ff;border:2px solid #bdc8f5;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
.card-warn{background:#fff8e1;border:2px solid #ffe082;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
.card-danger{background:#fde8e8;border:2px solid #f5a8a8;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
.card-ok{background:#edfaf3;border:2px solid #89d9ac;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}

.card-mcar *, .card-mar *, .card-mnar *, .card-info *, .card-warn *, .card-danger *, .card-ok * {color: #1a1a2e !important;}

.verdict-label{font-size:1.1rem;font-weight:700;margin-bottom:4px;}
.verdict-desc{font-size:.88rem;color:#333 !important;}

code{background:#e8e8eb;padding:2px 6px;border-radius:4px;font-size:.85rem; color:#d6336c !important;}
hr.divider{border:none;border-top:2px solid #e0ddd8;margin:1.5rem 0;}

.theory-box {background:#fafafa; border-left:4px solid #4f8ef7; border-radius:4px; padding:12px 18px; margin-bottom:16px;}
.theory-box h4 {color:#17172b; margin-bottom:6px; font-size:1.05rem;}
.theory-box p {color:#444; font-size:0.92rem; line-height:1.5;}

.stat-highlight { font-size: 1.2rem; font-weight: bold; color: #d6336c; background: #ffe4e1; padding: 2px 8px; border-radius: 4px;}

.test-header{font-size:1.05rem;font-weight:700;color:#17172b;margin:18px 0 8px;}
</style>
""", unsafe_allow_html=True)


# ════════════════════════════════════════════════════════════════════
#  SESSION STATE INIT
# ════════════════════════════════════════════════════════════════════
defaults = {"df_full": None, "df_train": None, "df_test": None, "target_col": None, "split_ratio": 0.8, "col_diagnostics": {}}
for k, v in defaults.items():
    if k not in st.session_state: st.session_state[k] = v


# ════════════════════════════════════════════════════════════════════
#  STATISTICAL TEST HELPERS
# ════════════════════════════════════════════════════════════════════
def littles_mcar_test(df: pd.DataFrame, cols_with_missing: list) -> dict:
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    chi2_total, df_total = 0.0, 0
    for col in cols_with_missing:
        if col not in numeric_cols: continue
        missing_mask = df[col].isnull()
        if missing_mask.sum() < 5 or (~missing_mask).sum() < 5: continue
        for other in numeric_cols:
            if other == col: continue
            g1, g2 = df.loc[missing_mask, other].dropna(), df.loc[~missing_mask, other].dropna()
            if len(g1) < 3 or len(g2) < 3: continue
            grand_mean, grand_var = df[other].mean(), df[other].var()
            if grand_var < 1e-12: continue
            chi2_total += (len(g1)*(g1.mean() - grand_mean)**2 + len(g2)*(g2.mean() - grand_mean)**2) / grand_var
            df_total += 1
    if df_total == 0: return {"chi2": None, "p_value": None, "verdict": "Insufficient numeric data"}
    p_val = 1 - chi2.cdf(chi2_total, df_total)
    verdict = f"Fail to reject MCAR" if p_val >= 0.05 else f"Reject MCAR"
    return {"chi2": round(chi2_total, 4), "df": df_total, "p_value": round(p_val, 4), "verdict": verdict, "reject_mcar": p_val < 0.05}

def feature_dependency_tests(df: pd.DataFrame, col: str) -> dict:
    missing_mask = df[col].isnull()
    if missing_mask.sum() < 5 or (~missing_mask).sum() < 5: return {"results": {}, "n_significant": 0, "signal": "Insufficient data"}
    results = {}
    for other in df.columns:
        if other == col: continue
        g_miss, g_obs = df.loc[missing_mask, other].dropna(), df.loc[~missing_mask, other].dropna()
        if len(g_miss) < 3 or len(g_obs) < 3: continue
        try:
            if pd.api.types.is_numeric_dtype(df[other]):
                n1, n2 = len(g_miss), len(g_obs)
                if min(n1, n2) >= 30:
                    se = np.sqrt(g_miss.var()/n1 + g_obs.var()/n2)
                    if se < 1e-12: continue
                    z_stat = (g_miss.mean() - g_obs.mean()) / se
                    p_val = 2 * (1 - norm.cdf(abs(z_stat)))
                    test_name, stat = "z-test", round(z_stat, 4)
                else:
                    t_stat, p_val = ttest_ind(g_miss, g_obs, equal_var=False)
                    test_name, stat = "Welch t-test", round(t_stat, 4)
                results[other] = {"test": test_name, "stat": stat, "p_value": round(p_val, 4), "significant": p_val < 0.05, "type": "numeric"}
            else:
                ct = pd.crosstab(missing_mask.astype(int), df[other])
                if ct.shape[0] < 2 or ct.shape[1] < 2: continue
                chi2_stat, p_val, _, _ = chi2_contingency(ct)
                results[other] = {"test": "chi²", "stat": round(chi2_stat, 4), "p_value": round(p_val, 4), "significant": p_val < 0.05, "type": "categorical"}
        except Exception: continue
    n_sig = sum(1 for r in results.values() if r["significant"])
    sig_pct = n_sig / max(len(results), 1) * 100
    signal = "No features differ significantly" if sig_pct == 0 else f"{n_sig}/{len(results)} features differ (p<0.05)"
    return {"results": results, "n_significant": n_sig, "total_tested": len(results), "sig_pct": round(sig_pct, 1), "signal": signal}

def target_dependency_test(df: pd.DataFrame, col: str, target_col: str) -> dict:
    missing_mask = df[col].isnull()
    if missing_mask.sum() < 5 or (~missing_mask).sum() < 5: return {"p_value": None, "signal": "Insufficient data", "significant": False}
    try:
        g_miss, g_obs = df.loc[missing_mask, target_col].dropna(), df.loc[~missing_mask, target_col].dropna()
        if pd.api.types.is_numeric_dtype(df[target_col]):
            n1, n2 = len(g_miss), len(g_obs)
            if min(n1, n2) >= 30:
                se = np.sqrt(g_miss.var()/n1 + g_obs.var()/n2)
                if se < 1e-12: return {"p_value": None, "signal": "Zero variance", "significant": False}
                z_stat = (g_miss.mean() - g_obs.mean()) / se
                p_val = 2 * (1 - norm.cdf(abs(z_stat)))
            else:
                _, p_val = ttest_ind(g_miss, g_obs, equal_var=False)
            diff_pct = abs(g_miss.mean() - g_obs.mean()) / max(abs(g_obs.mean()), 1e-9) * 100
        else:
            ct = pd.crosstab(missing_mask.astype(int), df[target_col])
            _, p_val, _, _ = chi2_contingency(ct)
            p1, p2 = g_miss.value_counts(normalize=True).iloc[0]*100, g_obs.value_counts(normalize=True).iloc[0]*100
            diff_pct = abs(p1 - p2)

        sig = p_val < 0.05
        signal = f"Not significant (p={p_val:.4f})" if not sig else f"Significant — target differs by {diff_pct:.1f}%"
        return {"p_value": round(p_val, 4), "significant": sig, "diff_pct": round(diff_pct, 2), "signal": signal}
    except Exception as e: return {"p_value": None, "signal": f"Error: {e}", "significant": False}

def classify_mechanism(t_feat, t_target, little):
    tgt_sig, tgt_diff = t_target.get("significant", False), t_target.get("diff_pct", 0)
    sig_pct = t_feat.get("sig_pct", 0)
    
    if tgt_sig and tgt_diff >= 10: return "MNAR", "High", "Missingness strongly correlates with the outcome."
    elif tgt_sig and tgt_diff >= 5: return "MNAR", "Moderate", "Moderate dependency on target. Treat conservatively as MNAR."
    elif sig_pct > 30: return "MAR", "High", "Strong dependency on observed features detected."
    elif sig_pct > 0: return "MAR", "Moderate", "Weak but present dependency on observed features."
    elif little.get("reject_mcar"): return "MAR", "Low", "Little's test rejects MCAR, but feature tests show weak dependency."
    else: return "MCAR", "High", "No statistical evidence of systematic missingness."

def run_single_diagnostic(df, col, target_col):
    little, t_feat = littles_mcar_test(df, [col]), feature_dependency_tests(df, col)
    t_target = {"p_value": None, "significant": False, "signal": "Skipped (Is Target)", "diff_pct": 0} if col == target_col else target_dependency_test(df, col, target_col)
    mech, conf, expl = classify_mechanism(t_feat, t_target, little)
    st.session_state["col_diagnostics"][col] = {
        "mechanism": mech, "confidence": conf, "explanation": expl,
        "miss_pct": round(df[col].isnull().mean()*100, 2),
        "dtype": str(df[col].dtype),
        "little": little, "t_feat": t_feat, "t_target": t_target
    }


# ════════════════════════════════════════════════════════════════════
#  IMPUTATION SIMULATION HELPERS
# ════════════════════════════════════════════════════════════════════
def feasibility_checks(df: pd.DataFrame, col: str, target_col: str, impute_method: str) -> dict:
    series = df[col].dropna()
    if len(series) < 5 or not pd.api.types.is_numeric_dtype(df[col]):
        return {"applicable": False}

    results = {"applicable": True, "escalate_to_knn": False, "reasons": []}
    
    # ── 1. Impute ──
    if impute_method == "Mean": imputed_series = df[col].fillna(series.mean())
    elif impute_method == "Median": imputed_series = df[col].fillna(series.median())
    else:
        numeric_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c != target_col]
        X_num = df[numeric_cols].copy()
        try:
            scaler = StandardScaler()
            X_scaled = pd.DataFrame(scaler.fit_transform(X_num), columns=X_num.columns)
            imputer = KNNImputer(n_neighbors=5) if impute_method == "KNN" else IterativeImputer(random_state=42, max_iter=10)
            X_imputed_scaled = pd.DataFrame(imputer.fit_transform(X_scaled), columns=X_num.columns)
            X_imputed = pd.DataFrame(scaler.inverse_transform(X_imputed_scaled), columns=X_num.columns)
            imputed_series = X_imputed[col]
        except Exception:
            imputed_series = df[col].fillna(series.median())

    results["imputed_series"] = imputed_series

    # ── 2. Skewness & Outliers ──
    skew = series.skew()
    Q1_b, Q3_b = series.quantile(0.25), series.quantile(0.75)
    IQR_b = Q3_b - Q1_b
    outliers_before = ((series < Q1_b - 1.5*IQR_b) | (series > Q3_b + 1.5*IQR_b)).sum()
    
    Q1_a, Q3_a = imputed_series.quantile(0.25), imputed_series.quantile(0.75)
    IQR_a = Q3_a - Q1_a
    outliers_after = ((imputed_series < Q1_a - 1.5*IQR_a) | (imputed_series > Q3_a + 1.5*IQR_a)).sum()
    new_outliers = max(0, outliers_after - outliers_before)

    if impute_method == "Mean":
        skew_verdict = "fail" if abs(skew) > 1 else "ok"
    elif impute_method == "Median":
        skew_verdict = "warn" if abs(skew) > 3 else "ok"
    else: 
        skew_verdict = "ok"

    results["skewness"] = {"verdict": skew_verdict, "value": skew, "msg": f"Skewness = {skew:.3f}"}
    
    if new_outliers > (len(series) * 0.05):
        out_verdict = "warn"
    else:
        out_verdict = "ok"

    results["outliers"] = {
        "verdict": out_verdict,
        "new_outliers": new_outliers,
        "outliers_before": outliers_before,
        "outliers_after": outliers_after
    }

    # ── 3. Variance Impact ──
    var_before = series.var()
    var_after  = imputed_series.var()
    var_drop_pct = (var_before - var_after) / var_before * 100 if var_before > 1e-12 else 0

    if var_drop_pct <= 10: var_verdict, var_msg = "ok", f"Variance Change: {var_drop_pct:.1f}%"
    elif var_drop_pct <= 20: var_verdict, var_msg = "warn", f"Variance Change: {var_drop_pct:.1f}%"
    else: var_verdict, var_msg = "fail", f"Variance Change: {var_drop_pct:.1f}%"

    results["variance"] = {"verdict": var_verdict, "msg": var_msg, "var_drop_pct": var_drop_pct}

    # ── 4. Correlation Preservation ──
    numeric_others = [c for c in df.select_dtypes(include=[np.number]).columns if c != col and c != target_col]
    corr_results, max_corr_shift, sign_flip = {}, 0.0, False

    for other in numeric_others[:10]:
        s_before = df[[col, other]].dropna()
        if len(s_before) < 5: continue
        r_before = s_before[col].corr(s_before[other])
        r_after  = imputed_series.corr(df[other])
        
        delta = abs(r_before - r_after)
        flipped = (r_before * r_after < 0) and (abs(r_before) > 0.1)
        
        corr_results[other] = {"r_before": round(r_before, 4), "r_after": round(r_after, 4), "delta": round(delta, 4), "sign_flip": flipped}
        max_corr_shift = max(max_corr_shift, delta)
        if flipped: sign_flip = True

    if max_corr_shift <= 0.05 and not sign_flip: corr_verdict, corr_msg = "ok", f"Max Δ = {max_corr_shift:.3f} — Correlation well preserved"
    elif sign_flip: corr_verdict, corr_msg = "fail", f"Sign flip detected! Correlation direction reversed."
    elif max_corr_shift <= 0.10: corr_verdict, corr_msg = "warn", f"Max Δ = {max_corr_shift:.3f} — Moderate correlation shift"
    else: corr_verdict, corr_msg = "fail", f"Max Δ = {max_corr_shift:.3f} — Large correlation shift detected"

    results["correlation"] = {"details": corr_results, "verdict": corr_verdict, "msg": corr_msg, "max_shift": round(max_corr_shift, 4)}

    return results

def get_auto_recommendation(df, col, target, mechanism, miss_pct, dtype):
    """Determine best imputation strategy with explicit labeling."""
    needs_indicator = (mechanism == "MNAR") or (mechanism == "MAR" and miss_pct >= 10)
    indicator_suffix = " + Missing Indicator" if needs_indicator else ""

    # High missingness — always flag
    if miss_pct > 70:
        return f"Drop Column"

    if mechanism == "MCAR" and miss_pct <= 5:
        return "Drop Rows"

    # Categorical / non-numeric
    if not pd.api.types.is_numeric_dtype(df[col]):
        return f"Mode Imputation{indicator_suffix}"

    # Numeric: run quick feasibility to decide
    feas_med = feasibility_checks(df, col, target, "Median")
    if not feas_med.get("applicable"):
        return f"Median Imputation{indicator_suffix}"

    var_ok = feas_med["variance"]["var_drop_pct"] <= 20
    corr_ok = feas_med["correlation"]["verdict"] != "fail"
    skew_val = abs(feas_med["skewness"].get("value", 0))

    if var_ok and corr_ok:
        if skew_val <= 1:
            return f"Mean Imputation{indicator_suffix}"
        else:
            return f"Median Imputation{indicator_suffix}"
    else:
        if miss_pct > 30:
            return f"MICE Imputer{indicator_suffix}"
        else:
            return f"KNN Imputer{indicator_suffix}"


# ════════════════════════════════════════════════════════════════════
#  SIDEBAR NAVIGATION
# ════════════════════════════════════════════════════════════════════
STEPS = ["1 · Upload & Split", "2 · Overview", "3 · Column Diagnostics", "4 · Feasibility Gate", "5 · Final Report"]

with st.sidebar:
    st.markdown("## 🔬 Missing Value Analyzer")
    st.markdown("---")
    step = st.radio("Navigate:", STEPS, label_visibility="collapsed")
    st.markdown("---")
    if st.session_state.get("df_train") is not None:
        st.markdown(f"**Train set:** {st.session_state['df_train'].shape[0]} rows × {st.session_state['df_train'].shape[1]} cols")
        st.markdown(f"**Diagnosed:** {len(st.session_state['col_diagnostics'])} columns")
    st.markdown("<small style='color:#9090c0'>Analysis runs on TRAIN SET only to prevent data leakage.</small>", unsafe_allow_html=True)


# ════════════════════════════════════════════════════════════════════
#  STEP 1 — UPLOAD & SPLIT
# ════════════════════════════════════════════════════════════════════
def render_step1():
    st.markdown('<div class="main-title">📂 Step 1 — Upload CSV & Train/Test Split</div>', unsafe_allow_html=True)
    uploaded = st.file_uploader("Choose a CSV file", type=["csv"])
    if not uploaded: return st.info("👆 Upload a CSV file to begin.")

    df = pd.read_csv(uploaded)
    st.success(f"✅ Loaded **{uploaded.name}**")

    col1, col2 = st.columns(2)
    target = col1.selectbox("Target column (Y):", df.columns.tolist(), index=len(df.columns)-1)
    split_pct = col2.slider("Train size:", 50, 95, 80, 5, format="%d%%")

    if st.button("✅ Confirm & Split", type="primary"):
        df_train, df_test = train_test_split(df, train_size=split_pct/100.0, random_state=42)
        st.session_state.update({"df_full": df, "df_train": df_train.reset_index(drop=True), "df_test": df_test.reset_index(drop=True), "target_col": target, "col_diagnostics": {}})
        st.success("✅ Split complete!")
        st.dataframe(df_train.head(), use_container_width=True)

# ════════════════════════════════════════════════════════════════════
#  STEP 2 — OVERVIEW
# ════════════════════════════════════════════════════════════════════
def render_step2():
    st.markdown('<div class="main-title">📊 Step 2 — Missing Value Overview</div>', unsafe_allow_html=True)
    df = st.session_state.get("df_train")
    if df is None: return st.warning("⚠️ Please complete Step 1.")

    miss_cols = [c for c in df.columns if df[c].isnull().any()]
    if not miss_cols: return st.success("🎉 No missing values!")

    # ── Summary table ──
    summary = pd.DataFrame({
        "Missing Count": df[miss_cols].isnull().sum(),
        "Missing %": (df[miss_cols].isnull().sum() / len(df) * 100).round(2)
    }).sort_values("Missing %", ascending=False)
    st.dataframe(
        summary.style.background_gradient(cmap="YlOrRd", subset=["Missing %"]),
        use_container_width=True
    )

    st.markdown("---")

    # ── Missingness Heatmap ──
    st.markdown("### 🗺️ Missingness Heatmap")
    st.caption("Each dark stripe = a missing value in that row. Aligned stripes across columns = rows missing together (MAR signal).")

    fig_h, ax_h = plt.subplots(figsize=(14, max(3, len(miss_cols) * 0.6)))
    fig_h.patch.set_facecolor('#f8f8f8')
    ax_h.set_facecolor('#f0f0f0')

    miss_matrix = df[miss_cols].isnull().astype(int)
    # Subsample rows for performance if large
    if len(miss_matrix) > 2000:
        miss_matrix = miss_matrix.sample(2000, random_state=42).reset_index(drop=True)

    ax_h.imshow(
        miss_matrix.T.values,
        aspect='auto',
        cmap=sns.color_palette(["#f0f0f0", "#17172b"], as_cmap=True),
        interpolation='none'
    )
    ax_h.set_yticks(range(len(miss_cols)))
    ax_h.set_yticklabels(miss_cols, fontsize=10)
    ax_h.set_xlabel("Row index (sampled)" if len(df) > 2000 else "Row index", fontsize=10)
    ax_h.set_title("Missing Value Pattern (dark = missing)", fontsize=12, fontweight='bold', pad=10)
    ax_h.spines[['top','right','bottom','left']].set_visible(False)
    plt.tight_layout()
    st.pyplot(fig_h, use_container_width=True)
    plt.close()

    st.markdown("---")

    # ── Missingness Correlation Heatmap ──
    st.markdown("### 🔗 Missingness Correlation")
    st.caption("Correlation between missing patterns of columns. Values near 1.0 = these columns tend to be missing in the same rows — strong MAR signal.")

    if len(miss_cols) >= 2:
        miss_indicator = df[miss_cols].isnull().astype(int)
        corr_matrix = miss_indicator.corr()

        fig_c, ax_c = plt.subplots(figsize=(max(6, len(miss_cols) * 1.2), max(5, len(miss_cols) * 1.0)))
        fig_c.patch.set_facecolor('#f8f8f8')

        mask = np.zeros_like(corr_matrix, dtype=bool)
        mask[np.triu_indices_from(mask, k=1)] = True  # show lower triangle only

        sns.heatmap(
            corr_matrix,
            mask=mask,
            annot=True,
            fmt=".2f",
            cmap="RdYlGn",
            vmin=-1, vmax=1,
            center=0,
            ax=ax_c,
            square=True,
            linewidths=0.5,
            linecolor='white',
            annot_kws={"size": 10, "weight": "bold"},
            cbar_kws={"shrink": 0.8}
        )
        ax_c.set_title("Pairwise Missingness Correlation", fontsize=12, fontweight='bold', pad=12)
        ax_c.tick_params(axis='x', rotation=45, labelsize=10)
        ax_c.tick_params(axis='y', rotation=0, labelsize=10)
        plt.tight_layout()
        st.pyplot(fig_c, use_container_width=True)
        plt.close()

        # Interpretation callout
        max_corr_pair = None
        max_val = 0
        for i in range(len(miss_cols)):
            for j in range(i):
                val = abs(corr_matrix.iloc[i, j])
                if val > max_val:
                    max_val = val
                    max_corr_pair = (miss_cols[i], miss_cols[j], corr_matrix.iloc[i, j])

        if max_corr_pair:
            c1, c2, v = max_corr_pair
            if v >= 0.9:
                st.markdown(f'<div class="card-danger">🚨 <b>Very high missingness correlation ({v:.2f})</b> between <code>{c1}</code> and <code>{c2}</code> — these rows go missing together. Strong MAR signal; consider joint imputation (KNN/MICE).</div>', unsafe_allow_html=True)
            elif v >= 0.5:
                st.markdown(f'<div class="card-warn">⚠️ <b>Moderate missingness correlation ({v:.2f})</b> between <code>{c1}</code> and <code>{c2}</code> — partial co-occurrence of missingness detected.</div>', unsafe_allow_html=True)
            else:
                st.markdown(f'<div class="card-ok">✅ <b>Low missingness correlation (max {v:.2f})</b> — columns appear to be missing independently.</div>', unsafe_allow_html=True)
    else:
        st.info("Only one column with missing values — correlation requires at least two.")

# ════════════════════════════════════════════════════════════════════
#  STEP 3 — DIAGNOSTICS
# ════════════════════════════════════════════════════════════════════
def render_step3():
    st.markdown('<div class="main-title">🧪 Step 3 — Per-Column Diagnostics</div>', unsafe_allow_html=True)
    df, target = st.session_state.get("df_train"), st.session_state.get("target_col")
    if df is None: return st.warning("⚠️ Please complete Step 1.")

    miss_cols = [c for c in df.columns if df[c].isnull().any()]
    if not miss_cols: return st.success("🎉 No missing values.")

    col1, col2 = st.columns([1, 4])
    selected_col = col1.selectbox("Select column to view:", miss_cols)
    run_single = col1.button("▶ Run Diagnostics")
    run_all = col2.button("▶ Run ALL columns", type="primary")

    if run_single:
        run_single_diagnostic(df, selected_col, target)
    if run_all:
        progress = st.progress(0, text="Running diagnostics...")
        for i, c in enumerate(miss_cols):
            run_single_diagnostic(df, c, target)
            progress.progress((i+1)/len(miss_cols), text=f"Diagnosing: {c}")
        progress.empty()
        st.success(f"✅ Diagnosed {len(miss_cols)} columns.")

    if selected_col in st.session_state["col_diagnostics"]:
        res = st.session_state["col_diagnostics"][selected_col]
        little, t_feat, t_target = res["little"], res["t_feat"], res["t_target"]
        
        st.markdown("---")

        # ── Mechanism verdict card ──
        card_class = {"MCAR":"card-mcar","MAR":"card-mar","MNAR":"card-mnar"}[res["mechanism"]]
        emoji = {"MCAR":"🟢","MAR":"🟠","MNAR":"🔴"}[res["mechanism"]]
        st.markdown(
            f'<div class="{card_class}">'
            f'<div class="verdict-label">{emoji} Mechanism: {res["mechanism"]} — {res["confidence"]} Confidence</div>'
            f'<div class="verdict-desc">{res["explanation"]}</div>'
            f'<div class="verdict-desc" style="margin-top:6px">Missing: <b>{res["miss_pct"]}%</b> &nbsp;|&nbsp; dtype: <b>{res["dtype"]}</b></div>'
            f'</div>',
            unsafe_allow_html=True
        )

        # ══ TEST 1: Little's MCAR ══
        st.markdown('<div class="test-header">🔬 Test 1 — Little\'s MCAR Test</div>', unsafe_allow_html=True)
        with st.expander("ℹ️ What does this test measure?", expanded=False):
            st.markdown("""
            **Little's MCAR test** checks if missingness is completely random.
            - **H₀ (null):** Data is Missing Completely At Random (MCAR)  
            - **p ≥ 0.05:** Fail to reject → data may be MCAR  
            - **p < 0.05:** Reject → systematic missingness detected
            """)

        little_rows = [{
            "Test": "Little's MCAR",
            "χ² Statistic": little.get("chi2", "N/A"),
            "Degrees of Freedom": little.get("df", "N/A"),
            "p-value": little.get("p_value", "N/A"),
            "Verdict": little.get("verdict", "N/A"),
            "Reject MCAR?": "✅ Yes — systematic" if little.get("reject_mcar") else "❌ No — may be MCAR"
        }]
        st.dataframe(pd.DataFrame(little_rows), use_container_width=True, hide_index=True)

        # ══ TEST 2: Target Dependency ══
        st.markdown('<div class="test-header">🎯 Test 2 — Target Dependency Test</div>', unsafe_allow_html=True)
        with st.expander("ℹ️ What does this test measure?", expanded=False):
            st.markdown("""
            Tests if the **target variable** has different values when this column is missing vs. observed.
            - **Numeric target:** z-test or Welch t-test  
            - **Categorical target:** Chi-squared test  
            - **Significant (p<0.05) + large diff % → MNAR** (missingness depends on outcome)
            """)

        tgt_rows = [{
            "Test Applied": "z-test / Welch t-test / Chi²",
            "p-value": t_target.get("p_value", "N/A"),
            "Target Diff %": f'{t_target.get("diff_pct", 0):.1f}%' if t_target.get("diff_pct") is not None else "N/A",
            "Significant (p<0.05)?": "✅ Yes" if t_target.get("significant") else "❌ No",
            "Interpretation": t_target.get("signal", "N/A")
        }]
        st.dataframe(pd.DataFrame(tgt_rows), use_container_width=True, hide_index=True)

        # ══ TEST 3: Feature Dependency ══
        st.markdown('<div class="test-header">🔗 Test 3 — Feature Dependency Tests</div>', unsafe_allow_html=True)
        with st.expander("ℹ️ What does this test measure?", expanded=False):
            st.markdown("""
            For each other feature, tests if values differ **significantly** between rows where this column is missing vs. observed.
            - **Numeric features:** z-test (n≥30) or Welch t-test  
            - **Categorical features:** Chi-squared test  
            - **Many significant features (>30%) → MAR** (missingness explained by observed data)
            """)

        # Summary row first
        summary_cols = st.columns(3)
        summary_cols[0].metric("Features Tested", t_feat.get("total_tested", 0))
        summary_cols[1].metric("Significant (p<0.05)", t_feat.get("n_significant", 0))
        summary_cols[2].metric("% Significant", f'{t_feat.get("sig_pct", 0):.1f}%')

        if t_feat["results"]:
            rows = []
            for f, r in t_feat["results"].items():
                rows.append({
                    "Feature": f,
                    "Data Type": r["type"].capitalize(),
                    "Test Used": r["test"],
                    "Test Statistic": r["stat"],
                    "p-value": r["p_value"],
                    "p < 0.05?": "✅ Significant" if r["significant"] else "—"
                })
            feat_df = pd.DataFrame(rows).sort_values("p-value")

            def highlight_sig(row):
                if row["p < 0.05?"] == "✅ Significant":
                    return ["background-color:#ffe4e1; color:#900000"] * len(row)
                return [""] * len(row)

            st.dataframe(
                feat_df.style.apply(highlight_sig, axis=1),
                use_container_width=True,
                hide_index=True
            )
        else:
            st.info("No feature dependency results available (insufficient data or no other columns).")

        # ══ Decision Logic Summary ══
        st.markdown('<div class="test-header">🧠 Decision Logic Summary</div>', unsafe_allow_html=True)
        logic_rows = [
            {"Rule Check": "Little's test rejects MCAR?", "Result": "✅ Yes" if little.get("reject_mcar") else "❌ No"},
            {"Rule Check": "Target differs significantly?", "Result": "✅ Yes" if t_target.get("significant") else "❌ No"},
            {"Rule Check": "Target diff magnitude", "Result": f'{t_target.get("diff_pct", 0):.1f}% difference'},
            {"Rule Check": "% of features with significant diff", "Result": f'{t_feat.get("sig_pct", 0):.1f}%'},
            {"Rule Check": "→ Final Mechanism", "Result": f'{res["mechanism"]} ({res["confidence"]} confidence)'},
        ]
        st.dataframe(pd.DataFrame(logic_rows), use_container_width=True, hide_index=True)


# ════════════════════════════════════════════════════════════════════
#  STEP 4 — FEASIBILITY GATE (Interactive)
# ════════════════════════════════════════════════════════════════════
def render_step4():
    st.markdown('<div class="main-title">⚖️ Step 4 — Imputation Feasibility Gate</div>', unsafe_allow_html=True)
    
    with st.expander("📚 Theory & Guide: Why test imputation mathematically? (Click to expand)"):
        st.markdown("""
        <div class="theory-box">
            <h4>Why test imputation mathematically?</h4>
            <p>Single-value imputations (like filling blanks with Mean or Median) are dangerous if overused. They can:</p>
            <ul>
                <li><b>Collapse Variance:</b> If you fill 20% of the data with the same number, the spread of your data shrinks unnaturally.</li>
                <li><b>Create Artificial Outliers:</b> Because the variance (IQR) shrank, real valid data points at the edges suddenly look like outliers!</li>
                <li><b>Destroy Correlation:</b> Assigning a median weight to someone without considering their height breaks the natural relationship between features.</li>
            </ul>
            <p><b>KNN and MICE</b> solve this by acting like mini machine-learning models — they look at other features to make an educated guess, preserving variance and correlations.</p>
        </div>
        """, unsafe_allow_html=True)

    df, target = st.session_state.get("df_train"), st.session_state.get("target_col")
    col_diag = st.session_state.get("col_diagnostics", {})
    if not col_diag: return st.warning("⚠️ Please run diagnostics in Step 3 first.")

    numeric_diag = {c: v for c, v in col_diag.items() if pd.api.types.is_numeric_dtype(df[c])}
    if not numeric_diag: return st.info("No numeric columns available.")

    col1, col2 = st.columns([1, 2])
    selected_col  = col1.selectbox("Select numeric column:", list(numeric_diag.keys()))
    impute_choice = col2.radio("Simulate impact of:", ["Mean", "Median", "KNN", "MICE"], horizontal=True)

    if st.button(f"▶ Simulate {impute_choice} Imputation", type="primary"):
        with st.spinner(f"Running {impute_choice} simulation (may take a moment for KNN/MICE)..."):
            feas = feasibility_checks(df, selected_col, target, impute_choice)

        if not feas.get("applicable"):
            return st.error("Column not applicable for numeric feasibility checks.")

        ICONS  = {"ok": "✅", "warn": "⚠️", "fail": "❌"}
        COLORS = {"ok": "stat-ok", "warn": "stat-warn", "fail": "stat-fail"}

        # ── Big Stats Banner ──
        st.markdown("### 📊 Imputation Impact — Key Statistics")
        m1, m2, m3, m4 = st.columns(4)

        var_pct   = feas["variance"]["var_drop_pct"]
        var_verd  = feas["variance"]["verdict"]
        new_out   = feas["outliers"]["new_outliers"]
        out_verd  = feas["outliers"]["verdict"]
        corr_verd = feas["correlation"]["verdict"]
        corr_max  = feas["correlation"]["max_shift"]
        skew_val  = feas["skewness"]["value"]
        skew_verd = feas["skewness"]["verdict"]

        m1.markdown(
            f'<div class="big-stat-box {COLORS[var_verd]}">'
            f'<div class="big-stat-val">-{var_pct:.1f}%</div>'
            f'<div class="big-stat-lbl">Variance Change</div>'
            f'<div class="big-stat-sub">{ICONS[var_verd]} {"Safe" if var_verd=="ok" else "Caution" if var_verd=="warn" else "High Risk"}</div>'
            f'</div>', unsafe_allow_html=True
        )
        m2.markdown(
            f'<div class="big-stat-box {COLORS[out_verd]}">'
            f'<div class="big-stat-val">+{new_out}</div>'
            f'<div class="big-stat-lbl">New Outliers Created</div>'
            f'<div class="big-stat-sub">{ICONS[out_verd]} Before: {feas["outliers"]["outliers_before"]} → After: {feas["outliers"]["outliers_after"]}</div>'
            f'</div>', unsafe_allow_html=True
        )
        m3.markdown(
            f'<div class="big-stat-box {COLORS[corr_verd]}">'
            f'<div class="big-stat-val">Δ{corr_max:.3f}</div>'
            f'<div class="big-stat-lbl">Max Corr. Shift</div>'
            f'<div class="big-stat-sub">{ICONS[corr_verd]} {corr_verd.capitalize()}</div>'
            f'</div>', unsafe_allow_html=True
        )
        m4.markdown(
            f'<div class="big-stat-box {COLORS[skew_verd]}">'
            f'<div class="big-stat-val">{skew_val:.3f}</div>'
            f'<div class="big-stat-lbl">Skewness</div>'
            f'<div class="big-stat-sub">{ICONS[skew_verd]} {"Low" if abs(skew_val)<=1 else "Moderate" if abs(skew_val)<=3 else "High"} skew</div>'
            f'</div>', unsafe_allow_html=True
        )

        st.markdown("---")
        
        # ── KDE Plots — Two clear separate charts ──
        st.markdown("### 📈 Distribution Comparison (KDE)")

        series   = df[selected_col].dropna()
        imputed  = feas["imputed_series"]
        miss_pct_col = df[selected_col].isnull().mean() * 100

        fig, axes = plt.subplots(1, 2, figsize=(16, 5))
        fig.patch.set_facecolor('#fafafa')

        # Plot 1: Overlapping KDE
        ax = axes[0]
        ax.set_facecolor('#f8f8f8')
        try:
            from scipy.stats import gaussian_kde
            # Original KDE
            kde_orig = gaussian_kde(series.values, bw_method='scott')
            x_range = np.linspace(min(series.min(), imputed.min()), max(series.max(), imputed.max()), 300)
            ax.fill_between(x_range, kde_orig(x_range), alpha=0.35, color='#17172b', label='Original (observed only)')
            ax.plot(x_range, kde_orig(x_range), color='#17172b', lw=2.5)

            # Imputed KDE
            kde_imp = gaussian_kde(imputed.values, bw_method='scott')
            ax.fill_between(x_range, kde_imp(x_range), alpha=0.35, color='#d6336c', label=f'After {impute_choice}')
            ax.plot(x_range, kde_imp(x_range), color='#d6336c', lw=2.5, linestyle='--')
        except Exception:
            ax.hist(series.values, bins=25, alpha=0.5, color='#17172b', label='Original', density=True)
            ax.hist(imputed.values, bins=25, alpha=0.4, color='#d6336c', label=f'After {impute_choice}', density=True)

        ax.set_title(f'KDE: Original vs After {impute_choice}\n({miss_pct_col:.1f}% was missing)', fontsize=13, fontweight='bold', pad=12)
        ax.set_xlabel(selected_col, fontsize=11)
        ax.set_ylabel('Density', fontsize=11)
        ax.legend(fontsize=10)
        ax.grid(axis='y', alpha=0.3)
        ax.spines[['top','right']].set_visible(False)

        # Plot 2: Box plots side by side
        ax2 = axes[1]
        ax2.set_facecolor('#f8f8f8')
        bp = ax2.boxplot(
            [series.values, imputed.values],
            labels=['Original\n(non-missing)', f'After\n{impute_choice}'],
            patch_artist=True,
            widths=0.5,
            medianprops=dict(color='#d6336c', linewidth=2.5),
            flierprops=dict(marker='o', markerfacecolor='#d6336c', markersize=5, alpha=0.5),
            whiskerprops=dict(linewidth=1.5),
            capprops=dict(linewidth=1.5),
        )
        bp['boxes'][0].set_facecolor('#c8d8f0')
        bp['boxes'][1].set_facecolor('#f5c6d0')

        # Annotate variance change
        ax2.set_title(
            f'Spread & Outliers\nVariance Change: {var_pct:.1f}% | New Outliers: +{new_out}',
            fontsize=13, fontweight='bold', pad=12
        )
        ax2.set_ylabel('Value', fontsize=11)
        ax2.grid(axis='y', alpha=0.3)
        ax2.spines[['top','right']].set_visible(False)

        plt.tight_layout(pad=2.5)
        st.pyplot(fig, use_container_width=True)
        plt.close()

        # ── Correlation Details ──
        st.markdown("---")
        st.markdown("#### 🔗 Correlation Preservation Details")
        st.markdown(f'<div class="card-{"ok" if corr_verd=="ok" else "warn" if corr_verd=="warn" else "danger"}">{ICONS[corr_verd]} <b>{feas["correlation"]["msg"]}</b></div>', unsafe_allow_html=True)
        if feas["correlation"]["details"]:
            rows = [{
                "Feature": f,
                "r (before)": r["r_before"],
                "r (after)": r["r_after"],
                "Δ (shift)": r["delta"],
                "Sign Flip?": "🚨 YES" if r["sign_flip"] else "No"
            } for f, r in feas["correlation"]["details"].items()]
            corr_df = pd.DataFrame(rows).sort_values("Δ (shift)", ascending=False)
            def highlight_corr(row):
                if row["Sign Flip?"] == "🚨 YES": return ["background-color:#fde8e8; color:#900000"] * len(row)
                if row["Δ (shift)"] > 0.10:       return ["background-color:#fff0ed; color:#900000"] * len(row)
                return [""] * len(row)
            st.dataframe(corr_df.style.apply(highlight_corr, axis=1), use_container_width=True, hide_index=True)


# ════════════════════════════════════════════════════════════════════
#  STEP 5 — FINAL REPORT
# ════════════════════════════════════════════════════════════════════
def render_step5():
    st.markdown('<div class="main-title">📋 Step 5 — Final Diagnostic Report</div>', unsafe_allow_html=True)
    
    df, target = st.session_state.get("df_train"), st.session_state.get("target_col")
    col_diag = st.session_state.get("col_diagnostics", {})
    if not col_diag: return st.warning("⚠️ Run diagnostics in Step 3 first.")

    # ── Legend ──
    with st.expander("📖 How to read the Recommended Strategy column"):
        st.markdown("""
        | Label | Meaning |
        |-------|---------|
        | **Drop Rows** | MCAR + <5% missing — safe to delete affected rows |
        | **Drop Column** | >70% missing — too little data to impute reliably |
        | **Mean Imputation** | Low-skew numeric, variance loss is acceptable |
        | **Median Imputation** | Skewed numeric; median is more robust than mean |
        | **Mode Imputation** | Categorical / non-numeric columns |
        | **KNN Imputer** | Moderate missingness; feature relationships preserved |
        | **MICE Imputer** | High missingness (>30%); multiple-imputation approach |
        | **+ Missing Indicator** | Added when mechanism is MNAR, or MAR ≥ 10% missing — add a binary flag column `col_missing` alongside imputed values |
        """)

    table_rows = []
    for col, res in col_diag.items():
        rec_string = get_auto_recommendation(df, col, target, res["mechanism"], res["miss_pct"], res["dtype"])
        table_rows.append({
            "Column": col,
            "dtype": res["dtype"],
            "Missing %": f'{res["miss_pct"]:.1f}%',
            "Mechanism": res["mechanism"],
            "Confidence": res["confidence"],
            "Recommended Strategy": rec_string
        })

    report_df = pd.DataFrame(table_rows).sort_values("Missing %", ascending=False)

    def color_rows(row):
        mech_colors = {
            "MNAR": "background-color:#fff0ed; color:#000",
            "MAR":  "background-color:#fffaeb; color:#000",
            "MCAR": "background-color:#edfaf3; color:#000"
        }
        return [mech_colors.get(row["Mechanism"], "")] * len(row)

    st.dataframe(
        report_df.style.apply(color_rows, axis=1),
        use_container_width=True,
        hide_index=True
    )

    # ── Summary counts ──
    st.markdown("---")
    c1, c2, c3 = st.columns(3)
    mcar_n = sum(1 for r in col_diag.values() if r["mechanism"] == "MCAR")
    mar_n  = sum(1 for r in col_diag.values() if r["mechanism"] == "MAR")
    mnar_n = sum(1 for r in col_diag.values() if r["mechanism"] == "MNAR")
    c1.markdown(f'<div class="metric-box"><div class="metric-val" style="color:#0a5c30">🟢 {mcar_n}</div><div class="metric-lbl">MCAR columns</div></div>', unsafe_allow_html=True)
    c2.markdown(f'<div class="metric-box"><div class="metric-val" style="color:#7a4f00">🟠 {mar_n}</div><div class="metric-lbl">MAR columns</div></div>', unsafe_allow_html=True)
    c3.markdown(f'<div class="metric-box"><div class="metric-val" style="color:#900000">🔴 {mnar_n}</div><div class="metric-lbl">MNAR columns</div></div>', unsafe_allow_html=True)


if   step == STEPS[0]: render_step1()
elif step == STEPS[1]: render_step2()
elif step == STEPS[2]: render_step3()
elif step == STEPS[3]: render_step4()
elif step == STEPS[4]: render_step5()