diff --git "a/src/streamlit_app.py" "b/src/streamlit_app.py"
--- "a/src/streamlit_app.py"
+++ "b/src/streamlit_app.py"
@@ -1,1497 +1,770 @@
 """
-Missing Value Intelligence Suite — Merged App
-Combines the stepwise pipeline (app.py) with the comprehensive dashboard (app_tanisha.py)
-into a unified 7-step workflow.
+Missing Value Analyzer — Statistically Rigorous Pipeline
+=========================================================
+Phases:
+  1  Upload CSV & Train/Test Split
+  2  Missing Value Overview (train set only)
+  3  Per-Column Diagnostics (Tables for all tests)
+  4  Imputation Feasibility Gate (KDE plots, Variance %, New Outliers)
+  5  Final Report & Recommendations
 """
 
 import streamlit as st
 import pandas as pd
 import numpy as np
+import matplotlib
+matplotlib.use("Agg")
 import matplotlib.pyplot as plt
-import matplotlib.patches as mpatches
 import seaborn as sns
 from scipy import stats
-from scipy.stats import chi2_contingency, ks_2samp, shapiro, skew, kurtosis
-from sklearn.preprocessing import LabelEncoder, StandardScaler
-from sklearn.linear_model import LogisticRegression
+from scipy.stats import chi2_contingency, ttest_ind, norm, chi2
 from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.impute import KNNImputer
+from sklearn.experimental import enable_iterative_imputer
+from sklearn.impute import IterativeImputer
 import warnings
 warnings.filterwarnings("ignore")
 
 # ─────────────────────────── Page config ────────────────────────────
 st.set_page_config(
-    page_title="Missing Value Intelligence Suite",
+    page_title="Missing Value Analyzer",
     page_icon="🔬",
     layout="wide",
     initial_sidebar_state="expanded",
 )
 
-# ─────────────────────────── Custom CSS ─────────────────────────────
+# ─────────────────────────── CSS ────────────────────────────────────
 st.markdown("""
 <style>
 @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
+html,body,[class*="css"]{font-family:'Inter',sans-serif;}
 
-html, body, [class*="css"] { font-family: 'Inter', sans-serif; }
-
-section[data-testid="stSidebar"] {
-    background: #17172b;
-    color: #ffffff;
-}
-section[data-testid="stSidebar"] * { color: #ffffff !important; }
-section[data-testid="stSidebar"] .stSelectbox label,
-section[data-testid="stSidebar"] .stRadio label { color: #c0c0e0 !important; }
-
-.main-title {
-    font-size: 2rem;
-    font-weight: 700;
-    color: #17172b;
-    margin-bottom: 0.2rem;
-}
-.main-sub {
-    font-size: 1rem;
-    color: #6060a0;
-    margin-bottom: 1.5rem;
-}
-
-.section-header {
-    font-size: 1.3rem; font-weight: 600; color: #1a1a2e;
-    background: linear-gradient(90deg, #eef2ff, transparent);
-    padding: 10px 16px; border-left: 4px solid #4f8ef7;
-    border-radius: 4px; margin: 24px 0 14px 0;
-}
-
-.step-badge {
-    display: inline-block;
-    background: #17172b;
-    color: #fff;
-    font-size: 0.72rem;
-    font-weight: 700;
-    padding: 3px 10px;
-    border-radius: 20px;
-    margin-bottom: 6px;
-    letter-spacing: 0.08em;
-    text-transform: uppercase;
-}
-
-.card-mcar { background:#edfaf3; border:2px solid #89d9ac; border-radius:10px; padding:14px 18px; margin-bottom:10px; }
-.card-mar  { background:#fffaeb; border:2px solid #f0cc7a; border-radius:10px; padding:14px 18px; margin-bottom:10px; }
-.card-mnar { background:#fff0ed; border:2px solid #f5a898; border-radius:10px; padding:14px 18px; margin-bottom:10px; }
-.card-info { background:#eef2ff; border:2px solid #bdc8f5; border-radius:10px; padding:14px 18px; margin-bottom:10px; }
-.card-warn { background:#fff8e1; border:2px solid #ffe082; border-radius:10px; padding:14px 18px; margin-bottom:10px; }
-.card-strat{ background:#f8f0ff; border:2px solid #c8a0f0; border-radius:10px; padding:14px 18px; margin-bottom:10px; }
-
-.verdict-label { font-size: 1.1rem; font-weight: 700; margin-bottom: 4px; }
-.verdict-desc  { font-size: 0.88rem; color: #444; }
-
-.metric-box {
-    background: #f5f3ee;
-    border-radius: 8px;
-    padding: 12px 16px;
-    text-align: center;
-}
-.metric-val { font-size: 1.4rem; font-weight: 700; color: #17172b; }
-.metric-lbl { font-size: 0.78rem; color: #6060a0; margin-top: 2px; }
-
-.metric-card {
-    background: white; border-radius: 10px; padding: 18px 24px;
-    box-shadow: 0 2px 8px rgba(0,0,0,0.08); text-align: center;
-}
-.metric-card .val { font-size: 2rem; font-weight: 700; color: #4f8ef7; }
-.metric-card .lbl { font-size: 0.82rem; color: #666; margin-top: 4px; }
-
-.col-stat-card {
-    background: white; border-radius: 10px; padding: 14px 18px;
-    box-shadow: 0 1px 6px rgba(0,0,0,0.07); text-align: center;
-}
-.col-stat-card .cv { font-size: 1.5rem; font-weight: 700; color: #1a1a2e; }
-.col-stat-card .ck { font-size: 0.75rem; color: #888; margin-top: 3px;
-    text-transform: uppercase; letter-spacing: .05em; }
-
-.badge-mcar { background:#d4edda; color:#155724; padding:3px 10px; border-radius:12px; font-size:0.82rem; font-weight:600; }
-.badge-mar  { background:#fff3cd; color:#856404; padding:3px 10px; border-radius:12px; font-size:0.82rem; font-weight:600; }
-.badge-mnar { background:#f8d7da; color:#721c24; padding:3px 10px; border-radius:12px; font-size:0.82rem; font-weight:600; }
-
-.strat-chip { display:inline-block; padding:4px 14px; border-radius:20px;
-    font-size:0.82rem; font-weight:600; margin:3px 3px; }
-.chip-green  { background:#d4edda; color:#155724; border:1px solid #89d9ac; }
-.chip-yellow { background:#fff3cd; color:#856404; border:1px solid #f0cc7a; }
-.chip-red    { background:#f8d7da; color:#721c24; border:1px solid #f5a898; }
-.chip-blue   { background:#dce3ff; color:#2a3da0; border:1px solid #bdc8f5; }
-
-.insight-box {
-    background: #f0f7ff; border: 1px solid #bdd5ff;
-    border-radius: 8px; padding: 16px 20px; margin: 12px 0;
-}
-.insight-box li { margin: 6px 0; color: #1a3a6e; font-size: 0.92rem; }
-.theory-box {
-    background: #fafafa; border: 1px solid #e0e0e0;
-    border-radius: 8px; padding: 16px 20px; margin: 12px 0;
-}
-.theory-box h4 { color: #333; margin-bottom: 8px; }
-.theory-box p  { color: #555; font-size: 0.91rem; line-height: 1.6; }
-
-code { background: #f0f0f8; padding: 2px 6px; border-radius: 4px; font-size: 0.85rem; }
-hr.divider { border: none; border-top: 2px solid #e0ddd8; margin: 1.5rem 0; }
-</style>
-""", unsafe_allow_html=True)
+section[data-testid="stSidebar"]{background:#17172b;}
+section[data-testid="stSidebar"] *{color:#ffffff !important;}
+section[data-testid="stSidebar"] hr{border-color:#ffffff33 !important;}
 
+.main-title{font-size:2rem;font-weight:700;color:#17172b;margin-bottom:.2rem;}
+.main-sub{font-size:1rem;color:#6060a0;margin-bottom:1.5rem;}
 
-# ════════════════════════════════════════════════════════════════════
-#  SHARED HELPER FUNCTIONS
-# ════════════════════════════════════════════════════════════════════
+.metric-box{background:#f5f3ee;border-radius:8px;padding:12px 16px;text-align:center;margin-bottom:8px;}
+.metric-val{font-size:1.4rem;font-weight:700;color:#17172b !important;}
+.metric-lbl{font-size:.78rem;color:#6060a0 !important;margin-top:2px;}
 
-def missing_summary_df(df: pd.DataFrame) -> pd.DataFrame:
-    total = len(df)
-    counts = df.isnull().sum()
-    pct = counts / total * 100
-    summary = pd.DataFrame({
-        "Missing Count": counts,
-        "Missing %": pct.round(2),
-        "Dtype": df.dtypes.astype(str),
-    })
-    return summary[summary["Missing Count"] > 0].sort_values("Missing %", ascending=False)
-
-
-def missing_summary_typed(df, num_cols, cat_cols):
-    rows = []
-    for col in df.columns:
-        mc  = df[col].isnull().sum()
-        pct = mc / len(df) * 100
-        dtype = "Numerical" if col in num_cols else "Categorical"
-        rows.append({"Column": col, "Data Type": dtype,
-                     "Missing Count": mc, "Missing %": round(pct, 2)})
-    result = pd.DataFrame(rows).sort_values("Missing %", ascending=False).reset_index(drop=True)
-    return result[result["Missing Count"] > 0].reset_index(drop=True)
-
-
-def severity(pct):
-    if pct < 5:  return "Low"
-    if pct < 20: return "Moderate"
-    return "High"
-
-
-def identify_columns(df):
-    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-    cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
-    return num_cols, cat_cols
-
-
-def missingness_risk_level(pct: float) -> tuple:
-    if pct <= 5:
-        return "≤5%", "Very low missingness. Low risk of bias.", "#edfaf3", "#0d6b3a"
-    elif pct <= 15:
-        return "5–15%", "Moderate. Imputation preferred over dropping.", "#fffaeb", "#7a4d00"
-    elif pct <= 30:
-        return "15–30%", "High. Dropping loses too much data. Advanced imputation + missing indicator mandatory.", "#fff0ed", "#9e2210"
-    else:
-        return ">30%", "Very high. Consider dropping the column. Re-evaluate column usefulness + domain check.", "#fde8e8", "#7a0000"
-
-
-# ── Statistical Tests (from app.py) ──────────────────────────────────
-
-def test1_pattern_analysis(df: pd.DataFrame, col: str) -> dict:
-    indicator = df[col].isnull().astype(int)
-    miss_pct = indicator.mean() * 100
-    runs = (indicator != indicator.shift()).sum()
-    max_possible_runs = min(len(indicator) * 2, len(indicator[indicator == 1]) * 2 + 1)
-    cluster_ratio = runs / max(max_possible_runs, 1)
-    scattered = cluster_ratio > 0.5
-    return {
-        "indicator": indicator,
-        "miss_pct": miss_pct,
-        "scattered": scattered,
-        "cluster_ratio": cluster_ratio,
-        "signal": "MCAR signal" if scattered else "MAR / MNAR signal (clustered rows)",
-    }
+.big-stat-box{border-radius:12px;padding:20px 24px;text-align:center;margin-bottom:8px;}
+.big-stat-val{font-size:2.2rem;font-weight:800;margin-bottom:4px;}
+.big-stat-lbl{font-size:.82rem;font-weight:500;opacity:0.8;text-transform:uppercase;letter-spacing:.05em;}
+.big-stat-sub{font-size:.78rem;opacity:0.65;margin-top:4px;}
 
+.stat-ok{background:#edfaf3;border:2px solid #89d9ac;}
+.stat-ok .big-stat-val{color:#0a5c30;}
+.stat-warn{background:#fffaeb;border:2px solid #f0cc7a;}
+.stat-warn .big-stat-val{color:#7a4f00;}
+.stat-fail{background:#fff0ed;border:2px solid #f5a898;}
+.stat-fail .big-stat-val{color:#900000;}
 
-def test2_feature_dependency(df: pd.DataFrame, col: str) -> dict:
-    missing_mask = df[col].isnull()
-    if missing_mask.sum() < 5 or (~missing_mask).sum() < 5:
-        return {"diffs": {}, "max_diff": 0.0, "signal": "Insufficient data"}
+.card-mcar{background:#edfaf3;border:2px solid #89d9ac;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
+.card-mar {background:#fffaeb;border:2px solid #f0cc7a;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
+.card-mnar{background:#fff0ed;border:2px solid #f5a898;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
+.card-info{background:#eef2ff;border:2px solid #bdc8f5;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
+.card-warn{background:#fff8e1;border:2px solid #ffe082;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
+.card-danger{background:#fde8e8;border:2px solid #f5a8a8;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
+.card-ok{background:#edfaf3;border:2px solid #89d9ac;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
 
-    diffs = {}
-    for other_col in df.columns:
-        if other_col == col:
-            continue
-        try:
-            miss_vals = df.loc[missing_mask, other_col].dropna()
-            obs_vals  = df.loc[~missing_mask, other_col].dropna()
-            if len(miss_vals) < 3 or len(obs_vals) < 3:
-                continue
-            if pd.api.types.is_numeric_dtype(df[other_col]):
-                m1, m2 = miss_vals.mean(), obs_vals.mean()
-                denom = max(abs(m2), 1e-9)
-                diff_pct = abs(m1 - m2) / denom * 100
-                diffs[other_col] = diff_pct
-            else:
-                ct = pd.crosstab(
-                    pd.concat([pd.Series(["missing"] * len(miss_vals)),
-                               pd.Series(["present"] * len(obs_vals))]),
-                    pd.concat([miss_vals, obs_vals])
-                )
-                chi2, _, _, _ = chi2_contingency(ct)
-                n = ct.values.sum()
-                k = min(ct.shape) - 1
-                cramers_v = np.sqrt(chi2 / (n * max(k, 1))) * 100
-                diffs[other_col] = cramers_v
-        except Exception:
-            continue
+.card-mcar *, .card-mar *, .card-mnar *, .card-info *, .card-warn *, .card-danger *, .card-ok * {color: #1a1a2e !important;}
 
-    if not diffs:
-        return {"diffs": {}, "max_diff": 0.0, "signal": "No comparable features"}
+.verdict-label{font-size:1.1rem;font-weight:700;margin-bottom:4px;}
+.verdict-desc{font-size:.88rem;color:#333 !important;}
 
-    max_diff = max(diffs.values())
-    if max_diff < 5:
-        signal = "Weak signal — MCAR likely"
-    elif max_diff < 30:
-        signal = "Strong MAR signal (feature dependency detected)"
-    else:
-        signal = "Very strong dependency — MAR or MNAR"
+code{background:#e8e8eb;padding:2px 6px;border-radius:4px;font-size:.85rem; color:#d6336c !important;}
+hr.divider{border:none;border-top:2px solid #e0ddd8;margin:1.5rem 0;}
 
-    return {"diffs": diffs, "max_diff": max_diff, "signal": signal}
+.theory-box {background:#fafafa; border-left:4px solid #4f8ef7; border-radius:4px; padding:12px 18px; margin-bottom:16px;}
+.theory-box h4 {color:#17172b; margin-bottom:6px; font-size:1.05rem;}
+.theory-box p {color:#444; font-size:0.92rem; line-height:1.5;}
 
+.stat-highlight { font-size: 1.2rem; font-weight: bold; color: #d6336c; background: #ffe4e1; padding: 2px 8px; border-radius: 4px;}
 
-def test3_target_dependency(df: pd.DataFrame, col: str, target_col: str) -> dict:
-    missing_mask = df[col].isnull()
-    if missing_mask.sum() < 5 or (~missing_mask).sum() < 5:
-        return {"diff_pct": None, "signal": "Insufficient data"}
+.test-header{font-size:1.05rem;font-weight:700;color:#17172b;margin:18px 0 8px;}
+</style>
+""", unsafe_allow_html=True)
+
+
+# ════════════════════════════════════════════════════════════════════
+#  SESSION STATE INIT
+# ═════════════════════════════════════════════════���══════════════════
+defaults = {"df_full": None, "df_train": None, "df_test": None, "target_col": None, "split_ratio": 0.8, "col_diagnostics": {}}
+for k, v in defaults.items():
+    if k not in st.session_state: st.session_state[k] = v
 
-    try:
-        miss_target = df.loc[missing_mask, target_col].dropna()
-        obs_target  = df.loc[~missing_mask, target_col].dropna()
 
+# ════════════════════════════════════════════════════════════════════
+#  STATISTICAL TEST HELPERS
+# ════════════════════════════════════════════════════════════════════
+def littles_mcar_test(df: pd.DataFrame, cols_with_missing: list) -> dict:
+    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    chi2_total, df_total = 0.0, 0
+    for col in cols_with_missing:
+        if col not in numeric_cols: continue
+        missing_mask = df[col].isnull()
+        if missing_mask.sum() < 5 or (~missing_mask).sum() < 5: continue
+        for other in numeric_cols:
+            if other == col: continue
+            g1, g2 = df.loc[missing_mask, other].dropna(), df.loc[~missing_mask, other].dropna()
+            if len(g1) < 3 or len(g2) < 3: continue
+            grand_mean, grand_var = df[other].mean(), df[other].var()
+            if grand_var < 1e-12: continue
+            chi2_total += (len(g1)*(g1.mean() - grand_mean)**2 + len(g2)*(g2.mean() - grand_mean)**2) / grand_var
+            df_total += 1
+    if df_total == 0: return {"chi2": None, "p_value": None, "verdict": "Insufficient numeric data"}
+    p_val = 1 - chi2.cdf(chi2_total, df_total)
+    verdict = f"Fail to reject MCAR" if p_val >= 0.05 else f"Reject MCAR"
+    return {"chi2": round(chi2_total, 4), "df": df_total, "p_value": round(p_val, 4), "verdict": verdict, "reject_mcar": p_val < 0.05}
+
+def feature_dependency_tests(df: pd.DataFrame, col: str) -> dict:
+    missing_mask = df[col].isnull()
+    if missing_mask.sum() < 5 or (~missing_mask).sum() < 5: return {"results": {}, "n_significant": 0, "signal": "Insufficient data"}
+    results = {}
+    for other in df.columns:
+        if other == col: continue
+        g_miss, g_obs = df.loc[missing_mask, other].dropna(), df.loc[~missing_mask, other].dropna()
+        if len(g_miss) < 3 or len(g_obs) < 3: continue
+        try:
+            if pd.api.types.is_numeric_dtype(df[other]):
+                n1, n2 = len(g_miss), len(g_obs)
+                if min(n1, n2) >= 30:
+                    se = np.sqrt(g_miss.var()/n1 + g_obs.var()/n2)
+                    if se < 1e-12: continue
+                    z_stat = (g_miss.mean() - g_obs.mean()) / se
+                    p_val = 2 * (1 - norm.cdf(abs(z_stat)))
+                    test_name, stat = "z-test", round(z_stat, 4)
+                else:
+                    t_stat, p_val = ttest_ind(g_miss, g_obs, equal_var=False)
+                    test_name, stat = "Welch t-test", round(t_stat, 4)
+                results[other] = {"test": test_name, "stat": stat, "p_value": round(p_val, 4), "significant": p_val < 0.05, "type": "numeric"}
+            else:
+                ct = pd.crosstab(missing_mask.astype(int), df[other])
+                if ct.shape[0] < 2 or ct.shape[1] < 2: continue
+                chi2_stat, p_val, _, _ = chi2_contingency(ct)
+                results[other] = {"test": "chi²", "stat": round(chi2_stat, 4), "p_value": round(p_val, 4), "significant": p_val < 0.05, "type": "categorical"}
+        except Exception: continue
+    n_sig = sum(1 for r in results.values() if r["significant"])
+    sig_pct = n_sig / max(len(results), 1) * 100
+    signal = "No features differ significantly" if sig_pct == 0 else f"{n_sig}/{len(results)} features differ (p<0.05)"
+    return {"results": results, "n_significant": n_sig, "total_tested": len(results), "sig_pct": round(sig_pct, 1), "signal": signal}
+
+def target_dependency_test(df: pd.DataFrame, col: str, target_col: str) -> dict:
+    missing_mask = df[col].isnull()
+    if missing_mask.sum() < 5 or (~missing_mask).sum() < 5: return {"p_value": None, "signal": "Insufficient data", "significant": False}
+    try:
+        g_miss, g_obs = df.loc[missing_mask, target_col].dropna(), df.loc[~missing_mask, target_col].dropna()
         if pd.api.types.is_numeric_dtype(df[target_col]):
-            m1, m2 = miss_target.mean(), obs_target.mean()
-            denom = max(abs(m2), 1e-9)
-            diff_pct = abs(m1 - m2) / denom * 100
+            n1, n2 = len(g_miss), len(g_obs)
+            if min(n1, n2) >= 30:
+                se = np.sqrt(g_miss.var()/n1 + g_obs.var()/n2)
+                if se < 1e-12: return {"p_value": None, "signal": "Zero variance", "significant": False}
+                z_stat = (g_miss.mean() - g_obs.mean()) / se
+                p_val = 2 * (1 - norm.cdf(abs(z_stat)))
+            else:
+                _, p_val = ttest_ind(g_miss, g_obs, equal_var=False)
+            diff_pct = abs(g_miss.mean() - g_obs.mean()) / max(abs(g_obs.mean()), 1e-9) * 100
         else:
-            p1 = miss_target.value_counts(normalize=True).iloc[0] * 100
-            p2 = obs_target.value_counts(normalize=True).iloc[0] * 100
+            ct = pd.crosstab(missing_mask.astype(int), df[target_col])
+            _, p_val, _, _ = chi2_contingency(ct)
+            p1, p2 = g_miss.value_counts(normalize=True).iloc[0]*100, g_obs.value_counts(normalize=True).iloc[0]*100
             diff_pct = abs(p1 - p2)
 
-        if diff_pct < 5:
-            signal = "No strong signal (<5% target diff)"
-        elif diff_pct < 10:
-            signal = "Moderate target dependency — possible MAR/MNAR"
-        else:
-            signal = "Strong target dependency → MNAR likely (>10% target diff)"
-
-        return {"diff_pct": round(diff_pct, 2), "signal": signal}
-    except Exception as e:
-        return {"diff_pct": None, "signal": f"Could not compute: {e}"}
-
+        sig = p_val < 0.05
+        signal = f"Not significant (p={p_val:.4f})" if not sig else f"Significant — target differs by {diff_pct:.1f}%"
+        return {"p_value": round(p_val, 4), "significant": sig, "diff_pct": round(diff_pct, 2), "signal": signal}
+    except Exception as e: return {"p_value": None, "signal": f"Error: {e}", "significant": False}
+
+def classify_mechanism(t_feat, t_target, little):
+    tgt_sig, tgt_diff = t_target.get("significant", False), t_target.get("diff_pct", 0)
+    sig_pct = t_feat.get("sig_pct", 0)
+    
+    if tgt_sig and tgt_diff >= 10: return "MNAR", "High", "Missingness strongly correlates with the outcome."
+    elif tgt_sig and tgt_diff >= 5: return "MNAR", "Moderate", "Moderate dependency on target. Treat conservatively as MNAR."
+    elif sig_pct > 30: return "MAR", "High", "Strong dependency on observed features detected."
+    elif sig_pct > 0: return "MAR", "Moderate", "Weak but present dependency on observed features."
+    elif little.get("reject_mcar"): return "MAR", "Low", "Little's test rejects MCAR, but feature tests show weak dependency."
+    else: return "MCAR", "High", "No statistical evidence of systematic missingness."
+
+def run_single_diagnostic(df, col, target_col):
+    little, t_feat = littles_mcar_test(df, [col]), feature_dependency_tests(df, col)
+    t_target = {"p_value": None, "significant": False, "signal": "Skipped (Is Target)", "diff_pct": 0} if col == target_col else target_dependency_test(df, col, target_col)
+    mech, conf, expl = classify_mechanism(t_feat, t_target, little)
+    st.session_state["col_diagnostics"][col] = {
+        "mechanism": mech, "confidence": conf, "explanation": expl,
+        "miss_pct": round(df[col].isnull().mean()*100, 2),
+        "dtype": str(df[col].dtype),
+        "little": little, "t_feat": t_feat, "t_target": t_target
+    }
 
-def classify_mechanism(t1: dict, t2: dict, t3: dict) -> tuple:
-    feat_dep  = t2.get("max_diff", 0)
-    tgt_dep   = t3.get("diff_pct") or 0
-    scattered = t1.get("scattered", True)
 
-    if tgt_dep > 10:
-        return "MNAR", "High", (
-            f"Target variable differs by {tgt_dep:.1f}% between missing/present rows. "
-            "The probability of missingness depends on the unobserved value itself."
-        )
-    elif feat_dep >= 10 and not scattered:
-        return "MAR", "High", (
-            f"Feature distributions differ by up to {feat_dep:.1f}% and missing values appear "
-            "clustered — missingness depends on observed features."
-        )
-    elif feat_dep >= 5:
-        return "MAR", "Moderate", (
-            f"Feature distributions differ by up to {feat_dep:.1f}%. "
-            "Missingness likely depends on observed features."
-        )
-    elif scattered and feat_dep < 5 and tgt_dep < 5:
-        return "MCAR", "High", (
-            "Values appear randomly scattered, feature distributions are similar across "
-            "groups, and target shows no dependency — consistent with MCAR."
-        )
+# ════════════════════════════════════════════════════════════════════
+#  IMPUTATION SIMULATION HELPERS
+# ════════════════════════════════════════════════════════════════════
+def feasibility_checks(df: pd.DataFrame, col: str, target_col: str, impute_method: str) -> dict:
+    series = df[col].dropna()
+    if len(series) < 5 or not pd.api.types.is_numeric_dtype(df[col]):
+        return {"applicable": False}
+
+    results = {"applicable": True, "escalate_to_knn": False, "reasons": []}
+    
+    # ── 1. Impute ──
+    if impute_method == "Mean": imputed_series = df[col].fillna(series.mean())
+    elif impute_method == "Median": imputed_series = df[col].fillna(series.median())
     else:
-        return "MCAR", "Low", (
-            "Weak signals across all three tests. Treated as MCAR but verify with domain knowledge."
-        )
-
-
-# ── Logistic Regression-based mechanism diagnosis (from app_tanisha.py) ──
-
-def diagnose_mechanism_lr(df, col, num_cols):
-    miss_mask  = df[col].isnull().astype(int)
-    predictors = [c for c in df.columns if c != col and df[c].isnull().mean() < 0.9]
-    if not predictors or miss_mask.sum() < 5:
-        return "MNAR", "Insufficient data to test; assumed MNAR."
-    mcar_p_vals = []
-    for p in predictors:
-        if p in num_cols and df[p].dropna().nunique() > 1:
-            try:
-                binned = pd.qcut(df[p].fillna(df[p].median()), q=4, duplicates="drop", labels=False)
-                ct = pd.crosstab(binned, miss_mask)
-                if ct.shape[0] > 1 and ct.shape[1] > 1:
-                    _, p_val, _, _ = chi2_contingency(ct)
-                    mcar_p_vals.append(p_val)
-            except Exception:
-                pass
-    if mcar_p_vals and np.mean(mcar_p_vals) > 0.05:
-        return "MCAR", (f"Chi-square tests show no significant dependency "
-                        f"(avg p={np.mean(mcar_p_vals):.3f} > 0.05). Missingness appears random.")
-    try:
-        X_pred = df[predictors].copy()
-        for c in X_pred.select_dtypes(include="object").columns:
-            X_pred[c] = X_pred[c].astype("category").cat.codes
-        X_pred = X_pred.fillna(X_pred.median(numeric_only=True))
-        scaler   = StandardScaler()
-        X_scaled = scaler.fit_transform(X_pred)
-        lr = LogisticRegression(max_iter=300, solver="lbfgs")
-        lr.fit(X_scaled, miss_mask)
-        score    = lr.score(X_scaled, miss_mask)
-        baseline = max(miss_mask.mean(), 1 - miss_mask.mean())
-        if score > baseline + 0.05:
-            return "MAR", (f"Logistic Regression predicts missingness with accuracy {score:.2%} "
-                           f"(baseline {baseline:.2%}). Missingness is related to observed variables.")
-    except Exception:
-        pass
-    return "MNAR", "Missingness not explained by observed data. Likely related to the missing value itself — assumed MNAR."
-
-
-def recommend_strategy(mechanism: str, miss_pct: float, dtype: str) -> dict:
-    is_num = "float" in dtype or "int" in dtype
-    add_indicator = (mechanism == "MNAR") or (mechanism == "MAR" and miss_pct >= 10)
-
-    if mechanism == "MCAR" and miss_pct <= 5:
-        method = "Drop rows"
-        reason = "MCAR confirmed and loss is minimal (≤5%). Safe to drop."
-        adv = "✓ No artificial data introduced"
-        disadv = "✗ Loses data — only safe at very low %"
-    elif mechanism in ("MCAR", "MAR") and miss_pct <= 15:
-        if is_num:
-            method = "Median imputation"
-            reason = "Low-moderate missingness. Median is robust to skew and outliers."
-            adv = "✓ Outlier-resistant; recommended default for numeric"
-            disadv = "✗ Reduces variance slightly"
-        else:
-            method = "Mode imputation"
-            reason = "Low-moderate missingness on categorical data."
-            adv = "✓ Preserves category structure"
-            disadv = "✗ Can over-represent dominant category"
-    elif mechanism == "MAR" and miss_pct <= 30:
-        method = "KNN Imputation" if is_num else "Mode / KNN Imputation"
-        reason = "Moderate MAR missingness. KNN leverages feature relationships."
-        adv = "✓ Preserves local patterns; captures inter-feature structure"
-        disadv = "✗ Slow on large datasets; requires scaling"
-    elif mechanism == "MAR" and miss_pct > 30:
-        method = "Iterative Imputer (MICE)"
-        reason = "High MAR missingness. MICE models each column as a function of others."
-        adv = "✓ Most statistically principled; accounts for all feature relationships"
-        disadv = "✗ Computationally expensive; risk of instability"
-    elif mechanism == "MNAR":
-        method = "Median + Missing Indicator (mandatory)"
-        reason = "MNAR: the fact of missingness is informative. Indicator must be created BEFORE imputation."
-        adv = "✓ Preserves MNAR signal; lets model learn from missingness"
-        disadv = "✗ Imputation may still be biased; domain expertise required"
+        numeric_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c != target_col]
+        X_num = df[numeric_cols].copy()
+        try:
+            scaler = StandardScaler()
+            X_scaled = pd.DataFrame(scaler.fit_transform(X_num), columns=X_num.columns)
+            imputer = KNNImputer(n_neighbors=5) if impute_method == "KNN" else IterativeImputer(random_state=42, max_iter=10)
+            X_imputed_scaled = pd.DataFrame(imputer.fit_transform(X_scaled), columns=X_num.columns)
+            X_imputed = pd.DataFrame(scaler.inverse_transform(X_imputed_scaled), columns=X_num.columns)
+            imputed_series = X_imputed[col]
+        except Exception:
+            imputed_series = df[col].fillna(series.median())
+
+    results["imputed_series"] = imputed_series
+
+    # ��─ 2. Skewness & Outliers ──
+    skew = series.skew()
+    Q1_b, Q3_b = series.quantile(0.25), series.quantile(0.75)
+    IQR_b = Q3_b - Q1_b
+    outliers_before = ((series < Q1_b - 1.5*IQR_b) | (series > Q3_b + 1.5*IQR_b)).sum()
+    
+    Q1_a, Q3_a = imputed_series.quantile(0.25), imputed_series.quantile(0.75)
+    IQR_a = Q3_a - Q1_a
+    outliers_after = ((imputed_series < Q1_a - 1.5*IQR_a) | (imputed_series > Q3_a + 1.5*IQR_a)).sum()
+    new_outliers = max(0, outliers_after - outliers_before)
+
+    if impute_method == "Mean":
+        skew_verdict = "fail" if abs(skew) > 1 else "ok"
+    elif impute_method == "Median":
+        skew_verdict = "warn" if abs(skew) > 3 else "ok"
+    else: 
+        skew_verdict = "ok"
+
+    results["skewness"] = {"verdict": skew_verdict, "value": skew, "msg": f"Skewness = {skew:.3f}"}
+    
+    if new_outliers > (len(series) * 0.05):
+        out_verdict = "warn"
     else:
-        method = "Consider dropping column"
-        reason = f"Missing > 30% with {mechanism}. Evaluate predictive value vs. cost of imputation."
-        adv = "✓ Eliminates noise if column is uninformative"
-        disadv = "✗ Irreversible — verify with domain expert first"
-
-    return {
-        "method": method,
-        "reason": reason,
-        "adv": adv,
-        "disadv": disadv,
-        "add_indicator": add_indicator,
-    }
-
+        out_verdict = "ok"
 
-def strategy_chips_html(mech, miss_pct, col_type):
-    chips = []
-    if mech == "CLEAN":
-        return '<span class="strat-chip chip-green">✅ No action needed — column is complete</span>'
-    if miss_pct > 50:
-        chips.append(("⚠ Consider Dropping Column (>50% missing)", "chip-red"))
-    if mech == "MCAR":
-        if miss_pct < 5:
-            chips.append(("Listwise Deletion (safe)", "chip-green"))
-        chips.append(("Median Imputation" if col_type == "Numerical" else "Mode Imputation", "chip-green"))
-    if mech == "MAR":
-        chips.append(("KNN Imputation", "chip-blue"))
-        chips.append(("Iterative Imputer (MICE)", "chip-blue"))
-        chips.append(("Group-wise Imputation", "chip-blue"))
-        if miss_pct >= 10:
-            chips.append(("Create Missing Indicator (≥10% MAR)", "chip-yellow"))
-    if mech == "MNAR":
-        chips.append(("⚠ Create Missing Indicator FIRST (mandatory)", "chip-red"))
-        chips.append(("Constant / Domain-Specific Value", "chip-yellow"))
-        chips.append(("Sensitivity Analysis Required", "chip-yellow"))
-    return " ".join(f'<span class="strat-chip {cls}">{lbl}</span>' for lbl, cls in chips)
-
-
-def validation_checks(df_before: pd.Series, df_after: pd.Series) -> dict:
-    m_shift   = abs(df_before.mean() - df_after.mean()) / max(abs(df_before.mean()), 1e-9) * 100
-    med_shift = abs(df_before.median() - df_after.median()) / max(abs(df_before.median()), 1e-9) * 100
-    var_change = abs(df_before.var() - df_after.var()) / max(df_before.var(), 1e-9) * 100
-    return {
-        "mean_shift_pct":   round(m_shift, 2),
-        "median_shift_pct": round(med_shift, 2),
-        "var_change_pct":   round(var_change, 2),
-        "mean_ok":   m_shift   <= 5,
-        "median_ok": med_shift <= 3,
-        "var_ok":    var_change <= 20,
+    results["outliers"] = {
+        "verdict": out_verdict,
+        "new_outliers": new_outliers,
+        "outliers_before": outliers_before,
+        "outliers_after": outliers_after
     }
 
+    # ── 3. Variance Impact ──
+    var_before = series.var()
+    var_after  = imputed_series.var()
+    var_drop_pct = (var_before - var_after) / var_before * 100 if var_before > 1e-12 else 0
 
-# ── Outlier & Variance helpers (from app_tanisha.py) ──────────────────
-
-def detect_outliers_iqr(series):
-    s = series.dropna()
-    if len(s) < 4: return 0
-    Q1, Q3 = s.quantile(0.25), s.quantile(0.75)
-    IQR = Q3 - Q1
-    return int(((s < Q1 - 1.5 * IQR) | (s > Q3 + 1.5 * IQR)).sum())
-
-
-def variance_impact(series):
-    s = series.dropna()
-    if len(s) < 2: return 0.0, 0.0, 0.0
-    var_before = float(s.var())
-    var_after  = float(series.fillna(s.mean()).var())
-    return round(var_before, 4), round(var_after, 4), round(var_before - var_after, 4)
-
-
-def stat_card(label, value, color="#1a1a2e"):
-    return (f'<div class="col-stat-card">'
-            f'<div class="cv" style="color:{color};">{value}</div>'
-            f'<div class="ck">{label}</div></div>')
-
-
-# ── Plot helpers ──────────────────────────────────────────────────────
-
-def plot_missing_heatmap(df):
-    missing_cols = [c for c in df.columns if df[c].isnull().any()]
-    if not missing_cols:
-        return None
-    sorted_cols = sorted(missing_cols, key=lambda c: df[c].isnull().mean(), reverse=True)
-    sample_size = min(300, len(df))
-    df_s = df[sorted_cols].sample(n=sample_size, random_state=42) if len(df) > sample_size else df[sorted_cols]
-    mask_df = df_s.isnull().astype(int)
-    fig, ax = plt.subplots(figsize=(max(10, len(sorted_cols) * 0.7), 5))
-    sns.heatmap(mask_df.T, cmap=["#f5f3ee", "#17172b"], cbar=True,
-                yticklabels=sorted_cols, xticklabels=False, linewidths=0, ax=ax)
-    ax.set_title(f"Missing Value Heatmap — sample of {sample_size} rows", fontsize=13, fontweight="bold", pad=12)
-    ax.set_xlabel("Rows (observations)", fontsize=10)
-    ax.set_ylabel("Columns", fontsize=10)
-    plt.tight_layout()
-    return fig
-
-
-def plot_missingness_correlation(df):
-    missing_cols = [c for c in df.columns if df[c].isnull().any()]
-    if len(missing_cols) < 2:
-        return None
-    miss_bin = df[missing_cols].isnull().astype(int)
-    corr = miss_bin.corr()
-    fig, ax = plt.subplots(figsize=(max(7, len(missing_cols) * 0.9), max(6, len(missing_cols) * 0.8)))
-    mask = np.triu(np.ones_like(corr, dtype=bool))
-    sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", center=0,
-                mask=mask, square=True, linewidths=0.5, cbar_kws={"shrink": 0.8}, ax=ax)
-    ax.set_title("Missingness Correlation Matrix", fontsize=13, fontweight="bold", pad=12)
-    plt.tight_layout()
-    return fig
-
-
-def plot_numerical_column(df, col):
-    s_original = df[col].dropna()
-    s_imputed  = df[col].fillna(s_original.mean())
-    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
-    fig.suptitle(f"Deep Distribution Analysis — {col}", fontsize=14, fontweight="bold")
-    sns.kdeplot(s_original, ax=axes[0], color="#4f8ef7", linewidth=3,
-                label="Original (Before)", fill=True, alpha=0.2)
-    sns.kdeplot(s_imputed,  ax=axes[0], color="#e07b54", linewidth=3,
-                label="Mean Imputed (After)", linestyle="--")
-    axes[0].set_title("Distribution Shift: Original vs. Imputed", fontsize=12)
-    axes[0].legend()
-    box_data = pd.DataFrame({
-        "Value": pd.concat([s_original, s_imputed]),
-        "Type":  ["Original"] * len(s_original) + ["Imputed"] * len(s_imputed),
-    })
-    sns.boxplot(data=box_data, x="Type", y="Value", ax=axes[1], palette=["#dce3ff", "#fce4d6"])
-    axes[1].set_title("Variance & Outlier Comparison", fontsize=12)
-    plt.tight_layout()
-    return fig
-
-
-def plot_categorical_column(df, col, top_n=10):
-    s_original = df[col].dropna()
-    s_imputed  = df[col].fillna(s_original.mode()[0] if not s_original.empty else "N/A")
-    fig, axes  = plt.subplots(1, 2, figsize=(16, 7))
-    fig.suptitle(f"Categorical Frequency Analysis — {col}", fontsize=14, fontweight="bold")
-    orig_counts = s_original.value_counts().head(top_n)
-    imp_counts  = s_imputed.value_counts().head(top_n)
-    compare_df  = pd.DataFrame({"Original": orig_counts, "Imputed (Mode)": imp_counts}).fillna(0)
-    compare_df.plot(kind="barh", ax=axes[0], color=["#4f8ef7", "#e07b54"], width=0.8)
-    axes[0].set_title(f"Top {top_n} Categories: Original vs Mode Imputed", fontsize=12)
-    axes[0].invert_yaxis()
-    top_pie = imp_counts.head(8)
-    axes[1].pie(top_pie, labels=top_pie.index.astype(str), autopct="%1.1f%%",
-                startangle=140, colors=plt.cm.Pastel1.colors, wedgeprops={"edgecolor": "white"})
-    axes[1].set_title("Final Proportion (After Imputation)", fontsize=12)
-    plt.tight_layout()
-    return fig
-
-
-def plot_missing_vs_features(df, col):
-    num_others = [c for c in df.select_dtypes(include=[np.number]).columns
-                  if c != col and df[c].isnull().mean() < 0.95]
-    if not num_others:
-        return None
-    means_present = df[df[col].notna()][num_others].mean()
-    means_missing = df[df[col].isnull()][num_others].mean()
-    diff_df = pd.DataFrame({"Present": means_present, "Missing": means_missing}).dropna().head(12)
-    if diff_df.empty:
-        return None
-    fig, ax = plt.subplots(figsize=(max(8, len(diff_df) * 0.9), 4))
-    x = np.arange(len(diff_df)); w = 0.35
-    ax.bar(x - w/2, diff_df["Present"], w, label="Present rows", color="#4f8ef7", alpha=0.85)
-    ax.bar(x + w/2, diff_df["Missing"], w, label="Missing rows",  color="#e07b54", alpha=0.85)
-    ax.set_xticks(x)
-    ax.set_xticklabels(diff_df.index, rotation=35, ha="right", fontsize=9)
-    ax.set_title(f"Feature Means — Rows where '{col}' is Present vs Missing",
-                 fontsize=11, fontweight="bold")
-    ax.set_ylabel("Mean value")
-    ax.legend(fontsize=9)
-    plt.tight_layout()
-    return fig
-
-
-def render_per_column_deep_analysis(df, col, num_cols, cat_cols, mechanism_results):
-    miss_count = int(df[col].isnull().sum())
-    miss_pct   = round(df[col].isnull().mean() * 100, 2)
-    total_rows = len(df)
-    present    = total_rows - miss_count
-    col_type   = "Numerical" if col in num_cols else "Categorical"
-    mech_info  = mechanism_results.get(col, {})
-    mech       = mech_info.get("mechanism", "N/A")
-    mech_reason = mech_info.get("reason", "Run the global diagnosis section above first.")
-    sev        = severity(miss_pct) if miss_pct > 0 else "None"
-
-    miss_color = "#dc2626" if miss_pct >= 20 else "#d97706" if miss_pct >= 5 else "#16a34a"
-    sev_color  = "#dc2626" if sev == "High" else "#d97706" if sev == "Moderate" else "#16a34a"
-    mech_color = {"MCAR": "#155724", "MAR": "#856404", "MNAR": "#721c24"}.get(mech, "#444")
-
-    st.markdown(f"#### 🔍 Deep Analysis — `{col}` &nbsp;·&nbsp; {col_type}", unsafe_allow_html=True)
-    m1, m2, m3, m4, m5 = st.columns(5)
-    with m1: st.markdown(stat_card("Total Rows", f"{total_rows:,}"), unsafe_allow_html=True)
-    with m2: st.markdown(stat_card("Present",    f"{present:,}"),    unsafe_allow_html=True)
-    with m3: st.markdown(stat_card("Missing",    f"{miss_pct}%",  miss_color), unsafe_allow_html=True)
-    with m4: st.markdown(stat_card("Severity",   sev,             sev_color),  unsafe_allow_html=True)
-    with m5: st.markdown(stat_card("Mechanism",  mech,            mech_color), unsafe_allow_html=True)
-    st.markdown("")
-
-    if col_type == "Numerical":
-        s = df[col].dropna()
-        if len(s) > 1:
-            col_skew = float(skew(s))
-            col_kurt = float(kurtosis(s))
-            Q1, Q3  = float(s.quantile(0.25)), float(s.quantile(0.75))
-            IQR     = Q3 - Q1
-            n_out   = detect_outliers_iqr(df[col])
-            vb, va, vi = variance_impact(df[col])
-            out_pct = n_out / max(len(s), 1)
-
-            r1 = st.columns(4)
-            for (lbl, val), col_ui in zip(
-                [("Mean", f"{s.mean():.4g}"), ("Median", f"{s.median():.4g}"),
-                 ("Std Dev", f"{s.std():.4g}"), ("Variance", f"{s.var():.4g}")], r1):
-                with col_ui: st.markdown(stat_card(lbl, val), unsafe_allow_html=True)
-            st.markdown("")
-
-            r2 = st.columns(4)
-            for (lbl, val), col_ui in zip(
-                [("Min", f"{s.min():.4g}"), ("Max", f"{s.max():.4g}"),
-                 ("Skewness", f"{col_skew:.3f}"), ("Kurtosis", f"{col_kurt:.3f}")], r2):
-                with col_ui: st.markdown(stat_card(lbl, val), unsafe_allow_html=True)
-            st.markdown("")
-
-            r3 = st.columns(4)
-            out_color = "#dc2626" if out_pct > 0.15 else "#d97706" if out_pct > 0.05 else "#16a34a"
-            for (lbl, val, clr), col_ui in zip(
-                [("Q1", f"{Q1:.4g}", "#1a1a2e"), ("Q3", f"{Q3:.4g}", "#1a1a2e"),
-                 ("IQR", f"{IQR:.4g}", "#1a1a2e"), ("Outliers (IQR)", str(n_out), out_color)], r3):
-                with col_ui: st.markdown(stat_card(lbl, val, clr), unsafe_allow_html=True)
-
-            if len(s) <= 5000:
-                try:
-                    _, p_norm = shapiro(s.sample(min(len(s), 5000), random_state=0))
-                    norm_txt = f"✅ Normal (p={p_norm:.4f})" if p_norm > 0.05 else f"⚠ Not Normal (p={p_norm:.4f})"
-                    st.caption(f"📐 Shapiro-Wilk normality test: {norm_txt}")
-                except Exception:
-                    pass
-
-            st.markdown("")
-            fig_dist = plot_numerical_column(df, col)
-            st.pyplot(fig_dist); plt.close(fig_dist)
-
-            st.markdown("**Variance Impact of Mean Imputation (simulated)**")
-            vc = st.columns(3)
-            delta_color = "#dc2626" if abs(vi)/max(vb,1e-9) > 0.3 else "#d97706" if abs(vi)/max(vb,1e-9) > 0.1 else "#16a34a"
-            with vc[0]: st.markdown(stat_card("Variance (before)", f"{vb:.4g}"), unsafe_allow_html=True)
-            with vc[1]: st.markdown(stat_card("Variance (after)",  f"{va:.4g}"), unsafe_allow_html=True)
-            with vc[2]: st.markdown(stat_card("Δ Variance", f"{vi:.4g}", delta_color), unsafe_allow_html=True)
-
-            pct_chg = abs(vi) / max(vb, 1e-9) * 100
-            if pct_chg >= 30:
-                st.warning(f"⚠ Variance drops by {pct_chg:.1f}% after mean imputation — over-smoothing risk. Use median or model-based imputation.")
-            elif pct_chg >= 10:
-                st.info(f"ℹ Variance drops by {pct_chg:.1f}% — acceptable, but monitor distribution shape.")
-            else:
-                st.success(f"✅ Variance change is small ({pct_chg:.1f}%) — mean imputation is statistically safe here.")
-    else:
-        s = df[col].dropna()
-        n_unique = s.nunique()
-        mode_val = str(s.mode().iloc[0]) if len(s) > 0 else "N/A"
-        mode_cnt = int((s == s.mode().iloc[0]).sum()) if len(s) > 0 else 0
-        mode_pct = round(mode_cnt / max(len(s), 1) * 100, 1)
-
-        r1 = st.columns(4)
-        for (lbl, val), col_ui in zip(
-            [("Unique Values", n_unique), ("Mode", mode_val[:12]),
-             ("Mode Count", f"{mode_cnt:,}"), ("Mode Freq %", f"{mode_pct}%")], r1):
-            with col_ui: st.markdown(stat_card(lbl, str(val)), unsafe_allow_html=True)
-
-        st.markdown("")
-        freq_table = s.value_counts().reset_index()
-        freq_table.columns = ["Value", "Count"]
-        freq_table["% of Present"] = (freq_table["Count"] / len(s) * 100).round(2)
-        tab_chart, tab_table = st.tabs(["📊 Frequency Chart", "📋 Frequency Table"])
-        with tab_chart:
-            fig_cat = plot_categorical_column(df, col)
-            st.pyplot(fig_cat); plt.close(fig_cat)
-        with tab_table:
-            st.dataframe(freq_table, use_container_width=True, hide_index=True)
-
-    st.markdown("")
-    if miss_count > 0:
-        st.markdown("**How Missingness Relates to Other Features**")
-        fig_pat = plot_missing_vs_features(df, col)
-        if fig_pat:
-            st.pyplot(fig_pat); plt.close(fig_pat)
-            st.caption("Large differences between blue (present) and orange (missing) bars signal MAR behavior.")
-        else:
-            st.info("No other numerical features available for pattern comparison.")
-
-    st.markdown("")
-    verdict_cls = {"MCAR": "card-mcar", "MAR": "card-mar", "MNAR": "card-mnar"}.get(mech, "card-info")
-    mech_icon   = {"MCAR": "🟢", "MAR": "🟡", "MNAR": "🔴"}.get(mech, "✅")
-    mech_label  = {"MCAR": "Missing Completely At Random (MCAR)",
-                   "MAR":  "Missing At Random (MAR)",
-                   "MNAR": "Missing Not At Random (MNAR)",
-                   "N/A":  "No Missing Values"}.get(mech, mech)
-
-    st.markdown(
-        f'<div class="{verdict_cls}"><strong>{mech_icon} {mech_label}</strong><br>'
-        f'<span style="font-size:0.9rem;color:#444;">{mech_reason}</span></div>',
-        unsafe_allow_html=True)
-
-    chips_html = strategy_chips_html(mech, miss_pct, col_type)
-    if chips_html:
-        st.markdown("")
-        st.markdown("**Recommended Strategies**")
-        st.markdown(chips_html, unsafe_allow_html=True)
-
-    pointer = {
-        "MCAR": ("📍 **MCAR**: Missing% <5% → listwise deletion is safe. 5–15% → median/mode imputation. "
-                 "15–30% → advanced imputation with missing indicator."),
-        "MAR":  ("📍 **MAR**: KNN / MICE preferred. Create a missing indicator if missing% ≥10%."),
-        "MNAR": ("📍 **MNAR**: **Create the missing indicator FIRST**, then use constant or sensitivity analysis. "
-                 "Domain knowledge is essential."),
-        "N/A":  "📍 No action needed — this column is complete. Proceed to feature engineering.",
-    }.get(mech, "")
-    if pointer:
-        st.markdown("")
-        st.info(pointer)
+    if var_drop_pct <= 10: var_verdict, var_msg = "ok", f"Variance Change: {var_drop_pct:.1f}%"
+    elif var_drop_pct <= 20: var_verdict, var_msg = "warn", f"Variance Change: {var_drop_pct:.1f}%"
+    else: var_verdict, var_msg = "fail", f"Variance Change: {var_drop_pct:.1f}%"
 
+    results["variance"] = {"verdict": var_verdict, "msg": var_msg, "var_drop_pct": var_drop_pct}
 
-# ════════════════════════════════════════════════════════════════════
-#  SIDEBAR — NAVIGATION
-# ════════════════════════════════════════════════════════════════════
+    # ── 4. Correlation Preservation ──
+    numeric_others = [c for c in df.select_dtypes(include=[np.number]).columns if c != col and c != target_col]
+    corr_results, max_corr_shift, sign_flip = {}, 0.0, False
 
-STEPS = [
-    "1 · Upload CSV",
-    "2 · Select Target Column",
-    "3 · Overview & Patterns",
-    "4 · Mechanism Dashboard",
-    "5 · Column Diagnostics",
-    "6 · Strategy & Imputation",
-    "7 · Validation Checks",
-]
+    for other in numeric_others[:10]:
+        s_before = df[[col, other]].dropna()
+        if len(s_before) < 5: continue
+        r_before = s_before[col].corr(s_before[other])
+        r_after  = imputed_series.corr(df[other])
+        
+        delta = abs(r_before - r_after)
+        flipped = (r_before * r_after < 0) and (abs(r_before) > 0.1)
+        
+        corr_results[other] = {"r_before": round(r_before, 4), "r_after": round(r_after, 4), "delta": round(delta, 4), "sign_flip": flipped}
+        max_corr_shift = max(max_corr_shift, delta)
+        if flipped: sign_flip = True
 
-with st.sidebar:
-    st.markdown("## 🔬 Missing Value Intelligence Suite")
-    st.markdown("---")
-    st.markdown("**Navigation**")
-    step = st.radio("Go to step:", STEPS, label_visibility="collapsed")
-    st.markdown("---")
-    st.markdown(
-        "<small style='color:#9090c0'>Follow the steps in order for a complete analysis pipeline. "
-        "Steps 3–4 are exploratory; Steps 5–7 form the diagnostic pipeline.</small>",
-        unsafe_allow_html=True,
-    )
+    if max_corr_shift <= 0.05 and not sign_flip: corr_verdict, corr_msg = "ok", f"Max Δ = {max_corr_shift:.3f} — Correlation well preserved"
+    elif sign_flip: corr_verdict, corr_msg = "fail", f"Sign flip detected! Correlation direction reversed."
+    elif max_corr_shift <= 0.10: corr_verdict, corr_msg = "warn", f"Max Δ = {max_corr_shift:.3f} — Moderate correlation shift"
+    else: corr_verdict, corr_msg = "fail", f"Max Δ = {max_corr_shift:.3f} — Large correlation shift detected"
 
+    results["correlation"] = {"details": corr_results, "verdict": corr_verdict, "msg": corr_msg, "max_shift": round(max_corr_shift, 4)}
 
-# ════════════════════════════════════════════════════════════════════
-#  SESSION STATE
-# ════════════════════════════════════════════════════════════════════
+    return results
 
-for key in ["df", "target_col", "col_results", "df_imputed", "mechanism_results_lr"]:
-    if key not in st.session_state:
-        st.session_state[key] = None
-if st.session_state["col_results"] is None:
-    st.session_state["col_results"] = {}
-if st.session_state["mechanism_results_lr"] is None:
-    st.session_state["mechanism_results_lr"] = {}
+def get_auto_recommendation(df, col, target, mechanism, miss_pct, dtype):
+    """Determine best imputation strategy with explicit labeling."""
+    needs_indicator = (mechanism == "MNAR") or (mechanism == "MAR" and miss_pct >= 10)
+    indicator_suffix = " + Missing Indicator" if needs_indicator else ""
 
+    # High missingness — always flag
+    if miss_pct > 70:
+        return f"Drop Column"
 
-# ════════════════════════════════════════════════════════════════════
-#  STEP 1 — UPLOAD CSV
-# ════════════════════════════════════════════════════════════════════
+    if mechanism == "MCAR" and miss_pct <= 5:
+        return "Drop Rows"
 
-if step == STEPS[0]:
-    st.markdown('<div class="main-title">📂 Step 1 — Upload Your CSV</div>', unsafe_allow_html=True)
-    st.markdown('<div class="main-sub">Upload a CSV file to begin the missing-value analysis pipeline.</div>', unsafe_allow_html=True)
+    # Categorical / non-numeric
+    if not pd.api.types.is_numeric_dtype(df[col]):
+        return f"Mode Imputation{indicator_suffix}"
 
-    uploaded = st.file_uploader("Choose a CSV file", type=["csv"])
+    # Numeric: run quick feasibility to decide
+    feas_med = feasibility_checks(df, col, target, "Median")
+    if not feas_med.get("applicable"):
+        return f"Median Imputation{indicator_suffix}"
 
-    if uploaded:
-        try:
-            df = pd.read_csv(uploaded)
-            # Auto-remove ID-like columns
-            id_cols = [c for c in df.columns if c.strip().lower() in ("id", "index", "row", "rowid", "row_id")]
-            if id_cols:
-                df.drop(columns=id_cols, inplace=True)
-                st.toast(f"Auto-removed non-informative column(s): {id_cols}", icon="🗑️")
-
-            st.session_state["df"] = df
-            st.session_state["col_results"] = {}
-            st.session_state["mechanism_results_lr"] = {}
-            st.session_state["df_imputed"] = df.copy()
-
-            st.success(f"✅ File loaded: **{uploaded.name}** — {df.shape[0]} rows × {df.shape[1]} columns")
-            st.markdown("### Preview (first 10 rows)")
-            st.dataframe(df.head(10), use_container_width=True)
-
-            c1, c2, c3, c4 = st.columns(4)
-            with c1:
-                st.markdown(f'<div class="metric-box"><div class="metric-val">{df.shape[0]:,}</div><div class="metric-lbl">Rows</div></div>', unsafe_allow_html=True)
-            with c2:
-                st.markdown(f'<div class="metric-box"><div class="metric-val">{df.shape[1]}</div><div class="metric-lbl">Columns</div></div>', unsafe_allow_html=True)
-            with c3:
-                n_miss_cols = df.isnull().any().sum()
-                st.markdown(f'<div class="metric-box"><div class="metric-val">{n_miss_cols}</div><div class="metric-lbl">Columns w/ Missings</div></div>', unsafe_allow_html=True)
-            with c4:
-                total_miss = df.isnull().sum().sum()
-                pct_miss = round(total_miss / df.size * 100, 1)
-                st.markdown(f'<div class="metric-box"><div class="metric-val">{pct_miss}%</div><div class="metric-lbl">Overall Missing Rate</div></div>', unsafe_allow_html=True)
-
-            st.markdown("### Column Types & Missingness")
-            type_df = pd.DataFrame({
-                "Column":    df.columns,
-                "Dtype":     df.dtypes.astype(str).values,
-                "Missing":   df.isnull().sum().values,
-                "Missing %": (df.isnull().mean() * 100).round(2).values,
-            })
-            st.dataframe(type_df, use_container_width=True, hide_index=True)
-
-        except Exception as e:
-            st.error(f"Could not read file: {e}")
+    var_ok = feas_med["variance"]["var_drop_pct"] <= 20
+    corr_ok = feas_med["correlation"]["verdict"] != "fail"
+    skew_val = abs(feas_med["skewness"].get("value", 0))
+
+    if var_ok and corr_ok:
+        if skew_val <= 1:
+            return f"Mean Imputation{indicator_suffix}"
+        else:
+            return f"Median Imputation{indicator_suffix}"
     else:
-        st.info("👆 Upload a CSV to get started.")
+        if miss_pct > 30:
+            return f"MICE Imputer{indicator_suffix}"
+        else:
+            return f"KNN Imputer{indicator_suffix}"
 
 
 # ════════════════════════════════════════════════════════════════════
-#  STEP 2 — SELECT TARGET COLUMN
+#  SIDEBAR NAVIGATION
 # ════════════════════════════════════════════════════════════════════
+STEPS = ["1 · Upload & Split", "2 · Overview", "3 · Column Diagnostics", "4 · Feasibility Gate", "5 · Final Report"]
 
-elif step == STEPS[1]:
-    st.markdown('<div class="main-title">🎯 Step 2 — Select Target Column</div>', unsafe_allow_html=True)
-    st.markdown('<div class="main-sub">The target column (y) is used in Test 3 to detect MNAR patterns and is excluded from feature analysis.</div>', unsafe_allow_html=True)
-
-    df = st.session_state.get("df")
-    if df is None:
-        st.warning("⚠️ Please upload a CSV in Step 1 first.")
-    else:
-        target = st.selectbox(
-            "Select the output / target column:",
-            options=df.columns.tolist(),
-            index=len(df.columns) - 1,
-        )
-        if st.button("✅ Confirm Target Column", type="primary"):
-            st.session_state["target_col"] = target
-            st.success(f"Target column set to: **{target}**")
-
-        if st.session_state.get("target_col"):
-            st.info(f"Current target: **{st.session_state['target_col']}**")
-            tc = st.session_state["target_col"]
-            col_data = df[tc]
-            st.markdown("#### Target Column Distribution")
-            fig, ax = plt.subplots(figsize=(7, 3))
-            if pd.api.types.is_numeric_dtype(col_data):
-                col_data.dropna().hist(bins=30, ax=ax, color="#17172b", edgecolor="white")
-                ax.set_xlabel(tc); ax.set_ylabel("Count")
-            else:
-                vc = col_data.value_counts().head(15)
-                vc.plot(kind="bar", ax=ax, color="#17172b")
-                ax.set_ylabel("Count")
-            ax.set_title(f"Distribution of '{tc}'")
-            plt.tight_layout()
-            st.pyplot(fig)
-            plt.close()
+with st.sidebar:
+    st.markdown("## 🔬 Missing Value Analyzer")
+    st.markdown("---")
+    step = st.radio("Navigate:", STEPS, label_visibility="collapsed")
+    st.markdown("---")
+    if st.session_state.get("df_train") is not None:
+        st.markdown(f"**Train set:** {st.session_state['df_train'].shape[0]} rows × {st.session_state['df_train'].shape[1]} cols")
+        st.markdown(f"**Diagnosed:** {len(st.session_state['col_diagnostics'])} columns")
+    st.markdown("<small style='color:#9090c0'>Analysis runs on TRAIN SET only to prevent data leakage.</small>", unsafe_allow_html=True)
 
 
 # ════════════════════════════════════════════════════════════════════
-#  STEP 3 — OVERVIEW & PATTERNS
+#  STEP 1 — UPLOAD & SPLIT
 # ════════════════════════════════════════════════════════════════════
+def render_step1():
+    st.markdown('<div class="main-title">📂 Step 1 — Upload CSV & Train/Test Split</div>', unsafe_allow_html=True)
+    uploaded = st.file_uploader("Choose a CSV file", type=["csv"])
+    if not uploaded: return st.info("👆 Upload a CSV file to begin.")
 
-elif step == STEPS[2]:
-    st.markdown('<div class="main-title">📊 Step 3 — Overview & Patterns</div>', unsafe_allow_html=True)
-    st.markdown('<div class="main-sub">Bird\'s-eye view of missingness across the dataset, including heatmaps and co-missingness patterns.</div>', unsafe_allow_html=True)
-
-    df = st.session_state.get("df")
-    target_col = st.session_state.get("target_col")
-
-    if df is None:
-        st.warning("⚠️ Please upload a CSV in Step 1 first.")
-    else:
-        X = df.drop(columns=[target_col]) if target_col and target_col in df.columns else df
-        summary = missing_summary_df(X)
+    df = pd.read_csv(uploaded)
+    st.success(f"✅ Loaded **{uploaded.name}**")
 
-        if summary.empty:
-            st.success("🎉 No missing values found in the dataset features!")
-        else:
-            st.markdown(f"### {len(summary)} column(s) have missing values")
-            st.dataframe(summary.style.background_gradient(subset=["Missing %"], cmap="YlOrRd"),
-                         use_container_width=True)
-
-            # ── Missing % bar chart
-            st.markdown('<div class="section-header">📉 Missing % per Column</div>', unsafe_allow_html=True)
-            miss_cols = summary.index.tolist()
-            fig_bar, ax_bar = plt.subplots(figsize=(max(7, len(miss_cols) * 0.9), 4))
-            colors = ["#9e2210" if v > 30 else "#7a4d00" if v > 15 else "#0d6b3a" for v in summary["Missing %"]]
-            ax_bar.barh(summary.index[::-1], summary["Missing %"][::-1], color=colors[::-1], edgecolor="white")
-            ax_bar.axvline(5,  color="#89d9ac", linewidth=1.5, linestyle="--", label="5% threshold")
-            ax_bar.axvline(15, color="#f0cc7a", linewidth=1.5, linestyle="--", label="15% threshold")
-            ax_bar.axvline(30, color="#f5a898", linewidth=1.5, linestyle="--", label="30% threshold")
-            ax_bar.set_xlabel("Missing %"); ax_bar.set_title("Missing % per Column")
-            ax_bar.legend(loc="lower right", fontsize=8)
-            plt.tight_layout()
-            st.pyplot(fig_bar)
-            plt.close()
-
-            # ── Heatmap + Correlation tabs
-            st.markdown('<div class="section-header">🗺 Missingness Patterns</div>', unsafe_allow_html=True)
-            tab_hm, tab_corr = st.tabs(["Missing Heatmap", "Missingness Correlation"])
-            with tab_hm:
-                fig_hm = plot_missing_heatmap(X)
-                if fig_hm:
-                    st.pyplot(fig_hm); plt.close(fig_hm)
-                    st.caption("Dark = missing, light = present. Each column is a row.")
-                else:
-                    st.info("No missing values to display.")
-            with tab_corr:
-                fig_corr = plot_missingness_correlation(X)
-                if fig_corr:
-                    st.pyplot(fig_corr); plt.close(fig_corr)
-                    st.caption("Near +1: columns tend to be missing together. Near −1: rarely missing simultaneously.")
-                else:
-                    st.info("Need at least 2 columns with missing values for this chart.")
-
-            # ── Correlation among numerical features
-            num_cols_x, _ = identify_columns(X)
-            if len(num_cols_x) >= 2:
-                st.markdown('<div class="section-header">📈 Feature Correlations (Numerical)</div>', unsafe_allow_html=True)
-                valid = [c for c in num_cols_x if X[c].isnull().mean() < 1.0]
-                if len(valid) >= 2:
-                    corr = X[valid].corr()
-                    strong = (corr.abs() > 0.5) & (corr != 1.0)
-                    if strong.any().any():
-                        fig_fc, ax_fc = plt.subplots(figsize=(max(8, len(valid) * 0.9), max(7, len(valid) * 0.8)))
-                        mask = np.triu(np.ones_like(corr, dtype=bool))
-                        display_corr = corr.where(corr.abs() > 0.5)
-                        sns.heatmap(display_corr, annot=False, cmap="RdYlGn", center=0,
-                                    mask=mask, square=True, linewidths=0.5,
-                                    cbar_kws={"shrink": 0.8}, ax=ax_fc, vmin=-1, vmax=1)
-                        ax_fc.set_title("Strong Correlations (|r| > 0.5) — Numerical Features",
-                                        fontsize=13, fontweight="bold", pad=12)
-                        plt.tight_layout()
-                        st.pyplot(fig_fc); plt.close(fig_fc)
-
-                        # Correlation pairs table
-                        pairs = []
-                        seen = set()
-                        for i, c1 in enumerate(corr.columns):
-                            for j, c2 in enumerate(corr.columns):
-                                if i >= j: continue
-                                v = corr.loc[c1, c2]
-                                if abs(v) > 0.5:
-                                    key = tuple(sorted([c1, c2]))
-                                    if key not in seen:
-                                        seen.add(key)
-                                        pairs.append({"Column A": c1, "Column B": c2,
-                                                      "Correlation": round(v, 4),
-                                                      "Correlation %": f"{round(v * 100, 2)}%"})
-                        if pairs:
-                            corr_table = pd.DataFrame(pairs).sort_values("Correlation", key=abs, ascending=False)
-                            st.markdown("**Strong Correlation Pairs (|r| > 0.5)**")
-                            st.dataframe(corr_table, use_container_width=True, hide_index=True)
-                    else:
-                        st.info("No strong correlations (|r| > 0.5) found among numerical features.")
+    col1, col2 = st.columns(2)
+    target = col1.selectbox("Target column (Y):", df.columns.tolist(), index=len(df.columns)-1)
+    split_pct = col2.slider("Train size:", 50, 95, 80, 5, format="%d%%")
 
+    if st.button("✅ Confirm & Split", type="primary"):
+        df_train, df_test = train_test_split(df, train_size=split_pct/100.0, random_state=42)
+        st.session_state.update({"df_full": df, "df_train": df_train.reset_index(drop=True), "df_test": df_test.reset_index(drop=True), "target_col": target, "col_diagnostics": {}})
+        st.success("✅ Split complete!")
+        st.dataframe(df_train.head(), use_container_width=True)
 
 # ════════════════════════════════════════════════════════════════════
-#  STEP 4 — MECHANISM DASHBOARD  (from app_tanisha.py)
+#  STEP 2 — OVERVIEW
 # ════════════════════════════════════════════════════════════════════
+def render_step2():
+    st.markdown('<div class="main-title">📊 Step 2 — Missing Value Overview</div>', unsafe_allow_html=True)
+    df = st.session_state.get("df_train")
+    if df is None: return st.warning("⚠️ Please complete Step 1.")
 
-elif step == STEPS[3]:
-    st.markdown('<div class="main-title">🧪 Step 4 — Mechanism Dashboard</div>', unsafe_allow_html=True)
-    st.markdown('<div class="main-sub">Automated MCAR/MAR/MNAR detection via Chi-square & Logistic Regression, plus outlier/variance analysis and deep per-column exploration.</div>', unsafe_allow_html=True)
-
-    df = st.session_state.get("df")
-    target_col = st.session_state.get("target_col")
-
-    if df is None:
-        st.warning("⚠️ Please upload a CSV in Step 1 first.")
-    elif target_col is None:
-        st.warning("⚠️ Please select a target column in Step 2 first.")
-    else:
-        X = df.drop(columns=[target_col])
-        y = df[target_col]
-        num_cols, cat_cols = identify_columns(X)
-
-        # ── Train-test split
-        st.markdown('<div class="section-header">✂️ Train-Test Split (80 / 20)</div>', unsafe_allow_html=True)
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-        sc1, sc2 = st.columns(2)
-        with sc1: st.markdown(f"**Training Set** — X_train: `{X_train.shape}` · y_train: `{y_train.shape}`")
-        with sc2: st.markdown(f"**Test Set** — X_test: `{X_test.shape}` · y_test: `{y_test.shape}`")
-
-        # ── Mechanism diagnosis
-        st.markdown('<div class="section-header">🔬 Missing Data Mechanism Diagnosis (Chi-square + Logistic Regression)</div>', unsafe_allow_html=True)
-        missing_feature_cols = [c for c in X.columns if X[c].isnull().any()]
-
-        if not missing_feature_cols:
-            st.success("No missing values in feature columns — nothing to diagnose.")
-            mechanism_results = {}
-        else:
-            cached = st.session_state.get("mechanism_results_lr", {})
-            if not cached:
-                with st.spinner("Running MCAR (Chi-square) and MAR (Logistic Regression) tests…"):
-                    mechanism_results = {}
-                    for col in missing_feature_cols:
-                        mech, reason = diagnose_mechanism_lr(X, col, num_cols)
-                        mechanism_results[col] = {"mechanism": mech, "reason": reason}
-                st.session_state["mechanism_results_lr"] = mechanism_results
-            else:
-                mechanism_results = cached
-
-            badge_map = {"MCAR": "badge-mcar", "MAR": "badge-mar", "MNAR": "badge-mnar"}
-            for col, res in mechanism_results.items():
-                mech = res["mechanism"]
-                pct  = round(X[col].isnull().mean() * 100, 2)
-                with st.expander(f"🔎 **{col}** — {mech}  |  {pct}% missing"):
-                    st.markdown(f'<span class="{badge_map[mech]}">{mech}</span>&nbsp;&nbsp;{res["reason"]}',
-                                unsafe_allow_html=True)
-
-        # ── Outlier Detection & Variance Impact
-        st.markdown('<div class="section-header">⚡ Outlier Detection & Variance Impact</div>', unsafe_allow_html=True)
-        outlier_data = {}
-        for col in num_cols:
-            n_out = detect_outliers_iqr(X[col])
-            vb, va, vi = variance_impact(X[col])
-            outlier_data[col] = {
-                "Missing %": round(X[col].isnull().mean() * 100, 2),
-                "Outliers (IQR)": n_out,
-                "Variance (before impute)": vb,
-                "Variance (after mean impute)": va,
-                "Variance Impact (Δ)": vi,
-            }
-        if outlier_data:
-            out_df = (pd.DataFrame(outlier_data).T.reset_index()
-                      .rename(columns={"index": "Column"})
-                      .sort_values("Outliers (IQR)", ascending=False))
-
-            def color_outliers(val):
-                if isinstance(val, (int, float)):
-                    if val > 50: return "background-color: #f8d7da; color: #721c24;"
-                    if val > 10: return "background-color: #fff3cd; color: #856404;"
-                return ""
-            st.dataframe(out_df.style.applymap(color_outliers, subset=["Outliers (IQR)"]),
-                         use_container_width=True, hide_index=True)
-        else:
-            st.info("No numerical columns available for outlier analysis.")
-
-        # ── Final Diagnosis Table
-        st.markdown('<div class="section-header">📋 Final Diagnosis Table</div>', unsafe_allow_html=True)
-        diag_rows = []
-        for col in X.columns:
-            mp   = round(X[col].isnull().mean() * 100, 2)
-            mech = mechanism_results.get(col, {}).get("mechanism", "N/A") if col in missing_feature_cols else "N/A"
-            diag_rows.append({
-                "Column": col, "Missing %": mp,
-                "Mechanism": mech, "Severity": severity(mp) if mp > 0 else "None",
-                "Outliers": outlier_data.get(col, {}).get("Outliers (IQR)", "—"),
-                "Variance Impact (Δ)": outlier_data.get(col, {}).get("Variance Impact (Δ)", "—"),
-            })
-        diag_df = pd.DataFrame(diag_rows).sort_values("Missing %", ascending=False).reset_index(drop=True)
-
-        sev_colors  = {"High": "background-color: #f8d7da; color: #721c24;",
-                       "Moderate": "background-color: #fff3cd; color: #856404;",
-                       "Low": "background-color: #d4edda; color: #155724;"}
-        mech_colors = {"MCAR": "background-color: #d4edda; color: #155724;",
-                       "MAR":  "background-color: #fff3cd; color: #856404;",
-                       "MNAR": "background-color: #f8d7da; color: #721c24;"}
-
-        def color_diag_row(row):
-            mech_style = mech_colors.get(row["Mechanism"], "")
-            sev_style  = sev_colors.get(row["Severity"], "")
-            return ["", "", mech_style, sev_style, "", ""]
-
-        st.dataframe(diag_df.style.apply(color_diag_row, axis=1),
-                     use_container_width=True, hide_index=True)
-
-        # ── Per-Column Deep Analysis
-        st.markdown('<div class="section-header">🔬 Per-Column Deep Analysis</div>', unsafe_allow_html=True)
-        col_label_to_name = {}
-        for col in X.columns:
-            mp_l     = round(X[col].isnull().mean() * 100, 1)
-            type_lbl = "Num" if col in num_cols else "Cat"
-            mech_lbl = mechanism_results.get(col, {}).get("mechanism", "—") if col in missing_feature_cols else "complete"
-            label    = f"{col}  [{type_lbl} · {mp_l}% missing · {mech_lbl}]"
-            col_label_to_name[label] = col
-
-        chosen_label = st.selectbox(
-            "Select a column to analyse in detail:",
-            options=["— choose a column —"] + list(col_label_to_name.keys()),
-            key="deep_col_select"
-        )
-        if chosen_label != "— choose a column —":
-            chosen_col = col_label_to_name[chosen_label]
-            with st.spinner(f"Analysing `{chosen_col}`…"):
-                st.markdown("---")
-                render_per_column_deep_analysis(
-                    df=X, col=chosen_col,
-                    num_cols=num_cols, cat_cols=cat_cols,
-                    mechanism_results=mechanism_results,
-                )
-                st.markdown("---")
-
-        # ── Insights
-        st.markdown('<div class="section-header">💡 Data Analysis Insights</div>', unsafe_allow_html=True)
-        high_miss = diag_df[diag_df["Missing %"] >= 20]["Column"].tolist()
-        mar_cols  = diag_df[diag_df["Mechanism"] == "MAR"]["Column"].tolist()
-        mnar_cols = diag_df[diag_df["Mechanism"] == "MNAR"]["Column"].tolist()
-        high_out  = [c for c in num_cols if outlier_data.get(c, {}).get("Outliers (IQR)", 0) > 10]
-
-        insights = [
-            "Missing data must be understood <b>before</b> any imputation or modeling to avoid biased results.",
-            (f"<b>{', '.join(high_miss)}</b> have ≥20% missing values — treat with caution or consider dropping."
-             if high_miss else "No columns have critically high (≥20%) missing rates — dataset quality looks reasonable."),
-            (f"Columns <b>{', '.join(mar_cols)}</b> show MAR behavior — KNN/MICE imputation is viable."
-             if mar_cols else "No columns confirmed MAR."),
-            (f"Columns <b>{', '.join(mnar_cols)}</b> are likely MNAR — create a missing indicator before imputing."
-             if mnar_cols else "No columns flagged as MNAR."),
-            (f"Columns <b>{', '.join(high_out)}</b> have many outliers — prefer median over mean imputation."
-             if high_out else "Outlier counts appear manageable across numerical columns."),
-            "Correlated missingness indicates data is likely <b>not MCAR</b> — jointly missing due to a common cause.",
-            "MCAR is rare in real-world datasets. Most missingness in practice is MAR or MNAR.",
-            "MNAR <b>cannot be confirmed statistically</b> from observed data alone — domain knowledge is essential.",
-        ]
-        st.markdown('<div class="insight-box"><ul>' + "".join(f"<li>{i}</li>" for i in insights) + "</ul></div>",
-                    unsafe_allow_html=True)
-
-        # ── Theory
-        st.markdown('<div class="section-header">📚 Theoretical Background</div>', unsafe_allow_html=True)
-        theories = [
-            ("🔵 MCAR — Missing Completely At Random",
-             "The probability of missingness is entirely independent of observed and unobserved data. "
-             "Listwise deletion is unbiased under MCAR, though it reduces sample size."),
-            ("🟡 MAR — Missing At Random",
-             "Missingness depends on <i>observed</i> data but not on the missing value itself. "
-             "Multiple imputation or FIML methods produce valid estimates under MAR."),
-            ("🔴 MNAR — Missing Not At Random",
-             "Missingness depends on the <i>unobserved value itself</i>. Cannot be detected from observed data. "
-             "Requires sensitivity analysis and domain knowledge. Ignoring MNAR produces biased results."),
-            ("📐 Why Chi-Square for MCAR Testing?",
-             "Chi-square tests independence between the binary missingness indicator and binned numeric predictors. "
-             "No significant association is consistent with MCAR, though this only confirms pairwise independence."),
-            ("🤖 Why Logistic Regression for MAR Detection?",
-             "LR models the binary missingness indicator as a function of all observed features. "
-             "Accuracy substantially above the majority-class baseline indicates MAR."),
-            ("📉 Why MNAR Cannot Be Confirmed Statistically",
-             "MNAR depends on unobserved values — data we do not have. No statistical test on observed data "
-             "can definitively confirm it. Domain reasoning about the data generation process is required."),
-            ("📦 Outliers and Their Impact on Variance",
-             "Outliers (>1.5×IQR) inflate variance and distort the mean. Mean imputation artificially collapses "
-             "variance because all missing cells receive the same central value, masking true data spread."),
-        ]
-        for title, body in theories:
-            st.markdown(f'<div class="theory-box"><h4>{title}</h4><p>{body}</p></div>', unsafe_allow_html=True)
+    miss_cols = [c for c in df.columns if df[c].isnull().any()]
+    if not miss_cols: return st.success("🎉 No missing values!")
 
+    summary = pd.DataFrame({"Missing Count": df.isnull().sum(), "Missing %": (df.isnull().sum()/len(df)*100).round(2)})
+    st.dataframe(summary[summary["Missing Count"] > 0].sort_values("Missing %", ascending=False).style.background_gradient(cmap="YlOrRd"), use_container_width=True)
 
 # ════════════════════════════════════════════════════════════════════
-#  STEP 5 — COLUMN DIAGNOSTICS  (from app.py — 3 statistical tests)
+#  STEP 3 — DIAGNOSTICS
 # ════════════════════════════════════════════════════════════════════
+def render_step3():
+    st.markdown('<div class="main-title">🧪 Step 3 — Per-Column Diagnostics</div>', unsafe_allow_html=True)
+    df, target = st.session_state.get("df_train"), st.session_state.get("target_col")
+    if df is None: return st.warning("⚠️ Please complete Step 1.")
+
+    miss_cols = [c for c in df.columns if df[c].isnull().any()]
+    if not miss_cols: return st.success("🎉 No missing values.")
+
+    col1, col2 = st.columns([1, 4])
+    selected_col = col1.selectbox("Select column to view:", miss_cols)
+    run_single = col1.button("▶ Run Diagnostics")
+    run_all = col2.button("▶ Run ALL columns", type="primary")
+
+    if run_single:
+        run_single_diagnostic(df, selected_col, target)
+    if run_all:
+        progress = st.progress(0, text="Running diagnostics...")
+        for i, c in enumerate(miss_cols):
+            run_single_diagnostic(df, c, target)
+            progress.progress((i+1)/len(miss_cols), text=f"Diagnosing: {c}")
+        progress.empty()
+        st.success(f"✅ Diagnosed {len(miss_cols)} columns.")
+
+    if selected_col in st.session_state["col_diagnostics"]:
+        res = st.session_state["col_diagnostics"][selected_col]
+        little, t_feat, t_target = res["little"], res["t_feat"], res["t_target"]
+        
+        st.markdown("---")
+
+        # ── Mechanism verdict card ──
+        card_class = {"MCAR":"card-mcar","MAR":"card-mar","MNAR":"card-mnar"}[res["mechanism"]]
+        emoji = {"MCAR":"🟢","MAR":"🟠","MNAR":"🔴"}[res["mechanism"]]
+        st.markdown(
+            f'<div class="{card_class}">'
+            f'<div class="verdict-label">{emoji} Mechanism: {res["mechanism"]} — {res["confidence"]} Confidence</div>'
+            f'<div class="verdict-desc">{res["explanation"]}</div>'
+            f'<div class="verdict-desc" style="margin-top:6px">Missing: <b>{res["miss_pct"]}%</b> &nbsp;|&nbsp; dtype: <b>{res["dtype"]}</b></div>'
+            f'</div>',
+            unsafe_allow_html=True
+        )
 
-elif step == STEPS[4]:
-    st.markdown('<div class="main-title">🔬 Step 5 — Column Diagnostics</div>', unsafe_allow_html=True)
-    st.markdown('<div class="main-sub">Run three independent statistical tests per column to determine the missing-data mechanism (MCAR / MAR / MNAR).</div>', unsafe_allow_html=True)
-
-    df = st.session_state.get("df")
-    target_col = st.session_state.get("target_col")
-
-    if df is None:
-        st.warning("⚠️ Please upload a CSV in Step 1 first.")
-    elif target_col is None:
-        st.warning("⚠️ Please select a target column in Step 2 first.")
-    else:
-        summary = missing_summary_df(df)
-        if summary.empty:
-            st.success("🎉 No missing values — nothing to diagnose.")
-        else:
-            miss_cols = summary.index.tolist()
-            selected_col = st.selectbox("Select a column to analyse:", miss_cols)
-            miss_pct = summary.loc[selected_col, "Missing %"]
-            dtype_str = str(df[selected_col].dtype)
-
-            st.markdown(f"---")
-            st.markdown(f"### Analysing column: `{selected_col}`")
-
-            lv, risk_txt, risk_bg, risk_fg = missingness_risk_level(miss_pct)
-            c1, c2, c3 = st.columns(3)
-            with c1:
-                st.markdown(f'<div class="metric-box"><div class="metric-val" style="color:#9e2210">{miss_pct:.1f}%</div><div class="metric-lbl">Missing</div></div>', unsafe_allow_html=True)
-            with c2:
-                st.markdown(f'<div class="metric-box"><div class="metric-val">{dtype_str}</div><div class="metric-lbl">Data Type</div></div>', unsafe_allow_html=True)
-            with c3:
-                n_miss = int(summary.loc[selected_col, "Missing Count"])
-                st.markdown(f'<div class="metric-box"><div class="metric-val">{n_miss:,}</div><div class="metric-lbl">Missing Rows</div></div>', unsafe_allow_html=True)
-
-            st.markdown(
-                f'<div style="background:{risk_bg};border:1.5px solid {risk_fg};border-radius:8px;padding:10px 16px;margin:12px 0;">'
-                f'<b style="color:{risk_fg}">{lv} Missingness</b> — {risk_txt}</div>',
-                unsafe_allow_html=True,
-            )
-
-            # ── Test 1
-            st.markdown("#### 🔬 Test 1 — Pattern Analysis (Missingness Map)")
-            t1 = test1_pattern_analysis(df, selected_col)
-            fig, axes = plt.subplots(1, 2, figsize=(12, 3))
-            sample_size = min(300, len(df))
-            idx_sample = df.sample(n=sample_size, random_state=42).index if len(df) > sample_size else df.index
-            ind_sample = t1["indicator"].loc[idx_sample]
-            axes[0].scatter(range(len(ind_sample)), ind_sample.values,
-                c=["#9e2210" if v else "#89d9ac" for v in ind_sample.values], s=8, alpha=0.8)
-            axes[0].set_yticks([0, 1]); axes[0].set_yticklabels(["Present", "Missing"])
-            axes[0].set_title(f"Missingness Pattern ({sample_size} rows)")
-            axes[0].set_xlabel("Row index")
-            roll = t1["indicator"].rolling(50, min_periods=1).mean()
-            axes[1].plot(roll.values, color="#17172b", linewidth=1.2)
-            axes[1].set_title("Rolling Miss Rate (window=50)")
-            axes[1].set_xlabel("Row index"); axes[1].set_ylabel("Miss rate")
-            axes[1].axhline(t1["miss_pct"] / 100, color="#9e2210", linestyle="--", label="Mean miss rate")
-            axes[1].legend(fontsize=8)
-            plt.tight_layout()
-            st.pyplot(fig); plt.close()
-
-            scatter_icon = "🟢" if t1["scattered"] else "🟠"
-            st.markdown(f'<div class="card-info"><b>{scatter_icon} {t1["signal"]}</b><br><small>Cluster ratio: {t1["cluster_ratio"]:.2f} (higher = more scattered = MCAR signal)</small></div>', unsafe_allow_html=True)
-
-            # ── Test 2
-            st.markdown("#### 🔬 Test 2 — Feature Dependency")
-            t2 = test2_feature_dependency(df, selected_col)
-            if t2["diffs"]:
-                top_diffs = dict(sorted(t2["diffs"].items(), key=lambda x: -x[1])[:15])
-                fig2, ax2 = plt.subplots(figsize=(10, max(3, len(top_diffs) * 0.45)))
-                colors = ["#9e2210" if v >= 30 else "#f0a040" if v >= 10 else "#89d9ac" for v in top_diffs.values()]
-                ax2.barh(list(top_diffs.keys())[::-1], list(top_diffs.values())[::-1], color=colors[::-1], edgecolor="white")
-                ax2.axvline(5,  color="#89d9ac", linewidth=1.5, linestyle="--", label="5% weak")
-                ax2.axvline(10, color="#f0cc7a", linewidth=1.5, linestyle="--", label="10% MAR signal")
-                ax2.axvline(30, color="#f5a898", linewidth=1.5, linestyle="--", label="30% strong")
-                ax2.set_xlabel("Distribution Difference (%)")
-                ax2.set_title("Feature Distribution Difference")
-                ax2.legend(fontsize=8)
-                plt.tight_layout()
-                st.pyplot(fig2); plt.close()
-                dep_icon = "🟢" if t2["max_diff"] < 5 else "🟠" if t2["max_diff"] < 30 else "🔴"
-                st.markdown(f'<div class="card-info"><b>{dep_icon} {t2["signal"]}</b><br><small>Max difference: {t2["max_diff"]:.1f}%</small></div>', unsafe_allow_html=True)
-            else:
-                st.info("Not enough data to compare feature distributions.")
-                t2 = {"diffs": {}, "max_diff": 0.0, "signal": "Insufficient data"}
-
-            # ── Test 3
-            st.markdown("#### 🔬 Test 3 — Target Dependency")
-            if selected_col == target_col:
-                st.warning("⚠️ Selected column IS the target column. Test 3 skipped.")
-                t3 = {"diff_pct": None, "signal": "Skipped — column is target"}
-            else:
-                t3 = test3_target_dependency(df, selected_col, target_col)
-                if t3["diff_pct"] is not None:
-                    missing_mask = df[selected_col].isnull()
-                    fig3, ax3 = plt.subplots(figsize=(7, 3.5))
-                    if pd.api.types.is_numeric_dtype(df[target_col]):
-                        miss_target = df.loc[missing_mask, target_col].dropna()
-                        obs_target  = df.loc[~missing_mask, target_col].dropna()
-                        ax3.hist(obs_target,  bins=25, alpha=0.7, label="Target when present",  color="#17172b", edgecolor="white")
-                        ax3.hist(miss_target, bins=25, alpha=0.7, label="Target when missing",  color="#9e2210", edgecolor="white")
-                        ax3.set_xlabel(target_col); ax3.set_ylabel("Count")
-                        ax3.legend()
-                    else:
-                        miss_target = df.loc[missing_mask, target_col].value_counts(normalize=True) * 100
-                        obs_target  = df.loc[~missing_mask, target_col].value_counts(normalize=True) * 100
-                        cats = list(set(miss_target.index) | set(obs_target.index))
-                        x = np.arange(len(cats))
-                        ax3.bar(x - 0.2, [obs_target.get(c, 0) for c in cats],  0.4, label="Present",  color="#17172b")
-                        ax3.bar(x + 0.2, [miss_target.get(c, 0) for c in cats], 0.4, label="Missing", color="#9e2210")
-                        ax3.set_xticks(x); ax3.set_xticklabels(cats, rotation=30)
-                        ax3.set_ylabel("% of group"); ax3.legend()
-                    ax3.set_title(f"Target ({target_col}) dist: present vs missing in '{selected_col}'")
-                    plt.tight_layout()
-                    st.pyplot(fig3); plt.close()
-                    dep_icon = "🟢" if (t3["diff_pct"] or 0) < 5 else "🟠" if (t3["diff_pct"] or 0) < 10 else "🔴"
-                    st.markdown(f'<div class="card-info"><b>{dep_icon} {t3["signal"]}</b><br><small>Target diff: {t3["diff_pct"]}%</small></div>', unsafe_allow_html=True)
-                else:
-                    st.info(t3["signal"])
-
-            # ── Verdict
-            st.markdown("---")
-            st.markdown("### 🏁 Mechanism Verdict")
-            mechanism, confidence, explanation = classify_mechanism(t1, t2, t3)
-            card_class = {"MCAR": "card-mcar", "MAR": "card-mar", "MNAR": "card-mnar"}[mechanism]
-            emoji = {"MCAR": "🟢", "MAR": "🟠", "MNAR": "🔴"}[mechanism]
-            st.markdown(
-                f'<div class="{card_class}">'
-                f'<div class="verdict-label">{emoji} {mechanism} — {confidence} confidence</div>'
-                f'<div class="verdict-desc">{explanation}</div>'
-                f'</div>',
-                unsafe_allow_html=True,
+        # ══ TEST 1: Little's MCAR ══
+        st.markdown('<div class="test-header">🔬 Test 1 — Little\'s MCAR Test</div>', unsafe_allow_html=True)
+        with st.expander("ℹ️ What does this test measure?", expanded=False):
+            st.markdown("""
+            **Little's MCAR test** checks if missingness is completely random.
+            - **H₀ (null):** Data is Missing Completely At Random (MCAR)  
+            - **p ≥ 0.05:** Fail to reject → data may be MCAR  
+            - **p < 0.05:** Reject → systematic missingness detected
+            """)
+
+        little_rows = [{
+            "Test": "Little's MCAR",
+            "χ² Statistic": little.get("chi2", "N/A"),
+            "Degrees of Freedom": little.get("df", "N/A"),
+            "p-value": little.get("p_value", "N/A"),
+            "Verdict": little.get("verdict", "N/A"),
+            "Reject MCAR?": "✅ Yes — systematic" if little.get("reject_mcar") else "❌ No — may be MCAR"
+        }]
+        st.dataframe(pd.DataFrame(little_rows), use_container_width=True, hide_index=True)
+
+        # ══ TEST 2: Target Dependency ══
+        st.markdown('<div class="test-header">🎯 Test 2 — Target Dependency Test</div>', unsafe_allow_html=True)
+        with st.expander("ℹ️ What does this test measure?", expanded=False):
+            st.markdown("""
+            Tests if the **target variable** has different values when this column is missing vs. observed.
+            - **Numeric target:** z-test or Welch t-test  
+            - **Categorical target:** Chi-squared test  
+            - **Significant (p<0.05) + large diff % → MNAR** (missingness depends on outcome)
+            """)
+
+        tgt_rows = [{
+            "Test Applied": "z-test / Welch t-test / Chi²",
+            "p-value": t_target.get("p_value", "N/A"),
+            "Target Diff %": f'{t_target.get("diff_pct", 0):.1f}%' if t_target.get("diff_pct") is not None else "N/A",
+            "Significant (p<0.05)?": "✅ Yes" if t_target.get("significant") else "❌ No",
+            "Interpretation": t_target.get("signal", "N/A")
+        }]
+        st.dataframe(pd.DataFrame(tgt_rows), use_container_width=True, hide_index=True)
+
+        # ══ TEST 3: Feature Dependency ══
+        st.markdown('<div class="test-header">🔗 Test 3 — Feature Dependency Tests</div>', unsafe_allow_html=True)
+        with st.expander("ℹ️ What does this test measure?", expanded=False):
+            st.markdown("""
+            For each other feature, tests if values differ **significantly** between rows where this column is missing vs. observed.
+            - **Numeric features:** z-test (n≥30) or Welch t-test  
+            - **Categorical features:** Chi-squared test  
+            - **Many significant features (>30%) → MAR** (missingness explained by observed data)
+            """)
+
+        # Summary row first
+        summary_cols = st.columns(3)
+        summary_cols[0].metric("Features Tested", t_feat.get("total_tested", 0))
+        summary_cols[1].metric("Significant (p<0.05)", t_feat.get("n_significant", 0))
+        summary_cols[2].metric("% Significant", f'{t_feat.get("sig_pct", 0):.1f}%')
+
+        if t_feat["results"]:
+            rows = []
+            for f, r in t_feat["results"].items():
+                rows.append({
+                    "Feature": f,
+                    "Data Type": r["type"].capitalize(),
+                    "Test Used": r["test"],
+                    "Test Statistic": r["stat"],
+                    "p-value": r["p_value"],
+                    "p < 0.05?": "✅ Significant" if r["significant"] else "—"
+                })
+            feat_df = pd.DataFrame(rows).sort_values("p-value")
+
+            def highlight_sig(row):
+                if row["p < 0.05?"] == "✅ Significant":
+                    return ["background-color:#ffe4e1; color:#900000"] * len(row)
+                return [""] * len(row)
+
+            st.dataframe(
+                feat_df.style.apply(highlight_sig, axis=1),
+                use_container_width=True,
+                hide_index=True
             )
-
-            # Strategy chips
-            col_type_str = "Numerical" if pd.api.types.is_numeric_dtype(df[selected_col]) else "Categorical"
-            chips_html = strategy_chips_html(mechanism, miss_pct, col_type_str)
-            if chips_html:
-                st.markdown("**Recommended Strategy Options**")
-                st.markdown(chips_html, unsafe_allow_html=True)
-
-            st.session_state["col_results"][selected_col] = {
-                "mechanism": mechanism,
-                "confidence": confidence,
-                "miss_pct": miss_pct,
-                "dtype": dtype_str,
-                "t1": t1, "t2": t2, "t3": t3,
-            }
+        else:
+            st.info("No feature dependency results available (insufficient data or no other columns).")
+
+        # ══ Decision Logic Summary ══
+        st.markdown('<div class="test-header">🧠 Decision Logic Summary</div>', unsafe_allow_html=True)
+        logic_rows = [
+            {"Rule Check": "Little's test rejects MCAR?", "Result": "✅ Yes" if little.get("reject_mcar") else "❌ No"},
+            {"Rule Check": "Target differs significantly?", "Result": "✅ Yes" if t_target.get("significant") else "❌ No"},
+            {"Rule Check": "Target diff magnitude", "Result": f'{t_target.get("diff_pct", 0):.1f}% difference'},
+            {"Rule Check": "% of features with significant diff", "Result": f'{t_feat.get("sig_pct", 0):.1f}%'},
+            {"Rule Check": "→ Final Mechanism", "Result": f'{res["mechanism"]} ({res["confidence"]} confidence)'},
+        ]
+        st.dataframe(pd.DataFrame(logic_rows), use_container_width=True, hide_index=True)
 
 
 # ════════════════════════════════════════════════════════════════════
-#  STEP 6 — STRATEGY & IMPUTATION
+#  STEP 4 — FEASIBILITY GATE (Interactive)
 # ════════════════════════════════════════════════════════════════════
+def render_step4():
+    st.markdown('<div class="main-title">⚖️ Step 4 — Imputation Feasibility Gate</div>', unsafe_allow_html=True)
+    
+    with st.expander("📚 Theory & Guide: Why test imputation mathematically? (Click to expand)"):
+        st.markdown("""
+        <div class="theory-box">
+            <h4>Why test imputation mathematically?</h4>
+            <p>Single-value imputations (like filling blanks with Mean or Median) are dangerous if overused. They can:</p>
+            <ul>
+                <li><b>Collapse Variance:</b> If you fill 20% of the data with the same number, the spread of your data shrinks unnaturally.</li>
+                <li><b>Create Artificial Outliers:</b> Because the variance (IQR) shrank, real valid data points at the edges suddenly look like outliers!</li>
+                <li><b>Destroy Correlation:</b> Assigning a median weight to someone without considering their height breaks the natural relationship between features.</li>
+            </ul>
+            <p><b>KNN and MICE</b> solve this by acting like mini machine-learning models — they look at other features to make an educated guess, preserving variance and correlations.</p>
+        </div>
+        """, unsafe_allow_html=True)
+
+    df, target = st.session_state.get("df_train"), st.session_state.get("target_col")
+    col_diag = st.session_state.get("col_diagnostics", {})
+    if not col_diag: return st.warning("⚠️ Please run diagnostics in Step 3 first.")
+
+    numeric_diag = {c: v for c, v in col_diag.items() if pd.api.types.is_numeric_dtype(df[c])}
+    if not numeric_diag: return st.info("No numeric columns available.")
+
+    col1, col2 = st.columns([1, 2])
+    selected_col  = col1.selectbox("Select numeric column:", list(numeric_diag.keys()))
+    impute_choice = col2.radio("Simulate impact of:", ["Mean", "Median", "KNN", "MICE"], horizontal=True)
+
+    if st.button(f"▶ Simulate {impute_choice} Imputation", type="primary"):
+        with st.spinner(f"Running {impute_choice} simulation (may take a moment for KNN/MICE)..."):
+            feas = feasibility_checks(df, selected_col, target, impute_choice)
+
+        if not feas.get("applicable"):
+            return st.error("Column not applicable for numeric feasibility checks.")
+
+        ICONS  = {"ok": "✅", "warn": "⚠️", "fail": "❌"}
+        COLORS = {"ok": "stat-ok", "warn": "stat-warn", "fail": "stat-fail"}
+
+        # ── Big Stats Banner ──
+        st.markdown("### 📊 Imputation Impact — Key Statistics")
+        m1, m2, m3, m4 = st.columns(4)
+
+        var_pct   = feas["variance"]["var_drop_pct"]
+        var_verd  = feas["variance"]["verdict"]
+        new_out   = feas["outliers"]["new_outliers"]
+        out_verd  = feas["outliers"]["verdict"]
+        corr_verd = feas["correlation"]["verdict"]
+        corr_max  = feas["correlation"]["max_shift"]
+        skew_val  = feas["skewness"]["value"]
+        skew_verd = feas["skewness"]["verdict"]
+
+        var_color  = "#900000" if var_verd == "fail"  else ("#7a4f00" if var_verd == "warn"  else "#0a5c30")
+        out_color  = "#900000" if out_verd == "fail"  else ("#7a4f00" if out_verd == "warn"  else "#0a5c30")
+        corr_color = "#900000" if corr_verd == "fail" else ("#7a4f00" if corr_verd == "warn" else "#0a5c30")
+        skew_color = "#900000" if skew_verd == "fail" else ("#7a4f00" if skew_verd == "warn" else "#0a5c30")
+
+        m1.markdown(
+            f'<div class="big-stat-box {COLORS[var_verd]}">'
+            f'<div class="big-stat-val" style="color:{var_color}">-{var_pct:.1f}%</div>'
+            f'<div class="big-stat-lbl">Variance Change</div>'
+            f'<div class="big-stat-sub">{ICONS[var_verd]} {"Safe" if var_verd=="ok" else "Caution" if var_verd=="warn" else "High Risk"}</div>'
+            f'</div>', unsafe_allow_html=True
+        )
+        m2.markdown(
+            f'<div class="big-stat-box {COLORS[out_verd]}">'
+            f'<div class="big-stat-val" style="color:{out_color}">+{new_out}</div>'
+            f'<div class="big-stat-lbl">New Outliers Created</div>'
+            f'<div class="big-stat-sub">{ICONS[out_verd]} Before: {feas["outliers"]["outliers_before"]} → After: {feas["outliers"]["outliers_after"]}</div>'
+            f'</div>', unsafe_allow_html=True
+        )
+        m3.markdown(
+            f'<div class="big-stat-box {COLORS[corr_verd]}">'
+            f'<div class="big-stat-val" style="color:{corr_color}">Δ{corr_max:.3f}</div>'
+            f'<div class="big-stat-lbl">Max Corr. Shift</div>'
+            f'<div class="big-stat-sub">{ICONS[corr_verd]} {corr_verd.capitalize()}</div>'
+            f'</div>', unsafe_allow_html=True
+        )
+        m4.markdown(
+            f'<div class="big-stat-box {COLORS[skew_verd]}">'
+            f'<div class="big-stat-val" style="color:{skew_color}">{skew_val:.3f}</div>'
+            f'<div class="big-stat-lbl">Skewness</div>'
+            f'<div class="big-stat-sub">{ICONS[skew_verd]} {"Low" if abs(skew_val)<=1 else "Moderate" if abs(skew_val)<=3 else "High"} skew</div>'
+            f'</div>', unsafe_allow_html=True
+        )
 
-elif step == STEPS[5]:
-    st.markdown('<div class="main-title">🛠 Step 6 — Strategy & Imputation</div>', unsafe_allow_html=True)
-    st.markdown('<div class="main-sub">Based on the mechanism and missing %, select and apply the right strategy for each column.</div>', unsafe_allow_html=True)
-
-    df = st.session_state.get("df")
-    col_results = st.session_state.get("col_results", {})
-
-    if df is None:
-        st.warning("⚠️ Please upload a CSV in Step 1 first.")
-    elif not col_results:
-        st.warning("⚠️ Please run diagnostics in Step 5 for at least one column first.")
-    else:
-        df_imputed = (df.copy() if st.session_state.get("df_imputed") is None
-                      else st.session_state["df_imputed"].copy())
-
-        for col, res in col_results.items():
-            mechanism = res["mechanism"]
-            miss_pct  = res["miss_pct"]
-            dtype_str = res["dtype"]
-
-            st.markdown(f"### Column: `{col}`")
-            st.markdown(f"**Mechanism:** {mechanism} | **Missing:** {miss_pct:.1f}% | **Type:** `{dtype_str}`")
-
-            rec = recommend_strategy(mechanism, miss_pct, dtype_str)
-            card_class = "card-mcar" if mechanism == "MCAR" else "card-mar" if mechanism == "MAR" else "card-mnar"
-            st.markdown(
-                f'<div class="{card_class}">'
-                f'<b>Recommended: {rec["method"]}</b><br>'
-                f'<small>{rec["reason"]}</small><br>'
-                f'<small>{rec["adv"]}</small><br>'
-                f'<small style="color:#888">{rec["disadv"]}</small>'
-                f'</div>',
-                unsafe_allow_html=True,
-            )
+        st.markdown("---")
+        
+        # ── KDE Plots — Two clear separate charts ──
+        st.markdown("### 📈 Distribution Comparison (KDE)")
 
-            if rec["add_indicator"]:
-                st.markdown(
-                    '<div class="card-warn">🚩 <b>Missing Indicator will be added BEFORE imputation</b> — '
-                    "missingness itself carries signal for this column.</div>",
-                    unsafe_allow_html=True,
-                )
-
-            is_num = "float" in dtype_str or "int" in dtype_str
-            strategy_options = (
-                ["Mean", "Median", "Constant (0)", "Drop rows", "Keep as-is"] if is_num
-                else ["Mode", "Constant ('Unknown')", "Drop rows", "Keep as-is"]
-            )
-            chosen = st.selectbox(
-                f"Apply strategy for `{col}`:",
-                options=strategy_options,
-                key=f"strategy_{col}",
-            )
+        series   = df[selected_col].dropna()
+        imputed  = feas["imputed_series"]
+        miss_pct_col = df[selected_col].isnull().mean() * 100
 
-            if st.button(f"▶ Apply to `{col}`", key=f"apply_{col}"):
-                if rec["add_indicator"]:
-                    indicator_col = f"{col}_was_missing"
-                    df_imputed[indicator_col] = df[col].isnull().astype(int)
-                    st.info(f"✅ Created indicator column: `{indicator_col}`")
-
-                if chosen == "Mean":
-                    fill_val = df[col].mean()
-                    df_imputed[col] = df_imputed[col].fillna(fill_val)
-                    st.success(f"✅ Imputed with mean = {fill_val:.4f}")
-                elif chosen == "Median":
-                    fill_val = df[col].median()
-                    df_imputed[col] = df_imputed[col].fillna(fill_val)
-                    st.success(f"✅ Imputed with median = {fill_val:.4f}")
-                elif chosen == "Mode":
-                    fill_val = df[col].mode().iloc[0]
-                    df_imputed[col] = df_imputed[col].fillna(fill_val)
-                    st.success(f"✅ Imputed with mode = {fill_val}")
-                elif chosen in ("Constant (0)", "Constant ('Unknown')"):
-                    fill_val = 0 if is_num else "Unknown"
-                    df_imputed[col] = df_imputed[col].fillna(fill_val)
-                    st.success(f"✅ Imputed with constant = {fill_val}")
-                elif chosen == "Drop rows":
-                    before = len(df_imputed)
-                    df_imputed = df_imputed.dropna(subset=[col])
-                    after = len(df_imputed)
-                    st.success(f"✅ Dropped {before - after} rows with missing `{col}`")
-                else:
-                    st.info("No imputation applied.")
+        fig, axes = plt.subplots(1, 2, figsize=(16, 5))
+        fig.patch.set_facecolor('#fafafa')
 
-                st.session_state["df_imputed"] = df_imputed
-
-            st.markdown("<hr class='divider'>", unsafe_allow_html=True)
+        # Plot 1: Overlapping KDE
+        ax = axes[0]
+        ax.set_facecolor('#f8f8f8')
+        try:
+            from scipy.stats import gaussian_kde
+            # Original KDE
+            kde_orig = gaussian_kde(series.values, bw_method='scott')
+            x_range = np.linspace(min(series.min(), imputed.min()), max(series.max(), imputed.max()), 300)
+            ax.fill_between(x_range, kde_orig(x_range), alpha=0.35, color='#17172b', label='Original (observed only)')
+            ax.plot(x_range, kde_orig(x_range), color='#17172b', lw=2.5)
+
+            # Imputed KDE
+            kde_imp = gaussian_kde(imputed.values, bw_method='scott')
+            ax.fill_between(x_range, kde_imp(x_range), alpha=0.35, color='#d6336c', label=f'After {impute_choice}')
+            ax.plot(x_range, kde_imp(x_range), color='#d6336c', lw=2.5, linestyle='--')
+        except Exception:
+            ax.hist(series.values, bins=25, alpha=0.5, color='#17172b', label='Original', density=True)
+            ax.hist(imputed.values, bins=25, alpha=0.4, color='#d6336c', label=f'After {impute_choice}', density=True)
+
+        ax.set_title(f'KDE: Original vs After {impute_choice}\n({miss_pct_col:.1f}% was missing)', fontsize=13, fontweight='bold', pad=12)
+        ax.set_xlabel(selected_col, fontsize=11)
+        ax.set_ylabel('Density', fontsize=11)
+        ax.legend(fontsize=10)
+        ax.grid(axis='y', alpha=0.3)
+        ax.spines[['top','right']].set_visible(False)
+
+        # Plot 2: Box plots side by side
+        ax2 = axes[1]
+        ax2.set_facecolor('#f8f8f8')
+        bp = ax2.boxplot(
+            [series.values, imputed.values],
+            labels=['Original\n(non-missing)', f'After\n{impute_choice}'],
+            patch_artist=True,
+            widths=0.5,
+            medianprops=dict(color='#d6336c', linewidth=2.5),
+            flierprops=dict(marker='o', markerfacecolor='#d6336c', markersize=5, alpha=0.5),
+            whiskerprops=dict(linewidth=1.5),
+            capprops=dict(linewidth=1.5),
+        )
+        bp['boxes'][0].set_facecolor('#c8d8f0')
+        bp['boxes'][1].set_facecolor('#f5c6d0')
 
-        st.markdown("### 📥 Download Imputed Dataset")
-        df_out = st.session_state.get("df_imputed", df)
-        csv_bytes = df_out.to_csv(index=False).encode("utf-8")
-        st.download_button(
-            label="⬇ Download imputed CSV",
-            data=csv_bytes,
-            file_name="imputed_dataset.csv",
-            mime="text/csv",
+        # Annotate variance change
+        ax2.set_title(
+            f'Spread & Outliers\nVariance Change: {var_pct:.1f}% | New Outliers: +{new_out}',
+            fontsize=13, fontweight='bold', pad=12
         )
-        st.dataframe(df_out.head(10), use_container_width=True)
+        ax2.set_ylabel('Value', fontsize=11)
+        ax2.grid(axis='y', alpha=0.3)
+        ax2.spines[['top','right']].set_visible(False)
+
+        plt.tight_layout(pad=2.5)
+        st.pyplot(fig, use_container_width=True)
+        plt.close()
+
+        # ── Correlation Details ──
+        st.markdown("---")
+        st.markdown("#### 🔗 Correlation Preservation Details")
+        st.markdown(f'<div class="card-{"ok" if corr_verd=="ok" else "warn" if corr_verd=="warn" else "danger"}">{ICONS[corr_verd]} <b>{feas["correlation"]["msg"]}</b></div>', unsafe_allow_html=True)
+        if feas["correlation"]["details"]:
+            rows = [{
+                "Feature": f,
+                "r (before)": r["r_before"],
+                "r (after)": r["r_after"],
+                "Δ (shift)": r["delta"],
+                "Sign Flip?": "🚨 YES" if r["sign_flip"] else "No"
+            } for f, r in feas["correlation"]["details"].items()]
+            corr_df = pd.DataFrame(rows).sort_values("Δ (shift)", ascending=False)
+            def highlight_corr(row):
+                if row["Sign Flip?"] == "🚨 YES": return ["background-color:#fde8e8; color:#900000"] * len(row)
+                if row["Δ (shift)"] > 0.10:       return ["background-color:#fff0ed; color:#900000"] * len(row)
+                return [""] * len(row)
+            st.dataframe(corr_df.style.apply(highlight_corr, axis=1), use_container_width=True, hide_index=True)
 
 
 # ════════════════════════════════════════════════════════════════════
-#  STEP 7 — VALIDATION CHECKS
+#  STEP 5 — FINAL REPORT
 # ════════════════════════════════════════════════════════════════════
+def render_step5():
+    st.markdown('<div class="main-title">📋 Step 5 — Final Diagnostic Report</div>', unsafe_allow_html=True)
+    
+    df, target = st.session_state.get("df_train"), st.session_state.get("target_col")
+    col_diag = st.session_state.get("col_diagnostics", {})
+    if not col_diag: return st.warning("⚠️ Run diagnostics in Step 3 first.")
+
+    # ── Legend ──
+    with st.expander("📖 How to read the Recommended Strategy column"):
+        st.markdown("""
+        | Label | Meaning |
+        |-------|---------|
+        | **Drop Rows** | MCAR + <5% missing — safe to delete affected rows |
+        | **Drop Column** | >70% missing — too little data to impute reliably |
+        | **Mean Imputation** | Low-skew numeric, variance loss is acceptable |
+        | **Median Imputation** | Skewed numeric; median is more robust than mean |
+        | **Mode Imputation** | Categorical / non-numeric columns |
+        | **KNN Imputer** | Moderate missingness; feature relationships preserved |
+        | **MICE Imputer** | High missingness (>30%); multiple-imputation approach |
+        | **+ Missing Indicator** | Added when mechanism is MNAR, or MAR ≥ 10% missing — add a binary flag column `col_missing` alongside imputed values |
+        """)
+
+    table_rows = []
+    for col, res in col_diag.items():
+        rec_string = get_auto_recommendation(df, col, target, res["mechanism"], res["miss_pct"], res["dtype"])
+        table_rows.append({
+            "Column": col,
+            "dtype": res["dtype"],
+            "Missing %": f'{res["miss_pct"]:.1f}%',
+            "Mechanism": res["mechanism"],
+            "Confidence": res["confidence"],
+            "Recommended Strategy": rec_string
+        })
+
+    report_df = pd.DataFrame(table_rows).sort_values("Missing %", ascending=False)
+
+    def color_rows(row):
+        mech_colors = {
+            "MNAR": "background-color:#fff0ed; color:#000",
+            "MAR":  "background-color:#fffaeb; color:#000",
+            "MCAR": "background-color:#edfaf3; color:#000"
+        }
+        return [mech_colors.get(row["Mechanism"], "")] * len(row)
+
+    st.dataframe(
+        report_df.style.apply(color_rows, axis=1),
+        use_container_width=True,
+        hide_index=True
+    )
 
-elif step == STEPS[6]:
-    st.markdown('<div class="main-title">✅ Step 7 — Validation Checks</div>', unsafe_allow_html=True)
-    st.markdown('<div class="main-sub">Confirm that imputation preserved statistical properties and did not introduce bias.</div>', unsafe_allow_html=True)
-
-    df_orig    = st.session_state.get("df")
-    df_imputed = st.session_state.get("df_imputed")
-    col_results = st.session_state.get("col_results", {})
-
-    if df_orig is None or df_imputed is None:
-        st.warning("⚠️ Complete Steps 1–6 first.")
-    elif not col_results:
-        st.warning("⚠️ Run diagnostics in Step 5 and apply a strategy in Step 6 first.")
-    else:
-        numeric_cols = [c for c in col_results if pd.api.types.is_numeric_dtype(df_orig[c])]
-
-        if not numeric_cols:
-            st.info("Validation checks apply to numeric columns only. No numeric columns were diagnosed.")
-        else:
-            for col in numeric_cols:
-                before = df_orig[col].dropna()
-                after  = df_imputed[col].dropna()
-
-                if len(after) == 0 or len(before) == 0:
-                    continue
-
-                st.markdown(f"### `{col}`")
-                chk = validation_checks(before, after)
-
-                c1, c2, c3 = st.columns(3)
-                def chk_icon(ok): return "✅" if ok else "⚠️"
-                with c1:
-                    st.markdown(
-                        f'<div class="metric-box">'
-                        f'<div class="metric-val">{chk_icon(chk["mean_ok"])} {chk["mean_shift_pct"]}%</div>'
-                        f'<div class="metric-lbl">Mean shift (≤5% OK)</div>'
-                        f'</div>', unsafe_allow_html=True)
-                with c2:
-                    st.markdown(
-                        f'<div class="metric-box">'
-                        f'<div class="metric-val">{chk_icon(chk["median_ok"])} {chk["median_shift_pct"]}%</div>'
-                        f'<div class="metric-lbl">Median shift (≤3% OK)</div>'
-                        f'</div>', unsafe_allow_html=True)
-                with c3:
-                    st.markdown(
-                        f'<div class="metric-box">'
-                        f'<div class="metric-val">{chk_icon(chk["var_ok"])} {chk["var_change_pct"]}%</div>'
-                        f'<div class="metric-lbl">Variance change (≤20% OK)</div>'
-                        f'</div>', unsafe_allow_html=True)
-
-                fig_v, ax_v = plt.subplots(figsize=(8, 3.5))
-                ax_v.hist(before.values, bins=30, alpha=0.55, label="Before imputation", color="#17172b", edgecolor="white")
-                ax_v.hist(after.values,  bins=30, alpha=0.55, label="After imputation",  color="#6020a0", edgecolor="white")
-                ax_v.axvline(before.mean(), color="#17172b", linewidth=1.5, linestyle="--", label=f"Mean before: {before.mean():.2f}")
-                ax_v.axvline(after.mean(),  color="#6020a0", linewidth=1.5, linestyle="--", label=f"Mean after: {after.mean():.2f}")
-                ax_v.set_title(f"Distribution: '{col}' before vs after imputation")
-                ax_v.legend(fontsize=8)
-                plt.tight_layout()
-                st.pyplot(fig_v); plt.close()
-
-                target_col = st.session_state.get("target_col")
-                if target_col and target_col in df_orig.columns and pd.api.types.is_numeric_dtype(df_orig[target_col]):
-                    corr_before = df_orig[[col, target_col]].dropna().corr().iloc[0, 1]
-                    corr_after  = df_imputed[[col, target_col]].dropna().corr().iloc[0, 1]
-                    delta = abs(corr_before - corr_after)
-                    sign_flip = (corr_before * corr_after < 0)
-                    icon = "✅" if delta <= 0.05 and not sign_flip else "⚠️"
-                    st.markdown(
-                        f'<div class="card-info">{icon} <b>Correlation with target:</b> '
-                        f'Before = {corr_before:.3f} → After = {corr_after:.3f} | Δ = {delta:.3f}'
-                        + (" 🚨 Sign flipped!" if sign_flip else "")
-                        + "</div>",
-                        unsafe_allow_html=True,
-                    )
-
-                st.markdown("<hr class='divider'>", unsafe_allow_html=True)
-
-        st.markdown("### ⚠️ Common Pitfalls Checklist")
-        pitfalls = [
-            "Each column treated independently?",
-            "Imputation done AFTER train-test split?",
-            "Target variable NOT used as imputation predictor?",
-            "Missing indicator created BEFORE imputation for MNAR/MAR ≥10%?",
-            "Validation checked beyond just accuracy?",
-        ]
-        for txt in pitfalls:
-            st.checkbox(txt, value=False, key=f"pitfall_{txt[:20]}")
-
-        st.markdown(
-            '<div class="card-warn">'
-            '<b>↻ Repeat Steps 5–6 for every column independently.</b><br>'
-            'One column may be MCAR (drop rows), another MAR (KNN), another MNAR (indicator + median). '
-            'Never apply one method to all columns at once.'
-            '</div>',
-            unsafe_allow_html=True,
-        )
-
-st.markdown("---")
-st.caption("🔬 Missing Value Intelligence Suite · Merged from app.py + app_tanisha.py · Built with Streamlit, pandas, scikit-learn, scipy, seaborn")
\ No newline at end of file
+    # ── Summary counts ──
+    st.markdown("---")
+    c1, c2, c3 = st.columns(3)
+    mcar_n = sum(1 for r in col_diag.values() if r["mechanism"] == "MCAR")
+    mar_n  = sum(1 for r in col_diag.values() if r["mechanism"] == "MAR")
+    mnar_n = sum(1 for r in col_diag.values() if r["mechanism"] == "MNAR")
+    c1.markdown(f'<div class="metric-box"><div class="metric-val" style="color:#0a5c30">🟢 {mcar_n}</div><div class="metric-lbl">MCAR columns</div></div>', unsafe_allow_html=True)
+    c2.markdown(f'<div class="metric-box"><div class="metric-val" style="color:#7a4f00">🟠 {mar_n}</div><div class="metric-lbl">MAR columns</div></div>', unsafe_allow_html=True)
+    c3.markdown(f'<div class="metric-box"><div class="metric-val" style="color:#900000">🔴 {mnar_n}</div><div class="metric-lbl">MNAR columns</div></div>', unsafe_allow_html=True)
+
+
+if   step == STEPS[0]: render_step1()
+elif step == STEPS[1]: render_step2()
+elif step == STEPS[2]: render_step3()
+elif step == STEPS[3]: render_step4()
+elif step == STEPS[4]: render_step5()