Spaces:

GS123
/

Missing_values_app

Running

App Files Files Community

GS123 commited on 22 days ago

Commit

9057123

verified ·

1 Parent(s): 0f10ed4

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +1492 -35

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,1497 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+"""
+Missing Value Intelligence Suite — Merged App
+Combines the stepwise pipeline (app.py) with the comprehensive dashboard (app_tanisha.py)
+into a unified 7-step workflow.
+"""
 import streamlit as st
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+import seaborn as sns
+from scipy import stats
+from scipy.stats import chi2_contingency, ks_2samp, shapiro, skew, kurtosis
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+import warnings
+warnings.filterwarnings("ignore")
+# ─────────────────────────── Page config ────────────────────────────
+st.set_page_config(
+    page_title="Missing Value Intelligence Suite",
+    page_icon="🔬",
+    layout="wide",
+    initial_sidebar_state="expanded",
+)
+# ─────────────────────────── Custom CSS ─────────────────────────────
+st.markdown("""
+<style>
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
+html, body, [class*="css"] { font-family: 'Inter', sans-serif; }
+section[data-testid="stSidebar"] {
+    background: #17172b;
+    color: #ffffff;
+}
+section[data-testid="stSidebar"] * { color: #ffffff !important; }
+section[data-testid="stSidebar"] .stSelectbox label,
+section[data-testid="stSidebar"] .stRadio label { color: #c0c0e0 !important; }
+.main-title {
+    font-size: 2rem;
+    font-weight: 700;
+    color: #17172b;
+    margin-bottom: 0.2rem;
+}
+.main-sub {
+    font-size: 1rem;
+    color: #6060a0;
+    margin-bottom: 1.5rem;
+}
+.section-header {
+    font-size: 1.3rem; font-weight: 600; color: #1a1a2e;
+    background: linear-gradient(90deg, #eef2ff, transparent);
+    padding: 10px 16px; border-left: 4px solid #4f8ef7;
+    border-radius: 4px; margin: 24px 0 14px 0;
+}
+.step-badge {
+    display: inline-block;
+    background: #17172b;
+    color: #fff;
+    font-size: 0.72rem;
+    font-weight: 700;
+    padding: 3px 10px;
+    border-radius: 20px;
+    margin-bottom: 6px;
+    letter-spacing: 0.08em;
+    text-transform: uppercase;
+}
+.card-mcar { background:#edfaf3; border:2px solid #89d9ac; border-radius:10px; padding:14px 18px; margin-bottom:10px; }
+.card-mar  { background:#fffaeb; border:2px solid #f0cc7a; border-radius:10px; padding:14px 18px; margin-bottom:10px; }
+.card-mnar { background:#fff0ed; border:2px solid #f5a898; border-radius:10px; padding:14px 18px; margin-bottom:10px; }
+.card-info { background:#eef2ff; border:2px solid #bdc8f5; border-radius:10px; padding:14px 18px; margin-bottom:10px; }
+.card-warn { background:#fff8e1; border:2px solid #ffe082; border-radius:10px; padding:14px 18px; margin-bottom:10px; }
+.card-strat{ background:#f8f0ff; border:2px solid #c8a0f0; border-radius:10px; padding:14px 18px; margin-bottom:10px; }
+.verdict-label { font-size: 1.1rem; font-weight: 700; margin-bottom: 4px; }
+.verdict-desc  { font-size: 0.88rem; color: #444; }
+.metric-box {
+    background: #f5f3ee;
+    border-radius: 8px;
+    padding: 12px 16px;
+    text-align: center;
+}
+.metric-val { font-size: 1.4rem; font-weight: 700; color: #17172b; }
+.metric-lbl { font-size: 0.78rem; color: #6060a0; margin-top: 2px; }
+.metric-card {
+    background: white; border-radius: 10px; padding: 18px 24px;
+    box-shadow: 0 2px 8px rgba(0,0,0,0.08); text-align: center;
+}
+.metric-card .val { font-size: 2rem; font-weight: 700; color: #4f8ef7; }
+.metric-card .lbl { font-size: 0.82rem; color: #666; margin-top: 4px; }
+.col-stat-card {
+    background: white; border-radius: 10px; padding: 14px 18px;
+    box-shadow: 0 1px 6px rgba(0,0,0,0.07); text-align: center;
+}
+.col-stat-card .cv { font-size: 1.5rem; font-weight: 700; color: #1a1a2e; }
+.col-stat-card .ck { font-size: 0.75rem; color: #888; margin-top: 3px;
+    text-transform: uppercase; letter-spacing: .05em; }
+.badge-mcar { background:#d4edda; color:#155724; padding:3px 10px; border-radius:12px; font-size:0.82rem; font-weight:600; }
+.badge-mar  { background:#fff3cd; color:#856404; padding:3px 10px; border-radius:12px; font-size:0.82rem; font-weight:600; }
+.badge-mnar { background:#f8d7da; color:#721c24; padding:3px 10px; border-radius:12px; font-size:0.82rem; font-weight:600; }
+.strat-chip { display:inline-block; padding:4px 14px; border-radius:20px;
+    font-size:0.82rem; font-weight:600; margin:3px 3px; }
+.chip-green  { background:#d4edda; color:#155724; border:1px solid #89d9ac; }
+.chip-yellow { background:#fff3cd; color:#856404; border:1px solid #f0cc7a; }
+.chip-red    { background:#f8d7da; color:#721c24; border:1px solid #f5a898; }
+.chip-blue   { background:#dce3ff; color:#2a3da0; border:1px solid #bdc8f5; }
+.insight-box {
+    background: #f0f7ff; border: 1px solid #bdd5ff;
+    border-radius: 8px; padding: 16px 20px; margin: 12px 0;
+}
+.insight-box li { margin: 6px 0; color: #1a3a6e; font-size: 0.92rem; }
+.theory-box {
+    background: #fafafa; border: 1px solid #e0e0e0;
+    border-radius: 8px; padding: 16px 20px; margin: 12px 0;
+}
+.theory-box h4 { color: #333; margin-bottom: 8px; }
+.theory-box p  { color: #555; font-size: 0.91rem; line-height: 1.6; }
+code { background: #f0f0f8; padding: 2px 6px; border-radius: 4px; font-size: 0.85rem; }
+hr.divider { border: none; border-top: 2px solid #e0ddd8; margin: 1.5rem 0; }
+</style>
+""", unsafe_allow_html=True)
+# ════════════════════════════════════════════════════════════════════
+#  SHARED HELPER FUNCTIONS
+# ════════════════════════════════════════════════════════════════════
+def missing_summary_df(df: pd.DataFrame) -> pd.DataFrame:
+    total = len(df)
+    counts = df.isnull().sum()
+    pct = counts / total * 100
+    summary = pd.DataFrame({
+        "Missing Count": counts,
+        "Missing %": pct.round(2),
+        "Dtype": df.dtypes.astype(str),
+    })
+    return summary[summary["Missing Count"] > 0].sort_values("Missing %", ascending=False)
+def missing_summary_typed(df, num_cols, cat_cols):
+    rows = []
+    for col in df.columns:
+        mc  = df[col].isnull().sum()
+        pct = mc / len(df) * 100
+        dtype = "Numerical" if col in num_cols else "Categorical"
+        rows.append({"Column": col, "Data Type": dtype,
+                     "Missing Count": mc, "Missing %": round(pct, 2)})
+    result = pd.DataFrame(rows).sort_values("Missing %", ascending=False).reset_index(drop=True)
+    return result[result["Missing Count"] > 0].reset_index(drop=True)
+def severity(pct):
+    if pct < 5:  return "Low"
+    if pct < 20: return "Moderate"
+    return "High"
+def identify_columns(df):
+    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
+    return num_cols, cat_cols
+def missingness_risk_level(pct: float) -> tuple:
+    if pct <= 5:
+        return "≤5%", "Very low missingness. Low risk of bias.", "#edfaf3", "#0d6b3a"
+    elif pct <= 15:
+        return "5–15%", "Moderate. Imputation preferred over dropping.", "#fffaeb", "#7a4d00"
+    elif pct <= 30:
+        return "15–30%", "High. Dropping loses too much data. Advanced imputation + missing indicator mandatory.", "#fff0ed", "#9e2210"
+    else:
+        return ">30%", "Very high. Consider dropping the column. Re-evaluate column usefulness + domain check.", "#fde8e8", "#7a0000"
+# ── Statistical Tests (from app.py) ──────────────────────────────────
+def test1_pattern_analysis(df: pd.DataFrame, col: str) -> dict:
+    indicator = df[col].isnull().astype(int)
+    miss_pct = indicator.mean() * 100
+    runs = (indicator != indicator.shift()).sum()
+    max_possible_runs = min(len(indicator) * 2, len(indicator[indicator == 1]) * 2 + 1)
+    cluster_ratio = runs / max(max_possible_runs, 1)
+    scattered = cluster_ratio > 0.5
+    return {
+        "indicator": indicator,
+        "miss_pct": miss_pct,
+        "scattered": scattered,
+        "cluster_ratio": cluster_ratio,
+        "signal": "MCAR signal" if scattered else "MAR / MNAR signal (clustered rows)",
+    }
+def test2_feature_dependency(df: pd.DataFrame, col: str) -> dict:
+    missing_mask = df[col].isnull()
+    if missing_mask.sum() < 5 or (~missing_mask).sum() < 5:
+        return {"diffs": {}, "max_diff": 0.0, "signal": "Insufficient data"}
+    diffs = {}
+    for other_col in df.columns:
+        if other_col == col:
+            continue
+        try:
+            miss_vals = df.loc[missing_mask, other_col].dropna()
+            obs_vals  = df.loc[~missing_mask, other_col].dropna()
+            if len(miss_vals) < 3 or len(obs_vals) < 3:
+                continue
+            if pd.api.types.is_numeric_dtype(df[other_col]):
+                m1, m2 = miss_vals.mean(), obs_vals.mean()
+                denom = max(abs(m2), 1e-9)
+                diff_pct = abs(m1 - m2) / denom * 100
+                diffs[other_col] = diff_pct
+            else:
+                ct = pd.crosstab(
+                    pd.concat([pd.Series(["missing"] * len(miss_vals)),
+                               pd.Series(["present"] * len(obs_vals))]),
+                    pd.concat([miss_vals, obs_vals])
+                )
+                chi2, _, _, _ = chi2_contingency(ct)
+                n = ct.values.sum()
+                k = min(ct.shape) - 1
+                cramers_v = np.sqrt(chi2 / (n * max(k, 1))) * 100
+                diffs[other_col] = cramers_v
+        except Exception:
+            continue
+    if not diffs:
+        return {"diffs": {}, "max_diff": 0.0, "signal": "No comparable features"}
+    max_diff = max(diffs.values())
+    if max_diff < 5:
+        signal = "Weak signal — MCAR likely"
+    elif max_diff < 30:
+        signal = "Strong MAR signal (feature dependency detected)"
+    else:
+        signal = "Very strong dependency — MAR or MNAR"
+    return {"diffs": diffs, "max_diff": max_diff, "signal": signal}
+def test3_target_dependency(df: pd.DataFrame, col: str, target_col: str) -> dict:
+    missing_mask = df[col].isnull()
+    if missing_mask.sum() < 5 or (~missing_mask).sum() < 5:
+        return {"diff_pct": None, "signal": "Insufficient data"}
+    try:
+        miss_target = df.loc[missing_mask, target_col].dropna()
+        obs_target  = df.loc[~missing_mask, target_col].dropna()
+        if pd.api.types.is_numeric_dtype(df[target_col]):
+            m1, m2 = miss_target.mean(), obs_target.mean()
+            denom = max(abs(m2), 1e-9)
+            diff_pct = abs(m1 - m2) / denom * 100
+        else:
+            p1 = miss_target.value_counts(normalize=True).iloc[0] * 100
+            p2 = obs_target.value_counts(normalize=True).iloc[0] * 100
+            diff_pct = abs(p1 - p2)
+        if diff_pct < 5:
+            signal = "No strong signal (<5% target diff)"
+        elif diff_pct < 10:
+            signal = "Moderate target dependency — possible MAR/MNAR"
+        else:
+            signal = "Strong target dependency → MNAR likely (>10% target diff)"
+        return {"diff_pct": round(diff_pct, 2), "signal": signal}
+    except Exception as e:
+        return {"diff_pct": None, "signal": f"Could not compute: {e}"}
+def classify_mechanism(t1: dict, t2: dict, t3: dict) -> tuple:
+    feat_dep  = t2.get("max_diff", 0)
+    tgt_dep   = t3.get("diff_pct") or 0
+    scattered = t1.get("scattered", True)
+    if tgt_dep > 10:
+        return "MNAR", "High", (
+            f"Target variable differs by {tgt_dep:.1f}% between missing/present rows. "
+            "The probability of missingness depends on the unobserved value itself."
+        )
+    elif feat_dep >= 10 and not scattered:
+        return "MAR", "High", (
+            f"Feature distributions differ by up to {feat_dep:.1f}% and missing values appear "
+            "clustered — missingness depends on observed features."
+        )
+    elif feat_dep >= 5:
+        return "MAR", "Moderate", (
+            f"Feature distributions differ by up to {feat_dep:.1f}%. "
+            "Missingness likely depends on observed features."
+        )
+    elif scattered and feat_dep < 5 and tgt_dep < 5:
+        return "MCAR", "High", (
+            "Values appear randomly scattered, feature distributions are similar across "
+            "groups, and target shows no dependency — consistent with MCAR."
+        )
+    else:
+        return "MCAR", "Low", (
+            "Weak signals across all three tests. Treated as MCAR but verify with domain knowledge."
+        )
+# ── Logistic Regression-based mechanism diagnosis (from app_tanisha.py) ──
+def diagnose_mechanism_lr(df, col, num_cols):
+    miss_mask  = df[col].isnull().astype(int)
+    predictors = [c for c in df.columns if c != col and df[c].isnull().mean() < 0.9]
+    if not predictors or miss_mask.sum() < 5:
+        return "MNAR", "Insufficient data to test; assumed MNAR."
+    mcar_p_vals = []
+    for p in predictors:
+        if p in num_cols and df[p].dropna().nunique() > 1:
+            try:
+                binned = pd.qcut(df[p].fillna(df[p].median()), q=4, duplicates="drop", labels=False)
+                ct = pd.crosstab(binned, miss_mask)
+                if ct.shape[0] > 1 and ct.shape[1] > 1:
+                    _, p_val, _, _ = chi2_contingency(ct)
+                    mcar_p_vals.append(p_val)
+            except Exception:
+                pass
+    if mcar_p_vals and np.mean(mcar_p_vals) > 0.05:
+        return "MCAR", (f"Chi-square tests show no significant dependency "
+                        f"(avg p={np.mean(mcar_p_vals):.3f} > 0.05). Missingness appears random.")
+    try:
+        X_pred = df[predictors].copy()
+        for c in X_pred.select_dtypes(include="object").columns:
+            X_pred[c] = X_pred[c].astype("category").cat.codes
+        X_pred = X_pred.fillna(X_pred.median(numeric_only=True))
+        scaler   = StandardScaler()
+        X_scaled = scaler.fit_transform(X_pred)
+        lr = LogisticRegression(max_iter=300, solver="lbfgs")
+        lr.fit(X_scaled, miss_mask)
+        score    = lr.score(X_scaled, miss_mask)
+        baseline = max(miss_mask.mean(), 1 - miss_mask.mean())
+        if score > baseline + 0.05:
+            return "MAR", (f"Logistic Regression predicts missingness with accuracy {score:.2%} "
+                           f"(baseline {baseline:.2%}). Missingness is related to observed variables.")
+    except Exception:
+        pass
+    return "MNAR", "Missingness not explained by observed data. Likely related to the missing value itself — assumed MNAR."
+def recommend_strategy(mechanism: str, miss_pct: float, dtype: str) -> dict:
+    is_num = "float" in dtype or "int" in dtype
+    add_indicator = (mechanism == "MNAR") or (mechanism == "MAR" and miss_pct >= 10)
+    if mechanism == "MCAR" and miss_pct <= 5:
+        method = "Drop rows"
+        reason = "MCAR confirmed and loss is minimal (≤5%). Safe to drop."
+        adv = "✓ No artificial data introduced"
+        disadv = "✗ Loses data — only safe at very low %"
+    elif mechanism in ("MCAR", "MAR") and miss_pct <= 15:
+        if is_num:
+            method = "Median imputation"
+            reason = "Low-moderate missingness. Median is robust to skew and outliers."
+            adv = "✓ Outlier-resistant; recommended default for numeric"
+            disadv = "✗ Reduces variance slightly"
+        else:
+            method = "Mode imputation"
+            reason = "Low-moderate missingness on categorical data."
+            adv = "✓ Preserves category structure"
+            disadv = "✗ Can over-represent dominant category"
+    elif mechanism == "MAR" and miss_pct <= 30:
+        method = "KNN Imputation" if is_num else "Mode / KNN Imputation"
+        reason = "Moderate MAR missingness. KNN leverages feature relationships."
+        adv = "✓ Preserves local patterns; captures inter-feature structure"
+        disadv = "✗ Slow on large datasets; requires scaling"
+    elif mechanism == "MAR" and miss_pct > 30:
+        method = "Iterative Imputer (MICE)"
+        reason = "High MAR missingness. MICE models each column as a function of others."
+        adv = "✓ Most statistically principled; accounts for all feature relationships"
+        disadv = "✗ Computationally expensive; risk of instability"
+    elif mechanism == "MNAR":
+        method = "Median + Missing Indicator (mandatory)"
+        reason = "MNAR: the fact of missingness is informative. Indicator must be created BEFORE imputation."
+        adv = "✓ Preserves MNAR signal; lets model learn from missingness"
+        disadv = "✗ Imputation may still be biased; domain expertise required"
+    else:
+        method = "Consider dropping column"
+        reason = f"Missing > 30% with {mechanism}. Evaluate predictive value vs. cost of imputation."
+        adv = "✓ Eliminates noise if column is uninformative"
+        disadv = "✗ Irreversible — verify with domain expert first"
+    return {
+        "method": method,
+        "reason": reason,
+        "adv": adv,
+        "disadv": disadv,
+        "add_indicator": add_indicator,
+    }
+def strategy_chips_html(mech, miss_pct, col_type):
+    chips = []
+    if mech == "CLEAN":
+        return '<span class="strat-chip chip-green">✅ No action needed — column is complete</span>'
+    if miss_pct > 50:
+        chips.append(("⚠ Consider Dropping Column (>50% missing)", "chip-red"))
+    if mech == "MCAR":
+        if miss_pct < 5:
+            chips.append(("Listwise Deletion (safe)", "chip-green"))
+        chips.append(("Median Imputation" if col_type == "Numerical" else "Mode Imputation", "chip-green"))
+    if mech == "MAR":
+        chips.append(("KNN Imputation", "chip-blue"))
+        chips.append(("Iterative Imputer (MICE)", "chip-blue"))
+        chips.append(("Group-wise Imputation", "chip-blue"))
+        if miss_pct >= 10:
+            chips.append(("Create Missing Indicator (≥10% MAR)", "chip-yellow"))
+    if mech == "MNAR":
+        chips.append(("⚠ Create Missing Indicator FIRST (mandatory)", "chip-red"))
+        chips.append(("Constant / Domain-Specific Value", "chip-yellow"))
+        chips.append(("Sensitivity Analysis Required", "chip-yellow"))
+    return " ".join(f'<span class="strat-chip {cls}">{lbl}</span>' for lbl, cls in chips)
+def validation_checks(df_before: pd.Series, df_after: pd.Series) -> dict:
+    m_shift   = abs(df_before.mean() - df_after.mean()) / max(abs(df_before.mean()), 1e-9) * 100
+    med_shift = abs(df_before.median() - df_after.median()) / max(abs(df_before.median()), 1e-9) * 100
+    var_change = abs(df_before.var() - df_after.var()) / max(df_before.var(), 1e-9) * 100
+    return {
+        "mean_shift_pct":   round(m_shift, 2),
+        "median_shift_pct": round(med_shift, 2),
+        "var_change_pct":   round(var_change, 2),
+        "mean_ok":   m_shift   <= 5,
+        "median_ok": med_shift <= 3,
+        "var_ok":    var_change <= 20,
+    }
+# ── Outlier & Variance helpers (from app_tanisha.py) ──────────────────
+def detect_outliers_iqr(series):
+    s = series.dropna()
+    if len(s) < 4: return 0
+    Q1, Q3 = s.quantile(0.25), s.quantile(0.75)
+    IQR = Q3 - Q1
+    return int(((s < Q1 - 1.5 * IQR) | (s > Q3 + 1.5 * IQR)).sum())
+def variance_impact(series):
+    s = series.dropna()
+    if len(s) < 2: return 0.0, 0.0, 0.0
+    var_before = float(s.var())
+    var_after  = float(series.fillna(s.mean()).var())
+    return round(var_before, 4), round(var_after, 4), round(var_before - var_after, 4)
+def stat_card(label, value, color="#1a1a2e"):
+    return (f'<div class="col-stat-card">'
+            f'<div class="cv" style="color:{color};">{value}</div>'
+            f'<div class="ck">{label}</div></div>')
+# ── Plot helpers ──────────────────────────────────────────────────────
+def plot_missing_heatmap(df):
+    missing_cols = [c for c in df.columns if df[c].isnull().any()]
+    if not missing_cols:
+        return None
+    sorted_cols = sorted(missing_cols, key=lambda c: df[c].isnull().mean(), reverse=True)
+    sample_size = min(300, len(df))
+    df_s = df[sorted_cols].sample(n=sample_size, random_state=42) if len(df) > sample_size else df[sorted_cols]
+    mask_df = df_s.isnull().astype(int)
+    fig, ax = plt.subplots(figsize=(max(10, len(sorted_cols) * 0.7), 5))
+    sns.heatmap(mask_df.T, cmap=["#f5f3ee", "#17172b"], cbar=True,
+                yticklabels=sorted_cols, xticklabels=False, linewidths=0, ax=ax)
+    ax.set_title(f"Missing Value Heatmap — sample of {sample_size} rows", fontsize=13, fontweight="bold", pad=12)
+    ax.set_xlabel("Rows (observations)", fontsize=10)
+    ax.set_ylabel("Columns", fontsize=10)
+    plt.tight_layout()
+    return fig
+def plot_missingness_correlation(df):
+    missing_cols = [c for c in df.columns if df[c].isnull().any()]
+    if len(missing_cols) < 2:
+        return None
+    miss_bin = df[missing_cols].isnull().astype(int)
+    corr = miss_bin.corr()
+    fig, ax = plt.subplots(figsize=(max(7, len(missing_cols) * 0.9), max(6, len(missing_cols) * 0.8)))
+    mask = np.triu(np.ones_like(corr, dtype=bool))
+    sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", center=0,
+                mask=mask, square=True, linewidths=0.5, cbar_kws={"shrink": 0.8}, ax=ax)
+    ax.set_title("Missingness Correlation Matrix", fontsize=13, fontweight="bold", pad=12)
+    plt.tight_layout()
+    return fig
+def plot_numerical_column(df, col):
+    s_original = df[col].dropna()
+    s_imputed  = df[col].fillna(s_original.mean())
+    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
+    fig.suptitle(f"Deep Distribution Analysis — {col}", fontsize=14, fontweight="bold")
+    sns.kdeplot(s_original, ax=axes[0], color="#4f8ef7", linewidth=3,
+                label="Original (Before)", fill=True, alpha=0.2)
+    sns.kdeplot(s_imputed,  ax=axes[0], color="#e07b54", linewidth=3,
+                label="Mean Imputed (After)", linestyle="--")
+    axes[0].set_title("Distribution Shift: Original vs. Imputed", fontsize=12)
+    axes[0].legend()
+    box_data = pd.DataFrame({
+        "Value": pd.concat([s_original, s_imputed]),
+        "Type":  ["Original"] * len(s_original) + ["Imputed"] * len(s_imputed),
+    })
+    sns.boxplot(data=box_data, x="Type", y="Value", ax=axes[1], palette=["#dce3ff", "#fce4d6"])
+    axes[1].set_title("Variance & Outlier Comparison", fontsize=12)
+    plt.tight_layout()
+    return fig
+def plot_categorical_column(df, col, top_n=10):
+    s_original = df[col].dropna()
+    s_imputed  = df[col].fillna(s_original.mode()[0] if not s_original.empty else "N/A")
+    fig, axes  = plt.subplots(1, 2, figsize=(16, 7))
+    fig.suptitle(f"Categorical Frequency Analysis — {col}", fontsize=14, fontweight="bold")
+    orig_counts = s_original.value_counts().head(top_n)
+    imp_counts  = s_imputed.value_counts().head(top_n)
+    compare_df  = pd.DataFrame({"Original": orig_counts, "Imputed (Mode)": imp_counts}).fillna(0)
+    compare_df.plot(kind="barh", ax=axes[0], color=["#4f8ef7", "#e07b54"], width=0.8)
+    axes[0].set_title(f"Top {top_n} Categories: Original vs Mode Imputed", fontsize=12)
+    axes[0].invert_yaxis()
+    top_pie = imp_counts.head(8)
+    axes[1].pie(top_pie, labels=top_pie.index.astype(str), autopct="%1.1f%%",
+                startangle=140, colors=plt.cm.Pastel1.colors, wedgeprops={"edgecolor": "white"})
+    axes[1].set_title("Final Proportion (After Imputation)", fontsize=12)
+    plt.tight_layout()
+    return fig
+def plot_missing_vs_features(df, col):
+    num_others = [c for c in df.select_dtypes(include=[np.number]).columns
+                  if c != col and df[c].isnull().mean() < 0.95]
+    if not num_others:
+        return None
+    means_present = df[df[col].notna()][num_others].mean()
+    means_missing = df[df[col].isnull()][num_others].mean()
+    diff_df = pd.DataFrame({"Present": means_present, "Missing": means_missing}).dropna().head(12)
+    if diff_df.empty:
+        return None
+    fig, ax = plt.subplots(figsize=(max(8, len(diff_df) * 0.9), 4))
+    x = np.arange(len(diff_df)); w = 0.35
+    ax.bar(x - w/2, diff_df["Present"], w, label="Present rows", color="#4f8ef7", alpha=0.85)
+    ax.bar(x + w/2, diff_df["Missing"], w, label="Missing rows",  color="#e07b54", alpha=0.85)
+    ax.set_xticks(x)
+    ax.set_xticklabels(diff_df.index, rotation=35, ha="right", fontsize=9)
+    ax.set_title(f"Feature Means — Rows where '{col}' is Present vs Missing",
+                 fontsize=11, fontweight="bold")
+    ax.set_ylabel("Mean value")
+    ax.legend(fontsize=9)
+    plt.tight_layout()
+    return fig
+def render_per_column_deep_analysis(df, col, num_cols, cat_cols, mechanism_results):
+    miss_count = int(df[col].isnull().sum())
+    miss_pct   = round(df[col].isnull().mean() * 100, 2)
+    total_rows = len(df)
+    present    = total_rows - miss_count
+    col_type   = "Numerical" if col in num_cols else "Categorical"
+    mech_info  = mechanism_results.get(col, {})
+    mech       = mech_info.get("mechanism", "N/A")
+    mech_reason = mech_info.get("reason", "Run the global diagnosis section above first.")
+    sev        = severity(miss_pct) if miss_pct > 0 else "None"
+    miss_color = "#dc2626" if miss_pct >= 20 else "#d97706" if miss_pct >= 5 else "#16a34a"
+    sev_color  = "#dc2626" if sev == "High" else "#d97706" if sev == "Moderate" else "#16a34a"
+    mech_color = {"MCAR": "#155724", "MAR": "#856404", "MNAR": "#721c24"}.get(mech, "#444")
+    st.markdown(f"#### 🔍 Deep Analysis — `{col}` &nbsp;·&nbsp; {col_type}", unsafe_allow_html=True)
+    m1, m2, m3, m4, m5 = st.columns(5)
+    with m1: st.markdown(stat_card("Total Rows", f"{total_rows:,}"), unsafe_allow_html=True)
+    with m2: st.markdown(stat_card("Present",    f"{present:,}"),    unsafe_allow_html=True)
+    with m3: st.markdown(stat_card("Missing",    f"{miss_pct}%",  miss_color), unsafe_allow_html=True)
+    with m4: st.markdown(stat_card("Severity",   sev,             sev_color),  unsafe_allow_html=True)
+    with m5: st.markdown(stat_card("Mechanism",  mech,            mech_color), unsafe_allow_html=True)
+    st.markdown("")
+    if col_type == "Numerical":
+        s = df[col].dropna()
+        if len(s) > 1:
+            col_skew = float(skew(s))
+            col_kurt = float(kurtosis(s))
+            Q1, Q3  = float(s.quantile(0.25)), float(s.quantile(0.75))
+            IQR     = Q3 - Q1
+            n_out   = detect_outliers_iqr(df[col])
+            vb, va, vi = variance_impact(df[col])
+            out_pct = n_out / max(len(s), 1)
+            r1 = st.columns(4)
+            for (lbl, val), col_ui in zip(
+                [("Mean", f"{s.mean():.4g}"), ("Median", f"{s.median():.4g}"),
+                 ("Std Dev", f"{s.std():.4g}"), ("Variance", f"{s.var():.4g}")], r1):
+                with col_ui: st.markdown(stat_card(lbl, val), unsafe_allow_html=True)
+            st.markdown("")
+            r2 = st.columns(4)
+            for (lbl, val), col_ui in zip(
+                [("Min", f"{s.min():.4g}"), ("Max", f"{s.max():.4g}"),
+                 ("Skewness", f"{col_skew:.3f}"), ("Kurtosis", f"{col_kurt:.3f}")], r2):
+                with col_ui: st.markdown(stat_card(lbl, val), unsafe_allow_html=True)
+            st.markdown("")
+            r3 = st.columns(4)
+            out_color = "#dc2626" if out_pct > 0.15 else "#d97706" if out_pct > 0.05 else "#16a34a"
+            for (lbl, val, clr), col_ui in zip(
+                [("Q1", f"{Q1:.4g}", "#1a1a2e"), ("Q3", f"{Q3:.4g}", "#1a1a2e"),
+                 ("IQR", f"{IQR:.4g}", "#1a1a2e"), ("Outliers (IQR)", str(n_out), out_color)], r3):
+                with col_ui: st.markdown(stat_card(lbl, val, clr), unsafe_allow_html=True)
+            if len(s) <= 5000:
+                try:
+                    _, p_norm = shapiro(s.sample(min(len(s), 5000), random_state=0))
+                    norm_txt = f"✅ Normal (p={p_norm:.4f})" if p_norm > 0.05 else f"⚠ Not Normal (p={p_norm:.4f})"
+                    st.caption(f"📐 Shapiro-Wilk normality test: {norm_txt}")
+                except Exception:
+                    pass
+            st.markdown("")
+            fig_dist = plot_numerical_column(df, col)
+            st.pyplot(fig_dist); plt.close(fig_dist)
+            st.markdown("**Variance Impact of Mean Imputation (simulated)**")
+            vc = st.columns(3)
+            delta_color = "#dc2626" if abs(vi)/max(vb,1e-9) > 0.3 else "#d97706" if abs(vi)/max(vb,1e-9) > 0.1 else "#16a34a"
+            with vc[0]: st.markdown(stat_card("Variance (before)", f"{vb:.4g}"), unsafe_allow_html=True)
+            with vc[1]: st.markdown(stat_card("Variance (after)",  f"{va:.4g}"), unsafe_allow_html=True)
+            with vc[2]: st.markdown(stat_card("Δ Variance", f"{vi:.4g}", delta_color), unsafe_allow_html=True)
+            pct_chg = abs(vi) / max(vb, 1e-9) * 100
+            if pct_chg >= 30:
+                st.warning(f"⚠ Variance drops by {pct_chg:.1f}% after mean imputation — over-smoothing risk. Use median or model-based imputation.")
+            elif pct_chg >= 10:
+                st.info(f"ℹ Variance drops by {pct_chg:.1f}% — acceptable, but monitor distribution shape.")
+            else:
+                st.success(f"✅ Variance change is small ({pct_chg:.1f}%) — mean imputation is statistically safe here.")
+    else:
+        s = df[col].dropna()
+        n_unique = s.nunique()
+        mode_val = str(s.mode().iloc[0]) if len(s) > 0 else "N/A"
+        mode_cnt = int((s == s.mode().iloc[0]).sum()) if len(s) > 0 else 0
+        mode_pct = round(mode_cnt / max(len(s), 1) * 100, 1)
+        r1 = st.columns(4)
+        for (lbl, val), col_ui in zip(
+            [("Unique Values", n_unique), ("Mode", mode_val[:12]),
+             ("Mode Count", f"{mode_cnt:,}"), ("Mode Freq %", f"{mode_pct}%")], r1):
+            with col_ui: st.markdown(stat_card(lbl, str(val)), unsafe_allow_html=True)
+        st.markdown("")
+        freq_table = s.value_counts().reset_index()
+        freq_table.columns = ["Value", "Count"]
+        freq_table["% of Present"] = (freq_table["Count"] / len(s) * 100).round(2)
+        tab_chart, tab_table = st.tabs(["📊 Frequency Chart", "📋 Frequency Table"])
+        with tab_chart:
+            fig_cat = plot_categorical_column(df, col)
+            st.pyplot(fig_cat); plt.close(fig_cat)
+        with tab_table:
+            st.dataframe(freq_table, use_container_width=True, hide_index=True)
+    st.markdown("")
+    if miss_count > 0:
+        st.markdown("**How Missingness Relates to Other Features**")
+        fig_pat = plot_missing_vs_features(df, col)
+        if fig_pat:
+            st.pyplot(fig_pat); plt.close(fig_pat)
+            st.caption("Large differences between blue (present) and orange (missing) bars signal MAR behavior.")
+        else:
+            st.info("No other numerical features available for pattern comparison.")
+    st.markdown("")
+    verdict_cls = {"MCAR": "card-mcar", "MAR": "card-mar", "MNAR": "card-mnar"}.get(mech, "card-info")
+    mech_icon   = {"MCAR": "🟢", "MAR": "🟡", "MNAR": "🔴"}.get(mech, "✅")
+    mech_label  = {"MCAR": "Missing Completely At Random (MCAR)",
+                   "MAR":  "Missing At Random (MAR)",
+                   "MNAR": "Missing Not At Random (MNAR)",
+                   "N/A":  "No Missing Values"}.get(mech, mech)
+    st.markdown(
+        f'<div class="{verdict_cls}"><strong>{mech_icon} {mech_label}</strong><br>'
+        f'<span style="font-size:0.9rem;color:#444;">{mech_reason}</span></div>',
+        unsafe_allow_html=True)
+    chips_html = strategy_chips_html(mech, miss_pct, col_type)
+    if chips_html:
+        st.markdown("")
+        st.markdown("**Recommended Strategies**")
+        st.markdown(chips_html, unsafe_allow_html=True)
+    pointer = {
+        "MCAR": ("📍 **MCAR**: Missing% <5% → listwise deletion is safe. 5–15% → median/mode imputation. "
+                 "15–30% → advanced imputation with missing indicator."),
+        "MAR":  ("📍 **MAR**: KNN / MICE preferred. Create a missing indicator if missing% ≥10%."),
+        "MNAR": ("📍 **MNAR**: **Create the missing indicator FIRST**, then use constant or sensitivity analysis. "
+                 "Domain knowledge is essential."),
+        "N/A":  "📍 No action needed — this column is complete. Proceed to feature engineering.",
+    }.get(mech, "")
+    if pointer:
+        st.markdown("")
+        st.info(pointer)
+# ════════════════════════════════════════════════════════════════════
+#  SIDEBAR — NAVIGATION
+# ════════════════════════════════════════════════════════════════════
+STEPS = [
+    "1 · Upload CSV",
+    "2 · Select Target Column",
+    "3 · Overview & Patterns",
+    "4 · Mechanism Dashboard",
+    "5 · Column Diagnostics",
+    "6 · Strategy & Imputation",
+    "7 · Validation Checks",
+]
+with st.sidebar:
+    st.markdown("## 🔬 Missing Value Intelligence Suite")
+    st.markdown("---")
+    st.markdown("**Navigation**")
+    step = st.radio("Go to step:", STEPS, label_visibility="collapsed")
+    st.markdown("---")
+    st.markdown(
+        "<small style='color:#9090c0'>Follow the steps in order for a complete analysis pipeline. "
+        "Steps 3–4 are exploratory; Steps 5–7 form the diagnostic pipeline.</small>",
+        unsafe_allow_html=True,
+    )
+# ════════════════════════════════════════════════════════════════════
+#  SESSION STATE
+# ════════════════════════════════════════════════════════════════════
+for key in ["df", "target_col", "col_results", "df_imputed", "mechanism_results_lr"]:
+    if key not in st.session_state:
+        st.session_state[key] = None
+if st.session_state["col_results"] is None:
+    st.session_state["col_results"] = {}
+if st.session_state["mechanism_results_lr"] is None:
+    st.session_state["mechanism_results_lr"] = {}
+# ════════════════════════════════════════════════════════════════════
+#  STEP 1 — UPLOAD CSV
+# ════════════════════════════════════════════════════════════════════
+if step == STEPS[0]:
+    st.markdown('<div class="main-title">📂 Step 1 — Upload Your CSV</div>', unsafe_allow_html=True)
+    st.markdown('<div class="main-sub">Upload a CSV file to begin the missing-value analysis pipeline.</div>', unsafe_allow_html=True)
+    uploaded = st.file_uploader("Choose a CSV file", type=["csv"])
+    if uploaded:
+        try:
+            df = pd.read_csv(uploaded)
+            # Auto-remove ID-like columns
+            id_cols = [c for c in df.columns if c.strip().lower() in ("id", "index", "row", "rowid", "row_id")]
+            if id_cols:
+                df.drop(columns=id_cols, inplace=True)
+                st.toast(f"Auto-removed non-informative column(s): {id_cols}", icon="🗑️")
+            st.session_state["df"] = df
+            st.session_state["col_results"] = {}
+            st.session_state["mechanism_results_lr"] = {}
+            st.session_state["df_imputed"] = df.copy()
+            st.success(f"✅ File loaded: **{uploaded.name}** — {df.shape[0]} rows × {df.shape[1]} columns")
+            st.markdown("### Preview (first 10 rows)")
+            st.dataframe(df.head(10), use_container_width=True)
+            c1, c2, c3, c4 = st.columns(4)
+            with c1:
+                st.markdown(f'<div class="metric-box"><div class="metric-val">{df.shape[0]:,}</div><div class="metric-lbl">Rows</div></div>', unsafe_allow_html=True)
+            with c2:
+                st.markdown(f'<div class="metric-box"><div class="metric-val">{df.shape[1]}</div><div class="metric-lbl">Columns</div></div>', unsafe_allow_html=True)
+            with c3:
+                n_miss_cols = df.isnull().any().sum()
+                st.markdown(f'<div class="metric-box"><div class="metric-val">{n_miss_cols}</div><div class="metric-lbl">Columns w/ Missings</div></div>', unsafe_allow_html=True)
+            with c4:
+                total_miss = df.isnull().sum().sum()
+                pct_miss = round(total_miss / df.size * 100, 1)
+                st.markdown(f'<div class="metric-box"><div class="metric-val">{pct_miss}%</div><div class="metric-lbl">Overall Missing Rate</div></div>', unsafe_allow_html=True)
+            st.markdown("### Column Types & Missingness")
+            type_df = pd.DataFrame({
+                "Column":    df.columns,
+                "Dtype":     df.dtypes.astype(str).values,
+                "Missing":   df.isnull().sum().values,
+                "Missing %": (df.isnull().mean() * 100).round(2).values,
+            })
+            st.dataframe(type_df, use_container_width=True, hide_index=True)
+        except Exception as e:
+            st.error(f"Could not read file: {e}")
+    else:
+        st.info("👆 Upload a CSV to get started.")
+# ════════════════════════════════════════════════════════════════════
+#  STEP 2 — SELECT TARGET COLUMN
+# ════════════════════════════════════════════════════════════════════
+elif step == STEPS[1]:
+    st.markdown('<div class="main-title">🎯 Step 2 — Select Target Column</div>', unsafe_allow_html=True)
+    st.markdown('<div class="main-sub">The target column (y) is used in Test 3 to detect MNAR patterns and is excluded from feature analysis.</div>', unsafe_allow_html=True)
+    df = st.session_state.get("df")
+    if df is None:
+        st.warning("⚠️ Please upload a CSV in Step 1 first.")
+    else:
+        target = st.selectbox(
+            "Select the output / target column:",
+            options=df.columns.tolist(),
+            index=len(df.columns) - 1,
+        )
+        if st.button("✅ Confirm Target Column", type="primary"):
+            st.session_state["target_col"] = target
+            st.success(f"Target column set to: **{target}**")
+        if st.session_state.get("target_col"):
+            st.info(f"Current target: **{st.session_state['target_col']}**")
+            tc = st.session_state["target_col"]
+            col_data = df[tc]
+            st.markdown("#### Target Column Distribution")
+            fig, ax = plt.subplots(figsize=(7, 3))
+            if pd.api.types.is_numeric_dtype(col_data):
+                col_data.dropna().hist(bins=30, ax=ax, color="#17172b", edgecolor="white")
+                ax.set_xlabel(tc); ax.set_ylabel("Count")
+            else:
+                vc = col_data.value_counts().head(15)
+                vc.plot(kind="bar", ax=ax, color="#17172b")
+                ax.set_ylabel("Count")
+            ax.set_title(f"Distribution of '{tc}'")
+            plt.tight_layout()
+            st.pyplot(fig)
+            plt.close()
+# ════════════════════════════════════════════════════════════════════
+#  STEP 3 — OVERVIEW & PATTERNS
+# ════════════════════════════════════════════════════════════════════
+elif step == STEPS[2]:
+    st.markdown('<div class="main-title">📊 Step 3 — Overview & Patterns</div>', unsafe_allow_html=True)
+    st.markdown('<div class="main-sub">Bird\'s-eye view of missingness across the dataset, including heatmaps and co-missingness patterns.</div>', unsafe_allow_html=True)
+    df = st.session_state.get("df")
+    target_col = st.session_state.get("target_col")
+    if df is None:
+        st.warning("⚠️ Please upload a CSV in Step 1 first.")
+    else:
+        X = df.drop(columns=[target_col]) if target_col and target_col in df.columns else df
+        summary = missing_summary_df(X)
+        if summary.empty:
+            st.success("🎉 No missing values found in the dataset features!")
+        else:
+            st.markdown(f"### {len(summary)} column(s) have missing values")
+            st.dataframe(summary.style.background_gradient(subset=["Missing %"], cmap="YlOrRd"),
+                         use_container_width=True)
+            # ── Missing % bar chart
+            st.markdown('<div class="section-header">📉 Missing % per Column</div>', unsafe_allow_html=True)
+            miss_cols = summary.index.tolist()
+            fig_bar, ax_bar = plt.subplots(figsize=(max(7, len(miss_cols) * 0.9), 4))
+            colors = ["#9e2210" if v > 30 else "#7a4d00" if v > 15 else "#0d6b3a" for v in summary["Missing %"]]
+            ax_bar.barh(summary.index[::-1], summary["Missing %"][::-1], color=colors[::-1], edgecolor="white")
+            ax_bar.axvline(5,  color="#89d9ac", linewidth=1.5, linestyle="--", label="5% threshold")
+            ax_bar.axvline(15, color="#f0cc7a", linewidth=1.5, linestyle="--", label="15% threshold")
+            ax_bar.axvline(30, color="#f5a898", linewidth=1.5, linestyle="--", label="30% threshold")
+            ax_bar.set_xlabel("Missing %"); ax_bar.set_title("Missing % per Column")
+            ax_bar.legend(loc="lower right", fontsize=8)
+            plt.tight_layout()
+            st.pyplot(fig_bar)
+            plt.close()
+            # ── Heatmap + Correlation tabs
+            st.markdown('<div class="section-header">🗺 Missingness Patterns</div>', unsafe_allow_html=True)
+            tab_hm, tab_corr = st.tabs(["Missing Heatmap", "Missingness Correlation"])
+            with tab_hm:
+                fig_hm = plot_missing_heatmap(X)
+                if fig_hm:
+                    st.pyplot(fig_hm); plt.close(fig_hm)
+                    st.caption("Dark = missing, light = present. Each column is a row.")
+                else:
+                    st.info("No missing values to display.")
+            with tab_corr:
+                fig_corr = plot_missingness_correlation(X)
+                if fig_corr:
+                    st.pyplot(fig_corr); plt.close(fig_corr)
+                    st.caption("Near +1: columns tend to be missing together. Near −1: rarely missing simultaneously.")
+                else:
+                    st.info("Need at least 2 columns with missing values for this chart.")
+            # ── Correlation among numerical features
+            num_cols_x, _ = identify_columns(X)
+            if len(num_cols_x) >= 2:
+                st.markdown('<div class="section-header">📈 Feature Correlations (Numerical)</div>', unsafe_allow_html=True)
+                valid = [c for c in num_cols_x if X[c].isnull().mean() < 1.0]
+                if len(valid) >= 2:
+                    corr = X[valid].corr()
+                    strong = (corr.abs() > 0.5) & (corr != 1.0)
+                    if strong.any().any():
+                        fig_fc, ax_fc = plt.subplots(figsize=(max(8, len(valid) * 0.9), max(7, len(valid) * 0.8)))
+                        mask = np.triu(np.ones_like(corr, dtype=bool))
+                        display_corr = corr.where(corr.abs() > 0.5)
+                        sns.heatmap(display_corr, annot=False, cmap="RdYlGn", center=0,
+                                    mask=mask, square=True, linewidths=0.5,
+                                    cbar_kws={"shrink": 0.8}, ax=ax_fc, vmin=-1, vmax=1)
+                        ax_fc.set_title("Strong Correlations (|r| > 0.5) — Numerical Features",
+                                        fontsize=13, fontweight="bold", pad=12)
+                        plt.tight_layout()
+                        st.pyplot(fig_fc); plt.close(fig_fc)
+                        # Correlation pairs table
+                        pairs = []
+                        seen = set()
+                        for i, c1 in enumerate(corr.columns):
+                            for j, c2 in enumerate(corr.columns):
+                                if i >= j: continue
+                                v = corr.loc[c1, c2]
+                                if abs(v) > 0.5:
+                                    key = tuple(sorted([c1, c2]))
+                                    if key not in seen:
+                                        seen.add(key)
+                                        pairs.append({"Column A": c1, "Column B": c2,
+                                                      "Correlation": round(v, 4),
+                                                      "Correlation %": f"{round(v * 100, 2)}%"})
+                        if pairs:
+                            corr_table = pd.DataFrame(pairs).sort_values("Correlation", key=abs, ascending=False)
+                            st.markdown("**Strong Correlation Pairs (|r| > 0.5)**")
+                            st.dataframe(corr_table, use_container_width=True, hide_index=True)
+                    else:
+                        st.info("No strong correlations (|r| > 0.5) found among numerical features.")
+# ════════════════════════════════════════════════════════════════════
+#  STEP 4 — MECHANISM DASHBOARD  (from app_tanisha.py)
+# ════════════════════════════════════════════════════════════════════
+elif step == STEPS[3]:
+    st.markdown('<div class="main-title">🧪 Step 4 — Mechanism Dashboard</div>', unsafe_allow_html=True)
+    st.markdown('<div class="main-sub">Automated MCAR/MAR/MNAR detection via Chi-square & Logistic Regression, plus outlier/variance analysis and deep per-column exploration.</div>', unsafe_allow_html=True)
+    df = st.session_state.get("df")
+    target_col = st.session_state.get("target_col")
+    if df is None:
+        st.warning("⚠️ Please upload a CSV in Step 1 first.")
+    elif target_col is None:
+        st.warning("⚠️ Please select a target column in Step 2 first.")
+    else:
+        X = df.drop(columns=[target_col])
+        y = df[target_col]
+        num_cols, cat_cols = identify_columns(X)
+        # ── Train-test split
+        st.markdown('<div class="section-header">✂️ Train-Test Split (80 / 20)</div>', unsafe_allow_html=True)
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+        sc1, sc2 = st.columns(2)
+        with sc1: st.markdown(f"**Training Set** — X_train: `{X_train.shape}` · y_train: `{y_train.shape}`")
+        with sc2: st.markdown(f"**Test Set** — X_test: `{X_test.shape}` · y_test: `{y_test.shape}`")
+        # ── Mechanism diagnosis
+        st.markdown('<div class="section-header">🔬 Missing Data Mechanism Diagnosis (Chi-square + Logistic Regression)</div>', unsafe_allow_html=True)
+        missing_feature_cols = [c for c in X.columns if X[c].isnull().any()]
+        if not missing_feature_cols:
+            st.success("No missing values in feature columns — nothing to diagnose.")
+            mechanism_results = {}
+        else:
+            cached = st.session_state.get("mechanism_results_lr", {})
+            if not cached:
+                with st.spinner("Running MCAR (Chi-square) and MAR (Logistic Regression) tests…"):
+                    mechanism_results = {}
+                    for col in missing_feature_cols:
+                        mech, reason = diagnose_mechanism_lr(X, col, num_cols)
+                        mechanism_results[col] = {"mechanism": mech, "reason": reason}
+                st.session_state["mechanism_results_lr"] = mechanism_results
+            else:
+                mechanism_results = cached
+            badge_map = {"MCAR": "badge-mcar", "MAR": "badge-mar", "MNAR": "badge-mnar"}
+            for col, res in mechanism_results.items():
+                mech = res["mechanism"]
+                pct  = round(X[col].isnull().mean() * 100, 2)
+                with st.expander(f"🔎 **{col}** — {mech}  |  {pct}% missing"):
+                    st.markdown(f'<span class="{badge_map[mech]}">{mech}</span>&nbsp;&nbsp;{res["reason"]}',
+                                unsafe_allow_html=True)
+        # ── Outlier Detection & Variance Impact
+        st.markdown('<div class="section-header">⚡ Outlier Detection & Variance Impact</div>', unsafe_allow_html=True)
+        outlier_data = {}
+        for col in num_cols:
+            n_out = detect_outliers_iqr(X[col])
+            vb, va, vi = variance_impact(X[col])
+            outlier_data[col] = {
+                "Missing %": round(X[col].isnull().mean() * 100, 2),
+                "Outliers (IQR)": n_out,
+                "Variance (before impute)": vb,
+                "Variance (after mean impute)": va,
+                "Variance Impact (Δ)": vi,
+            }
+        if outlier_data:
+            out_df = (pd.DataFrame(outlier_data).T.reset_index()
+                      .rename(columns={"index": "Column"})
+                      .sort_values("Outliers (IQR)", ascending=False))
+            def color_outliers(val):
+                if isinstance(val, (int, float)):
+                    if val > 50: return "background-color: #f8d7da; color: #721c24;"
+                    if val > 10: return "background-color: #fff3cd; color: #856404;"
+                return ""
+            st.dataframe(out_df.style.applymap(color_outliers, subset=["Outliers (IQR)"]),
+                         use_container_width=True, hide_index=True)
+        else:
+            st.info("No numerical columns available for outlier analysis.")
+        # ── Final Diagnosis Table
+        st.markdown('<div class="section-header">📋 Final Diagnosis Table</div>', unsafe_allow_html=True)
+        diag_rows = []
+        for col in X.columns:
+            mp   = round(X[col].isnull().mean() * 100, 2)
+            mech = mechanism_results.get(col, {}).get("mechanism", "N/A") if col in missing_feature_cols else "N/A"
+            diag_rows.append({
+                "Column": col, "Missing %": mp,
+                "Mechanism": mech, "Severity": severity(mp) if mp > 0 else "None",
+                "Outliers": outlier_data.get(col, {}).get("Outliers (IQR)", "—"),
+                "Variance Impact (Δ)": outlier_data.get(col, {}).get("Variance Impact (Δ)", "—"),
+            })
+        diag_df = pd.DataFrame(diag_rows).sort_values("Missing %", ascending=False).reset_index(drop=True)
+        sev_colors  = {"High": "background-color: #f8d7da; color: #721c24;",
+                       "Moderate": "background-color: #fff3cd; color: #856404;",
+                       "Low": "background-color: #d4edda; color: #155724;"}
+        mech_colors = {"MCAR": "background-color: #d4edda; color: #155724;",
+                       "MAR":  "background-color: #fff3cd; color: #856404;",
+                       "MNAR": "background-color: #f8d7da; color: #721c24;"}
+        def color_diag_row(row):
+            mech_style = mech_colors.get(row["Mechanism"], "")
+            sev_style  = sev_colors.get(row["Severity"], "")
+            return ["", "", mech_style, sev_style, "", ""]
+        st.dataframe(diag_df.style.apply(color_diag_row, axis=1),
+                     use_container_width=True, hide_index=True)
+        # ── Per-Column Deep Analysis
+        st.markdown('<div class="section-header">🔬 Per-Column Deep Analysis</div>', unsafe_allow_html=True)
+        col_label_to_name = {}
+        for col in X.columns:
+            mp_l     = round(X[col].isnull().mean() * 100, 1)
+            type_lbl = "Num" if col in num_cols else "Cat"
+            mech_lbl = mechanism_results.get(col, {}).get("mechanism", "—") if col in missing_feature_cols else "complete"
+            label    = f"{col}  [{type_lbl} · {mp_l}% missing · {mech_lbl}]"
+            col_label_to_name[label] = col
+        chosen_label = st.selectbox(
+            "Select a column to analyse in detail:",
+            options=["— choose a column —"] + list(col_label_to_name.keys()),
+            key="deep_col_select"
+        )
+        if chosen_label != "— choose a column —":
+            chosen_col = col_label_to_name[chosen_label]
+            with st.spinner(f"Analysing `{chosen_col}`…"):
+                st.markdown("---")
+                render_per_column_deep_analysis(
+                    df=X, col=chosen_col,
+                    num_cols=num_cols, cat_cols=cat_cols,
+                    mechanism_results=mechanism_results,
+                )
+                st.markdown("---")
+        # ── Insights
+        st.markdown('<div class="section-header">💡 Data Analysis Insights</div>', unsafe_allow_html=True)
+        high_miss = diag_df[diag_df["Missing %"] >= 20]["Column"].tolist()
+        mar_cols  = diag_df[diag_df["Mechanism"] == "MAR"]["Column"].tolist()
+        mnar_cols = diag_df[diag_df["Mechanism"] == "MNAR"]["Column"].tolist()
+        high_out  = [c for c in num_cols if outlier_data.get(c, {}).get("Outliers (IQR)", 0) > 10]
+        insights = [
+            "Missing data must be understood <b>before</b> any imputation or modeling to avoid biased results.",
+            (f"<b>{', '.join(high_miss)}</b> have ≥20% missing values — treat with caution or consider dropping."
+             if high_miss else "No columns have critically high (≥20%) missing rates — dataset quality looks reasonable."),
+            (f"Columns <b>{', '.join(mar_cols)}</b> show MAR behavior — KNN/MICE imputation is viable."
+             if mar_cols else "No columns confirmed MAR."),
+            (f"Columns <b>{', '.join(mnar_cols)}</b> are likely MNAR — create a missing indicator before imputing."
+             if mnar_cols else "No columns flagged as MNAR."),
+            (f"Columns <b>{', '.join(high_out)}</b> have many outliers — prefer median over mean imputation."
+             if high_out else "Outlier counts appear manageable across numerical columns."),
+            "Correlated missingness indicates data is likely <b>not MCAR</b> — jointly missing due to a common cause.",
+            "MCAR is rare in real-world datasets. Most missingness in practice is MAR or MNAR.",
+            "MNAR <b>cannot be confirmed statistically</b> from observed data alone — domain knowledge is essential.",
+        ]
+        st.markdown('<div class="insight-box"><ul>' + "".join(f"<li>{i}</li>" for i in insights) + "</ul></div>",
+                    unsafe_allow_html=True)
+        # ── Theory
+        st.markdown('<div class="section-header">📚 Theoretical Background</div>', unsafe_allow_html=True)
+        theories = [
+            ("🔵 MCAR — Missing Completely At Random",
+             "The probability of missingness is entirely independent of observed and unobserved data. "
+             "Listwise deletion is unbiased under MCAR, though it reduces sample size."),
+            ("🟡 MAR — Missing At Random",
+             "Missingness depends on <i>observed</i> data but not on the missing value itself. "
+             "Multiple imputation or FIML methods produce valid estimates under MAR."),
+            ("🔴 MNAR — Missing Not At Random",
+             "Missingness depends on the <i>unobserved value itself</i>. Cannot be detected from observed data. "
+             "Requires sensitivity analysis and domain knowledge. Ignoring MNAR produces biased results."),
+            ("📐 Why Chi-Square for MCAR Testing?",
+             "Chi-square tests independence between the binary missingness indicator and binned numeric predictors. "
+             "No significant association is consistent with MCAR, though this only confirms pairwise independence."),
+            ("🤖 Why Logistic Regression for MAR Detection?",
+             "LR models the binary missingness indicator as a function of all observed features. "
+             "Accuracy substantially above the majority-class baseline indicates MAR."),
+            ("📉 Why MNAR Cannot Be Confirmed Statistically",
+             "MNAR depends on unobserved values — data we do not have. No statistical test on observed data "
+             "can definitively confirm it. Domain reasoning about the data generation process is required."),
+            ("📦 Outliers and Their Impact on Variance",
+             "Outliers (>1.5×IQR) inflate variance and distort the mean. Mean imputation artificially collapses "
+             "variance because all missing cells receive the same central value, masking true data spread."),
+        ]
+        for title, body in theories:
+            st.markdown(f'<div class="theory-box"><h4>{title}</h4><p>{body}</p></div>', unsafe_allow_html=True)
+# ════════════════════════════════════════════════════════════════════
+#  STEP 5 — COLUMN DIAGNOSTICS  (from app.py — 3 statistical tests)
+# ════════════════════════════════════════════════════════════════════
+elif step == STEPS[4]:
+    st.markdown('<div class="main-title">🔬 Step 5 — Column Diagnostics</div>', unsafe_allow_html=True)
+    st.markdown('<div class="main-sub">Run three independent statistical tests per column to determine the missing-data mechanism (MCAR / MAR / MNAR).</div>', unsafe_allow_html=True)
+    df = st.session_state.get("df")
+    target_col = st.session_state.get("target_col")
+    if df is None:
+        st.warning("⚠️ Please upload a CSV in Step 1 first.")
+    elif target_col is None:
+        st.warning("⚠️ Please select a target column in Step 2 first.")
+    else:
+        summary = missing_summary_df(df)
+        if summary.empty:
+            st.success("🎉 No missing values — nothing to diagnose.")
+        else:
+            miss_cols = summary.index.tolist()
+            selected_col = st.selectbox("Select a column to analyse:", miss_cols)
+            miss_pct = summary.loc[selected_col, "Missing %"]
+            dtype_str = str(df[selected_col].dtype)
+            st.markdown(f"---")
+            st.markdown(f"### Analysing column: `{selected_col}`")
+            lv, risk_txt, risk_bg, risk_fg = missingness_risk_level(miss_pct)
+            c1, c2, c3 = st.columns(3)
+            with c1:
+                st.markdown(f'<div class="metric-box"><div class="metric-val" style="color:#9e2210">{miss_pct:.1f}%</div><div class="metric-lbl">Missing</div></div>', unsafe_allow_html=True)
+            with c2:
+                st.markdown(f'<div class="metric-box"><div class="metric-val">{dtype_str}</div><div class="metric-lbl">Data Type</div></div>', unsafe_allow_html=True)
+            with c3:
+                n_miss = int(summary.loc[selected_col, "Missing Count"])
+                st.markdown(f'<div class="metric-box"><div class="metric-val">{n_miss:,}</div><div class="metric-lbl">Missing Rows</div></div>', unsafe_allow_html=True)
+            st.markdown(
+                f'<div style="background:{risk_bg};border:1.5px solid {risk_fg};border-radius:8px;padding:10px 16px;margin:12px 0;">'
+                f'<b style="color:{risk_fg}">{lv} Missingness</b> — {risk_txt}</div>',
+                unsafe_allow_html=True,
+            )
+            # ── Test 1
+            st.markdown("#### 🔬 Test 1 — Pattern Analysis (Missingness Map)")
+            t1 = test1_pattern_analysis(df, selected_col)
+            fig, axes = plt.subplots(1, 2, figsize=(12, 3))
+            sample_size = min(300, len(df))
+            idx_sample = df.sample(n=sample_size, random_state=42).index if len(df) > sample_size else df.index
+            ind_sample = t1["indicator"].loc[idx_sample]
+            axes[0].scatter(range(len(ind_sample)), ind_sample.values,
+                c=["#9e2210" if v else "#89d9ac" for v in ind_sample.values], s=8, alpha=0.8)
+            axes[0].set_yticks([0, 1]); axes[0].set_yticklabels(["Present", "Missing"])
+            axes[0].set_title(f"Missingness Pattern ({sample_size} rows)")
+            axes[0].set_xlabel("Row index")
+            roll = t1["indicator"].rolling(50, min_periods=1).mean()
+            axes[1].plot(roll.values, color="#17172b", linewidth=1.2)
+            axes[1].set_title("Rolling Miss Rate (window=50)")
+            axes[1].set_xlabel("Row index"); axes[1].set_ylabel("Miss rate")
+            axes[1].axhline(t1["miss_pct"] / 100, color="#9e2210", linestyle="--", label="Mean miss rate")
+            axes[1].legend(fontsize=8)
+            plt.tight_layout()
+            st.pyplot(fig); plt.close()
+            scatter_icon = "🟢" if t1["scattered"] else "🟠"
+            st.markdown(f'<div class="card-info"><b>{scatter_icon} {t1["signal"]}</b><br><small>Cluster ratio: {t1["cluster_ratio"]:.2f} (higher = more scattered = MCAR signal)</small></div>', unsafe_allow_html=True)
+            # ── Test 2
+            st.markdown("#### 🔬 Test 2 — Feature Dependency")
+            t2 = test2_feature_dependency(df, selected_col)
+            if t2["diffs"]:
+                top_diffs = dict(sorted(t2["diffs"].items(), key=lambda x: -x[1])[:15])
+                fig2, ax2 = plt.subplots(figsize=(10, max(3, len(top_diffs) * 0.45)))
+                colors = ["#9e2210" if v >= 30 else "#f0a040" if v >= 10 else "#89d9ac" for v in top_diffs.values()]
+                ax2.barh(list(top_diffs.keys())[::-1], list(top_diffs.values())[::-1], color=colors[::-1], edgecolor="white")
+                ax2.axvline(5,  color="#89d9ac", linewidth=1.5, linestyle="--", label="5% weak")
+                ax2.axvline(10, color="#f0cc7a", linewidth=1.5, linestyle="--", label="10% MAR signal")
+                ax2.axvline(30, color="#f5a898", linewidth=1.5, linestyle="--", label="30% strong")
+                ax2.set_xlabel("Distribution Difference (%)")
+                ax2.set_title("Feature Distribution Difference")
+                ax2.legend(fontsize=8)
+                plt.tight_layout()
+                st.pyplot(fig2); plt.close()
+                dep_icon = "🟢" if t2["max_diff"] < 5 else "🟠" if t2["max_diff"] < 30 else "🔴"
+                st.markdown(f'<div class="card-info"><b>{dep_icon} {t2["signal"]}</b><br><small>Max difference: {t2["max_diff"]:.1f}%</small></div>', unsafe_allow_html=True)
+            else:
+                st.info("Not enough data to compare feature distributions.")
+                t2 = {"diffs": {}, "max_diff": 0.0, "signal": "Insufficient data"}
+            # ── Test 3
+            st.markdown("#### 🔬 Test 3 — Target Dependency")
+            if selected_col == target_col:
+                st.warning("⚠️ Selected column IS the target column. Test 3 skipped.")
+                t3 = {"diff_pct": None, "signal": "Skipped — column is target"}
+            else:
+                t3 = test3_target_dependency(df, selected_col, target_col)
+                if t3["diff_pct"] is not None:
+                    missing_mask = df[selected_col].isnull()
+                    fig3, ax3 = plt.subplots(figsize=(7, 3.5))
+                    if pd.api.types.is_numeric_dtype(df[target_col]):
+                        miss_target = df.loc[missing_mask, target_col].dropna()
+                        obs_target  = df.loc[~missing_mask, target_col].dropna()
+                        ax3.hist(obs_target,  bins=25, alpha=0.7, label="Target when present",  color="#17172b", edgecolor="white")
+                        ax3.hist(miss_target, bins=25, alpha=0.7, label="Target when missing",  color="#9e2210", edgecolor="white")
+                        ax3.set_xlabel(target_col); ax3.set_ylabel("Count")
+                        ax3.legend()
+                    else:
+                        miss_target = df.loc[missing_mask, target_col].value_counts(normalize=True) * 100
+                        obs_target  = df.loc[~missing_mask, target_col].value_counts(normalize=True) * 100
+                        cats = list(set(miss_target.index) | set(obs_target.index))
+                        x = np.arange(len(cats))
+                        ax3.bar(x - 0.2, [obs_target.get(c, 0) for c in cats],  0.4, label="Present",  color="#17172b")
+                        ax3.bar(x + 0.2, [miss_target.get(c, 0) for c in cats], 0.4, label="Missing", color="#9e2210")
+                        ax3.set_xticks(x); ax3.set_xticklabels(cats, rotation=30)
+                        ax3.set_ylabel("% of group"); ax3.legend()
+                    ax3.set_title(f"Target ({target_col}) dist: present vs missing in '{selected_col}'")
+                    plt.tight_layout()
+                    st.pyplot(fig3); plt.close()
+                    dep_icon = "🟢" if (t3["diff_pct"] or 0) < 5 else "🟠" if (t3["diff_pct"] or 0) < 10 else "🔴"
+                    st.markdown(f'<div class="card-info"><b>{dep_icon} {t3["signal"]}</b><br><small>Target diff: {t3["diff_pct"]}%</small></div>', unsafe_allow_html=True)
+                else:
+                    st.info(t3["signal"])
+            # ── Verdict
+            st.markdown("---")
+            st.markdown("### 🏁 Mechanism Verdict")
+            mechanism, confidence, explanation = classify_mechanism(t1, t2, t3)
+            card_class = {"MCAR": "card-mcar", "MAR": "card-mar", "MNAR": "card-mnar"}[mechanism]
+            emoji = {"MCAR": "🟢", "MAR": "🟠", "MNAR": "🔴"}[mechanism]
+            st.markdown(
+                f'<div class="{card_class}">'
+                f'<div class="verdict-label">{emoji} {mechanism} — {confidence} confidence</div>'
+                f'<div class="verdict-desc">{explanation}</div>'
+                f'</div>',
+                unsafe_allow_html=True,
+            )
+            # Strategy chips
+            col_type_str = "Numerical" if pd.api.types.is_numeric_dtype(df[selected_col]) else "Categorical"
+            chips_html = strategy_chips_html(mechanism, miss_pct, col_type_str)
+            if chips_html:
+                st.markdown("**Recommended Strategy Options**")
+                st.markdown(chips_html, unsafe_allow_html=True)
+            st.session_state["col_results"][selected_col] = {
+                "mechanism": mechanism,
+                "confidence": confidence,
+                "miss_pct": miss_pct,
+                "dtype": dtype_str,
+                "t1": t1, "t2": t2, "t3": t3,
+            }
+# ════════════════════════════════════════════════════════════════════
+#  STEP 6 — STRATEGY & IMPUTATION
+# ════════════════════════════════════════════════════════════════════
+elif step == STEPS[5]:
+    st.markdown('<div class="main-title">🛠 Step 6 — Strategy & Imputation</div>', unsafe_allow_html=True)
+    st.markdown('<div class="main-sub">Based on the mechanism and missing %, select and apply the right strategy for each column.</div>', unsafe_allow_html=True)
+    df = st.session_state.get("df")
+    col_results = st.session_state.get("col_results", {})
+    if df is None:
+        st.warning("⚠️ Please upload a CSV in Step 1 first.")
+    elif not col_results:
+        st.warning("⚠️ Please run diagnostics in Step 5 for at least one column first.")
+    else:
+        df_imputed = (df.copy() if st.session_state.get("df_imputed") is None
+                      else st.session_state["df_imputed"].copy())
+        for col, res in col_results.items():
+            mechanism = res["mechanism"]
+            miss_pct  = res["miss_pct"]
+            dtype_str = res["dtype"]
+            st.markdown(f"### Column: `{col}`")
+            st.markdown(f"**Mechanism:** {mechanism} | **Missing:** {miss_pct:.1f}% | **Type:** `{dtype_str}`")
+            rec = recommend_strategy(mechanism, miss_pct, dtype_str)
+            card_class = "card-mcar" if mechanism == "MCAR" else "card-mar" if mechanism == "MAR" else "card-mnar"
+            st.markdown(
+                f'<div class="{card_class}">'
+                f'<b>Recommended: {rec["method"]}</b><br>'
+                f'<small>{rec["reason"]}</small><br>'
+                f'<small>{rec["adv"]}</small><br>'
+                f'<small style="color:#888">{rec["disadv"]}</small>'
+                f'</div>',
+                unsafe_allow_html=True,
+            )
+            if rec["add_indicator"]:
+                st.markdown(
+                    '<div class="card-warn">🚩 <b>Missing Indicator will be added BEFORE imputation</b> — '
+                    "missingness itself carries signal for this column.</div>",
+                    unsafe_allow_html=True,
+                )
+            is_num = "float" in dtype_str or "int" in dtype_str
+            strategy_options = (
+                ["Mean", "Median", "Constant (0)", "Drop rows", "Keep as-is"] if is_num
+                else ["Mode", "Constant ('Unknown')", "Drop rows", "Keep as-is"]
+            )
+            chosen = st.selectbox(
+                f"Apply strategy for `{col}`:",
+                options=strategy_options,
+                key=f"strategy_{col}",
+            )
+            if st.button(f"▶ Apply to `{col}`", key=f"apply_{col}"):
+                if rec["add_indicator"]:
+                    indicator_col = f"{col}_was_missing"
+                    df_imputed[indicator_col] = df[col].isnull().astype(int)
+                    st.info(f"✅ Created indicator column: `{indicator_col}`")
+                if chosen == "Mean":
+                    fill_val = df[col].mean()
+                    df_imputed[col] = df_imputed[col].fillna(fill_val)
+                    st.success(f"✅ Imputed with mean = {fill_val:.4f}")
+                elif chosen == "Median":
+                    fill_val = df[col].median()
+                    df_imputed[col] = df_imputed[col].fillna(fill_val)
+                    st.success(f"✅ Imputed with median = {fill_val:.4f}")
+                elif chosen == "Mode":
+                    fill_val = df[col].mode().iloc[0]
+                    df_imputed[col] = df_imputed[col].fillna(fill_val)
+                    st.success(f"✅ Imputed with mode = {fill_val}")
+                elif chosen in ("Constant (0)", "Constant ('Unknown')"):
+                    fill_val = 0 if is_num else "Unknown"
+                    df_imputed[col] = df_imputed[col].fillna(fill_val)
+                    st.success(f"✅ Imputed with constant = {fill_val}")
+                elif chosen == "Drop rows":
+                    before = len(df_imputed)
+                    df_imputed = df_imputed.dropna(subset=[col])
+                    after = len(df_imputed)
+                    st.success(f"✅ Dropped {before - after} rows with missing `{col}`")
+                else:
+                    st.info("No imputation applied.")
+                st.session_state["df_imputed"] = df_imputed
+            st.markdown("<hr class='divider'>", unsafe_allow_html=True)
+        st.markdown("### 📥 Download Imputed Dataset")
+        df_out = st.session_state.get("df_imputed", df)
+        csv_bytes = df_out.to_csv(index=False).encode("utf-8")
+        st.download_button(
+            label="⬇ Download imputed CSV",
+            data=csv_bytes,
+            file_name="imputed_dataset.csv",
+            mime="text/csv",
+        )
+        st.dataframe(df_out.head(10), use_container_width=True)
+# ════════════════════════════════════════════════════════════════════
+#  STEP 7 — VALIDATION CHECKS
+# ════════════════════════════════════════════════════════════════════
+elif step == STEPS[6]:
+    st.markdown('<div class="main-title">✅ Step 7 — Validation Checks</div>', unsafe_allow_html=True)
+    st.markdown('<div class="main-sub">Confirm that imputation preserved statistical properties and did not introduce bias.</div>', unsafe_allow_html=True)
+    df_orig    = st.session_state.get("df")
+    df_imputed = st.session_state.get("df_imputed")
+    col_results = st.session_state.get("col_results", {})
+    if df_orig is None or df_imputed is None:
+        st.warning("⚠️ Complete Steps 1–6 first.")
+    elif not col_results:
+        st.warning("⚠️ Run diagnostics in Step 5 and apply a strategy in Step 6 first.")
+    else:
+        numeric_cols = [c for c in col_results if pd.api.types.is_numeric_dtype(df_orig[c])]
+        if not numeric_cols:
+            st.info("Validation checks apply to numeric columns only. No numeric columns were diagnosed.")
+        else:
+            for col in numeric_cols:
+                before = df_orig[col].dropna()
+                after  = df_imputed[col].dropna()
+                if len(after) == 0 or len(before) == 0:
+                    continue
+                st.markdown(f"### `{col}`")
+                chk = validation_checks(before, after)
+                c1, c2, c3 = st.columns(3)
+                def chk_icon(ok): return "✅" if ok else "⚠️"
+                with c1:
+                    st.markdown(
+                        f'<div class="metric-box">'
+                        f'<div class="metric-val">{chk_icon(chk["mean_ok"])} {chk["mean_shift_pct"]}%</div>'
+                        f'<div class="metric-lbl">Mean shift (≤5% OK)</div>'
+                        f'</div>', unsafe_allow_html=True)
+                with c2:
+                    st.markdown(
+                        f'<div class="metric-box">'
+                        f'<div class="metric-val">{chk_icon(chk["median_ok"])} {chk["median_shift_pct"]}%</div>'
+                        f'<div class="metric-lbl">Median shift (≤3% OK)</div>'
+                        f'</div>', unsafe_allow_html=True)
+                with c3:
+                    st.markdown(
+                        f'<div class="metric-box">'
+                        f'<div class="metric-val">{chk_icon(chk["var_ok"])} {chk["var_change_pct"]}%</div>'
+                        f'<div class="metric-lbl">Variance change (≤20% OK)</div>'
+                        f'</div>', unsafe_allow_html=True)
+                fig_v, ax_v = plt.subplots(figsize=(8, 3.5))
+                ax_v.hist(before.values, bins=30, alpha=0.55, label="Before imputation", color="#17172b", edgecolor="white")
+                ax_v.hist(after.values,  bins=30, alpha=0.55, label="After imputation",  color="#6020a0", edgecolor="white")
+                ax_v.axvline(before.mean(), color="#17172b", linewidth=1.5, linestyle="--", label=f"Mean before: {before.mean():.2f}")
+                ax_v.axvline(after.mean(),  color="#6020a0", linewidth=1.5, linestyle="--", label=f"Mean after: {after.mean():.2f}")
+                ax_v.set_title(f"Distribution: '{col}' before vs after imputation")
+                ax_v.legend(fontsize=8)
+                plt.tight_layout()
+                st.pyplot(fig_v); plt.close()
+                target_col = st.session_state.get("target_col")
+                if target_col and target_col in df_orig.columns and pd.api.types.is_numeric_dtype(df_orig[target_col]):
+                    corr_before = df_orig[[col, target_col]].dropna().corr().iloc[0, 1]
+                    corr_after  = df_imputed[[col, target_col]].dropna().corr().iloc[0, 1]
+                    delta = abs(corr_before - corr_after)
+                    sign_flip = (corr_before * corr_after < 0)
+                    icon = "✅" if delta <= 0.05 and not sign_flip else "⚠️"
+                    st.markdown(
+                        f'<div class="card-info">{icon} <b>Correlation with target:</b> '
+                        f'Before = {corr_before:.3f} → After = {corr_after:.3f} | Δ = {delta:.3f}'
+                        + (" 🚨 Sign flipped!" if sign_flip else "")
+                        + "</div>",
+                        unsafe_allow_html=True,
+                    )
+                st.markdown("<hr class='divider'>", unsafe_allow_html=True)
+        st.markdown("### ⚠️ Common Pitfalls Checklist")
+        pitfalls = [
+            "Each column treated independently?",
+            "Imputation done AFTER train-test split?",
+            "Target variable NOT used as imputation predictor?",
+            "Missing indicator created BEFORE imputation for MNAR/MAR ≥10%?",
+            "Validation checked beyond just accuracy?",
+        ]
+        for txt in pitfalls:
+            st.checkbox(txt, value=False, key=f"pitfall_{txt[:20]}")
+        st.markdown(
+            '<div class="card-warn">'
+            '<b>↻ Repeat Steps 5–6 for every column independently.</b><br>'
+            'One column may be MCAR (drop rows), another MAR (KNN), another MNAR (indicator + median). '
+            'Never apply one method to all columns at once.'
+            '</div>',
+            unsafe_allow_html=True,
+        )
+st.markdown("---")
+st.caption("🔬 Missing Value Intelligence Suite · Merged from app.py + app_tanisha.py · Built with Streamlit, pandas, scikit-learn, scipy, seaborn")