diff --git "a/src/streamlit_app.py" "b/src/streamlit_app.py"
--- "a/src/streamlit_app.py"
+++ "b/src/streamlit_app.py"
@@ -1,1497 +1,770 @@
"""
-Missing Value Intelligence Suite — Merged App
-Combines the stepwise pipeline (app.py) with the comprehensive dashboard (app_tanisha.py)
-into a unified 7-step workflow.
+Missing Value Analyzer — Statistically Rigorous Pipeline
+=========================================================
+Phases:
+ 1 Upload CSV & Train/Test Split
+ 2 Missing Value Overview (train set only)
+ 3 Per-Column Diagnostics (Tables for all tests)
+ 4 Imputation Feasibility Gate (KDE plots, Variance %, New Outliers)
+ 5 Final Report & Recommendations
"""
import streamlit as st
import pandas as pd
import numpy as np
+import matplotlib
+matplotlib.use("Agg")
import matplotlib.pyplot as plt
-import matplotlib.patches as mpatches
import seaborn as sns
from scipy import stats
-from scipy.stats import chi2_contingency, ks_2samp, shapiro, skew, kurtosis
-from sklearn.preprocessing import LabelEncoder, StandardScaler
-from sklearn.linear_model import LogisticRegression
+from scipy.stats import chi2_contingency, ttest_ind, norm, chi2
from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.impute import KNNImputer
+from sklearn.experimental import enable_iterative_imputer
+from sklearn.impute import IterativeImputer
import warnings
warnings.filterwarnings("ignore")
# ─────────────────────────── Page config ────────────────────────────
st.set_page_config(
- page_title="Missing Value Intelligence Suite",
+ page_title="Missing Value Analyzer",
page_icon="🔬",
layout="wide",
initial_sidebar_state="expanded",
)
-# ─────────────────────────── Custom CSS ─────────────────────────────
+# ─────────────────────────── CSS ────────────────────────────────────
st.markdown("""
-""", unsafe_allow_html=True)
+section[data-testid="stSidebar"]{background:#17172b;}
+section[data-testid="stSidebar"] *{color:#ffffff !important;}
+section[data-testid="stSidebar"] hr{border-color:#ffffff33 !important;}
+.main-title{font-size:2rem;font-weight:700;color:#17172b;margin-bottom:.2rem;}
+.main-sub{font-size:1rem;color:#6060a0;margin-bottom:1.5rem;}
-# ════════════════════════════════════════════════════════════════════
-# SHARED HELPER FUNCTIONS
-# ════════════════════════════════════════════════════════════════════
+.metric-box{background:#f5f3ee;border-radius:8px;padding:12px 16px;text-align:center;margin-bottom:8px;}
+.metric-val{font-size:1.4rem;font-weight:700;color:#17172b !important;}
+.metric-lbl{font-size:.78rem;color:#6060a0 !important;margin-top:2px;}
-def missing_summary_df(df: pd.DataFrame) -> pd.DataFrame:
- total = len(df)
- counts = df.isnull().sum()
- pct = counts / total * 100
- summary = pd.DataFrame({
- "Missing Count": counts,
- "Missing %": pct.round(2),
- "Dtype": df.dtypes.astype(str),
- })
- return summary[summary["Missing Count"] > 0].sort_values("Missing %", ascending=False)
-
-
-def missing_summary_typed(df, num_cols, cat_cols):
- rows = []
- for col in df.columns:
- mc = df[col].isnull().sum()
- pct = mc / len(df) * 100
- dtype = "Numerical" if col in num_cols else "Categorical"
- rows.append({"Column": col, "Data Type": dtype,
- "Missing Count": mc, "Missing %": round(pct, 2)})
- result = pd.DataFrame(rows).sort_values("Missing %", ascending=False).reset_index(drop=True)
- return result[result["Missing Count"] > 0].reset_index(drop=True)
-
-
-def severity(pct):
- if pct < 5: return "Low"
- if pct < 20: return "Moderate"
- return "High"
-
-
-def identify_columns(df):
- num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
- cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
- return num_cols, cat_cols
-
-
-def missingness_risk_level(pct: float) -> tuple:
- if pct <= 5:
- return "≤5%", "Very low missingness. Low risk of bias.", "#edfaf3", "#0d6b3a"
- elif pct <= 15:
- return "5–15%", "Moderate. Imputation preferred over dropping.", "#fffaeb", "#7a4d00"
- elif pct <= 30:
- return "15–30%", "High. Dropping loses too much data. Advanced imputation + missing indicator mandatory.", "#fff0ed", "#9e2210"
- else:
- return ">30%", "Very high. Consider dropping the column. Re-evaluate column usefulness + domain check.", "#fde8e8", "#7a0000"
-
-
-# ── Statistical Tests (from app.py) ──────────────────────────────────
-
-def test1_pattern_analysis(df: pd.DataFrame, col: str) -> dict:
- indicator = df[col].isnull().astype(int)
- miss_pct = indicator.mean() * 100
- runs = (indicator != indicator.shift()).sum()
- max_possible_runs = min(len(indicator) * 2, len(indicator[indicator == 1]) * 2 + 1)
- cluster_ratio = runs / max(max_possible_runs, 1)
- scattered = cluster_ratio > 0.5
- return {
- "indicator": indicator,
- "miss_pct": miss_pct,
- "scattered": scattered,
- "cluster_ratio": cluster_ratio,
- "signal": "MCAR signal" if scattered else "MAR / MNAR signal (clustered rows)",
- }
+.big-stat-box{border-radius:12px;padding:20px 24px;text-align:center;margin-bottom:8px;}
+.big-stat-val{font-size:2.2rem;font-weight:800;margin-bottom:4px;}
+.big-stat-lbl{font-size:.82rem;font-weight:500;opacity:0.8;text-transform:uppercase;letter-spacing:.05em;}
+.big-stat-sub{font-size:.78rem;opacity:0.65;margin-top:4px;}
+.stat-ok{background:#edfaf3;border:2px solid #89d9ac;}
+.stat-ok .big-stat-val{color:#0a5c30;}
+.stat-warn{background:#fffaeb;border:2px solid #f0cc7a;}
+.stat-warn .big-stat-val{color:#7a4f00;}
+.stat-fail{background:#fff0ed;border:2px solid #f5a898;}
+.stat-fail .big-stat-val{color:#900000;}
-def test2_feature_dependency(df: pd.DataFrame, col: str) -> dict:
- missing_mask = df[col].isnull()
- if missing_mask.sum() < 5 or (~missing_mask).sum() < 5:
- return {"diffs": {}, "max_diff": 0.0, "signal": "Insufficient data"}
+.card-mcar{background:#edfaf3;border:2px solid #89d9ac;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
+.card-mar {background:#fffaeb;border:2px solid #f0cc7a;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
+.card-mnar{background:#fff0ed;border:2px solid #f5a898;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
+.card-info{background:#eef2ff;border:2px solid #bdc8f5;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
+.card-warn{background:#fff8e1;border:2px solid #ffe082;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
+.card-danger{background:#fde8e8;border:2px solid #f5a8a8;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
+.card-ok{background:#edfaf3;border:2px solid #89d9ac;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
- diffs = {}
- for other_col in df.columns:
- if other_col == col:
- continue
- try:
- miss_vals = df.loc[missing_mask, other_col].dropna()
- obs_vals = df.loc[~missing_mask, other_col].dropna()
- if len(miss_vals) < 3 or len(obs_vals) < 3:
- continue
- if pd.api.types.is_numeric_dtype(df[other_col]):
- m1, m2 = miss_vals.mean(), obs_vals.mean()
- denom = max(abs(m2), 1e-9)
- diff_pct = abs(m1 - m2) / denom * 100
- diffs[other_col] = diff_pct
- else:
- ct = pd.crosstab(
- pd.concat([pd.Series(["missing"] * len(miss_vals)),
- pd.Series(["present"] * len(obs_vals))]),
- pd.concat([miss_vals, obs_vals])
- )
- chi2, _, _, _ = chi2_contingency(ct)
- n = ct.values.sum()
- k = min(ct.shape) - 1
- cramers_v = np.sqrt(chi2 / (n * max(k, 1))) * 100
- diffs[other_col] = cramers_v
- except Exception:
- continue
+.card-mcar *, .card-mar *, .card-mnar *, .card-info *, .card-warn *, .card-danger *, .card-ok * {color: #1a1a2e !important;}
- if not diffs:
- return {"diffs": {}, "max_diff": 0.0, "signal": "No comparable features"}
+.verdict-label{font-size:1.1rem;font-weight:700;margin-bottom:4px;}
+.verdict-desc{font-size:.88rem;color:#333 !important;}
- max_diff = max(diffs.values())
- if max_diff < 5:
- signal = "Weak signal — MCAR likely"
- elif max_diff < 30:
- signal = "Strong MAR signal (feature dependency detected)"
- else:
- signal = "Very strong dependency — MAR or MNAR"
+code{background:#e8e8eb;padding:2px 6px;border-radius:4px;font-size:.85rem; color:#d6336c !important;}
+hr.divider{border:none;border-top:2px solid #e0ddd8;margin:1.5rem 0;}
- return {"diffs": diffs, "max_diff": max_diff, "signal": signal}
+.theory-box {background:#fafafa; border-left:4px solid #4f8ef7; border-radius:4px; padding:12px 18px; margin-bottom:16px;}
+.theory-box h4 {color:#17172b; margin-bottom:6px; font-size:1.05rem;}
+.theory-box p {color:#444; font-size:0.92rem; line-height:1.5;}
+.stat-highlight { font-size: 1.2rem; font-weight: bold; color: #d6336c; background: #ffe4e1; padding: 2px 8px; border-radius: 4px;}
-def test3_target_dependency(df: pd.DataFrame, col: str, target_col: str) -> dict:
- missing_mask = df[col].isnull()
- if missing_mask.sum() < 5 or (~missing_mask).sum() < 5:
- return {"diff_pct": None, "signal": "Insufficient data"}
+.test-header{font-size:1.05rem;font-weight:700;color:#17172b;margin:18px 0 8px;}
+
+""", unsafe_allow_html=True)
+
+
+# ════════════════════════════════════════════════════════════════════
+# SESSION STATE INIT
+# ═════════════════════════════════════════════════���══════════════════
+defaults = {"df_full": None, "df_train": None, "df_test": None, "target_col": None, "split_ratio": 0.8, "col_diagnostics": {}}
+for k, v in defaults.items():
+ if k not in st.session_state: st.session_state[k] = v
- try:
- miss_target = df.loc[missing_mask, target_col].dropna()
- obs_target = df.loc[~missing_mask, target_col].dropna()
+# ════════════════════════════════════════════════════════════════════
+# STATISTICAL TEST HELPERS
+# ════════════════════════════════════════════════════════════════════
+def littles_mcar_test(df: pd.DataFrame, cols_with_missing: list) -> dict:
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+ chi2_total, df_total = 0.0, 0
+ for col in cols_with_missing:
+ if col not in numeric_cols: continue
+ missing_mask = df[col].isnull()
+ if missing_mask.sum() < 5 or (~missing_mask).sum() < 5: continue
+ for other in numeric_cols:
+ if other == col: continue
+ g1, g2 = df.loc[missing_mask, other].dropna(), df.loc[~missing_mask, other].dropna()
+ if len(g1) < 3 or len(g2) < 3: continue
+ grand_mean, grand_var = df[other].mean(), df[other].var()
+ if grand_var < 1e-12: continue
+ chi2_total += (len(g1)*(g1.mean() - grand_mean)**2 + len(g2)*(g2.mean() - grand_mean)**2) / grand_var
+ df_total += 1
+ if df_total == 0: return {"chi2": None, "p_value": None, "verdict": "Insufficient numeric data"}
+ p_val = 1 - chi2.cdf(chi2_total, df_total)
+ verdict = f"Fail to reject MCAR" if p_val >= 0.05 else f"Reject MCAR"
+ return {"chi2": round(chi2_total, 4), "df": df_total, "p_value": round(p_val, 4), "verdict": verdict, "reject_mcar": p_val < 0.05}
+
+def feature_dependency_tests(df: pd.DataFrame, col: str) -> dict:
+ missing_mask = df[col].isnull()
+ if missing_mask.sum() < 5 or (~missing_mask).sum() < 5: return {"results": {}, "n_significant": 0, "signal": "Insufficient data"}
+ results = {}
+ for other in df.columns:
+ if other == col: continue
+ g_miss, g_obs = df.loc[missing_mask, other].dropna(), df.loc[~missing_mask, other].dropna()
+ if len(g_miss) < 3 or len(g_obs) < 3: continue
+ try:
+ if pd.api.types.is_numeric_dtype(df[other]):
+ n1, n2 = len(g_miss), len(g_obs)
+ if min(n1, n2) >= 30:
+ se = np.sqrt(g_miss.var()/n1 + g_obs.var()/n2)
+ if se < 1e-12: continue
+ z_stat = (g_miss.mean() - g_obs.mean()) / se
+ p_val = 2 * (1 - norm.cdf(abs(z_stat)))
+ test_name, stat = "z-test", round(z_stat, 4)
+ else:
+ t_stat, p_val = ttest_ind(g_miss, g_obs, equal_var=False)
+ test_name, stat = "Welch t-test", round(t_stat, 4)
+ results[other] = {"test": test_name, "stat": stat, "p_value": round(p_val, 4), "significant": p_val < 0.05, "type": "numeric"}
+ else:
+ ct = pd.crosstab(missing_mask.astype(int), df[other])
+ if ct.shape[0] < 2 or ct.shape[1] < 2: continue
+ chi2_stat, p_val, _, _ = chi2_contingency(ct)
+ results[other] = {"test": "chi²", "stat": round(chi2_stat, 4), "p_value": round(p_val, 4), "significant": p_val < 0.05, "type": "categorical"}
+ except Exception: continue
+ n_sig = sum(1 for r in results.values() if r["significant"])
+ sig_pct = n_sig / max(len(results), 1) * 100
+ signal = "No features differ significantly" if sig_pct == 0 else f"{n_sig}/{len(results)} features differ (p<0.05)"
+ return {"results": results, "n_significant": n_sig, "total_tested": len(results), "sig_pct": round(sig_pct, 1), "signal": signal}
+
+def target_dependency_test(df: pd.DataFrame, col: str, target_col: str) -> dict:
+ missing_mask = df[col].isnull()
+ if missing_mask.sum() < 5 or (~missing_mask).sum() < 5: return {"p_value": None, "signal": "Insufficient data", "significant": False}
+ try:
+ g_miss, g_obs = df.loc[missing_mask, target_col].dropna(), df.loc[~missing_mask, target_col].dropna()
if pd.api.types.is_numeric_dtype(df[target_col]):
- m1, m2 = miss_target.mean(), obs_target.mean()
- denom = max(abs(m2), 1e-9)
- diff_pct = abs(m1 - m2) / denom * 100
+ n1, n2 = len(g_miss), len(g_obs)
+ if min(n1, n2) >= 30:
+ se = np.sqrt(g_miss.var()/n1 + g_obs.var()/n2)
+ if se < 1e-12: return {"p_value": None, "signal": "Zero variance", "significant": False}
+ z_stat = (g_miss.mean() - g_obs.mean()) / se
+ p_val = 2 * (1 - norm.cdf(abs(z_stat)))
+ else:
+ _, p_val = ttest_ind(g_miss, g_obs, equal_var=False)
+ diff_pct = abs(g_miss.mean() - g_obs.mean()) / max(abs(g_obs.mean()), 1e-9) * 100
else:
- p1 = miss_target.value_counts(normalize=True).iloc[0] * 100
- p2 = obs_target.value_counts(normalize=True).iloc[0] * 100
+ ct = pd.crosstab(missing_mask.astype(int), df[target_col])
+ _, p_val, _, _ = chi2_contingency(ct)
+ p1, p2 = g_miss.value_counts(normalize=True).iloc[0]*100, g_obs.value_counts(normalize=True).iloc[0]*100
diff_pct = abs(p1 - p2)
- if diff_pct < 5:
- signal = "No strong signal (<5% target diff)"
- elif diff_pct < 10:
- signal = "Moderate target dependency — possible MAR/MNAR"
- else:
- signal = "Strong target dependency → MNAR likely (>10% target diff)"
-
- return {"diff_pct": round(diff_pct, 2), "signal": signal}
- except Exception as e:
- return {"diff_pct": None, "signal": f"Could not compute: {e}"}
-
+ sig = p_val < 0.05
+ signal = f"Not significant (p={p_val:.4f})" if not sig else f"Significant — target differs by {diff_pct:.1f}%"
+ return {"p_value": round(p_val, 4), "significant": sig, "diff_pct": round(diff_pct, 2), "signal": signal}
+ except Exception as e: return {"p_value": None, "signal": f"Error: {e}", "significant": False}
+
+def classify_mechanism(t_feat, t_target, little):
+ tgt_sig, tgt_diff = t_target.get("significant", False), t_target.get("diff_pct", 0)
+ sig_pct = t_feat.get("sig_pct", 0)
+
+ if tgt_sig and tgt_diff >= 10: return "MNAR", "High", "Missingness strongly correlates with the outcome."
+ elif tgt_sig and tgt_diff >= 5: return "MNAR", "Moderate", "Moderate dependency on target. Treat conservatively as MNAR."
+ elif sig_pct > 30: return "MAR", "High", "Strong dependency on observed features detected."
+ elif sig_pct > 0: return "MAR", "Moderate", "Weak but present dependency on observed features."
+ elif little.get("reject_mcar"): return "MAR", "Low", "Little's test rejects MCAR, but feature tests show weak dependency."
+ else: return "MCAR", "High", "No statistical evidence of systematic missingness."
+
+def run_single_diagnostic(df, col, target_col):
+ little, t_feat = littles_mcar_test(df, [col]), feature_dependency_tests(df, col)
+ t_target = {"p_value": None, "significant": False, "signal": "Skipped (Is Target)", "diff_pct": 0} if col == target_col else target_dependency_test(df, col, target_col)
+ mech, conf, expl = classify_mechanism(t_feat, t_target, little)
+ st.session_state["col_diagnostics"][col] = {
+ "mechanism": mech, "confidence": conf, "explanation": expl,
+ "miss_pct": round(df[col].isnull().mean()*100, 2),
+ "dtype": str(df[col].dtype),
+ "little": little, "t_feat": t_feat, "t_target": t_target
+ }
-def classify_mechanism(t1: dict, t2: dict, t3: dict) -> tuple:
- feat_dep = t2.get("max_diff", 0)
- tgt_dep = t3.get("diff_pct") or 0
- scattered = t1.get("scattered", True)
- if tgt_dep > 10:
- return "MNAR", "High", (
- f"Target variable differs by {tgt_dep:.1f}% between missing/present rows. "
- "The probability of missingness depends on the unobserved value itself."
- )
- elif feat_dep >= 10 and not scattered:
- return "MAR", "High", (
- f"Feature distributions differ by up to {feat_dep:.1f}% and missing values appear "
- "clustered — missingness depends on observed features."
- )
- elif feat_dep >= 5:
- return "MAR", "Moderate", (
- f"Feature distributions differ by up to {feat_dep:.1f}%. "
- "Missingness likely depends on observed features."
- )
- elif scattered and feat_dep < 5 and tgt_dep < 5:
- return "MCAR", "High", (
- "Values appear randomly scattered, feature distributions are similar across "
- "groups, and target shows no dependency — consistent with MCAR."
- )
+# ════════════════════════════════════════════════════════════════════
+# IMPUTATION SIMULATION HELPERS
+# ════════════════════════════════════════════════════════════════════
+def feasibility_checks(df: pd.DataFrame, col: str, target_col: str, impute_method: str) -> dict:
+ series = df[col].dropna()
+ if len(series) < 5 or not pd.api.types.is_numeric_dtype(df[col]):
+ return {"applicable": False}
+
+ results = {"applicable": True, "escalate_to_knn": False, "reasons": []}
+
+ # ── 1. Impute ──
+ if impute_method == "Mean": imputed_series = df[col].fillna(series.mean())
+ elif impute_method == "Median": imputed_series = df[col].fillna(series.median())
else:
- return "MCAR", "Low", (
- "Weak signals across all three tests. Treated as MCAR but verify with domain knowledge."
- )
-
-
-# ── Logistic Regression-based mechanism diagnosis (from app_tanisha.py) ──
-
-def diagnose_mechanism_lr(df, col, num_cols):
- miss_mask = df[col].isnull().astype(int)
- predictors = [c for c in df.columns if c != col and df[c].isnull().mean() < 0.9]
- if not predictors or miss_mask.sum() < 5:
- return "MNAR", "Insufficient data to test; assumed MNAR."
- mcar_p_vals = []
- for p in predictors:
- if p in num_cols and df[p].dropna().nunique() > 1:
- try:
- binned = pd.qcut(df[p].fillna(df[p].median()), q=4, duplicates="drop", labels=False)
- ct = pd.crosstab(binned, miss_mask)
- if ct.shape[0] > 1 and ct.shape[1] > 1:
- _, p_val, _, _ = chi2_contingency(ct)
- mcar_p_vals.append(p_val)
- except Exception:
- pass
- if mcar_p_vals and np.mean(mcar_p_vals) > 0.05:
- return "MCAR", (f"Chi-square tests show no significant dependency "
- f"(avg p={np.mean(mcar_p_vals):.3f} > 0.05). Missingness appears random.")
- try:
- X_pred = df[predictors].copy()
- for c in X_pred.select_dtypes(include="object").columns:
- X_pred[c] = X_pred[c].astype("category").cat.codes
- X_pred = X_pred.fillna(X_pred.median(numeric_only=True))
- scaler = StandardScaler()
- X_scaled = scaler.fit_transform(X_pred)
- lr = LogisticRegression(max_iter=300, solver="lbfgs")
- lr.fit(X_scaled, miss_mask)
- score = lr.score(X_scaled, miss_mask)
- baseline = max(miss_mask.mean(), 1 - miss_mask.mean())
- if score > baseline + 0.05:
- return "MAR", (f"Logistic Regression predicts missingness with accuracy {score:.2%} "
- f"(baseline {baseline:.2%}). Missingness is related to observed variables.")
- except Exception:
- pass
- return "MNAR", "Missingness not explained by observed data. Likely related to the missing value itself — assumed MNAR."
-
-
-def recommend_strategy(mechanism: str, miss_pct: float, dtype: str) -> dict:
- is_num = "float" in dtype or "int" in dtype
- add_indicator = (mechanism == "MNAR") or (mechanism == "MAR" and miss_pct >= 10)
-
- if mechanism == "MCAR" and miss_pct <= 5:
- method = "Drop rows"
- reason = "MCAR confirmed and loss is minimal (≤5%). Safe to drop."
- adv = "✓ No artificial data introduced"
- disadv = "✗ Loses data — only safe at very low %"
- elif mechanism in ("MCAR", "MAR") and miss_pct <= 15:
- if is_num:
- method = "Median imputation"
- reason = "Low-moderate missingness. Median is robust to skew and outliers."
- adv = "✓ Outlier-resistant; recommended default for numeric"
- disadv = "✗ Reduces variance slightly"
- else:
- method = "Mode imputation"
- reason = "Low-moderate missingness on categorical data."
- adv = "✓ Preserves category structure"
- disadv = "✗ Can over-represent dominant category"
- elif mechanism == "MAR" and miss_pct <= 30:
- method = "KNN Imputation" if is_num else "Mode / KNN Imputation"
- reason = "Moderate MAR missingness. KNN leverages feature relationships."
- adv = "✓ Preserves local patterns; captures inter-feature structure"
- disadv = "✗ Slow on large datasets; requires scaling"
- elif mechanism == "MAR" and miss_pct > 30:
- method = "Iterative Imputer (MICE)"
- reason = "High MAR missingness. MICE models each column as a function of others."
- adv = "✓ Most statistically principled; accounts for all feature relationships"
- disadv = "✗ Computationally expensive; risk of instability"
- elif mechanism == "MNAR":
- method = "Median + Missing Indicator (mandatory)"
- reason = "MNAR: the fact of missingness is informative. Indicator must be created BEFORE imputation."
- adv = "✓ Preserves MNAR signal; lets model learn from missingness"
- disadv = "✗ Imputation may still be biased; domain expertise required"
+ numeric_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c != target_col]
+ X_num = df[numeric_cols].copy()
+ try:
+ scaler = StandardScaler()
+ X_scaled = pd.DataFrame(scaler.fit_transform(X_num), columns=X_num.columns)
+ imputer = KNNImputer(n_neighbors=5) if impute_method == "KNN" else IterativeImputer(random_state=42, max_iter=10)
+ X_imputed_scaled = pd.DataFrame(imputer.fit_transform(X_scaled), columns=X_num.columns)
+ X_imputed = pd.DataFrame(scaler.inverse_transform(X_imputed_scaled), columns=X_num.columns)
+ imputed_series = X_imputed[col]
+ except Exception:
+ imputed_series = df[col].fillna(series.median())
+
+ results["imputed_series"] = imputed_series
+
+ # ��─ 2. Skewness & Outliers ──
+ skew = series.skew()
+ Q1_b, Q3_b = series.quantile(0.25), series.quantile(0.75)
+ IQR_b = Q3_b - Q1_b
+ outliers_before = ((series < Q1_b - 1.5*IQR_b) | (series > Q3_b + 1.5*IQR_b)).sum()
+
+ Q1_a, Q3_a = imputed_series.quantile(0.25), imputed_series.quantile(0.75)
+ IQR_a = Q3_a - Q1_a
+ outliers_after = ((imputed_series < Q1_a - 1.5*IQR_a) | (imputed_series > Q3_a + 1.5*IQR_a)).sum()
+ new_outliers = max(0, outliers_after - outliers_before)
+
+ if impute_method == "Mean":
+ skew_verdict = "fail" if abs(skew) > 1 else "ok"
+ elif impute_method == "Median":
+ skew_verdict = "warn" if abs(skew) > 3 else "ok"
+ else:
+ skew_verdict = "ok"
+
+ results["skewness"] = {"verdict": skew_verdict, "value": skew, "msg": f"Skewness = {skew:.3f}"}
+
+ if new_outliers > (len(series) * 0.05):
+ out_verdict = "warn"
else:
- method = "Consider dropping column"
- reason = f"Missing > 30% with {mechanism}. Evaluate predictive value vs. cost of imputation."
- adv = "✓ Eliminates noise if column is uninformative"
- disadv = "✗ Irreversible — verify with domain expert first"
-
- return {
- "method": method,
- "reason": reason,
- "adv": adv,
- "disadv": disadv,
- "add_indicator": add_indicator,
- }
-
+ out_verdict = "ok"
-def strategy_chips_html(mech, miss_pct, col_type):
- chips = []
- if mech == "CLEAN":
- return '✅ No action needed — column is complete'
- if miss_pct > 50:
- chips.append(("⚠ Consider Dropping Column (>50% missing)", "chip-red"))
- if mech == "MCAR":
- if miss_pct < 5:
- chips.append(("Listwise Deletion (safe)", "chip-green"))
- chips.append(("Median Imputation" if col_type == "Numerical" else "Mode Imputation", "chip-green"))
- if mech == "MAR":
- chips.append(("KNN Imputation", "chip-blue"))
- chips.append(("Iterative Imputer (MICE)", "chip-blue"))
- chips.append(("Group-wise Imputation", "chip-blue"))
- if miss_pct >= 10:
- chips.append(("Create Missing Indicator (≥10% MAR)", "chip-yellow"))
- if mech == "MNAR":
- chips.append(("⚠ Create Missing Indicator FIRST (mandatory)", "chip-red"))
- chips.append(("Constant / Domain-Specific Value", "chip-yellow"))
- chips.append(("Sensitivity Analysis Required", "chip-yellow"))
- return " ".join(f'{lbl}' for lbl, cls in chips)
-
-
-def validation_checks(df_before: pd.Series, df_after: pd.Series) -> dict:
- m_shift = abs(df_before.mean() - df_after.mean()) / max(abs(df_before.mean()), 1e-9) * 100
- med_shift = abs(df_before.median() - df_after.median()) / max(abs(df_before.median()), 1e-9) * 100
- var_change = abs(df_before.var() - df_after.var()) / max(df_before.var(), 1e-9) * 100
- return {
- "mean_shift_pct": round(m_shift, 2),
- "median_shift_pct": round(med_shift, 2),
- "var_change_pct": round(var_change, 2),
- "mean_ok": m_shift <= 5,
- "median_ok": med_shift <= 3,
- "var_ok": var_change <= 20,
+ results["outliers"] = {
+ "verdict": out_verdict,
+ "new_outliers": new_outliers,
+ "outliers_before": outliers_before,
+ "outliers_after": outliers_after
}
+ # ── 3. Variance Impact ──
+ var_before = series.var()
+ var_after = imputed_series.var()
+ var_drop_pct = (var_before - var_after) / var_before * 100 if var_before > 1e-12 else 0
-# ── Outlier & Variance helpers (from app_tanisha.py) ──────────────────
-
-def detect_outliers_iqr(series):
- s = series.dropna()
- if len(s) < 4: return 0
- Q1, Q3 = s.quantile(0.25), s.quantile(0.75)
- IQR = Q3 - Q1
- return int(((s < Q1 - 1.5 * IQR) | (s > Q3 + 1.5 * IQR)).sum())
-
-
-def variance_impact(series):
- s = series.dropna()
- if len(s) < 2: return 0.0, 0.0, 0.0
- var_before = float(s.var())
- var_after = float(series.fillna(s.mean()).var())
- return round(var_before, 4), round(var_after, 4), round(var_before - var_after, 4)
-
-
-def stat_card(label, value, color="#1a1a2e"):
- return (f'
'
- f'
{value}
'
- f'
{label}
')
-
-
-# ── Plot helpers ──────────────────────────────────────────────────────
-
-def plot_missing_heatmap(df):
- missing_cols = [c for c in df.columns if df[c].isnull().any()]
- if not missing_cols:
- return None
- sorted_cols = sorted(missing_cols, key=lambda c: df[c].isnull().mean(), reverse=True)
- sample_size = min(300, len(df))
- df_s = df[sorted_cols].sample(n=sample_size, random_state=42) if len(df) > sample_size else df[sorted_cols]
- mask_df = df_s.isnull().astype(int)
- fig, ax = plt.subplots(figsize=(max(10, len(sorted_cols) * 0.7), 5))
- sns.heatmap(mask_df.T, cmap=["#f5f3ee", "#17172b"], cbar=True,
- yticklabels=sorted_cols, xticklabels=False, linewidths=0, ax=ax)
- ax.set_title(f"Missing Value Heatmap — sample of {sample_size} rows", fontsize=13, fontweight="bold", pad=12)
- ax.set_xlabel("Rows (observations)", fontsize=10)
- ax.set_ylabel("Columns", fontsize=10)
- plt.tight_layout()
- return fig
-
-
-def plot_missingness_correlation(df):
- missing_cols = [c for c in df.columns if df[c].isnull().any()]
- if len(missing_cols) < 2:
- return None
- miss_bin = df[missing_cols].isnull().astype(int)
- corr = miss_bin.corr()
- fig, ax = plt.subplots(figsize=(max(7, len(missing_cols) * 0.9), max(6, len(missing_cols) * 0.8)))
- mask = np.triu(np.ones_like(corr, dtype=bool))
- sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", center=0,
- mask=mask, square=True, linewidths=0.5, cbar_kws={"shrink": 0.8}, ax=ax)
- ax.set_title("Missingness Correlation Matrix", fontsize=13, fontweight="bold", pad=12)
- plt.tight_layout()
- return fig
-
-
-def plot_numerical_column(df, col):
- s_original = df[col].dropna()
- s_imputed = df[col].fillna(s_original.mean())
- fig, axes = plt.subplots(1, 2, figsize=(16, 6))
- fig.suptitle(f"Deep Distribution Analysis — {col}", fontsize=14, fontweight="bold")
- sns.kdeplot(s_original, ax=axes[0], color="#4f8ef7", linewidth=3,
- label="Original (Before)", fill=True, alpha=0.2)
- sns.kdeplot(s_imputed, ax=axes[0], color="#e07b54", linewidth=3,
- label="Mean Imputed (After)", linestyle="--")
- axes[0].set_title("Distribution Shift: Original vs. Imputed", fontsize=12)
- axes[0].legend()
- box_data = pd.DataFrame({
- "Value": pd.concat([s_original, s_imputed]),
- "Type": ["Original"] * len(s_original) + ["Imputed"] * len(s_imputed),
- })
- sns.boxplot(data=box_data, x="Type", y="Value", ax=axes[1], palette=["#dce3ff", "#fce4d6"])
- axes[1].set_title("Variance & Outlier Comparison", fontsize=12)
- plt.tight_layout()
- return fig
-
-
-def plot_categorical_column(df, col, top_n=10):
- s_original = df[col].dropna()
- s_imputed = df[col].fillna(s_original.mode()[0] if not s_original.empty else "N/A")
- fig, axes = plt.subplots(1, 2, figsize=(16, 7))
- fig.suptitle(f"Categorical Frequency Analysis — {col}", fontsize=14, fontweight="bold")
- orig_counts = s_original.value_counts().head(top_n)
- imp_counts = s_imputed.value_counts().head(top_n)
- compare_df = pd.DataFrame({"Original": orig_counts, "Imputed (Mode)": imp_counts}).fillna(0)
- compare_df.plot(kind="barh", ax=axes[0], color=["#4f8ef7", "#e07b54"], width=0.8)
- axes[0].set_title(f"Top {top_n} Categories: Original vs Mode Imputed", fontsize=12)
- axes[0].invert_yaxis()
- top_pie = imp_counts.head(8)
- axes[1].pie(top_pie, labels=top_pie.index.astype(str), autopct="%1.1f%%",
- startangle=140, colors=plt.cm.Pastel1.colors, wedgeprops={"edgecolor": "white"})
- axes[1].set_title("Final Proportion (After Imputation)", fontsize=12)
- plt.tight_layout()
- return fig
-
-
-def plot_missing_vs_features(df, col):
- num_others = [c for c in df.select_dtypes(include=[np.number]).columns
- if c != col and df[c].isnull().mean() < 0.95]
- if not num_others:
- return None
- means_present = df[df[col].notna()][num_others].mean()
- means_missing = df[df[col].isnull()][num_others].mean()
- diff_df = pd.DataFrame({"Present": means_present, "Missing": means_missing}).dropna().head(12)
- if diff_df.empty:
- return None
- fig, ax = plt.subplots(figsize=(max(8, len(diff_df) * 0.9), 4))
- x = np.arange(len(diff_df)); w = 0.35
- ax.bar(x - w/2, diff_df["Present"], w, label="Present rows", color="#4f8ef7", alpha=0.85)
- ax.bar(x + w/2, diff_df["Missing"], w, label="Missing rows", color="#e07b54", alpha=0.85)
- ax.set_xticks(x)
- ax.set_xticklabels(diff_df.index, rotation=35, ha="right", fontsize=9)
- ax.set_title(f"Feature Means — Rows where '{col}' is Present vs Missing",
- fontsize=11, fontweight="bold")
- ax.set_ylabel("Mean value")
- ax.legend(fontsize=9)
- plt.tight_layout()
- return fig
-
-
-def render_per_column_deep_analysis(df, col, num_cols, cat_cols, mechanism_results):
- miss_count = int(df[col].isnull().sum())
- miss_pct = round(df[col].isnull().mean() * 100, 2)
- total_rows = len(df)
- present = total_rows - miss_count
- col_type = "Numerical" if col in num_cols else "Categorical"
- mech_info = mechanism_results.get(col, {})
- mech = mech_info.get("mechanism", "N/A")
- mech_reason = mech_info.get("reason", "Run the global diagnosis section above first.")
- sev = severity(miss_pct) if miss_pct > 0 else "None"
-
- miss_color = "#dc2626" if miss_pct >= 20 else "#d97706" if miss_pct >= 5 else "#16a34a"
- sev_color = "#dc2626" if sev == "High" else "#d97706" if sev == "Moderate" else "#16a34a"
- mech_color = {"MCAR": "#155724", "MAR": "#856404", "MNAR": "#721c24"}.get(mech, "#444")
-
- st.markdown(f"#### 🔍 Deep Analysis — `{col}` · {col_type}", unsafe_allow_html=True)
- m1, m2, m3, m4, m5 = st.columns(5)
- with m1: st.markdown(stat_card("Total Rows", f"{total_rows:,}"), unsafe_allow_html=True)
- with m2: st.markdown(stat_card("Present", f"{present:,}"), unsafe_allow_html=True)
- with m3: st.markdown(stat_card("Missing", f"{miss_pct}%", miss_color), unsafe_allow_html=True)
- with m4: st.markdown(stat_card("Severity", sev, sev_color), unsafe_allow_html=True)
- with m5: st.markdown(stat_card("Mechanism", mech, mech_color), unsafe_allow_html=True)
- st.markdown("")
-
- if col_type == "Numerical":
- s = df[col].dropna()
- if len(s) > 1:
- col_skew = float(skew(s))
- col_kurt = float(kurtosis(s))
- Q1, Q3 = float(s.quantile(0.25)), float(s.quantile(0.75))
- IQR = Q3 - Q1
- n_out = detect_outliers_iqr(df[col])
- vb, va, vi = variance_impact(df[col])
- out_pct = n_out / max(len(s), 1)
-
- r1 = st.columns(4)
- for (lbl, val), col_ui in zip(
- [("Mean", f"{s.mean():.4g}"), ("Median", f"{s.median():.4g}"),
- ("Std Dev", f"{s.std():.4g}"), ("Variance", f"{s.var():.4g}")], r1):
- with col_ui: st.markdown(stat_card(lbl, val), unsafe_allow_html=True)
- st.markdown("")
-
- r2 = st.columns(4)
- for (lbl, val), col_ui in zip(
- [("Min", f"{s.min():.4g}"), ("Max", f"{s.max():.4g}"),
- ("Skewness", f"{col_skew:.3f}"), ("Kurtosis", f"{col_kurt:.3f}")], r2):
- with col_ui: st.markdown(stat_card(lbl, val), unsafe_allow_html=True)
- st.markdown("")
-
- r3 = st.columns(4)
- out_color = "#dc2626" if out_pct > 0.15 else "#d97706" if out_pct > 0.05 else "#16a34a"
- for (lbl, val, clr), col_ui in zip(
- [("Q1", f"{Q1:.4g}", "#1a1a2e"), ("Q3", f"{Q3:.4g}", "#1a1a2e"),
- ("IQR", f"{IQR:.4g}", "#1a1a2e"), ("Outliers (IQR)", str(n_out), out_color)], r3):
- with col_ui: st.markdown(stat_card(lbl, val, clr), unsafe_allow_html=True)
-
- if len(s) <= 5000:
- try:
- _, p_norm = shapiro(s.sample(min(len(s), 5000), random_state=0))
- norm_txt = f"✅ Normal (p={p_norm:.4f})" if p_norm > 0.05 else f"⚠ Not Normal (p={p_norm:.4f})"
- st.caption(f"📐 Shapiro-Wilk normality test: {norm_txt}")
- except Exception:
- pass
-
- st.markdown("")
- fig_dist = plot_numerical_column(df, col)
- st.pyplot(fig_dist); plt.close(fig_dist)
-
- st.markdown("**Variance Impact of Mean Imputation (simulated)**")
- vc = st.columns(3)
- delta_color = "#dc2626" if abs(vi)/max(vb,1e-9) > 0.3 else "#d97706" if abs(vi)/max(vb,1e-9) > 0.1 else "#16a34a"
- with vc[0]: st.markdown(stat_card("Variance (before)", f"{vb:.4g}"), unsafe_allow_html=True)
- with vc[1]: st.markdown(stat_card("Variance (after)", f"{va:.4g}"), unsafe_allow_html=True)
- with vc[2]: st.markdown(stat_card("Δ Variance", f"{vi:.4g}", delta_color), unsafe_allow_html=True)
-
- pct_chg = abs(vi) / max(vb, 1e-9) * 100
- if pct_chg >= 30:
- st.warning(f"⚠ Variance drops by {pct_chg:.1f}% after mean imputation — over-smoothing risk. Use median or model-based imputation.")
- elif pct_chg >= 10:
- st.info(f"ℹ Variance drops by {pct_chg:.1f}% — acceptable, but monitor distribution shape.")
- else:
- st.success(f"✅ Variance change is small ({pct_chg:.1f}%) — mean imputation is statistically safe here.")
- else:
- s = df[col].dropna()
- n_unique = s.nunique()
- mode_val = str(s.mode().iloc[0]) if len(s) > 0 else "N/A"
- mode_cnt = int((s == s.mode().iloc[0]).sum()) if len(s) > 0 else 0
- mode_pct = round(mode_cnt / max(len(s), 1) * 100, 1)
-
- r1 = st.columns(4)
- for (lbl, val), col_ui in zip(
- [("Unique Values", n_unique), ("Mode", mode_val[:12]),
- ("Mode Count", f"{mode_cnt:,}"), ("Mode Freq %", f"{mode_pct}%")], r1):
- with col_ui: st.markdown(stat_card(lbl, str(val)), unsafe_allow_html=True)
-
- st.markdown("")
- freq_table = s.value_counts().reset_index()
- freq_table.columns = ["Value", "Count"]
- freq_table["% of Present"] = (freq_table["Count"] / len(s) * 100).round(2)
- tab_chart, tab_table = st.tabs(["📊 Frequency Chart", "📋 Frequency Table"])
- with tab_chart:
- fig_cat = plot_categorical_column(df, col)
- st.pyplot(fig_cat); plt.close(fig_cat)
- with tab_table:
- st.dataframe(freq_table, use_container_width=True, hide_index=True)
-
- st.markdown("")
- if miss_count > 0:
- st.markdown("**How Missingness Relates to Other Features**")
- fig_pat = plot_missing_vs_features(df, col)
- if fig_pat:
- st.pyplot(fig_pat); plt.close(fig_pat)
- st.caption("Large differences between blue (present) and orange (missing) bars signal MAR behavior.")
- else:
- st.info("No other numerical features available for pattern comparison.")
-
- st.markdown("")
- verdict_cls = {"MCAR": "card-mcar", "MAR": "card-mar", "MNAR": "card-mnar"}.get(mech, "card-info")
- mech_icon = {"MCAR": "🟢", "MAR": "🟡", "MNAR": "🔴"}.get(mech, "✅")
- mech_label = {"MCAR": "Missing Completely At Random (MCAR)",
- "MAR": "Missing At Random (MAR)",
- "MNAR": "Missing Not At Random (MNAR)",
- "N/A": "No Missing Values"}.get(mech, mech)
-
- st.markdown(
- f'{mech_icon} {mech_label}
'
- f'{mech_reason}
',
- unsafe_allow_html=True)
-
- chips_html = strategy_chips_html(mech, miss_pct, col_type)
- if chips_html:
- st.markdown("")
- st.markdown("**Recommended Strategies**")
- st.markdown(chips_html, unsafe_allow_html=True)
-
- pointer = {
- "MCAR": ("📍 **MCAR**: Missing% <5% → listwise deletion is safe. 5–15% → median/mode imputation. "
- "15–30% → advanced imputation with missing indicator."),
- "MAR": ("📍 **MAR**: KNN / MICE preferred. Create a missing indicator if missing% ≥10%."),
- "MNAR": ("📍 **MNAR**: **Create the missing indicator FIRST**, then use constant or sensitivity analysis. "
- "Domain knowledge is essential."),
- "N/A": "📍 No action needed — this column is complete. Proceed to feature engineering.",
- }.get(mech, "")
- if pointer:
- st.markdown("")
- st.info(pointer)
+ if var_drop_pct <= 10: var_verdict, var_msg = "ok", f"Variance Change: {var_drop_pct:.1f}%"
+ elif var_drop_pct <= 20: var_verdict, var_msg = "warn", f"Variance Change: {var_drop_pct:.1f}%"
+ else: var_verdict, var_msg = "fail", f"Variance Change: {var_drop_pct:.1f}%"
+ results["variance"] = {"verdict": var_verdict, "msg": var_msg, "var_drop_pct": var_drop_pct}
-# ════════════════════════════════════════════════════════════════════
-# SIDEBAR — NAVIGATION
-# ════════════════════════════════════════════════════════════════════
+ # ── 4. Correlation Preservation ──
+ numeric_others = [c for c in df.select_dtypes(include=[np.number]).columns if c != col and c != target_col]
+ corr_results, max_corr_shift, sign_flip = {}, 0.0, False
-STEPS = [
- "1 · Upload CSV",
- "2 · Select Target Column",
- "3 · Overview & Patterns",
- "4 · Mechanism Dashboard",
- "5 · Column Diagnostics",
- "6 · Strategy & Imputation",
- "7 · Validation Checks",
-]
+ for other in numeric_others[:10]:
+ s_before = df[[col, other]].dropna()
+ if len(s_before) < 5: continue
+ r_before = s_before[col].corr(s_before[other])
+ r_after = imputed_series.corr(df[other])
+
+ delta = abs(r_before - r_after)
+ flipped = (r_before * r_after < 0) and (abs(r_before) > 0.1)
+
+ corr_results[other] = {"r_before": round(r_before, 4), "r_after": round(r_after, 4), "delta": round(delta, 4), "sign_flip": flipped}
+ max_corr_shift = max(max_corr_shift, delta)
+ if flipped: sign_flip = True
-with st.sidebar:
- st.markdown("## 🔬 Missing Value Intelligence Suite")
- st.markdown("---")
- st.markdown("**Navigation**")
- step = st.radio("Go to step:", STEPS, label_visibility="collapsed")
- st.markdown("---")
- st.markdown(
- "Follow the steps in order for a complete analysis pipeline. "
- "Steps 3–4 are exploratory; Steps 5–7 form the diagnostic pipeline.",
- unsafe_allow_html=True,
- )
+ if max_corr_shift <= 0.05 and not sign_flip: corr_verdict, corr_msg = "ok", f"Max Δ = {max_corr_shift:.3f} — Correlation well preserved"
+ elif sign_flip: corr_verdict, corr_msg = "fail", f"Sign flip detected! Correlation direction reversed."
+ elif max_corr_shift <= 0.10: corr_verdict, corr_msg = "warn", f"Max Δ = {max_corr_shift:.3f} — Moderate correlation shift"
+ else: corr_verdict, corr_msg = "fail", f"Max Δ = {max_corr_shift:.3f} — Large correlation shift detected"
+ results["correlation"] = {"details": corr_results, "verdict": corr_verdict, "msg": corr_msg, "max_shift": round(max_corr_shift, 4)}
-# ════════════════════════════════════════════════════════════════════
-# SESSION STATE
-# ════════════════════════════════════════════════════════════════════
+ return results
-for key in ["df", "target_col", "col_results", "df_imputed", "mechanism_results_lr"]:
- if key not in st.session_state:
- st.session_state[key] = None
-if st.session_state["col_results"] is None:
- st.session_state["col_results"] = {}
-if st.session_state["mechanism_results_lr"] is None:
- st.session_state["mechanism_results_lr"] = {}
+def get_auto_recommendation(df, col, target, mechanism, miss_pct, dtype):
+ """Determine best imputation strategy with explicit labeling."""
+ needs_indicator = (mechanism == "MNAR") or (mechanism == "MAR" and miss_pct >= 10)
+ indicator_suffix = " + Missing Indicator" if needs_indicator else ""
+ # High missingness — always flag
+ if miss_pct > 70:
+ return f"Drop Column"
-# ════════════════════════════════════════════════════════════════════
-# STEP 1 — UPLOAD CSV
-# ════════════════════════════════════════════════════════════════════
+ if mechanism == "MCAR" and miss_pct <= 5:
+ return "Drop Rows"
-if step == STEPS[0]:
- st.markdown('📂 Step 1 — Upload Your CSV
', unsafe_allow_html=True)
- st.markdown('Upload a CSV file to begin the missing-value analysis pipeline.
', unsafe_allow_html=True)
+ # Categorical / non-numeric
+ if not pd.api.types.is_numeric_dtype(df[col]):
+ return f"Mode Imputation{indicator_suffix}"
- uploaded = st.file_uploader("Choose a CSV file", type=["csv"])
+ # Numeric: run quick feasibility to decide
+ feas_med = feasibility_checks(df, col, target, "Median")
+ if not feas_med.get("applicable"):
+ return f"Median Imputation{indicator_suffix}"
- if uploaded:
- try:
- df = pd.read_csv(uploaded)
- # Auto-remove ID-like columns
- id_cols = [c for c in df.columns if c.strip().lower() in ("id", "index", "row", "rowid", "row_id")]
- if id_cols:
- df.drop(columns=id_cols, inplace=True)
- st.toast(f"Auto-removed non-informative column(s): {id_cols}", icon="🗑️")
-
- st.session_state["df"] = df
- st.session_state["col_results"] = {}
- st.session_state["mechanism_results_lr"] = {}
- st.session_state["df_imputed"] = df.copy()
-
- st.success(f"✅ File loaded: **{uploaded.name}** — {df.shape[0]} rows × {df.shape[1]} columns")
- st.markdown("### Preview (first 10 rows)")
- st.dataframe(df.head(10), use_container_width=True)
-
- c1, c2, c3, c4 = st.columns(4)
- with c1:
- st.markdown(f'', unsafe_allow_html=True)
- with c2:
- st.markdown(f'', unsafe_allow_html=True)
- with c3:
- n_miss_cols = df.isnull().any().sum()
- st.markdown(f'{n_miss_cols}
Columns w/ Missings
', unsafe_allow_html=True)
- with c4:
- total_miss = df.isnull().sum().sum()
- pct_miss = round(total_miss / df.size * 100, 1)
- st.markdown(f'{pct_miss}%
Overall Missing Rate
', unsafe_allow_html=True)
-
- st.markdown("### Column Types & Missingness")
- type_df = pd.DataFrame({
- "Column": df.columns,
- "Dtype": df.dtypes.astype(str).values,
- "Missing": df.isnull().sum().values,
- "Missing %": (df.isnull().mean() * 100).round(2).values,
- })
- st.dataframe(type_df, use_container_width=True, hide_index=True)
-
- except Exception as e:
- st.error(f"Could not read file: {e}")
+ var_ok = feas_med["variance"]["var_drop_pct"] <= 20
+ corr_ok = feas_med["correlation"]["verdict"] != "fail"
+ skew_val = abs(feas_med["skewness"].get("value", 0))
+
+ if var_ok and corr_ok:
+ if skew_val <= 1:
+ return f"Mean Imputation{indicator_suffix}"
+ else:
+ return f"Median Imputation{indicator_suffix}"
else:
- st.info("👆 Upload a CSV to get started.")
+ if miss_pct > 30:
+ return f"MICE Imputer{indicator_suffix}"
+ else:
+ return f"KNN Imputer{indicator_suffix}"
# ════════════════════════════════════════════════════════════════════
-# STEP 2 — SELECT TARGET COLUMN
+# SIDEBAR NAVIGATION
# ════════════════════════════════════════════════════════════════════
+STEPS = ["1 · Upload & Split", "2 · Overview", "3 · Column Diagnostics", "4 · Feasibility Gate", "5 · Final Report"]
-elif step == STEPS[1]:
- st.markdown('🎯 Step 2 — Select Target Column
', unsafe_allow_html=True)
- st.markdown('The target column (y) is used in Test 3 to detect MNAR patterns and is excluded from feature analysis.
', unsafe_allow_html=True)
-
- df = st.session_state.get("df")
- if df is None:
- st.warning("⚠️ Please upload a CSV in Step 1 first.")
- else:
- target = st.selectbox(
- "Select the output / target column:",
- options=df.columns.tolist(),
- index=len(df.columns) - 1,
- )
- if st.button("✅ Confirm Target Column", type="primary"):
- st.session_state["target_col"] = target
- st.success(f"Target column set to: **{target}**")
-
- if st.session_state.get("target_col"):
- st.info(f"Current target: **{st.session_state['target_col']}**")
- tc = st.session_state["target_col"]
- col_data = df[tc]
- st.markdown("#### Target Column Distribution")
- fig, ax = plt.subplots(figsize=(7, 3))
- if pd.api.types.is_numeric_dtype(col_data):
- col_data.dropna().hist(bins=30, ax=ax, color="#17172b", edgecolor="white")
- ax.set_xlabel(tc); ax.set_ylabel("Count")
- else:
- vc = col_data.value_counts().head(15)
- vc.plot(kind="bar", ax=ax, color="#17172b")
- ax.set_ylabel("Count")
- ax.set_title(f"Distribution of '{tc}'")
- plt.tight_layout()
- st.pyplot(fig)
- plt.close()
+with st.sidebar:
+ st.markdown("## 🔬 Missing Value Analyzer")
+ st.markdown("---")
+ step = st.radio("Navigate:", STEPS, label_visibility="collapsed")
+ st.markdown("---")
+ if st.session_state.get("df_train") is not None:
+ st.markdown(f"**Train set:** {st.session_state['df_train'].shape[0]} rows × {st.session_state['df_train'].shape[1]} cols")
+ st.markdown(f"**Diagnosed:** {len(st.session_state['col_diagnostics'])} columns")
+ st.markdown("Analysis runs on TRAIN SET only to prevent data leakage.", unsafe_allow_html=True)
# ════════════════════════════════════════════════════════════════════
-# STEP 3 — OVERVIEW & PATTERNS
+# STEP 1 — UPLOAD & SPLIT
# ════════════════════════════════════════════════════════════════════
+def render_step1():
+ st.markdown('📂 Step 1 — Upload CSV & Train/Test Split
', unsafe_allow_html=True)
+ uploaded = st.file_uploader("Choose a CSV file", type=["csv"])
+ if not uploaded: return st.info("👆 Upload a CSV file to begin.")
-elif step == STEPS[2]:
- st.markdown('📊 Step 3 — Overview & Patterns
', unsafe_allow_html=True)
- st.markdown('Bird\'s-eye view of missingness across the dataset, including heatmaps and co-missingness patterns.
', unsafe_allow_html=True)
-
- df = st.session_state.get("df")
- target_col = st.session_state.get("target_col")
-
- if df is None:
- st.warning("⚠️ Please upload a CSV in Step 1 first.")
- else:
- X = df.drop(columns=[target_col]) if target_col and target_col in df.columns else df
- summary = missing_summary_df(X)
+ df = pd.read_csv(uploaded)
+ st.success(f"✅ Loaded **{uploaded.name}**")
- if summary.empty:
- st.success("🎉 No missing values found in the dataset features!")
- else:
- st.markdown(f"### {len(summary)} column(s) have missing values")
- st.dataframe(summary.style.background_gradient(subset=["Missing %"], cmap="YlOrRd"),
- use_container_width=True)
-
- # ── Missing % bar chart
- st.markdown('', unsafe_allow_html=True)
- miss_cols = summary.index.tolist()
- fig_bar, ax_bar = plt.subplots(figsize=(max(7, len(miss_cols) * 0.9), 4))
- colors = ["#9e2210" if v > 30 else "#7a4d00" if v > 15 else "#0d6b3a" for v in summary["Missing %"]]
- ax_bar.barh(summary.index[::-1], summary["Missing %"][::-1], color=colors[::-1], edgecolor="white")
- ax_bar.axvline(5, color="#89d9ac", linewidth=1.5, linestyle="--", label="5% threshold")
- ax_bar.axvline(15, color="#f0cc7a", linewidth=1.5, linestyle="--", label="15% threshold")
- ax_bar.axvline(30, color="#f5a898", linewidth=1.5, linestyle="--", label="30% threshold")
- ax_bar.set_xlabel("Missing %"); ax_bar.set_title("Missing % per Column")
- ax_bar.legend(loc="lower right", fontsize=8)
- plt.tight_layout()
- st.pyplot(fig_bar)
- plt.close()
-
- # ── Heatmap + Correlation tabs
- st.markdown('', unsafe_allow_html=True)
- tab_hm, tab_corr = st.tabs(["Missing Heatmap", "Missingness Correlation"])
- with tab_hm:
- fig_hm = plot_missing_heatmap(X)
- if fig_hm:
- st.pyplot(fig_hm); plt.close(fig_hm)
- st.caption("Dark = missing, light = present. Each column is a row.")
- else:
- st.info("No missing values to display.")
- with tab_corr:
- fig_corr = plot_missingness_correlation(X)
- if fig_corr:
- st.pyplot(fig_corr); plt.close(fig_corr)
- st.caption("Near +1: columns tend to be missing together. Near −1: rarely missing simultaneously.")
- else:
- st.info("Need at least 2 columns with missing values for this chart.")
-
- # ── Correlation among numerical features
- num_cols_x, _ = identify_columns(X)
- if len(num_cols_x) >= 2:
- st.markdown('', unsafe_allow_html=True)
- valid = [c for c in num_cols_x if X[c].isnull().mean() < 1.0]
- if len(valid) >= 2:
- corr = X[valid].corr()
- strong = (corr.abs() > 0.5) & (corr != 1.0)
- if strong.any().any():
- fig_fc, ax_fc = plt.subplots(figsize=(max(8, len(valid) * 0.9), max(7, len(valid) * 0.8)))
- mask = np.triu(np.ones_like(corr, dtype=bool))
- display_corr = corr.where(corr.abs() > 0.5)
- sns.heatmap(display_corr, annot=False, cmap="RdYlGn", center=0,
- mask=mask, square=True, linewidths=0.5,
- cbar_kws={"shrink": 0.8}, ax=ax_fc, vmin=-1, vmax=1)
- ax_fc.set_title("Strong Correlations (|r| > 0.5) — Numerical Features",
- fontsize=13, fontweight="bold", pad=12)
- plt.tight_layout()
- st.pyplot(fig_fc); plt.close(fig_fc)
-
- # Correlation pairs table
- pairs = []
- seen = set()
- for i, c1 in enumerate(corr.columns):
- for j, c2 in enumerate(corr.columns):
- if i >= j: continue
- v = corr.loc[c1, c2]
- if abs(v) > 0.5:
- key = tuple(sorted([c1, c2]))
- if key not in seen:
- seen.add(key)
- pairs.append({"Column A": c1, "Column B": c2,
- "Correlation": round(v, 4),
- "Correlation %": f"{round(v * 100, 2)}%"})
- if pairs:
- corr_table = pd.DataFrame(pairs).sort_values("Correlation", key=abs, ascending=False)
- st.markdown("**Strong Correlation Pairs (|r| > 0.5)**")
- st.dataframe(corr_table, use_container_width=True, hide_index=True)
- else:
- st.info("No strong correlations (|r| > 0.5) found among numerical features.")
+ col1, col2 = st.columns(2)
+ target = col1.selectbox("Target column (Y):", df.columns.tolist(), index=len(df.columns)-1)
+ split_pct = col2.slider("Train size:", 50, 95, 80, 5, format="%d%%")
+ if st.button("✅ Confirm & Split", type="primary"):
+ df_train, df_test = train_test_split(df, train_size=split_pct/100.0, random_state=42)
+ st.session_state.update({"df_full": df, "df_train": df_train.reset_index(drop=True), "df_test": df_test.reset_index(drop=True), "target_col": target, "col_diagnostics": {}})
+ st.success("✅ Split complete!")
+ st.dataframe(df_train.head(), use_container_width=True)
# ════════════════════════════════════════════════════════════════════
-# STEP 4 — MECHANISM DASHBOARD (from app_tanisha.py)
+# STEP 2 — OVERVIEW
# ════════════════════════════════════════════════════════════════════
+def render_step2():
+ st.markdown('📊 Step 2 — Missing Value Overview
', unsafe_allow_html=True)
+ df = st.session_state.get("df_train")
+ if df is None: return st.warning("⚠️ Please complete Step 1.")
-elif step == STEPS[3]:
- st.markdown('🧪 Step 4 — Mechanism Dashboard
', unsafe_allow_html=True)
- st.markdown('Automated MCAR/MAR/MNAR detection via Chi-square & Logistic Regression, plus outlier/variance analysis and deep per-column exploration.
', unsafe_allow_html=True)
-
- df = st.session_state.get("df")
- target_col = st.session_state.get("target_col")
-
- if df is None:
- st.warning("⚠️ Please upload a CSV in Step 1 first.")
- elif target_col is None:
- st.warning("⚠️ Please select a target column in Step 2 first.")
- else:
- X = df.drop(columns=[target_col])
- y = df[target_col]
- num_cols, cat_cols = identify_columns(X)
-
- # ── Train-test split
- st.markdown('', unsafe_allow_html=True)
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
- sc1, sc2 = st.columns(2)
- with sc1: st.markdown(f"**Training Set** — X_train: `{X_train.shape}` · y_train: `{y_train.shape}`")
- with sc2: st.markdown(f"**Test Set** — X_test: `{X_test.shape}` · y_test: `{y_test.shape}`")
-
- # ── Mechanism diagnosis
- st.markdown('', unsafe_allow_html=True)
- missing_feature_cols = [c for c in X.columns if X[c].isnull().any()]
-
- if not missing_feature_cols:
- st.success("No missing values in feature columns — nothing to diagnose.")
- mechanism_results = {}
- else:
- cached = st.session_state.get("mechanism_results_lr", {})
- if not cached:
- with st.spinner("Running MCAR (Chi-square) and MAR (Logistic Regression) tests…"):
- mechanism_results = {}
- for col in missing_feature_cols:
- mech, reason = diagnose_mechanism_lr(X, col, num_cols)
- mechanism_results[col] = {"mechanism": mech, "reason": reason}
- st.session_state["mechanism_results_lr"] = mechanism_results
- else:
- mechanism_results = cached
-
- badge_map = {"MCAR": "badge-mcar", "MAR": "badge-mar", "MNAR": "badge-mnar"}
- for col, res in mechanism_results.items():
- mech = res["mechanism"]
- pct = round(X[col].isnull().mean() * 100, 2)
- with st.expander(f"🔎 **{col}** — {mech} | {pct}% missing"):
- st.markdown(f'{mech} {res["reason"]}',
- unsafe_allow_html=True)
-
- # ── Outlier Detection & Variance Impact
- st.markdown('', unsafe_allow_html=True)
- outlier_data = {}
- for col in num_cols:
- n_out = detect_outliers_iqr(X[col])
- vb, va, vi = variance_impact(X[col])
- outlier_data[col] = {
- "Missing %": round(X[col].isnull().mean() * 100, 2),
- "Outliers (IQR)": n_out,
- "Variance (before impute)": vb,
- "Variance (after mean impute)": va,
- "Variance Impact (Δ)": vi,
- }
- if outlier_data:
- out_df = (pd.DataFrame(outlier_data).T.reset_index()
- .rename(columns={"index": "Column"})
- .sort_values("Outliers (IQR)", ascending=False))
-
- def color_outliers(val):
- if isinstance(val, (int, float)):
- if val > 50: return "background-color: #f8d7da; color: #721c24;"
- if val > 10: return "background-color: #fff3cd; color: #856404;"
- return ""
- st.dataframe(out_df.style.applymap(color_outliers, subset=["Outliers (IQR)"]),
- use_container_width=True, hide_index=True)
- else:
- st.info("No numerical columns available for outlier analysis.")
-
- # ── Final Diagnosis Table
- st.markdown('', unsafe_allow_html=True)
- diag_rows = []
- for col in X.columns:
- mp = round(X[col].isnull().mean() * 100, 2)
- mech = mechanism_results.get(col, {}).get("mechanism", "N/A") if col in missing_feature_cols else "N/A"
- diag_rows.append({
- "Column": col, "Missing %": mp,
- "Mechanism": mech, "Severity": severity(mp) if mp > 0 else "None",
- "Outliers": outlier_data.get(col, {}).get("Outliers (IQR)", "—"),
- "Variance Impact (Δ)": outlier_data.get(col, {}).get("Variance Impact (Δ)", "—"),
- })
- diag_df = pd.DataFrame(diag_rows).sort_values("Missing %", ascending=False).reset_index(drop=True)
-
- sev_colors = {"High": "background-color: #f8d7da; color: #721c24;",
- "Moderate": "background-color: #fff3cd; color: #856404;",
- "Low": "background-color: #d4edda; color: #155724;"}
- mech_colors = {"MCAR": "background-color: #d4edda; color: #155724;",
- "MAR": "background-color: #fff3cd; color: #856404;",
- "MNAR": "background-color: #f8d7da; color: #721c24;"}
-
- def color_diag_row(row):
- mech_style = mech_colors.get(row["Mechanism"], "")
- sev_style = sev_colors.get(row["Severity"], "")
- return ["", "", mech_style, sev_style, "", ""]
-
- st.dataframe(diag_df.style.apply(color_diag_row, axis=1),
- use_container_width=True, hide_index=True)
-
- # ── Per-Column Deep Analysis
- st.markdown('', unsafe_allow_html=True)
- col_label_to_name = {}
- for col in X.columns:
- mp_l = round(X[col].isnull().mean() * 100, 1)
- type_lbl = "Num" if col in num_cols else "Cat"
- mech_lbl = mechanism_results.get(col, {}).get("mechanism", "—") if col in missing_feature_cols else "complete"
- label = f"{col} [{type_lbl} · {mp_l}% missing · {mech_lbl}]"
- col_label_to_name[label] = col
-
- chosen_label = st.selectbox(
- "Select a column to analyse in detail:",
- options=["— choose a column —"] + list(col_label_to_name.keys()),
- key="deep_col_select"
- )
- if chosen_label != "— choose a column —":
- chosen_col = col_label_to_name[chosen_label]
- with st.spinner(f"Analysing `{chosen_col}`…"):
- st.markdown("---")
- render_per_column_deep_analysis(
- df=X, col=chosen_col,
- num_cols=num_cols, cat_cols=cat_cols,
- mechanism_results=mechanism_results,
- )
- st.markdown("---")
-
- # ── Insights
- st.markdown('', unsafe_allow_html=True)
- high_miss = diag_df[diag_df["Missing %"] >= 20]["Column"].tolist()
- mar_cols = diag_df[diag_df["Mechanism"] == "MAR"]["Column"].tolist()
- mnar_cols = diag_df[diag_df["Mechanism"] == "MNAR"]["Column"].tolist()
- high_out = [c for c in num_cols if outlier_data.get(c, {}).get("Outliers (IQR)", 0) > 10]
-
- insights = [
- "Missing data must be understood before any imputation or modeling to avoid biased results.",
- (f"{', '.join(high_miss)} have ≥20% missing values — treat with caution or consider dropping."
- if high_miss else "No columns have critically high (≥20%) missing rates — dataset quality looks reasonable."),
- (f"Columns {', '.join(mar_cols)} show MAR behavior — KNN/MICE imputation is viable."
- if mar_cols else "No columns confirmed MAR."),
- (f"Columns {', '.join(mnar_cols)} are likely MNAR — create a missing indicator before imputing."
- if mnar_cols else "No columns flagged as MNAR."),
- (f"Columns {', '.join(high_out)} have many outliers — prefer median over mean imputation."
- if high_out else "Outlier counts appear manageable across numerical columns."),
- "Correlated missingness indicates data is likely not MCAR — jointly missing due to a common cause.",
- "MCAR is rare in real-world datasets. Most missingness in practice is MAR or MNAR.",
- "MNAR cannot be confirmed statistically from observed data alone — domain knowledge is essential.",
- ]
- st.markdown('' + "".join(f"- {i}
" for i in insights) + "
",
- unsafe_allow_html=True)
-
- # ── Theory
- st.markdown('', unsafe_allow_html=True)
- theories = [
- ("🔵 MCAR — Missing Completely At Random",
- "The probability of missingness is entirely independent of observed and unobserved data. "
- "Listwise deletion is unbiased under MCAR, though it reduces sample size."),
- ("🟡 MAR — Missing At Random",
- "Missingness depends on observed data but not on the missing value itself. "
- "Multiple imputation or FIML methods produce valid estimates under MAR."),
- ("🔴 MNAR — Missing Not At Random",
- "Missingness depends on the unobserved value itself. Cannot be detected from observed data. "
- "Requires sensitivity analysis and domain knowledge. Ignoring MNAR produces biased results."),
- ("📐 Why Chi-Square for MCAR Testing?",
- "Chi-square tests independence between the binary missingness indicator and binned numeric predictors. "
- "No significant association is consistent with MCAR, though this only confirms pairwise independence."),
- ("🤖 Why Logistic Regression for MAR Detection?",
- "LR models the binary missingness indicator as a function of all observed features. "
- "Accuracy substantially above the majority-class baseline indicates MAR."),
- ("📉 Why MNAR Cannot Be Confirmed Statistically",
- "MNAR depends on unobserved values — data we do not have. No statistical test on observed data "
- "can definitively confirm it. Domain reasoning about the data generation process is required."),
- ("📦 Outliers and Their Impact on Variance",
- "Outliers (>1.5×IQR) inflate variance and distort the mean. Mean imputation artificially collapses "
- "variance because all missing cells receive the same central value, masking true data spread."),
- ]
- for title, body in theories:
- st.markdown(f'', unsafe_allow_html=True)
+ miss_cols = [c for c in df.columns if df[c].isnull().any()]
+ if not miss_cols: return st.success("🎉 No missing values!")
+ summary = pd.DataFrame({"Missing Count": df.isnull().sum(), "Missing %": (df.isnull().sum()/len(df)*100).round(2)})
+ st.dataframe(summary[summary["Missing Count"] > 0].sort_values("Missing %", ascending=False).style.background_gradient(cmap="YlOrRd"), use_container_width=True)
# ════════════════════════════════════════════════════════════════════
-# STEP 5 — COLUMN DIAGNOSTICS (from app.py — 3 statistical tests)
+# STEP 3 — DIAGNOSTICS
# ════════════════════════════════════════════════════════════════════
+def render_step3():
+ st.markdown('🧪 Step 3 — Per-Column Diagnostics
', unsafe_allow_html=True)
+ df, target = st.session_state.get("df_train"), st.session_state.get("target_col")
+ if df is None: return st.warning("⚠️ Please complete Step 1.")
+
+ miss_cols = [c for c in df.columns if df[c].isnull().any()]
+ if not miss_cols: return st.success("🎉 No missing values.")
+
+ col1, col2 = st.columns([1, 4])
+ selected_col = col1.selectbox("Select column to view:", miss_cols)
+ run_single = col1.button("▶ Run Diagnostics")
+ run_all = col2.button("▶ Run ALL columns", type="primary")
+
+ if run_single:
+ run_single_diagnostic(df, selected_col, target)
+ if run_all:
+ progress = st.progress(0, text="Running diagnostics...")
+ for i, c in enumerate(miss_cols):
+ run_single_diagnostic(df, c, target)
+ progress.progress((i+1)/len(miss_cols), text=f"Diagnosing: {c}")
+ progress.empty()
+ st.success(f"✅ Diagnosed {len(miss_cols)} columns.")
+
+ if selected_col in st.session_state["col_diagnostics"]:
+ res = st.session_state["col_diagnostics"][selected_col]
+ little, t_feat, t_target = res["little"], res["t_feat"], res["t_target"]
+
+ st.markdown("---")
+
+ # ── Mechanism verdict card ──
+ card_class = {"MCAR":"card-mcar","MAR":"card-mar","MNAR":"card-mnar"}[res["mechanism"]]
+ emoji = {"MCAR":"🟢","MAR":"🟠","MNAR":"🔴"}[res["mechanism"]]
+ st.markdown(
+ f''
+ f'
{emoji} Mechanism: {res["mechanism"]} — {res["confidence"]} Confidence
'
+ f'
{res["explanation"]}
'
+ f'
Missing: {res["miss_pct"]}% | dtype: {res["dtype"]}
'
+ f'
',
+ unsafe_allow_html=True
+ )
-elif step == STEPS[4]:
- st.markdown('🔬 Step 5 — Column Diagnostics
', unsafe_allow_html=True)
- st.markdown('Run three independent statistical tests per column to determine the missing-data mechanism (MCAR / MAR / MNAR).
', unsafe_allow_html=True)
-
- df = st.session_state.get("df")
- target_col = st.session_state.get("target_col")
-
- if df is None:
- st.warning("⚠️ Please upload a CSV in Step 1 first.")
- elif target_col is None:
- st.warning("⚠️ Please select a target column in Step 2 first.")
- else:
- summary = missing_summary_df(df)
- if summary.empty:
- st.success("🎉 No missing values — nothing to diagnose.")
- else:
- miss_cols = summary.index.tolist()
- selected_col = st.selectbox("Select a column to analyse:", miss_cols)
- miss_pct = summary.loc[selected_col, "Missing %"]
- dtype_str = str(df[selected_col].dtype)
-
- st.markdown(f"---")
- st.markdown(f"### Analysing column: `{selected_col}`")
-
- lv, risk_txt, risk_bg, risk_fg = missingness_risk_level(miss_pct)
- c1, c2, c3 = st.columns(3)
- with c1:
- st.markdown(f'', unsafe_allow_html=True)
- with c2:
- st.markdown(f'', unsafe_allow_html=True)
- with c3:
- n_miss = int(summary.loc[selected_col, "Missing Count"])
- st.markdown(f'', unsafe_allow_html=True)
-
- st.markdown(
- f''
- f'{lv} Missingness — {risk_txt}
',
- unsafe_allow_html=True,
- )
-
- # ── Test 1
- st.markdown("#### 🔬 Test 1 — Pattern Analysis (Missingness Map)")
- t1 = test1_pattern_analysis(df, selected_col)
- fig, axes = plt.subplots(1, 2, figsize=(12, 3))
- sample_size = min(300, len(df))
- idx_sample = df.sample(n=sample_size, random_state=42).index if len(df) > sample_size else df.index
- ind_sample = t1["indicator"].loc[idx_sample]
- axes[0].scatter(range(len(ind_sample)), ind_sample.values,
- c=["#9e2210" if v else "#89d9ac" for v in ind_sample.values], s=8, alpha=0.8)
- axes[0].set_yticks([0, 1]); axes[0].set_yticklabels(["Present", "Missing"])
- axes[0].set_title(f"Missingness Pattern ({sample_size} rows)")
- axes[0].set_xlabel("Row index")
- roll = t1["indicator"].rolling(50, min_periods=1).mean()
- axes[1].plot(roll.values, color="#17172b", linewidth=1.2)
- axes[1].set_title("Rolling Miss Rate (window=50)")
- axes[1].set_xlabel("Row index"); axes[1].set_ylabel("Miss rate")
- axes[1].axhline(t1["miss_pct"] / 100, color="#9e2210", linestyle="--", label="Mean miss rate")
- axes[1].legend(fontsize=8)
- plt.tight_layout()
- st.pyplot(fig); plt.close()
-
- scatter_icon = "🟢" if t1["scattered"] else "🟠"
- st.markdown(f'{scatter_icon} {t1["signal"]}
Cluster ratio: {t1["cluster_ratio"]:.2f} (higher = more scattered = MCAR signal)
', unsafe_allow_html=True)
-
- # ── Test 2
- st.markdown("#### 🔬 Test 2 — Feature Dependency")
- t2 = test2_feature_dependency(df, selected_col)
- if t2["diffs"]:
- top_diffs = dict(sorted(t2["diffs"].items(), key=lambda x: -x[1])[:15])
- fig2, ax2 = plt.subplots(figsize=(10, max(3, len(top_diffs) * 0.45)))
- colors = ["#9e2210" if v >= 30 else "#f0a040" if v >= 10 else "#89d9ac" for v in top_diffs.values()]
- ax2.barh(list(top_diffs.keys())[::-1], list(top_diffs.values())[::-1], color=colors[::-1], edgecolor="white")
- ax2.axvline(5, color="#89d9ac", linewidth=1.5, linestyle="--", label="5% weak")
- ax2.axvline(10, color="#f0cc7a", linewidth=1.5, linestyle="--", label="10% MAR signal")
- ax2.axvline(30, color="#f5a898", linewidth=1.5, linestyle="--", label="30% strong")
- ax2.set_xlabel("Distribution Difference (%)")
- ax2.set_title("Feature Distribution Difference")
- ax2.legend(fontsize=8)
- plt.tight_layout()
- st.pyplot(fig2); plt.close()
- dep_icon = "🟢" if t2["max_diff"] < 5 else "🟠" if t2["max_diff"] < 30 else "🔴"
- st.markdown(f'{dep_icon} {t2["signal"]}
Max difference: {t2["max_diff"]:.1f}%
', unsafe_allow_html=True)
- else:
- st.info("Not enough data to compare feature distributions.")
- t2 = {"diffs": {}, "max_diff": 0.0, "signal": "Insufficient data"}
-
- # ── Test 3
- st.markdown("#### 🔬 Test 3 — Target Dependency")
- if selected_col == target_col:
- st.warning("⚠️ Selected column IS the target column. Test 3 skipped.")
- t3 = {"diff_pct": None, "signal": "Skipped — column is target"}
- else:
- t3 = test3_target_dependency(df, selected_col, target_col)
- if t3["diff_pct"] is not None:
- missing_mask = df[selected_col].isnull()
- fig3, ax3 = plt.subplots(figsize=(7, 3.5))
- if pd.api.types.is_numeric_dtype(df[target_col]):
- miss_target = df.loc[missing_mask, target_col].dropna()
- obs_target = df.loc[~missing_mask, target_col].dropna()
- ax3.hist(obs_target, bins=25, alpha=0.7, label="Target when present", color="#17172b", edgecolor="white")
- ax3.hist(miss_target, bins=25, alpha=0.7, label="Target when missing", color="#9e2210", edgecolor="white")
- ax3.set_xlabel(target_col); ax3.set_ylabel("Count")
- ax3.legend()
- else:
- miss_target = df.loc[missing_mask, target_col].value_counts(normalize=True) * 100
- obs_target = df.loc[~missing_mask, target_col].value_counts(normalize=True) * 100
- cats = list(set(miss_target.index) | set(obs_target.index))
- x = np.arange(len(cats))
- ax3.bar(x - 0.2, [obs_target.get(c, 0) for c in cats], 0.4, label="Present", color="#17172b")
- ax3.bar(x + 0.2, [miss_target.get(c, 0) for c in cats], 0.4, label="Missing", color="#9e2210")
- ax3.set_xticks(x); ax3.set_xticklabels(cats, rotation=30)
- ax3.set_ylabel("% of group"); ax3.legend()
- ax3.set_title(f"Target ({target_col}) dist: present vs missing in '{selected_col}'")
- plt.tight_layout()
- st.pyplot(fig3); plt.close()
- dep_icon = "🟢" if (t3["diff_pct"] or 0) < 5 else "🟠" if (t3["diff_pct"] or 0) < 10 else "🔴"
- st.markdown(f'{dep_icon} {t3["signal"]}
Target diff: {t3["diff_pct"]}%
', unsafe_allow_html=True)
- else:
- st.info(t3["signal"])
-
- # ── Verdict
- st.markdown("---")
- st.markdown("### 🏁 Mechanism Verdict")
- mechanism, confidence, explanation = classify_mechanism(t1, t2, t3)
- card_class = {"MCAR": "card-mcar", "MAR": "card-mar", "MNAR": "card-mnar"}[mechanism]
- emoji = {"MCAR": "🟢", "MAR": "🟠", "MNAR": "🔴"}[mechanism]
- st.markdown(
- f''
- f'
{emoji} {mechanism} — {confidence} confidence
'
- f'
{explanation}
'
- f'
',
- unsafe_allow_html=True,
+ # ══ TEST 1: Little's MCAR ══
+ st.markdown('', unsafe_allow_html=True)
+ with st.expander("ℹ️ What does this test measure?", expanded=False):
+ st.markdown("""
+ **Little's MCAR test** checks if missingness is completely random.
+ - **H₀ (null):** Data is Missing Completely At Random (MCAR)
+ - **p ≥ 0.05:** Fail to reject → data may be MCAR
+ - **p < 0.05:** Reject → systematic missingness detected
+ """)
+
+ little_rows = [{
+ "Test": "Little's MCAR",
+ "χ² Statistic": little.get("chi2", "N/A"),
+ "Degrees of Freedom": little.get("df", "N/A"),
+ "p-value": little.get("p_value", "N/A"),
+ "Verdict": little.get("verdict", "N/A"),
+ "Reject MCAR?": "✅ Yes — systematic" if little.get("reject_mcar") else "❌ No — may be MCAR"
+ }]
+ st.dataframe(pd.DataFrame(little_rows), use_container_width=True, hide_index=True)
+
+ # ══ TEST 2: Target Dependency ══
+ st.markdown('', unsafe_allow_html=True)
+ with st.expander("ℹ️ What does this test measure?", expanded=False):
+ st.markdown("""
+ Tests if the **target variable** has different values when this column is missing vs. observed.
+ - **Numeric target:** z-test or Welch t-test
+ - **Categorical target:** Chi-squared test
+ - **Significant (p<0.05) + large diff % → MNAR** (missingness depends on outcome)
+ """)
+
+ tgt_rows = [{
+ "Test Applied": "z-test / Welch t-test / Chi²",
+ "p-value": t_target.get("p_value", "N/A"),
+ "Target Diff %": f'{t_target.get("diff_pct", 0):.1f}%' if t_target.get("diff_pct") is not None else "N/A",
+ "Significant (p<0.05)?": "✅ Yes" if t_target.get("significant") else "❌ No",
+ "Interpretation": t_target.get("signal", "N/A")
+ }]
+ st.dataframe(pd.DataFrame(tgt_rows), use_container_width=True, hide_index=True)
+
+ # ══ TEST 3: Feature Dependency ══
+ st.markdown('', unsafe_allow_html=True)
+ with st.expander("ℹ️ What does this test measure?", expanded=False):
+ st.markdown("""
+ For each other feature, tests if values differ **significantly** between rows where this column is missing vs. observed.
+ - **Numeric features:** z-test (n≥30) or Welch t-test
+ - **Categorical features:** Chi-squared test
+ - **Many significant features (>30%) → MAR** (missingness explained by observed data)
+ """)
+
+ # Summary row first
+ summary_cols = st.columns(3)
+ summary_cols[0].metric("Features Tested", t_feat.get("total_tested", 0))
+ summary_cols[1].metric("Significant (p<0.05)", t_feat.get("n_significant", 0))
+ summary_cols[2].metric("% Significant", f'{t_feat.get("sig_pct", 0):.1f}%')
+
+ if t_feat["results"]:
+ rows = []
+ for f, r in t_feat["results"].items():
+ rows.append({
+ "Feature": f,
+ "Data Type": r["type"].capitalize(),
+ "Test Used": r["test"],
+ "Test Statistic": r["stat"],
+ "p-value": r["p_value"],
+ "p < 0.05?": "✅ Significant" if r["significant"] else "—"
+ })
+ feat_df = pd.DataFrame(rows).sort_values("p-value")
+
+ def highlight_sig(row):
+ if row["p < 0.05?"] == "✅ Significant":
+ return ["background-color:#ffe4e1; color:#900000"] * len(row)
+ return [""] * len(row)
+
+ st.dataframe(
+ feat_df.style.apply(highlight_sig, axis=1),
+ use_container_width=True,
+ hide_index=True
)
-
- # Strategy chips
- col_type_str = "Numerical" if pd.api.types.is_numeric_dtype(df[selected_col]) else "Categorical"
- chips_html = strategy_chips_html(mechanism, miss_pct, col_type_str)
- if chips_html:
- st.markdown("**Recommended Strategy Options**")
- st.markdown(chips_html, unsafe_allow_html=True)
-
- st.session_state["col_results"][selected_col] = {
- "mechanism": mechanism,
- "confidence": confidence,
- "miss_pct": miss_pct,
- "dtype": dtype_str,
- "t1": t1, "t2": t2, "t3": t3,
- }
+ else:
+ st.info("No feature dependency results available (insufficient data or no other columns).")
+
+ # ══ Decision Logic Summary ══
+ st.markdown('', unsafe_allow_html=True)
+ logic_rows = [
+ {"Rule Check": "Little's test rejects MCAR?", "Result": "✅ Yes" if little.get("reject_mcar") else "❌ No"},
+ {"Rule Check": "Target differs significantly?", "Result": "✅ Yes" if t_target.get("significant") else "❌ No"},
+ {"Rule Check": "Target diff magnitude", "Result": f'{t_target.get("diff_pct", 0):.1f}% difference'},
+ {"Rule Check": "% of features with significant diff", "Result": f'{t_feat.get("sig_pct", 0):.1f}%'},
+ {"Rule Check": "→ Final Mechanism", "Result": f'{res["mechanism"]} ({res["confidence"]} confidence)'},
+ ]
+ st.dataframe(pd.DataFrame(logic_rows), use_container_width=True, hide_index=True)
# ════════════════════════════════════════════════════════════════════
-# STEP 6 — STRATEGY & IMPUTATION
+# STEP 4 — FEASIBILITY GATE (Interactive)
# ════════════════════════════════════════════════════════════════════
+def render_step4():
+ st.markdown('⚖️ Step 4 — Imputation Feasibility Gate
', unsafe_allow_html=True)
+
+ with st.expander("📚 Theory & Guide: Why test imputation mathematically? (Click to expand)"):
+ st.markdown("""
+
+
Why test imputation mathematically?
+
Single-value imputations (like filling blanks with Mean or Median) are dangerous if overused. They can:
+
+ - Collapse Variance: If you fill 20% of the data with the same number, the spread of your data shrinks unnaturally.
+ - Create Artificial Outliers: Because the variance (IQR) shrank, real valid data points at the edges suddenly look like outliers!
+ - Destroy Correlation: Assigning a median weight to someone without considering their height breaks the natural relationship between features.
+
+
KNN and MICE solve this by acting like mini machine-learning models — they look at other features to make an educated guess, preserving variance and correlations.
+
+ """, unsafe_allow_html=True)
+
+ df, target = st.session_state.get("df_train"), st.session_state.get("target_col")
+ col_diag = st.session_state.get("col_diagnostics", {})
+ if not col_diag: return st.warning("⚠️ Please run diagnostics in Step 3 first.")
+
+ numeric_diag = {c: v for c, v in col_diag.items() if pd.api.types.is_numeric_dtype(df[c])}
+ if not numeric_diag: return st.info("No numeric columns available.")
+
+ col1, col2 = st.columns([1, 2])
+ selected_col = col1.selectbox("Select numeric column:", list(numeric_diag.keys()))
+ impute_choice = col2.radio("Simulate impact of:", ["Mean", "Median", "KNN", "MICE"], horizontal=True)
+
+ if st.button(f"▶ Simulate {impute_choice} Imputation", type="primary"):
+ with st.spinner(f"Running {impute_choice} simulation (may take a moment for KNN/MICE)..."):
+ feas = feasibility_checks(df, selected_col, target, impute_choice)
+
+ if not feas.get("applicable"):
+ return st.error("Column not applicable for numeric feasibility checks.")
+
+ ICONS = {"ok": "✅", "warn": "⚠️", "fail": "❌"}
+ COLORS = {"ok": "stat-ok", "warn": "stat-warn", "fail": "stat-fail"}
+
+ # ── Big Stats Banner ──
+ st.markdown("### 📊 Imputation Impact — Key Statistics")
+ m1, m2, m3, m4 = st.columns(4)
+
+ var_pct = feas["variance"]["var_drop_pct"]
+ var_verd = feas["variance"]["verdict"]
+ new_out = feas["outliers"]["new_outliers"]
+ out_verd = feas["outliers"]["verdict"]
+ corr_verd = feas["correlation"]["verdict"]
+ corr_max = feas["correlation"]["max_shift"]
+ skew_val = feas["skewness"]["value"]
+ skew_verd = feas["skewness"]["verdict"]
+
+ var_color = "#900000" if var_verd == "fail" else ("#7a4f00" if var_verd == "warn" else "#0a5c30")
+ out_color = "#900000" if out_verd == "fail" else ("#7a4f00" if out_verd == "warn" else "#0a5c30")
+ corr_color = "#900000" if corr_verd == "fail" else ("#7a4f00" if corr_verd == "warn" else "#0a5c30")
+ skew_color = "#900000" if skew_verd == "fail" else ("#7a4f00" if skew_verd == "warn" else "#0a5c30")
+
+ m1.markdown(
+ f''
+ f'
-{var_pct:.1f}%
'
+ f'
Variance Change
'
+ f'
{ICONS[var_verd]} {"Safe" if var_verd=="ok" else "Caution" if var_verd=="warn" else "High Risk"}
'
+ f'
', unsafe_allow_html=True
+ )
+ m2.markdown(
+ f''
+ f'
+{new_out}
'
+ f'
New Outliers Created
'
+ f'
{ICONS[out_verd]} Before: {feas["outliers"]["outliers_before"]} → After: {feas["outliers"]["outliers_after"]}
'
+ f'
', unsafe_allow_html=True
+ )
+ m3.markdown(
+ f''
+ f'
Δ{corr_max:.3f}
'
+ f'
Max Corr. Shift
'
+ f'
{ICONS[corr_verd]} {corr_verd.capitalize()}
'
+ f'
', unsafe_allow_html=True
+ )
+ m4.markdown(
+ f''
+ f'
{skew_val:.3f}
'
+ f'
Skewness
'
+ f'
{ICONS[skew_verd]} {"Low" if abs(skew_val)<=1 else "Moderate" if abs(skew_val)<=3 else "High"} skew
'
+ f'
', unsafe_allow_html=True
+ )
-elif step == STEPS[5]:
- st.markdown('🛠 Step 6 — Strategy & Imputation
', unsafe_allow_html=True)
- st.markdown('Based on the mechanism and missing %, select and apply the right strategy for each column.
', unsafe_allow_html=True)
-
- df = st.session_state.get("df")
- col_results = st.session_state.get("col_results", {})
-
- if df is None:
- st.warning("⚠️ Please upload a CSV in Step 1 first.")
- elif not col_results:
- st.warning("⚠️ Please run diagnostics in Step 5 for at least one column first.")
- else:
- df_imputed = (df.copy() if st.session_state.get("df_imputed") is None
- else st.session_state["df_imputed"].copy())
-
- for col, res in col_results.items():
- mechanism = res["mechanism"]
- miss_pct = res["miss_pct"]
- dtype_str = res["dtype"]
-
- st.markdown(f"### Column: `{col}`")
- st.markdown(f"**Mechanism:** {mechanism} | **Missing:** {miss_pct:.1f}% | **Type:** `{dtype_str}`")
-
- rec = recommend_strategy(mechanism, miss_pct, dtype_str)
- card_class = "card-mcar" if mechanism == "MCAR" else "card-mar" if mechanism == "MAR" else "card-mnar"
- st.markdown(
- f''
- f'Recommended: {rec["method"]}
'
- f'{rec["reason"]}
'
- f'{rec["adv"]}
'
- f'{rec["disadv"]}'
- f'
',
- unsafe_allow_html=True,
- )
+ st.markdown("---")
+
+ # ── KDE Plots — Two clear separate charts ──
+ st.markdown("### 📈 Distribution Comparison (KDE)")
- if rec["add_indicator"]:
- st.markdown(
- '🚩 Missing Indicator will be added BEFORE imputation — '
- "missingness itself carries signal for this column.
",
- unsafe_allow_html=True,
- )
-
- is_num = "float" in dtype_str or "int" in dtype_str
- strategy_options = (
- ["Mean", "Median", "Constant (0)", "Drop rows", "Keep as-is"] if is_num
- else ["Mode", "Constant ('Unknown')", "Drop rows", "Keep as-is"]
- )
- chosen = st.selectbox(
- f"Apply strategy for `{col}`:",
- options=strategy_options,
- key=f"strategy_{col}",
- )
+ series = df[selected_col].dropna()
+ imputed = feas["imputed_series"]
+ miss_pct_col = df[selected_col].isnull().mean() * 100
- if st.button(f"▶ Apply to `{col}`", key=f"apply_{col}"):
- if rec["add_indicator"]:
- indicator_col = f"{col}_was_missing"
- df_imputed[indicator_col] = df[col].isnull().astype(int)
- st.info(f"✅ Created indicator column: `{indicator_col}`")
-
- if chosen == "Mean":
- fill_val = df[col].mean()
- df_imputed[col] = df_imputed[col].fillna(fill_val)
- st.success(f"✅ Imputed with mean = {fill_val:.4f}")
- elif chosen == "Median":
- fill_val = df[col].median()
- df_imputed[col] = df_imputed[col].fillna(fill_val)
- st.success(f"✅ Imputed with median = {fill_val:.4f}")
- elif chosen == "Mode":
- fill_val = df[col].mode().iloc[0]
- df_imputed[col] = df_imputed[col].fillna(fill_val)
- st.success(f"✅ Imputed with mode = {fill_val}")
- elif chosen in ("Constant (0)", "Constant ('Unknown')"):
- fill_val = 0 if is_num else "Unknown"
- df_imputed[col] = df_imputed[col].fillna(fill_val)
- st.success(f"✅ Imputed with constant = {fill_val}")
- elif chosen == "Drop rows":
- before = len(df_imputed)
- df_imputed = df_imputed.dropna(subset=[col])
- after = len(df_imputed)
- st.success(f"✅ Dropped {before - after} rows with missing `{col}`")
- else:
- st.info("No imputation applied.")
+ fig, axes = plt.subplots(1, 2, figsize=(16, 5))
+ fig.patch.set_facecolor('#fafafa')
- st.session_state["df_imputed"] = df_imputed
-
- st.markdown("
", unsafe_allow_html=True)
+ # Plot 1: Overlapping KDE
+ ax = axes[0]
+ ax.set_facecolor('#f8f8f8')
+ try:
+ from scipy.stats import gaussian_kde
+ # Original KDE
+ kde_orig = gaussian_kde(series.values, bw_method='scott')
+ x_range = np.linspace(min(series.min(), imputed.min()), max(series.max(), imputed.max()), 300)
+ ax.fill_between(x_range, kde_orig(x_range), alpha=0.35, color='#17172b', label='Original (observed only)')
+ ax.plot(x_range, kde_orig(x_range), color='#17172b', lw=2.5)
+
+ # Imputed KDE
+ kde_imp = gaussian_kde(imputed.values, bw_method='scott')
+ ax.fill_between(x_range, kde_imp(x_range), alpha=0.35, color='#d6336c', label=f'After {impute_choice}')
+ ax.plot(x_range, kde_imp(x_range), color='#d6336c', lw=2.5, linestyle='--')
+ except Exception:
+ ax.hist(series.values, bins=25, alpha=0.5, color='#17172b', label='Original', density=True)
+ ax.hist(imputed.values, bins=25, alpha=0.4, color='#d6336c', label=f'After {impute_choice}', density=True)
+
+ ax.set_title(f'KDE: Original vs After {impute_choice}\n({miss_pct_col:.1f}% was missing)', fontsize=13, fontweight='bold', pad=12)
+ ax.set_xlabel(selected_col, fontsize=11)
+ ax.set_ylabel('Density', fontsize=11)
+ ax.legend(fontsize=10)
+ ax.grid(axis='y', alpha=0.3)
+ ax.spines[['top','right']].set_visible(False)
+
+ # Plot 2: Box plots side by side
+ ax2 = axes[1]
+ ax2.set_facecolor('#f8f8f8')
+ bp = ax2.boxplot(
+ [series.values, imputed.values],
+ labels=['Original\n(non-missing)', f'After\n{impute_choice}'],
+ patch_artist=True,
+ widths=0.5,
+ medianprops=dict(color='#d6336c', linewidth=2.5),
+ flierprops=dict(marker='o', markerfacecolor='#d6336c', markersize=5, alpha=0.5),
+ whiskerprops=dict(linewidth=1.5),
+ capprops=dict(linewidth=1.5),
+ )
+ bp['boxes'][0].set_facecolor('#c8d8f0')
+ bp['boxes'][1].set_facecolor('#f5c6d0')
- st.markdown("### 📥 Download Imputed Dataset")
- df_out = st.session_state.get("df_imputed", df)
- csv_bytes = df_out.to_csv(index=False).encode("utf-8")
- st.download_button(
- label="⬇ Download imputed CSV",
- data=csv_bytes,
- file_name="imputed_dataset.csv",
- mime="text/csv",
+ # Annotate variance change
+ ax2.set_title(
+ f'Spread & Outliers\nVariance Change: {var_pct:.1f}% | New Outliers: +{new_out}',
+ fontsize=13, fontweight='bold', pad=12
)
- st.dataframe(df_out.head(10), use_container_width=True)
+ ax2.set_ylabel('Value', fontsize=11)
+ ax2.grid(axis='y', alpha=0.3)
+ ax2.spines[['top','right']].set_visible(False)
+
+ plt.tight_layout(pad=2.5)
+ st.pyplot(fig, use_container_width=True)
+ plt.close()
+
+ # ── Correlation Details ──
+ st.markdown("---")
+ st.markdown("#### 🔗 Correlation Preservation Details")
+ st.markdown(f'{ICONS[corr_verd]} {feas["correlation"]["msg"]}
', unsafe_allow_html=True)
+ if feas["correlation"]["details"]:
+ rows = [{
+ "Feature": f,
+ "r (before)": r["r_before"],
+ "r (after)": r["r_after"],
+ "Δ (shift)": r["delta"],
+ "Sign Flip?": "🚨 YES" if r["sign_flip"] else "No"
+ } for f, r in feas["correlation"]["details"].items()]
+ corr_df = pd.DataFrame(rows).sort_values("Δ (shift)", ascending=False)
+ def highlight_corr(row):
+ if row["Sign Flip?"] == "🚨 YES": return ["background-color:#fde8e8; color:#900000"] * len(row)
+ if row["Δ (shift)"] > 0.10: return ["background-color:#fff0ed; color:#900000"] * len(row)
+ return [""] * len(row)
+ st.dataframe(corr_df.style.apply(highlight_corr, axis=1), use_container_width=True, hide_index=True)
# ════════════════════════════════════════════════════════════════════
-# STEP 7 — VALIDATION CHECKS
+# STEP 5 — FINAL REPORT
# ════════════════════════════════════════════════════════════════════
+def render_step5():
+ st.markdown('📋 Step 5 — Final Diagnostic Report
', unsafe_allow_html=True)
+
+ df, target = st.session_state.get("df_train"), st.session_state.get("target_col")
+ col_diag = st.session_state.get("col_diagnostics", {})
+ if not col_diag: return st.warning("⚠️ Run diagnostics in Step 3 first.")
+
+ # ── Legend ──
+ with st.expander("📖 How to read the Recommended Strategy column"):
+ st.markdown("""
+ | Label | Meaning |
+ |-------|---------|
+ | **Drop Rows** | MCAR + <5% missing — safe to delete affected rows |
+ | **Drop Column** | >70% missing — too little data to impute reliably |
+ | **Mean Imputation** | Low-skew numeric, variance loss is acceptable |
+ | **Median Imputation** | Skewed numeric; median is more robust than mean |
+ | **Mode Imputation** | Categorical / non-numeric columns |
+ | **KNN Imputer** | Moderate missingness; feature relationships preserved |
+ | **MICE Imputer** | High missingness (>30%); multiple-imputation approach |
+ | **+ Missing Indicator** | Added when mechanism is MNAR, or MAR ≥ 10% missing — add a binary flag column `col_missing` alongside imputed values |
+ """)
+
+ table_rows = []
+ for col, res in col_diag.items():
+ rec_string = get_auto_recommendation(df, col, target, res["mechanism"], res["miss_pct"], res["dtype"])
+ table_rows.append({
+ "Column": col,
+ "dtype": res["dtype"],
+ "Missing %": f'{res["miss_pct"]:.1f}%',
+ "Mechanism": res["mechanism"],
+ "Confidence": res["confidence"],
+ "Recommended Strategy": rec_string
+ })
+
+ report_df = pd.DataFrame(table_rows).sort_values("Missing %", ascending=False)
+
+ def color_rows(row):
+ mech_colors = {
+ "MNAR": "background-color:#fff0ed; color:#000",
+ "MAR": "background-color:#fffaeb; color:#000",
+ "MCAR": "background-color:#edfaf3; color:#000"
+ }
+ return [mech_colors.get(row["Mechanism"], "")] * len(row)
+
+ st.dataframe(
+ report_df.style.apply(color_rows, axis=1),
+ use_container_width=True,
+ hide_index=True
+ )
-elif step == STEPS[6]:
- st.markdown('✅ Step 7 — Validation Checks
', unsafe_allow_html=True)
- st.markdown('Confirm that imputation preserved statistical properties and did not introduce bias.
', unsafe_allow_html=True)
-
- df_orig = st.session_state.get("df")
- df_imputed = st.session_state.get("df_imputed")
- col_results = st.session_state.get("col_results", {})
-
- if df_orig is None or df_imputed is None:
- st.warning("⚠️ Complete Steps 1–6 first.")
- elif not col_results:
- st.warning("⚠️ Run diagnostics in Step 5 and apply a strategy in Step 6 first.")
- else:
- numeric_cols = [c for c in col_results if pd.api.types.is_numeric_dtype(df_orig[c])]
-
- if not numeric_cols:
- st.info("Validation checks apply to numeric columns only. No numeric columns were diagnosed.")
- else:
- for col in numeric_cols:
- before = df_orig[col].dropna()
- after = df_imputed[col].dropna()
-
- if len(after) == 0 or len(before) == 0:
- continue
-
- st.markdown(f"### `{col}`")
- chk = validation_checks(before, after)
-
- c1, c2, c3 = st.columns(3)
- def chk_icon(ok): return "✅" if ok else "⚠️"
- with c1:
- st.markdown(
- f''
- f'
{chk_icon(chk["mean_ok"])} {chk["mean_shift_pct"]}%
'
- f'
Mean shift (≤5% OK)
'
- f'
', unsafe_allow_html=True)
- with c2:
- st.markdown(
- f''
- f'
{chk_icon(chk["median_ok"])} {chk["median_shift_pct"]}%
'
- f'
Median shift (≤3% OK)
'
- f'
', unsafe_allow_html=True)
- with c3:
- st.markdown(
- f''
- f'
{chk_icon(chk["var_ok"])} {chk["var_change_pct"]}%
'
- f'
Variance change (≤20% OK)
'
- f'
', unsafe_allow_html=True)
-
- fig_v, ax_v = plt.subplots(figsize=(8, 3.5))
- ax_v.hist(before.values, bins=30, alpha=0.55, label="Before imputation", color="#17172b", edgecolor="white")
- ax_v.hist(after.values, bins=30, alpha=0.55, label="After imputation", color="#6020a0", edgecolor="white")
- ax_v.axvline(before.mean(), color="#17172b", linewidth=1.5, linestyle="--", label=f"Mean before: {before.mean():.2f}")
- ax_v.axvline(after.mean(), color="#6020a0", linewidth=1.5, linestyle="--", label=f"Mean after: {after.mean():.2f}")
- ax_v.set_title(f"Distribution: '{col}' before vs after imputation")
- ax_v.legend(fontsize=8)
- plt.tight_layout()
- st.pyplot(fig_v); plt.close()
-
- target_col = st.session_state.get("target_col")
- if target_col and target_col in df_orig.columns and pd.api.types.is_numeric_dtype(df_orig[target_col]):
- corr_before = df_orig[[col, target_col]].dropna().corr().iloc[0, 1]
- corr_after = df_imputed[[col, target_col]].dropna().corr().iloc[0, 1]
- delta = abs(corr_before - corr_after)
- sign_flip = (corr_before * corr_after < 0)
- icon = "✅" if delta <= 0.05 and not sign_flip else "⚠️"
- st.markdown(
- f'{icon} Correlation with target: '
- f'Before = {corr_before:.3f} → After = {corr_after:.3f} | Δ = {delta:.3f}'
- + (" 🚨 Sign flipped!" if sign_flip else "")
- + "
",
- unsafe_allow_html=True,
- )
-
- st.markdown("
", unsafe_allow_html=True)
-
- st.markdown("### ⚠️ Common Pitfalls Checklist")
- pitfalls = [
- "Each column treated independently?",
- "Imputation done AFTER train-test split?",
- "Target variable NOT used as imputation predictor?",
- "Missing indicator created BEFORE imputation for MNAR/MAR ≥10%?",
- "Validation checked beyond just accuracy?",
- ]
- for txt in pitfalls:
- st.checkbox(txt, value=False, key=f"pitfall_{txt[:20]}")
-
- st.markdown(
- ''
- '↻ Repeat Steps 5–6 for every column independently.
'
- 'One column may be MCAR (drop rows), another MAR (KNN), another MNAR (indicator + median). '
- 'Never apply one method to all columns at once.'
- '
',
- unsafe_allow_html=True,
- )
-
-st.markdown("---")
-st.caption("🔬 Missing Value Intelligence Suite · Merged from app.py + app_tanisha.py · Built with Streamlit, pandas, scikit-learn, scipy, seaborn")
\ No newline at end of file
+ # ── Summary counts ──
+ st.markdown("---")
+ c1, c2, c3 = st.columns(3)
+ mcar_n = sum(1 for r in col_diag.values() if r["mechanism"] == "MCAR")
+ mar_n = sum(1 for r in col_diag.values() if r["mechanism"] == "MAR")
+ mnar_n = sum(1 for r in col_diag.values() if r["mechanism"] == "MNAR")
+ c1.markdown(f'', unsafe_allow_html=True)
+ c2.markdown(f'', unsafe_allow_html=True)
+ c3.markdown(f'', unsafe_allow_html=True)
+
+
+if step == STEPS[0]: render_step1()
+elif step == STEPS[1]: render_step2()
+elif step == STEPS[2]: render_step3()
+elif step == STEPS[3]: render_step4()
+elif step == STEPS[4]: render_step5()