diff --git "a/src/streamlit_app.py" "b/src/streamlit_app.py" --- "a/src/streamlit_app.py" +++ "b/src/streamlit_app.py" @@ -1,1497 +1,770 @@ """ -Missing Value Intelligence Suite — Merged App -Combines the stepwise pipeline (app.py) with the comprehensive dashboard (app_tanisha.py) -into a unified 7-step workflow. +Missing Value Analyzer — Statistically Rigorous Pipeline +========================================================= +Phases: + 1 Upload CSV & Train/Test Split + 2 Missing Value Overview (train set only) + 3 Per-Column Diagnostics (Tables for all tests) + 4 Imputation Feasibility Gate (KDE plots, Variance %, New Outliers) + 5 Final Report & Recommendations """ import streamlit as st import pandas as pd import numpy as np +import matplotlib +matplotlib.use("Agg") import matplotlib.pyplot as plt -import matplotlib.patches as mpatches import seaborn as sns from scipy import stats -from scipy.stats import chi2_contingency, ks_2samp, shapiro, skew, kurtosis -from sklearn.preprocessing import LabelEncoder, StandardScaler -from sklearn.linear_model import LogisticRegression +from scipy.stats import chi2_contingency, ttest_ind, norm, chi2 from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.impute import KNNImputer +from sklearn.experimental import enable_iterative_imputer +from sklearn.impute import IterativeImputer import warnings warnings.filterwarnings("ignore") # ─────────────────────────── Page config ──────────────────────────── st.set_page_config( - page_title="Missing Value Intelligence Suite", + page_title="Missing Value Analyzer", page_icon="🔬", layout="wide", initial_sidebar_state="expanded", ) -# ─────────────────────────── Custom CSS ───────────────────────────── +# ─────────────────────────── CSS ──────────────────────────────────── st.markdown(""" -""", unsafe_allow_html=True) +section[data-testid="stSidebar"]{background:#17172b;} +section[data-testid="stSidebar"] *{color:#ffffff !important;} +section[data-testid="stSidebar"] hr{border-color:#ffffff33 !important;} +.main-title{font-size:2rem;font-weight:700;color:#17172b;margin-bottom:.2rem;} +.main-sub{font-size:1rem;color:#6060a0;margin-bottom:1.5rem;} -# ════════════════════════════════════════════════════════════════════ -# SHARED HELPER FUNCTIONS -# ════════════════════════════════════════════════════════════════════ +.metric-box{background:#f5f3ee;border-radius:8px;padding:12px 16px;text-align:center;margin-bottom:8px;} +.metric-val{font-size:1.4rem;font-weight:700;color:#17172b !important;} +.metric-lbl{font-size:.78rem;color:#6060a0 !important;margin-top:2px;} -def missing_summary_df(df: pd.DataFrame) -> pd.DataFrame: - total = len(df) - counts = df.isnull().sum() - pct = counts / total * 100 - summary = pd.DataFrame({ - "Missing Count": counts, - "Missing %": pct.round(2), - "Dtype": df.dtypes.astype(str), - }) - return summary[summary["Missing Count"] > 0].sort_values("Missing %", ascending=False) - - -def missing_summary_typed(df, num_cols, cat_cols): - rows = [] - for col in df.columns: - mc = df[col].isnull().sum() - pct = mc / len(df) * 100 - dtype = "Numerical" if col in num_cols else "Categorical" - rows.append({"Column": col, "Data Type": dtype, - "Missing Count": mc, "Missing %": round(pct, 2)}) - result = pd.DataFrame(rows).sort_values("Missing %", ascending=False).reset_index(drop=True) - return result[result["Missing Count"] > 0].reset_index(drop=True) - - -def severity(pct): - if pct < 5: return "Low" - if pct < 20: return "Moderate" - return "High" - - -def identify_columns(df): - num_cols = df.select_dtypes(include=[np.number]).columns.tolist() - cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist() - return num_cols, cat_cols - - -def missingness_risk_level(pct: float) -> tuple: - if pct <= 5: - return "≤5%", "Very low missingness. Low risk of bias.", "#edfaf3", "#0d6b3a" - elif pct <= 15: - return "5–15%", "Moderate. Imputation preferred over dropping.", "#fffaeb", "#7a4d00" - elif pct <= 30: - return "15–30%", "High. Dropping loses too much data. Advanced imputation + missing indicator mandatory.", "#fff0ed", "#9e2210" - else: - return ">30%", "Very high. Consider dropping the column. Re-evaluate column usefulness + domain check.", "#fde8e8", "#7a0000" - - -# ── Statistical Tests (from app.py) ────────────────────────────────── - -def test1_pattern_analysis(df: pd.DataFrame, col: str) -> dict: - indicator = df[col].isnull().astype(int) - miss_pct = indicator.mean() * 100 - runs = (indicator != indicator.shift()).sum() - max_possible_runs = min(len(indicator) * 2, len(indicator[indicator == 1]) * 2 + 1) - cluster_ratio = runs / max(max_possible_runs, 1) - scattered = cluster_ratio > 0.5 - return { - "indicator": indicator, - "miss_pct": miss_pct, - "scattered": scattered, - "cluster_ratio": cluster_ratio, - "signal": "MCAR signal" if scattered else "MAR / MNAR signal (clustered rows)", - } +.big-stat-box{border-radius:12px;padding:20px 24px;text-align:center;margin-bottom:8px;} +.big-stat-val{font-size:2.2rem;font-weight:800;margin-bottom:4px;} +.big-stat-lbl{font-size:.82rem;font-weight:500;opacity:0.8;text-transform:uppercase;letter-spacing:.05em;} +.big-stat-sub{font-size:.78rem;opacity:0.65;margin-top:4px;} +.stat-ok{background:#edfaf3;border:2px solid #89d9ac;} +.stat-ok .big-stat-val{color:#0a5c30;} +.stat-warn{background:#fffaeb;border:2px solid #f0cc7a;} +.stat-warn .big-stat-val{color:#7a4f00;} +.stat-fail{background:#fff0ed;border:2px solid #f5a898;} +.stat-fail .big-stat-val{color:#900000;} -def test2_feature_dependency(df: pd.DataFrame, col: str) -> dict: - missing_mask = df[col].isnull() - if missing_mask.sum() < 5 or (~missing_mask).sum() < 5: - return {"diffs": {}, "max_diff": 0.0, "signal": "Insufficient data"} +.card-mcar{background:#edfaf3;border:2px solid #89d9ac;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;} +.card-mar {background:#fffaeb;border:2px solid #f0cc7a;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;} +.card-mnar{background:#fff0ed;border:2px solid #f5a898;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;} +.card-info{background:#eef2ff;border:2px solid #bdc8f5;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;} +.card-warn{background:#fff8e1;border:2px solid #ffe082;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;} +.card-danger{background:#fde8e8;border:2px solid #f5a8a8;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;} +.card-ok{background:#edfaf3;border:2px solid #89d9ac;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;} - diffs = {} - for other_col in df.columns: - if other_col == col: - continue - try: - miss_vals = df.loc[missing_mask, other_col].dropna() - obs_vals = df.loc[~missing_mask, other_col].dropna() - if len(miss_vals) < 3 or len(obs_vals) < 3: - continue - if pd.api.types.is_numeric_dtype(df[other_col]): - m1, m2 = miss_vals.mean(), obs_vals.mean() - denom = max(abs(m2), 1e-9) - diff_pct = abs(m1 - m2) / denom * 100 - diffs[other_col] = diff_pct - else: - ct = pd.crosstab( - pd.concat([pd.Series(["missing"] * len(miss_vals)), - pd.Series(["present"] * len(obs_vals))]), - pd.concat([miss_vals, obs_vals]) - ) - chi2, _, _, _ = chi2_contingency(ct) - n = ct.values.sum() - k = min(ct.shape) - 1 - cramers_v = np.sqrt(chi2 / (n * max(k, 1))) * 100 - diffs[other_col] = cramers_v - except Exception: - continue +.card-mcar *, .card-mar *, .card-mnar *, .card-info *, .card-warn *, .card-danger *, .card-ok * {color: #1a1a2e !important;} - if not diffs: - return {"diffs": {}, "max_diff": 0.0, "signal": "No comparable features"} +.verdict-label{font-size:1.1rem;font-weight:700;margin-bottom:4px;} +.verdict-desc{font-size:.88rem;color:#333 !important;} - max_diff = max(diffs.values()) - if max_diff < 5: - signal = "Weak signal — MCAR likely" - elif max_diff < 30: - signal = "Strong MAR signal (feature dependency detected)" - else: - signal = "Very strong dependency — MAR or MNAR" +code{background:#e8e8eb;padding:2px 6px;border-radius:4px;font-size:.85rem; color:#d6336c !important;} +hr.divider{border:none;border-top:2px solid #e0ddd8;margin:1.5rem 0;} - return {"diffs": diffs, "max_diff": max_diff, "signal": signal} +.theory-box {background:#fafafa; border-left:4px solid #4f8ef7; border-radius:4px; padding:12px 18px; margin-bottom:16px;} +.theory-box h4 {color:#17172b; margin-bottom:6px; font-size:1.05rem;} +.theory-box p {color:#444; font-size:0.92rem; line-height:1.5;} +.stat-highlight { font-size: 1.2rem; font-weight: bold; color: #d6336c; background: #ffe4e1; padding: 2px 8px; border-radius: 4px;} -def test3_target_dependency(df: pd.DataFrame, col: str, target_col: str) -> dict: - missing_mask = df[col].isnull() - if missing_mask.sum() < 5 or (~missing_mask).sum() < 5: - return {"diff_pct": None, "signal": "Insufficient data"} +.test-header{font-size:1.05rem;font-weight:700;color:#17172b;margin:18px 0 8px;} + +""", unsafe_allow_html=True) + + +# ════════════════════════════════════════════════════════════════════ +# SESSION STATE INIT +# ═════════════════════════════════════════════════���══════════════════ +defaults = {"df_full": None, "df_train": None, "df_test": None, "target_col": None, "split_ratio": 0.8, "col_diagnostics": {}} +for k, v in defaults.items(): + if k not in st.session_state: st.session_state[k] = v - try: - miss_target = df.loc[missing_mask, target_col].dropna() - obs_target = df.loc[~missing_mask, target_col].dropna() +# ════════════════════════════════════════════════════════════════════ +# STATISTICAL TEST HELPERS +# ════════════════════════════════════════════════════════════════════ +def littles_mcar_test(df: pd.DataFrame, cols_with_missing: list) -> dict: + numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() + chi2_total, df_total = 0.0, 0 + for col in cols_with_missing: + if col not in numeric_cols: continue + missing_mask = df[col].isnull() + if missing_mask.sum() < 5 or (~missing_mask).sum() < 5: continue + for other in numeric_cols: + if other == col: continue + g1, g2 = df.loc[missing_mask, other].dropna(), df.loc[~missing_mask, other].dropna() + if len(g1) < 3 or len(g2) < 3: continue + grand_mean, grand_var = df[other].mean(), df[other].var() + if grand_var < 1e-12: continue + chi2_total += (len(g1)*(g1.mean() - grand_mean)**2 + len(g2)*(g2.mean() - grand_mean)**2) / grand_var + df_total += 1 + if df_total == 0: return {"chi2": None, "p_value": None, "verdict": "Insufficient numeric data"} + p_val = 1 - chi2.cdf(chi2_total, df_total) + verdict = f"Fail to reject MCAR" if p_val >= 0.05 else f"Reject MCAR" + return {"chi2": round(chi2_total, 4), "df": df_total, "p_value": round(p_val, 4), "verdict": verdict, "reject_mcar": p_val < 0.05} + +def feature_dependency_tests(df: pd.DataFrame, col: str) -> dict: + missing_mask = df[col].isnull() + if missing_mask.sum() < 5 or (~missing_mask).sum() < 5: return {"results": {}, "n_significant": 0, "signal": "Insufficient data"} + results = {} + for other in df.columns: + if other == col: continue + g_miss, g_obs = df.loc[missing_mask, other].dropna(), df.loc[~missing_mask, other].dropna() + if len(g_miss) < 3 or len(g_obs) < 3: continue + try: + if pd.api.types.is_numeric_dtype(df[other]): + n1, n2 = len(g_miss), len(g_obs) + if min(n1, n2) >= 30: + se = np.sqrt(g_miss.var()/n1 + g_obs.var()/n2) + if se < 1e-12: continue + z_stat = (g_miss.mean() - g_obs.mean()) / se + p_val = 2 * (1 - norm.cdf(abs(z_stat))) + test_name, stat = "z-test", round(z_stat, 4) + else: + t_stat, p_val = ttest_ind(g_miss, g_obs, equal_var=False) + test_name, stat = "Welch t-test", round(t_stat, 4) + results[other] = {"test": test_name, "stat": stat, "p_value": round(p_val, 4), "significant": p_val < 0.05, "type": "numeric"} + else: + ct = pd.crosstab(missing_mask.astype(int), df[other]) + if ct.shape[0] < 2 or ct.shape[1] < 2: continue + chi2_stat, p_val, _, _ = chi2_contingency(ct) + results[other] = {"test": "chi²", "stat": round(chi2_stat, 4), "p_value": round(p_val, 4), "significant": p_val < 0.05, "type": "categorical"} + except Exception: continue + n_sig = sum(1 for r in results.values() if r["significant"]) + sig_pct = n_sig / max(len(results), 1) * 100 + signal = "No features differ significantly" if sig_pct == 0 else f"{n_sig}/{len(results)} features differ (p<0.05)" + return {"results": results, "n_significant": n_sig, "total_tested": len(results), "sig_pct": round(sig_pct, 1), "signal": signal} + +def target_dependency_test(df: pd.DataFrame, col: str, target_col: str) -> dict: + missing_mask = df[col].isnull() + if missing_mask.sum() < 5 or (~missing_mask).sum() < 5: return {"p_value": None, "signal": "Insufficient data", "significant": False} + try: + g_miss, g_obs = df.loc[missing_mask, target_col].dropna(), df.loc[~missing_mask, target_col].dropna() if pd.api.types.is_numeric_dtype(df[target_col]): - m1, m2 = miss_target.mean(), obs_target.mean() - denom = max(abs(m2), 1e-9) - diff_pct = abs(m1 - m2) / denom * 100 + n1, n2 = len(g_miss), len(g_obs) + if min(n1, n2) >= 30: + se = np.sqrt(g_miss.var()/n1 + g_obs.var()/n2) + if se < 1e-12: return {"p_value": None, "signal": "Zero variance", "significant": False} + z_stat = (g_miss.mean() - g_obs.mean()) / se + p_val = 2 * (1 - norm.cdf(abs(z_stat))) + else: + _, p_val = ttest_ind(g_miss, g_obs, equal_var=False) + diff_pct = abs(g_miss.mean() - g_obs.mean()) / max(abs(g_obs.mean()), 1e-9) * 100 else: - p1 = miss_target.value_counts(normalize=True).iloc[0] * 100 - p2 = obs_target.value_counts(normalize=True).iloc[0] * 100 + ct = pd.crosstab(missing_mask.astype(int), df[target_col]) + _, p_val, _, _ = chi2_contingency(ct) + p1, p2 = g_miss.value_counts(normalize=True).iloc[0]*100, g_obs.value_counts(normalize=True).iloc[0]*100 diff_pct = abs(p1 - p2) - if diff_pct < 5: - signal = "No strong signal (<5% target diff)" - elif diff_pct < 10: - signal = "Moderate target dependency — possible MAR/MNAR" - else: - signal = "Strong target dependency → MNAR likely (>10% target diff)" - - return {"diff_pct": round(diff_pct, 2), "signal": signal} - except Exception as e: - return {"diff_pct": None, "signal": f"Could not compute: {e}"} - + sig = p_val < 0.05 + signal = f"Not significant (p={p_val:.4f})" if not sig else f"Significant — target differs by {diff_pct:.1f}%" + return {"p_value": round(p_val, 4), "significant": sig, "diff_pct": round(diff_pct, 2), "signal": signal} + except Exception as e: return {"p_value": None, "signal": f"Error: {e}", "significant": False} + +def classify_mechanism(t_feat, t_target, little): + tgt_sig, tgt_diff = t_target.get("significant", False), t_target.get("diff_pct", 0) + sig_pct = t_feat.get("sig_pct", 0) + + if tgt_sig and tgt_diff >= 10: return "MNAR", "High", "Missingness strongly correlates with the outcome." + elif tgt_sig and tgt_diff >= 5: return "MNAR", "Moderate", "Moderate dependency on target. Treat conservatively as MNAR." + elif sig_pct > 30: return "MAR", "High", "Strong dependency on observed features detected." + elif sig_pct > 0: return "MAR", "Moderate", "Weak but present dependency on observed features." + elif little.get("reject_mcar"): return "MAR", "Low", "Little's test rejects MCAR, but feature tests show weak dependency." + else: return "MCAR", "High", "No statistical evidence of systematic missingness." + +def run_single_diagnostic(df, col, target_col): + little, t_feat = littles_mcar_test(df, [col]), feature_dependency_tests(df, col) + t_target = {"p_value": None, "significant": False, "signal": "Skipped (Is Target)", "diff_pct": 0} if col == target_col else target_dependency_test(df, col, target_col) + mech, conf, expl = classify_mechanism(t_feat, t_target, little) + st.session_state["col_diagnostics"][col] = { + "mechanism": mech, "confidence": conf, "explanation": expl, + "miss_pct": round(df[col].isnull().mean()*100, 2), + "dtype": str(df[col].dtype), + "little": little, "t_feat": t_feat, "t_target": t_target + } -def classify_mechanism(t1: dict, t2: dict, t3: dict) -> tuple: - feat_dep = t2.get("max_diff", 0) - tgt_dep = t3.get("diff_pct") or 0 - scattered = t1.get("scattered", True) - if tgt_dep > 10: - return "MNAR", "High", ( - f"Target variable differs by {tgt_dep:.1f}% between missing/present rows. " - "The probability of missingness depends on the unobserved value itself." - ) - elif feat_dep >= 10 and not scattered: - return "MAR", "High", ( - f"Feature distributions differ by up to {feat_dep:.1f}% and missing values appear " - "clustered — missingness depends on observed features." - ) - elif feat_dep >= 5: - return "MAR", "Moderate", ( - f"Feature distributions differ by up to {feat_dep:.1f}%. " - "Missingness likely depends on observed features." - ) - elif scattered and feat_dep < 5 and tgt_dep < 5: - return "MCAR", "High", ( - "Values appear randomly scattered, feature distributions are similar across " - "groups, and target shows no dependency — consistent with MCAR." - ) +# ════════════════════════════════════════════════════════════════════ +# IMPUTATION SIMULATION HELPERS +# ════════════════════════════════════════════════════════════════════ +def feasibility_checks(df: pd.DataFrame, col: str, target_col: str, impute_method: str) -> dict: + series = df[col].dropna() + if len(series) < 5 or not pd.api.types.is_numeric_dtype(df[col]): + return {"applicable": False} + + results = {"applicable": True, "escalate_to_knn": False, "reasons": []} + + # ── 1. Impute ── + if impute_method == "Mean": imputed_series = df[col].fillna(series.mean()) + elif impute_method == "Median": imputed_series = df[col].fillna(series.median()) else: - return "MCAR", "Low", ( - "Weak signals across all three tests. Treated as MCAR but verify with domain knowledge." - ) - - -# ── Logistic Regression-based mechanism diagnosis (from app_tanisha.py) ── - -def diagnose_mechanism_lr(df, col, num_cols): - miss_mask = df[col].isnull().astype(int) - predictors = [c for c in df.columns if c != col and df[c].isnull().mean() < 0.9] - if not predictors or miss_mask.sum() < 5: - return "MNAR", "Insufficient data to test; assumed MNAR." - mcar_p_vals = [] - for p in predictors: - if p in num_cols and df[p].dropna().nunique() > 1: - try: - binned = pd.qcut(df[p].fillna(df[p].median()), q=4, duplicates="drop", labels=False) - ct = pd.crosstab(binned, miss_mask) - if ct.shape[0] > 1 and ct.shape[1] > 1: - _, p_val, _, _ = chi2_contingency(ct) - mcar_p_vals.append(p_val) - except Exception: - pass - if mcar_p_vals and np.mean(mcar_p_vals) > 0.05: - return "MCAR", (f"Chi-square tests show no significant dependency " - f"(avg p={np.mean(mcar_p_vals):.3f} > 0.05). Missingness appears random.") - try: - X_pred = df[predictors].copy() - for c in X_pred.select_dtypes(include="object").columns: - X_pred[c] = X_pred[c].astype("category").cat.codes - X_pred = X_pred.fillna(X_pred.median(numeric_only=True)) - scaler = StandardScaler() - X_scaled = scaler.fit_transform(X_pred) - lr = LogisticRegression(max_iter=300, solver="lbfgs") - lr.fit(X_scaled, miss_mask) - score = lr.score(X_scaled, miss_mask) - baseline = max(miss_mask.mean(), 1 - miss_mask.mean()) - if score > baseline + 0.05: - return "MAR", (f"Logistic Regression predicts missingness with accuracy {score:.2%} " - f"(baseline {baseline:.2%}). Missingness is related to observed variables.") - except Exception: - pass - return "MNAR", "Missingness not explained by observed data. Likely related to the missing value itself — assumed MNAR." - - -def recommend_strategy(mechanism: str, miss_pct: float, dtype: str) -> dict: - is_num = "float" in dtype or "int" in dtype - add_indicator = (mechanism == "MNAR") or (mechanism == "MAR" and miss_pct >= 10) - - if mechanism == "MCAR" and miss_pct <= 5: - method = "Drop rows" - reason = "MCAR confirmed and loss is minimal (≤5%). Safe to drop." - adv = "✓ No artificial data introduced" - disadv = "✗ Loses data — only safe at very low %" - elif mechanism in ("MCAR", "MAR") and miss_pct <= 15: - if is_num: - method = "Median imputation" - reason = "Low-moderate missingness. Median is robust to skew and outliers." - adv = "✓ Outlier-resistant; recommended default for numeric" - disadv = "✗ Reduces variance slightly" - else: - method = "Mode imputation" - reason = "Low-moderate missingness on categorical data." - adv = "✓ Preserves category structure" - disadv = "✗ Can over-represent dominant category" - elif mechanism == "MAR" and miss_pct <= 30: - method = "KNN Imputation" if is_num else "Mode / KNN Imputation" - reason = "Moderate MAR missingness. KNN leverages feature relationships." - adv = "✓ Preserves local patterns; captures inter-feature structure" - disadv = "✗ Slow on large datasets; requires scaling" - elif mechanism == "MAR" and miss_pct > 30: - method = "Iterative Imputer (MICE)" - reason = "High MAR missingness. MICE models each column as a function of others." - adv = "✓ Most statistically principled; accounts for all feature relationships" - disadv = "✗ Computationally expensive; risk of instability" - elif mechanism == "MNAR": - method = "Median + Missing Indicator (mandatory)" - reason = "MNAR: the fact of missingness is informative. Indicator must be created BEFORE imputation." - adv = "✓ Preserves MNAR signal; lets model learn from missingness" - disadv = "✗ Imputation may still be biased; domain expertise required" + numeric_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c != target_col] + X_num = df[numeric_cols].copy() + try: + scaler = StandardScaler() + X_scaled = pd.DataFrame(scaler.fit_transform(X_num), columns=X_num.columns) + imputer = KNNImputer(n_neighbors=5) if impute_method == "KNN" else IterativeImputer(random_state=42, max_iter=10) + X_imputed_scaled = pd.DataFrame(imputer.fit_transform(X_scaled), columns=X_num.columns) + X_imputed = pd.DataFrame(scaler.inverse_transform(X_imputed_scaled), columns=X_num.columns) + imputed_series = X_imputed[col] + except Exception: + imputed_series = df[col].fillna(series.median()) + + results["imputed_series"] = imputed_series + + # ��─ 2. Skewness & Outliers ── + skew = series.skew() + Q1_b, Q3_b = series.quantile(0.25), series.quantile(0.75) + IQR_b = Q3_b - Q1_b + outliers_before = ((series < Q1_b - 1.5*IQR_b) | (series > Q3_b + 1.5*IQR_b)).sum() + + Q1_a, Q3_a = imputed_series.quantile(0.25), imputed_series.quantile(0.75) + IQR_a = Q3_a - Q1_a + outliers_after = ((imputed_series < Q1_a - 1.5*IQR_a) | (imputed_series > Q3_a + 1.5*IQR_a)).sum() + new_outliers = max(0, outliers_after - outliers_before) + + if impute_method == "Mean": + skew_verdict = "fail" if abs(skew) > 1 else "ok" + elif impute_method == "Median": + skew_verdict = "warn" if abs(skew) > 3 else "ok" + else: + skew_verdict = "ok" + + results["skewness"] = {"verdict": skew_verdict, "value": skew, "msg": f"Skewness = {skew:.3f}"} + + if new_outliers > (len(series) * 0.05): + out_verdict = "warn" else: - method = "Consider dropping column" - reason = f"Missing > 30% with {mechanism}. Evaluate predictive value vs. cost of imputation." - adv = "✓ Eliminates noise if column is uninformative" - disadv = "✗ Irreversible — verify with domain expert first" - - return { - "method": method, - "reason": reason, - "adv": adv, - "disadv": disadv, - "add_indicator": add_indicator, - } - + out_verdict = "ok" -def strategy_chips_html(mech, miss_pct, col_type): - chips = [] - if mech == "CLEAN": - return '✅ No action needed — column is complete' - if miss_pct > 50: - chips.append(("⚠ Consider Dropping Column (>50% missing)", "chip-red")) - if mech == "MCAR": - if miss_pct < 5: - chips.append(("Listwise Deletion (safe)", "chip-green")) - chips.append(("Median Imputation" if col_type == "Numerical" else "Mode Imputation", "chip-green")) - if mech == "MAR": - chips.append(("KNN Imputation", "chip-blue")) - chips.append(("Iterative Imputer (MICE)", "chip-blue")) - chips.append(("Group-wise Imputation", "chip-blue")) - if miss_pct >= 10: - chips.append(("Create Missing Indicator (≥10% MAR)", "chip-yellow")) - if mech == "MNAR": - chips.append(("⚠ Create Missing Indicator FIRST (mandatory)", "chip-red")) - chips.append(("Constant / Domain-Specific Value", "chip-yellow")) - chips.append(("Sensitivity Analysis Required", "chip-yellow")) - return " ".join(f'{lbl}' for lbl, cls in chips) - - -def validation_checks(df_before: pd.Series, df_after: pd.Series) -> dict: - m_shift = abs(df_before.mean() - df_after.mean()) / max(abs(df_before.mean()), 1e-9) * 100 - med_shift = abs(df_before.median() - df_after.median()) / max(abs(df_before.median()), 1e-9) * 100 - var_change = abs(df_before.var() - df_after.var()) / max(df_before.var(), 1e-9) * 100 - return { - "mean_shift_pct": round(m_shift, 2), - "median_shift_pct": round(med_shift, 2), - "var_change_pct": round(var_change, 2), - "mean_ok": m_shift <= 5, - "median_ok": med_shift <= 3, - "var_ok": var_change <= 20, + results["outliers"] = { + "verdict": out_verdict, + "new_outliers": new_outliers, + "outliers_before": outliers_before, + "outliers_after": outliers_after } + # ── 3. Variance Impact ── + var_before = series.var() + var_after = imputed_series.var() + var_drop_pct = (var_before - var_after) / var_before * 100 if var_before > 1e-12 else 0 -# ── Outlier & Variance helpers (from app_tanisha.py) ────────────────── - -def detect_outliers_iqr(series): - s = series.dropna() - if len(s) < 4: return 0 - Q1, Q3 = s.quantile(0.25), s.quantile(0.75) - IQR = Q3 - Q1 - return int(((s < Q1 - 1.5 * IQR) | (s > Q3 + 1.5 * IQR)).sum()) - - -def variance_impact(series): - s = series.dropna() - if len(s) < 2: return 0.0, 0.0, 0.0 - var_before = float(s.var()) - var_after = float(series.fillna(s.mean()).var()) - return round(var_before, 4), round(var_after, 4), round(var_before - var_after, 4) - - -def stat_card(label, value, color="#1a1a2e"): - return (f'
' - f'
{value}
' - f'
{label}
') - - -# ── Plot helpers ────────────────────────────────────────────────────── - -def plot_missing_heatmap(df): - missing_cols = [c for c in df.columns if df[c].isnull().any()] - if not missing_cols: - return None - sorted_cols = sorted(missing_cols, key=lambda c: df[c].isnull().mean(), reverse=True) - sample_size = min(300, len(df)) - df_s = df[sorted_cols].sample(n=sample_size, random_state=42) if len(df) > sample_size else df[sorted_cols] - mask_df = df_s.isnull().astype(int) - fig, ax = plt.subplots(figsize=(max(10, len(sorted_cols) * 0.7), 5)) - sns.heatmap(mask_df.T, cmap=["#f5f3ee", "#17172b"], cbar=True, - yticklabels=sorted_cols, xticklabels=False, linewidths=0, ax=ax) - ax.set_title(f"Missing Value Heatmap — sample of {sample_size} rows", fontsize=13, fontweight="bold", pad=12) - ax.set_xlabel("Rows (observations)", fontsize=10) - ax.set_ylabel("Columns", fontsize=10) - plt.tight_layout() - return fig - - -def plot_missingness_correlation(df): - missing_cols = [c for c in df.columns if df[c].isnull().any()] - if len(missing_cols) < 2: - return None - miss_bin = df[missing_cols].isnull().astype(int) - corr = miss_bin.corr() - fig, ax = plt.subplots(figsize=(max(7, len(missing_cols) * 0.9), max(6, len(missing_cols) * 0.8))) - mask = np.triu(np.ones_like(corr, dtype=bool)) - sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", center=0, - mask=mask, square=True, linewidths=0.5, cbar_kws={"shrink": 0.8}, ax=ax) - ax.set_title("Missingness Correlation Matrix", fontsize=13, fontweight="bold", pad=12) - plt.tight_layout() - return fig - - -def plot_numerical_column(df, col): - s_original = df[col].dropna() - s_imputed = df[col].fillna(s_original.mean()) - fig, axes = plt.subplots(1, 2, figsize=(16, 6)) - fig.suptitle(f"Deep Distribution Analysis — {col}", fontsize=14, fontweight="bold") - sns.kdeplot(s_original, ax=axes[0], color="#4f8ef7", linewidth=3, - label="Original (Before)", fill=True, alpha=0.2) - sns.kdeplot(s_imputed, ax=axes[0], color="#e07b54", linewidth=3, - label="Mean Imputed (After)", linestyle="--") - axes[0].set_title("Distribution Shift: Original vs. Imputed", fontsize=12) - axes[0].legend() - box_data = pd.DataFrame({ - "Value": pd.concat([s_original, s_imputed]), - "Type": ["Original"] * len(s_original) + ["Imputed"] * len(s_imputed), - }) - sns.boxplot(data=box_data, x="Type", y="Value", ax=axes[1], palette=["#dce3ff", "#fce4d6"]) - axes[1].set_title("Variance & Outlier Comparison", fontsize=12) - plt.tight_layout() - return fig - - -def plot_categorical_column(df, col, top_n=10): - s_original = df[col].dropna() - s_imputed = df[col].fillna(s_original.mode()[0] if not s_original.empty else "N/A") - fig, axes = plt.subplots(1, 2, figsize=(16, 7)) - fig.suptitle(f"Categorical Frequency Analysis — {col}", fontsize=14, fontweight="bold") - orig_counts = s_original.value_counts().head(top_n) - imp_counts = s_imputed.value_counts().head(top_n) - compare_df = pd.DataFrame({"Original": orig_counts, "Imputed (Mode)": imp_counts}).fillna(0) - compare_df.plot(kind="barh", ax=axes[0], color=["#4f8ef7", "#e07b54"], width=0.8) - axes[0].set_title(f"Top {top_n} Categories: Original vs Mode Imputed", fontsize=12) - axes[0].invert_yaxis() - top_pie = imp_counts.head(8) - axes[1].pie(top_pie, labels=top_pie.index.astype(str), autopct="%1.1f%%", - startangle=140, colors=plt.cm.Pastel1.colors, wedgeprops={"edgecolor": "white"}) - axes[1].set_title("Final Proportion (After Imputation)", fontsize=12) - plt.tight_layout() - return fig - - -def plot_missing_vs_features(df, col): - num_others = [c for c in df.select_dtypes(include=[np.number]).columns - if c != col and df[c].isnull().mean() < 0.95] - if not num_others: - return None - means_present = df[df[col].notna()][num_others].mean() - means_missing = df[df[col].isnull()][num_others].mean() - diff_df = pd.DataFrame({"Present": means_present, "Missing": means_missing}).dropna().head(12) - if diff_df.empty: - return None - fig, ax = plt.subplots(figsize=(max(8, len(diff_df) * 0.9), 4)) - x = np.arange(len(diff_df)); w = 0.35 - ax.bar(x - w/2, diff_df["Present"], w, label="Present rows", color="#4f8ef7", alpha=0.85) - ax.bar(x + w/2, diff_df["Missing"], w, label="Missing rows", color="#e07b54", alpha=0.85) - ax.set_xticks(x) - ax.set_xticklabels(diff_df.index, rotation=35, ha="right", fontsize=9) - ax.set_title(f"Feature Means — Rows where '{col}' is Present vs Missing", - fontsize=11, fontweight="bold") - ax.set_ylabel("Mean value") - ax.legend(fontsize=9) - plt.tight_layout() - return fig - - -def render_per_column_deep_analysis(df, col, num_cols, cat_cols, mechanism_results): - miss_count = int(df[col].isnull().sum()) - miss_pct = round(df[col].isnull().mean() * 100, 2) - total_rows = len(df) - present = total_rows - miss_count - col_type = "Numerical" if col in num_cols else "Categorical" - mech_info = mechanism_results.get(col, {}) - mech = mech_info.get("mechanism", "N/A") - mech_reason = mech_info.get("reason", "Run the global diagnosis section above first.") - sev = severity(miss_pct) if miss_pct > 0 else "None" - - miss_color = "#dc2626" if miss_pct >= 20 else "#d97706" if miss_pct >= 5 else "#16a34a" - sev_color = "#dc2626" if sev == "High" else "#d97706" if sev == "Moderate" else "#16a34a" - mech_color = {"MCAR": "#155724", "MAR": "#856404", "MNAR": "#721c24"}.get(mech, "#444") - - st.markdown(f"#### 🔍 Deep Analysis — `{col}`  ·  {col_type}", unsafe_allow_html=True) - m1, m2, m3, m4, m5 = st.columns(5) - with m1: st.markdown(stat_card("Total Rows", f"{total_rows:,}"), unsafe_allow_html=True) - with m2: st.markdown(stat_card("Present", f"{present:,}"), unsafe_allow_html=True) - with m3: st.markdown(stat_card("Missing", f"{miss_pct}%", miss_color), unsafe_allow_html=True) - with m4: st.markdown(stat_card("Severity", sev, sev_color), unsafe_allow_html=True) - with m5: st.markdown(stat_card("Mechanism", mech, mech_color), unsafe_allow_html=True) - st.markdown("") - - if col_type == "Numerical": - s = df[col].dropna() - if len(s) > 1: - col_skew = float(skew(s)) - col_kurt = float(kurtosis(s)) - Q1, Q3 = float(s.quantile(0.25)), float(s.quantile(0.75)) - IQR = Q3 - Q1 - n_out = detect_outliers_iqr(df[col]) - vb, va, vi = variance_impact(df[col]) - out_pct = n_out / max(len(s), 1) - - r1 = st.columns(4) - for (lbl, val), col_ui in zip( - [("Mean", f"{s.mean():.4g}"), ("Median", f"{s.median():.4g}"), - ("Std Dev", f"{s.std():.4g}"), ("Variance", f"{s.var():.4g}")], r1): - with col_ui: st.markdown(stat_card(lbl, val), unsafe_allow_html=True) - st.markdown("") - - r2 = st.columns(4) - for (lbl, val), col_ui in zip( - [("Min", f"{s.min():.4g}"), ("Max", f"{s.max():.4g}"), - ("Skewness", f"{col_skew:.3f}"), ("Kurtosis", f"{col_kurt:.3f}")], r2): - with col_ui: st.markdown(stat_card(lbl, val), unsafe_allow_html=True) - st.markdown("") - - r3 = st.columns(4) - out_color = "#dc2626" if out_pct > 0.15 else "#d97706" if out_pct > 0.05 else "#16a34a" - for (lbl, val, clr), col_ui in zip( - [("Q1", f"{Q1:.4g}", "#1a1a2e"), ("Q3", f"{Q3:.4g}", "#1a1a2e"), - ("IQR", f"{IQR:.4g}", "#1a1a2e"), ("Outliers (IQR)", str(n_out), out_color)], r3): - with col_ui: st.markdown(stat_card(lbl, val, clr), unsafe_allow_html=True) - - if len(s) <= 5000: - try: - _, p_norm = shapiro(s.sample(min(len(s), 5000), random_state=0)) - norm_txt = f"✅ Normal (p={p_norm:.4f})" if p_norm > 0.05 else f"⚠ Not Normal (p={p_norm:.4f})" - st.caption(f"📐 Shapiro-Wilk normality test: {norm_txt}") - except Exception: - pass - - st.markdown("") - fig_dist = plot_numerical_column(df, col) - st.pyplot(fig_dist); plt.close(fig_dist) - - st.markdown("**Variance Impact of Mean Imputation (simulated)**") - vc = st.columns(3) - delta_color = "#dc2626" if abs(vi)/max(vb,1e-9) > 0.3 else "#d97706" if abs(vi)/max(vb,1e-9) > 0.1 else "#16a34a" - with vc[0]: st.markdown(stat_card("Variance (before)", f"{vb:.4g}"), unsafe_allow_html=True) - with vc[1]: st.markdown(stat_card("Variance (after)", f"{va:.4g}"), unsafe_allow_html=True) - with vc[2]: st.markdown(stat_card("Δ Variance", f"{vi:.4g}", delta_color), unsafe_allow_html=True) - - pct_chg = abs(vi) / max(vb, 1e-9) * 100 - if pct_chg >= 30: - st.warning(f"⚠ Variance drops by {pct_chg:.1f}% after mean imputation — over-smoothing risk. Use median or model-based imputation.") - elif pct_chg >= 10: - st.info(f"ℹ Variance drops by {pct_chg:.1f}% — acceptable, but monitor distribution shape.") - else: - st.success(f"✅ Variance change is small ({pct_chg:.1f}%) — mean imputation is statistically safe here.") - else: - s = df[col].dropna() - n_unique = s.nunique() - mode_val = str(s.mode().iloc[0]) if len(s) > 0 else "N/A" - mode_cnt = int((s == s.mode().iloc[0]).sum()) if len(s) > 0 else 0 - mode_pct = round(mode_cnt / max(len(s), 1) * 100, 1) - - r1 = st.columns(4) - for (lbl, val), col_ui in zip( - [("Unique Values", n_unique), ("Mode", mode_val[:12]), - ("Mode Count", f"{mode_cnt:,}"), ("Mode Freq %", f"{mode_pct}%")], r1): - with col_ui: st.markdown(stat_card(lbl, str(val)), unsafe_allow_html=True) - - st.markdown("") - freq_table = s.value_counts().reset_index() - freq_table.columns = ["Value", "Count"] - freq_table["% of Present"] = (freq_table["Count"] / len(s) * 100).round(2) - tab_chart, tab_table = st.tabs(["📊 Frequency Chart", "📋 Frequency Table"]) - with tab_chart: - fig_cat = plot_categorical_column(df, col) - st.pyplot(fig_cat); plt.close(fig_cat) - with tab_table: - st.dataframe(freq_table, use_container_width=True, hide_index=True) - - st.markdown("") - if miss_count > 0: - st.markdown("**How Missingness Relates to Other Features**") - fig_pat = plot_missing_vs_features(df, col) - if fig_pat: - st.pyplot(fig_pat); plt.close(fig_pat) - st.caption("Large differences between blue (present) and orange (missing) bars signal MAR behavior.") - else: - st.info("No other numerical features available for pattern comparison.") - - st.markdown("") - verdict_cls = {"MCAR": "card-mcar", "MAR": "card-mar", "MNAR": "card-mnar"}.get(mech, "card-info") - mech_icon = {"MCAR": "🟢", "MAR": "🟡", "MNAR": "🔴"}.get(mech, "✅") - mech_label = {"MCAR": "Missing Completely At Random (MCAR)", - "MAR": "Missing At Random (MAR)", - "MNAR": "Missing Not At Random (MNAR)", - "N/A": "No Missing Values"}.get(mech, mech) - - st.markdown( - f'
{mech_icon} {mech_label}
' - f'{mech_reason}
', - unsafe_allow_html=True) - - chips_html = strategy_chips_html(mech, miss_pct, col_type) - if chips_html: - st.markdown("") - st.markdown("**Recommended Strategies**") - st.markdown(chips_html, unsafe_allow_html=True) - - pointer = { - "MCAR": ("📍 **MCAR**: Missing% <5% → listwise deletion is safe. 5–15% → median/mode imputation. " - "15–30% → advanced imputation with missing indicator."), - "MAR": ("📍 **MAR**: KNN / MICE preferred. Create a missing indicator if missing% ≥10%."), - "MNAR": ("📍 **MNAR**: **Create the missing indicator FIRST**, then use constant or sensitivity analysis. " - "Domain knowledge is essential."), - "N/A": "📍 No action needed — this column is complete. Proceed to feature engineering.", - }.get(mech, "") - if pointer: - st.markdown("") - st.info(pointer) + if var_drop_pct <= 10: var_verdict, var_msg = "ok", f"Variance Change: {var_drop_pct:.1f}%" + elif var_drop_pct <= 20: var_verdict, var_msg = "warn", f"Variance Change: {var_drop_pct:.1f}%" + else: var_verdict, var_msg = "fail", f"Variance Change: {var_drop_pct:.1f}%" + results["variance"] = {"verdict": var_verdict, "msg": var_msg, "var_drop_pct": var_drop_pct} -# ════════════════════════════════════════════════════════════════════ -# SIDEBAR — NAVIGATION -# ════════════════════════════════════════════════════════════════════ + # ── 4. Correlation Preservation ── + numeric_others = [c for c in df.select_dtypes(include=[np.number]).columns if c != col and c != target_col] + corr_results, max_corr_shift, sign_flip = {}, 0.0, False -STEPS = [ - "1 · Upload CSV", - "2 · Select Target Column", - "3 · Overview & Patterns", - "4 · Mechanism Dashboard", - "5 · Column Diagnostics", - "6 · Strategy & Imputation", - "7 · Validation Checks", -] + for other in numeric_others[:10]: + s_before = df[[col, other]].dropna() + if len(s_before) < 5: continue + r_before = s_before[col].corr(s_before[other]) + r_after = imputed_series.corr(df[other]) + + delta = abs(r_before - r_after) + flipped = (r_before * r_after < 0) and (abs(r_before) > 0.1) + + corr_results[other] = {"r_before": round(r_before, 4), "r_after": round(r_after, 4), "delta": round(delta, 4), "sign_flip": flipped} + max_corr_shift = max(max_corr_shift, delta) + if flipped: sign_flip = True -with st.sidebar: - st.markdown("## 🔬 Missing Value Intelligence Suite") - st.markdown("---") - st.markdown("**Navigation**") - step = st.radio("Go to step:", STEPS, label_visibility="collapsed") - st.markdown("---") - st.markdown( - "Follow the steps in order for a complete analysis pipeline. " - "Steps 3–4 are exploratory; Steps 5–7 form the diagnostic pipeline.", - unsafe_allow_html=True, - ) + if max_corr_shift <= 0.05 and not sign_flip: corr_verdict, corr_msg = "ok", f"Max Δ = {max_corr_shift:.3f} — Correlation well preserved" + elif sign_flip: corr_verdict, corr_msg = "fail", f"Sign flip detected! Correlation direction reversed." + elif max_corr_shift <= 0.10: corr_verdict, corr_msg = "warn", f"Max Δ = {max_corr_shift:.3f} — Moderate correlation shift" + else: corr_verdict, corr_msg = "fail", f"Max Δ = {max_corr_shift:.3f} — Large correlation shift detected" + results["correlation"] = {"details": corr_results, "verdict": corr_verdict, "msg": corr_msg, "max_shift": round(max_corr_shift, 4)} -# ════════════════════════════════════════════════════════════════════ -# SESSION STATE -# ════════════════════════════════════════════════════════════════════ + return results -for key in ["df", "target_col", "col_results", "df_imputed", "mechanism_results_lr"]: - if key not in st.session_state: - st.session_state[key] = None -if st.session_state["col_results"] is None: - st.session_state["col_results"] = {} -if st.session_state["mechanism_results_lr"] is None: - st.session_state["mechanism_results_lr"] = {} +def get_auto_recommendation(df, col, target, mechanism, miss_pct, dtype): + """Determine best imputation strategy with explicit labeling.""" + needs_indicator = (mechanism == "MNAR") or (mechanism == "MAR" and miss_pct >= 10) + indicator_suffix = " + Missing Indicator" if needs_indicator else "" + # High missingness — always flag + if miss_pct > 70: + return f"Drop Column" -# ════════════════════════════════════════════════════════════════════ -# STEP 1 — UPLOAD CSV -# ════════════════════════════════════════════════════════════════════ + if mechanism == "MCAR" and miss_pct <= 5: + return "Drop Rows" -if step == STEPS[0]: - st.markdown('
📂 Step 1 — Upload Your CSV
', unsafe_allow_html=True) - st.markdown('
Upload a CSV file to begin the missing-value analysis pipeline.
', unsafe_allow_html=True) + # Categorical / non-numeric + if not pd.api.types.is_numeric_dtype(df[col]): + return f"Mode Imputation{indicator_suffix}" - uploaded = st.file_uploader("Choose a CSV file", type=["csv"]) + # Numeric: run quick feasibility to decide + feas_med = feasibility_checks(df, col, target, "Median") + if not feas_med.get("applicable"): + return f"Median Imputation{indicator_suffix}" - if uploaded: - try: - df = pd.read_csv(uploaded) - # Auto-remove ID-like columns - id_cols = [c for c in df.columns if c.strip().lower() in ("id", "index", "row", "rowid", "row_id")] - if id_cols: - df.drop(columns=id_cols, inplace=True) - st.toast(f"Auto-removed non-informative column(s): {id_cols}", icon="🗑️") - - st.session_state["df"] = df - st.session_state["col_results"] = {} - st.session_state["mechanism_results_lr"] = {} - st.session_state["df_imputed"] = df.copy() - - st.success(f"✅ File loaded: **{uploaded.name}** — {df.shape[0]} rows × {df.shape[1]} columns") - st.markdown("### Preview (first 10 rows)") - st.dataframe(df.head(10), use_container_width=True) - - c1, c2, c3, c4 = st.columns(4) - with c1: - st.markdown(f'
{df.shape[0]:,}
Rows
', unsafe_allow_html=True) - with c2: - st.markdown(f'
{df.shape[1]}
Columns
', unsafe_allow_html=True) - with c3: - n_miss_cols = df.isnull().any().sum() - st.markdown(f'
{n_miss_cols}
Columns w/ Missings
', unsafe_allow_html=True) - with c4: - total_miss = df.isnull().sum().sum() - pct_miss = round(total_miss / df.size * 100, 1) - st.markdown(f'
{pct_miss}%
Overall Missing Rate
', unsafe_allow_html=True) - - st.markdown("### Column Types & Missingness") - type_df = pd.DataFrame({ - "Column": df.columns, - "Dtype": df.dtypes.astype(str).values, - "Missing": df.isnull().sum().values, - "Missing %": (df.isnull().mean() * 100).round(2).values, - }) - st.dataframe(type_df, use_container_width=True, hide_index=True) - - except Exception as e: - st.error(f"Could not read file: {e}") + var_ok = feas_med["variance"]["var_drop_pct"] <= 20 + corr_ok = feas_med["correlation"]["verdict"] != "fail" + skew_val = abs(feas_med["skewness"].get("value", 0)) + + if var_ok and corr_ok: + if skew_val <= 1: + return f"Mean Imputation{indicator_suffix}" + else: + return f"Median Imputation{indicator_suffix}" else: - st.info("👆 Upload a CSV to get started.") + if miss_pct > 30: + return f"MICE Imputer{indicator_suffix}" + else: + return f"KNN Imputer{indicator_suffix}" # ════════════════════════════════════════════════════════════════════ -# STEP 2 — SELECT TARGET COLUMN +# SIDEBAR NAVIGATION # ════════════════════════════════════════════════════════════════════ +STEPS = ["1 · Upload & Split", "2 · Overview", "3 · Column Diagnostics", "4 · Feasibility Gate", "5 · Final Report"] -elif step == STEPS[1]: - st.markdown('
🎯 Step 2 — Select Target Column
', unsafe_allow_html=True) - st.markdown('
The target column (y) is used in Test 3 to detect MNAR patterns and is excluded from feature analysis.
', unsafe_allow_html=True) - - df = st.session_state.get("df") - if df is None: - st.warning("⚠️ Please upload a CSV in Step 1 first.") - else: - target = st.selectbox( - "Select the output / target column:", - options=df.columns.tolist(), - index=len(df.columns) - 1, - ) - if st.button("✅ Confirm Target Column", type="primary"): - st.session_state["target_col"] = target - st.success(f"Target column set to: **{target}**") - - if st.session_state.get("target_col"): - st.info(f"Current target: **{st.session_state['target_col']}**") - tc = st.session_state["target_col"] - col_data = df[tc] - st.markdown("#### Target Column Distribution") - fig, ax = plt.subplots(figsize=(7, 3)) - if pd.api.types.is_numeric_dtype(col_data): - col_data.dropna().hist(bins=30, ax=ax, color="#17172b", edgecolor="white") - ax.set_xlabel(tc); ax.set_ylabel("Count") - else: - vc = col_data.value_counts().head(15) - vc.plot(kind="bar", ax=ax, color="#17172b") - ax.set_ylabel("Count") - ax.set_title(f"Distribution of '{tc}'") - plt.tight_layout() - st.pyplot(fig) - plt.close() +with st.sidebar: + st.markdown("## 🔬 Missing Value Analyzer") + st.markdown("---") + step = st.radio("Navigate:", STEPS, label_visibility="collapsed") + st.markdown("---") + if st.session_state.get("df_train") is not None: + st.markdown(f"**Train set:** {st.session_state['df_train'].shape[0]} rows × {st.session_state['df_train'].shape[1]} cols") + st.markdown(f"**Diagnosed:** {len(st.session_state['col_diagnostics'])} columns") + st.markdown("Analysis runs on TRAIN SET only to prevent data leakage.", unsafe_allow_html=True) # ════════════════════════════════════════════════════════════════════ -# STEP 3 — OVERVIEW & PATTERNS +# STEP 1 — UPLOAD & SPLIT # ════════════════════════════════════════════════════════════════════ +def render_step1(): + st.markdown('
📂 Step 1 — Upload CSV & Train/Test Split
', unsafe_allow_html=True) + uploaded = st.file_uploader("Choose a CSV file", type=["csv"]) + if not uploaded: return st.info("👆 Upload a CSV file to begin.") -elif step == STEPS[2]: - st.markdown('
📊 Step 3 — Overview & Patterns
', unsafe_allow_html=True) - st.markdown('
Bird\'s-eye view of missingness across the dataset, including heatmaps and co-missingness patterns.
', unsafe_allow_html=True) - - df = st.session_state.get("df") - target_col = st.session_state.get("target_col") - - if df is None: - st.warning("⚠️ Please upload a CSV in Step 1 first.") - else: - X = df.drop(columns=[target_col]) if target_col and target_col in df.columns else df - summary = missing_summary_df(X) + df = pd.read_csv(uploaded) + st.success(f"✅ Loaded **{uploaded.name}**") - if summary.empty: - st.success("🎉 No missing values found in the dataset features!") - else: - st.markdown(f"### {len(summary)} column(s) have missing values") - st.dataframe(summary.style.background_gradient(subset=["Missing %"], cmap="YlOrRd"), - use_container_width=True) - - # ── Missing % bar chart - st.markdown('
📉 Missing % per Column
', unsafe_allow_html=True) - miss_cols = summary.index.tolist() - fig_bar, ax_bar = plt.subplots(figsize=(max(7, len(miss_cols) * 0.9), 4)) - colors = ["#9e2210" if v > 30 else "#7a4d00" if v > 15 else "#0d6b3a" for v in summary["Missing %"]] - ax_bar.barh(summary.index[::-1], summary["Missing %"][::-1], color=colors[::-1], edgecolor="white") - ax_bar.axvline(5, color="#89d9ac", linewidth=1.5, linestyle="--", label="5% threshold") - ax_bar.axvline(15, color="#f0cc7a", linewidth=1.5, linestyle="--", label="15% threshold") - ax_bar.axvline(30, color="#f5a898", linewidth=1.5, linestyle="--", label="30% threshold") - ax_bar.set_xlabel("Missing %"); ax_bar.set_title("Missing % per Column") - ax_bar.legend(loc="lower right", fontsize=8) - plt.tight_layout() - st.pyplot(fig_bar) - plt.close() - - # ── Heatmap + Correlation tabs - st.markdown('
🗺 Missingness Patterns
', unsafe_allow_html=True) - tab_hm, tab_corr = st.tabs(["Missing Heatmap", "Missingness Correlation"]) - with tab_hm: - fig_hm = plot_missing_heatmap(X) - if fig_hm: - st.pyplot(fig_hm); plt.close(fig_hm) - st.caption("Dark = missing, light = present. Each column is a row.") - else: - st.info("No missing values to display.") - with tab_corr: - fig_corr = plot_missingness_correlation(X) - if fig_corr: - st.pyplot(fig_corr); plt.close(fig_corr) - st.caption("Near +1: columns tend to be missing together. Near −1: rarely missing simultaneously.") - else: - st.info("Need at least 2 columns with missing values for this chart.") - - # ── Correlation among numerical features - num_cols_x, _ = identify_columns(X) - if len(num_cols_x) >= 2: - st.markdown('
📈 Feature Correlations (Numerical)
', unsafe_allow_html=True) - valid = [c for c in num_cols_x if X[c].isnull().mean() < 1.0] - if len(valid) >= 2: - corr = X[valid].corr() - strong = (corr.abs() > 0.5) & (corr != 1.0) - if strong.any().any(): - fig_fc, ax_fc = plt.subplots(figsize=(max(8, len(valid) * 0.9), max(7, len(valid) * 0.8))) - mask = np.triu(np.ones_like(corr, dtype=bool)) - display_corr = corr.where(corr.abs() > 0.5) - sns.heatmap(display_corr, annot=False, cmap="RdYlGn", center=0, - mask=mask, square=True, linewidths=0.5, - cbar_kws={"shrink": 0.8}, ax=ax_fc, vmin=-1, vmax=1) - ax_fc.set_title("Strong Correlations (|r| > 0.5) — Numerical Features", - fontsize=13, fontweight="bold", pad=12) - plt.tight_layout() - st.pyplot(fig_fc); plt.close(fig_fc) - - # Correlation pairs table - pairs = [] - seen = set() - for i, c1 in enumerate(corr.columns): - for j, c2 in enumerate(corr.columns): - if i >= j: continue - v = corr.loc[c1, c2] - if abs(v) > 0.5: - key = tuple(sorted([c1, c2])) - if key not in seen: - seen.add(key) - pairs.append({"Column A": c1, "Column B": c2, - "Correlation": round(v, 4), - "Correlation %": f"{round(v * 100, 2)}%"}) - if pairs: - corr_table = pd.DataFrame(pairs).sort_values("Correlation", key=abs, ascending=False) - st.markdown("**Strong Correlation Pairs (|r| > 0.5)**") - st.dataframe(corr_table, use_container_width=True, hide_index=True) - else: - st.info("No strong correlations (|r| > 0.5) found among numerical features.") + col1, col2 = st.columns(2) + target = col1.selectbox("Target column (Y):", df.columns.tolist(), index=len(df.columns)-1) + split_pct = col2.slider("Train size:", 50, 95, 80, 5, format="%d%%") + if st.button("✅ Confirm & Split", type="primary"): + df_train, df_test = train_test_split(df, train_size=split_pct/100.0, random_state=42) + st.session_state.update({"df_full": df, "df_train": df_train.reset_index(drop=True), "df_test": df_test.reset_index(drop=True), "target_col": target, "col_diagnostics": {}}) + st.success("✅ Split complete!") + st.dataframe(df_train.head(), use_container_width=True) # ════════════════════════════════════════════════════════════════════ -# STEP 4 — MECHANISM DASHBOARD (from app_tanisha.py) +# STEP 2 — OVERVIEW # ════════════════════════════════════════════════════════════════════ +def render_step2(): + st.markdown('
📊 Step 2 — Missing Value Overview
', unsafe_allow_html=True) + df = st.session_state.get("df_train") + if df is None: return st.warning("⚠️ Please complete Step 1.") -elif step == STEPS[3]: - st.markdown('
🧪 Step 4 — Mechanism Dashboard
', unsafe_allow_html=True) - st.markdown('
Automated MCAR/MAR/MNAR detection via Chi-square & Logistic Regression, plus outlier/variance analysis and deep per-column exploration.
', unsafe_allow_html=True) - - df = st.session_state.get("df") - target_col = st.session_state.get("target_col") - - if df is None: - st.warning("⚠️ Please upload a CSV in Step 1 first.") - elif target_col is None: - st.warning("⚠️ Please select a target column in Step 2 first.") - else: - X = df.drop(columns=[target_col]) - y = df[target_col] - num_cols, cat_cols = identify_columns(X) - - # ── Train-test split - st.markdown('
✂️ Train-Test Split (80 / 20)
', unsafe_allow_html=True) - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - sc1, sc2 = st.columns(2) - with sc1: st.markdown(f"**Training Set** — X_train: `{X_train.shape}` · y_train: `{y_train.shape}`") - with sc2: st.markdown(f"**Test Set** — X_test: `{X_test.shape}` · y_test: `{y_test.shape}`") - - # ── Mechanism diagnosis - st.markdown('
🔬 Missing Data Mechanism Diagnosis (Chi-square + Logistic Regression)
', unsafe_allow_html=True) - missing_feature_cols = [c for c in X.columns if X[c].isnull().any()] - - if not missing_feature_cols: - st.success("No missing values in feature columns — nothing to diagnose.") - mechanism_results = {} - else: - cached = st.session_state.get("mechanism_results_lr", {}) - if not cached: - with st.spinner("Running MCAR (Chi-square) and MAR (Logistic Regression) tests…"): - mechanism_results = {} - for col in missing_feature_cols: - mech, reason = diagnose_mechanism_lr(X, col, num_cols) - mechanism_results[col] = {"mechanism": mech, "reason": reason} - st.session_state["mechanism_results_lr"] = mechanism_results - else: - mechanism_results = cached - - badge_map = {"MCAR": "badge-mcar", "MAR": "badge-mar", "MNAR": "badge-mnar"} - for col, res in mechanism_results.items(): - mech = res["mechanism"] - pct = round(X[col].isnull().mean() * 100, 2) - with st.expander(f"🔎 **{col}** — {mech} | {pct}% missing"): - st.markdown(f'{mech}  {res["reason"]}', - unsafe_allow_html=True) - - # ── Outlier Detection & Variance Impact - st.markdown('
⚡ Outlier Detection & Variance Impact
', unsafe_allow_html=True) - outlier_data = {} - for col in num_cols: - n_out = detect_outliers_iqr(X[col]) - vb, va, vi = variance_impact(X[col]) - outlier_data[col] = { - "Missing %": round(X[col].isnull().mean() * 100, 2), - "Outliers (IQR)": n_out, - "Variance (before impute)": vb, - "Variance (after mean impute)": va, - "Variance Impact (Δ)": vi, - } - if outlier_data: - out_df = (pd.DataFrame(outlier_data).T.reset_index() - .rename(columns={"index": "Column"}) - .sort_values("Outliers (IQR)", ascending=False)) - - def color_outliers(val): - if isinstance(val, (int, float)): - if val > 50: return "background-color: #f8d7da; color: #721c24;" - if val > 10: return "background-color: #fff3cd; color: #856404;" - return "" - st.dataframe(out_df.style.applymap(color_outliers, subset=["Outliers (IQR)"]), - use_container_width=True, hide_index=True) - else: - st.info("No numerical columns available for outlier analysis.") - - # ── Final Diagnosis Table - st.markdown('
📋 Final Diagnosis Table
', unsafe_allow_html=True) - diag_rows = [] - for col in X.columns: - mp = round(X[col].isnull().mean() * 100, 2) - mech = mechanism_results.get(col, {}).get("mechanism", "N/A") if col in missing_feature_cols else "N/A" - diag_rows.append({ - "Column": col, "Missing %": mp, - "Mechanism": mech, "Severity": severity(mp) if mp > 0 else "None", - "Outliers": outlier_data.get(col, {}).get("Outliers (IQR)", "—"), - "Variance Impact (Δ)": outlier_data.get(col, {}).get("Variance Impact (Δ)", "—"), - }) - diag_df = pd.DataFrame(diag_rows).sort_values("Missing %", ascending=False).reset_index(drop=True) - - sev_colors = {"High": "background-color: #f8d7da; color: #721c24;", - "Moderate": "background-color: #fff3cd; color: #856404;", - "Low": "background-color: #d4edda; color: #155724;"} - mech_colors = {"MCAR": "background-color: #d4edda; color: #155724;", - "MAR": "background-color: #fff3cd; color: #856404;", - "MNAR": "background-color: #f8d7da; color: #721c24;"} - - def color_diag_row(row): - mech_style = mech_colors.get(row["Mechanism"], "") - sev_style = sev_colors.get(row["Severity"], "") - return ["", "", mech_style, sev_style, "", ""] - - st.dataframe(diag_df.style.apply(color_diag_row, axis=1), - use_container_width=True, hide_index=True) - - # ── Per-Column Deep Analysis - st.markdown('
🔬 Per-Column Deep Analysis
', unsafe_allow_html=True) - col_label_to_name = {} - for col in X.columns: - mp_l = round(X[col].isnull().mean() * 100, 1) - type_lbl = "Num" if col in num_cols else "Cat" - mech_lbl = mechanism_results.get(col, {}).get("mechanism", "—") if col in missing_feature_cols else "complete" - label = f"{col} [{type_lbl} · {mp_l}% missing · {mech_lbl}]" - col_label_to_name[label] = col - - chosen_label = st.selectbox( - "Select a column to analyse in detail:", - options=["— choose a column —"] + list(col_label_to_name.keys()), - key="deep_col_select" - ) - if chosen_label != "— choose a column —": - chosen_col = col_label_to_name[chosen_label] - with st.spinner(f"Analysing `{chosen_col}`…"): - st.markdown("---") - render_per_column_deep_analysis( - df=X, col=chosen_col, - num_cols=num_cols, cat_cols=cat_cols, - mechanism_results=mechanism_results, - ) - st.markdown("---") - - # ── Insights - st.markdown('
💡 Data Analysis Insights
', unsafe_allow_html=True) - high_miss = diag_df[diag_df["Missing %"] >= 20]["Column"].tolist() - mar_cols = diag_df[diag_df["Mechanism"] == "MAR"]["Column"].tolist() - mnar_cols = diag_df[diag_df["Mechanism"] == "MNAR"]["Column"].tolist() - high_out = [c for c in num_cols if outlier_data.get(c, {}).get("Outliers (IQR)", 0) > 10] - - insights = [ - "Missing data must be understood before any imputation or modeling to avoid biased results.", - (f"{', '.join(high_miss)} have ≥20% missing values — treat with caution or consider dropping." - if high_miss else "No columns have critically high (≥20%) missing rates — dataset quality looks reasonable."), - (f"Columns {', '.join(mar_cols)} show MAR behavior — KNN/MICE imputation is viable." - if mar_cols else "No columns confirmed MAR."), - (f"Columns {', '.join(mnar_cols)} are likely MNAR — create a missing indicator before imputing." - if mnar_cols else "No columns flagged as MNAR."), - (f"Columns {', '.join(high_out)} have many outliers — prefer median over mean imputation." - if high_out else "Outlier counts appear manageable across numerical columns."), - "Correlated missingness indicates data is likely not MCAR — jointly missing due to a common cause.", - "MCAR is rare in real-world datasets. Most missingness in practice is MAR or MNAR.", - "MNAR cannot be confirmed statistically from observed data alone — domain knowledge is essential.", - ] - st.markdown('
", - unsafe_allow_html=True) - - # ── Theory - st.markdown('
📚 Theoretical Background
', unsafe_allow_html=True) - theories = [ - ("🔵 MCAR — Missing Completely At Random", - "The probability of missingness is entirely independent of observed and unobserved data. " - "Listwise deletion is unbiased under MCAR, though it reduces sample size."), - ("🟡 MAR — Missing At Random", - "Missingness depends on observed data but not on the missing value itself. " - "Multiple imputation or FIML methods produce valid estimates under MAR."), - ("🔴 MNAR — Missing Not At Random", - "Missingness depends on the unobserved value itself. Cannot be detected from observed data. " - "Requires sensitivity analysis and domain knowledge. Ignoring MNAR produces biased results."), - ("📐 Why Chi-Square for MCAR Testing?", - "Chi-square tests independence between the binary missingness indicator and binned numeric predictors. " - "No significant association is consistent with MCAR, though this only confirms pairwise independence."), - ("🤖 Why Logistic Regression for MAR Detection?", - "LR models the binary missingness indicator as a function of all observed features. " - "Accuracy substantially above the majority-class baseline indicates MAR."), - ("📉 Why MNAR Cannot Be Confirmed Statistically", - "MNAR depends on unobserved values — data we do not have. No statistical test on observed data " - "can definitively confirm it. Domain reasoning about the data generation process is required."), - ("📦 Outliers and Their Impact on Variance", - "Outliers (>1.5×IQR) inflate variance and distort the mean. Mean imputation artificially collapses " - "variance because all missing cells receive the same central value, masking true data spread."), - ] - for title, body in theories: - st.markdown(f'

{title}

{body}

', unsafe_allow_html=True) + miss_cols = [c for c in df.columns if df[c].isnull().any()] + if not miss_cols: return st.success("🎉 No missing values!") + summary = pd.DataFrame({"Missing Count": df.isnull().sum(), "Missing %": (df.isnull().sum()/len(df)*100).round(2)}) + st.dataframe(summary[summary["Missing Count"] > 0].sort_values("Missing %", ascending=False).style.background_gradient(cmap="YlOrRd"), use_container_width=True) # ════════════════════════════════════════════════════════════════════ -# STEP 5 — COLUMN DIAGNOSTICS (from app.py — 3 statistical tests) +# STEP 3 — DIAGNOSTICS # ════════════════════════════════════════════════════════════════════ +def render_step3(): + st.markdown('
🧪 Step 3 — Per-Column Diagnostics
', unsafe_allow_html=True) + df, target = st.session_state.get("df_train"), st.session_state.get("target_col") + if df is None: return st.warning("⚠️ Please complete Step 1.") + + miss_cols = [c for c in df.columns if df[c].isnull().any()] + if not miss_cols: return st.success("🎉 No missing values.") + + col1, col2 = st.columns([1, 4]) + selected_col = col1.selectbox("Select column to view:", miss_cols) + run_single = col1.button("▶ Run Diagnostics") + run_all = col2.button("▶ Run ALL columns", type="primary") + + if run_single: + run_single_diagnostic(df, selected_col, target) + if run_all: + progress = st.progress(0, text="Running diagnostics...") + for i, c in enumerate(miss_cols): + run_single_diagnostic(df, c, target) + progress.progress((i+1)/len(miss_cols), text=f"Diagnosing: {c}") + progress.empty() + st.success(f"✅ Diagnosed {len(miss_cols)} columns.") + + if selected_col in st.session_state["col_diagnostics"]: + res = st.session_state["col_diagnostics"][selected_col] + little, t_feat, t_target = res["little"], res["t_feat"], res["t_target"] + + st.markdown("---") + + # ── Mechanism verdict card ── + card_class = {"MCAR":"card-mcar","MAR":"card-mar","MNAR":"card-mnar"}[res["mechanism"]] + emoji = {"MCAR":"🟢","MAR":"🟠","MNAR":"🔴"}[res["mechanism"]] + st.markdown( + f'
' + f'
{emoji} Mechanism: {res["mechanism"]} — {res["confidence"]} Confidence
' + f'
{res["explanation"]}
' + f'
Missing: {res["miss_pct"]}%  |  dtype: {res["dtype"]}
' + f'
', + unsafe_allow_html=True + ) -elif step == STEPS[4]: - st.markdown('
🔬 Step 5 — Column Diagnostics
', unsafe_allow_html=True) - st.markdown('
Run three independent statistical tests per column to determine the missing-data mechanism (MCAR / MAR / MNAR).
', unsafe_allow_html=True) - - df = st.session_state.get("df") - target_col = st.session_state.get("target_col") - - if df is None: - st.warning("⚠️ Please upload a CSV in Step 1 first.") - elif target_col is None: - st.warning("⚠️ Please select a target column in Step 2 first.") - else: - summary = missing_summary_df(df) - if summary.empty: - st.success("🎉 No missing values — nothing to diagnose.") - else: - miss_cols = summary.index.tolist() - selected_col = st.selectbox("Select a column to analyse:", miss_cols) - miss_pct = summary.loc[selected_col, "Missing %"] - dtype_str = str(df[selected_col].dtype) - - st.markdown(f"---") - st.markdown(f"### Analysing column: `{selected_col}`") - - lv, risk_txt, risk_bg, risk_fg = missingness_risk_level(miss_pct) - c1, c2, c3 = st.columns(3) - with c1: - st.markdown(f'
{miss_pct:.1f}%
Missing
', unsafe_allow_html=True) - with c2: - st.markdown(f'
{dtype_str}
Data Type
', unsafe_allow_html=True) - with c3: - n_miss = int(summary.loc[selected_col, "Missing Count"]) - st.markdown(f'
{n_miss:,}
Missing Rows
', unsafe_allow_html=True) - - st.markdown( - f'
' - f'{lv} Missingness — {risk_txt}
', - unsafe_allow_html=True, - ) - - # ── Test 1 - st.markdown("#### 🔬 Test 1 — Pattern Analysis (Missingness Map)") - t1 = test1_pattern_analysis(df, selected_col) - fig, axes = plt.subplots(1, 2, figsize=(12, 3)) - sample_size = min(300, len(df)) - idx_sample = df.sample(n=sample_size, random_state=42).index if len(df) > sample_size else df.index - ind_sample = t1["indicator"].loc[idx_sample] - axes[0].scatter(range(len(ind_sample)), ind_sample.values, - c=["#9e2210" if v else "#89d9ac" for v in ind_sample.values], s=8, alpha=0.8) - axes[0].set_yticks([0, 1]); axes[0].set_yticklabels(["Present", "Missing"]) - axes[0].set_title(f"Missingness Pattern ({sample_size} rows)") - axes[0].set_xlabel("Row index") - roll = t1["indicator"].rolling(50, min_periods=1).mean() - axes[1].plot(roll.values, color="#17172b", linewidth=1.2) - axes[1].set_title("Rolling Miss Rate (window=50)") - axes[1].set_xlabel("Row index"); axes[1].set_ylabel("Miss rate") - axes[1].axhline(t1["miss_pct"] / 100, color="#9e2210", linestyle="--", label="Mean miss rate") - axes[1].legend(fontsize=8) - plt.tight_layout() - st.pyplot(fig); plt.close() - - scatter_icon = "🟢" if t1["scattered"] else "🟠" - st.markdown(f'
{scatter_icon} {t1["signal"]}
Cluster ratio: {t1["cluster_ratio"]:.2f} (higher = more scattered = MCAR signal)
', unsafe_allow_html=True) - - # ── Test 2 - st.markdown("#### 🔬 Test 2 — Feature Dependency") - t2 = test2_feature_dependency(df, selected_col) - if t2["diffs"]: - top_diffs = dict(sorted(t2["diffs"].items(), key=lambda x: -x[1])[:15]) - fig2, ax2 = plt.subplots(figsize=(10, max(3, len(top_diffs) * 0.45))) - colors = ["#9e2210" if v >= 30 else "#f0a040" if v >= 10 else "#89d9ac" for v in top_diffs.values()] - ax2.barh(list(top_diffs.keys())[::-1], list(top_diffs.values())[::-1], color=colors[::-1], edgecolor="white") - ax2.axvline(5, color="#89d9ac", linewidth=1.5, linestyle="--", label="5% weak") - ax2.axvline(10, color="#f0cc7a", linewidth=1.5, linestyle="--", label="10% MAR signal") - ax2.axvline(30, color="#f5a898", linewidth=1.5, linestyle="--", label="30% strong") - ax2.set_xlabel("Distribution Difference (%)") - ax2.set_title("Feature Distribution Difference") - ax2.legend(fontsize=8) - plt.tight_layout() - st.pyplot(fig2); plt.close() - dep_icon = "🟢" if t2["max_diff"] < 5 else "🟠" if t2["max_diff"] < 30 else "🔴" - st.markdown(f'
{dep_icon} {t2["signal"]}
Max difference: {t2["max_diff"]:.1f}%
', unsafe_allow_html=True) - else: - st.info("Not enough data to compare feature distributions.") - t2 = {"diffs": {}, "max_diff": 0.0, "signal": "Insufficient data"} - - # ── Test 3 - st.markdown("#### 🔬 Test 3 — Target Dependency") - if selected_col == target_col: - st.warning("⚠️ Selected column IS the target column. Test 3 skipped.") - t3 = {"diff_pct": None, "signal": "Skipped — column is target"} - else: - t3 = test3_target_dependency(df, selected_col, target_col) - if t3["diff_pct"] is not None: - missing_mask = df[selected_col].isnull() - fig3, ax3 = plt.subplots(figsize=(7, 3.5)) - if pd.api.types.is_numeric_dtype(df[target_col]): - miss_target = df.loc[missing_mask, target_col].dropna() - obs_target = df.loc[~missing_mask, target_col].dropna() - ax3.hist(obs_target, bins=25, alpha=0.7, label="Target when present", color="#17172b", edgecolor="white") - ax3.hist(miss_target, bins=25, alpha=0.7, label="Target when missing", color="#9e2210", edgecolor="white") - ax3.set_xlabel(target_col); ax3.set_ylabel("Count") - ax3.legend() - else: - miss_target = df.loc[missing_mask, target_col].value_counts(normalize=True) * 100 - obs_target = df.loc[~missing_mask, target_col].value_counts(normalize=True) * 100 - cats = list(set(miss_target.index) | set(obs_target.index)) - x = np.arange(len(cats)) - ax3.bar(x - 0.2, [obs_target.get(c, 0) for c in cats], 0.4, label="Present", color="#17172b") - ax3.bar(x + 0.2, [miss_target.get(c, 0) for c in cats], 0.4, label="Missing", color="#9e2210") - ax3.set_xticks(x); ax3.set_xticklabels(cats, rotation=30) - ax3.set_ylabel("% of group"); ax3.legend() - ax3.set_title(f"Target ({target_col}) dist: present vs missing in '{selected_col}'") - plt.tight_layout() - st.pyplot(fig3); plt.close() - dep_icon = "🟢" if (t3["diff_pct"] or 0) < 5 else "🟠" if (t3["diff_pct"] or 0) < 10 else "🔴" - st.markdown(f'
{dep_icon} {t3["signal"]}
Target diff: {t3["diff_pct"]}%
', unsafe_allow_html=True) - else: - st.info(t3["signal"]) - - # ── Verdict - st.markdown("---") - st.markdown("### 🏁 Mechanism Verdict") - mechanism, confidence, explanation = classify_mechanism(t1, t2, t3) - card_class = {"MCAR": "card-mcar", "MAR": "card-mar", "MNAR": "card-mnar"}[mechanism] - emoji = {"MCAR": "🟢", "MAR": "🟠", "MNAR": "🔴"}[mechanism] - st.markdown( - f'
' - f'
{emoji} {mechanism} — {confidence} confidence
' - f'
{explanation}
' - f'
', - unsafe_allow_html=True, + # ══ TEST 1: Little's MCAR ══ + st.markdown('
🔬 Test 1 — Little\'s MCAR Test
', unsafe_allow_html=True) + with st.expander("ℹ️ What does this test measure?", expanded=False): + st.markdown(""" + **Little's MCAR test** checks if missingness is completely random. + - **H₀ (null):** Data is Missing Completely At Random (MCAR) + - **p ≥ 0.05:** Fail to reject → data may be MCAR + - **p < 0.05:** Reject → systematic missingness detected + """) + + little_rows = [{ + "Test": "Little's MCAR", + "χ² Statistic": little.get("chi2", "N/A"), + "Degrees of Freedom": little.get("df", "N/A"), + "p-value": little.get("p_value", "N/A"), + "Verdict": little.get("verdict", "N/A"), + "Reject MCAR?": "✅ Yes — systematic" if little.get("reject_mcar") else "❌ No — may be MCAR" + }] + st.dataframe(pd.DataFrame(little_rows), use_container_width=True, hide_index=True) + + # ══ TEST 2: Target Dependency ══ + st.markdown('
🎯 Test 2 — Target Dependency Test
', unsafe_allow_html=True) + with st.expander("ℹ️ What does this test measure?", expanded=False): + st.markdown(""" + Tests if the **target variable** has different values when this column is missing vs. observed. + - **Numeric target:** z-test or Welch t-test + - **Categorical target:** Chi-squared test + - **Significant (p<0.05) + large diff % → MNAR** (missingness depends on outcome) + """) + + tgt_rows = [{ + "Test Applied": "z-test / Welch t-test / Chi²", + "p-value": t_target.get("p_value", "N/A"), + "Target Diff %": f'{t_target.get("diff_pct", 0):.1f}%' if t_target.get("diff_pct") is not None else "N/A", + "Significant (p<0.05)?": "✅ Yes" if t_target.get("significant") else "❌ No", + "Interpretation": t_target.get("signal", "N/A") + }] + st.dataframe(pd.DataFrame(tgt_rows), use_container_width=True, hide_index=True) + + # ══ TEST 3: Feature Dependency ══ + st.markdown('
🔗 Test 3 — Feature Dependency Tests
', unsafe_allow_html=True) + with st.expander("ℹ️ What does this test measure?", expanded=False): + st.markdown(""" + For each other feature, tests if values differ **significantly** between rows where this column is missing vs. observed. + - **Numeric features:** z-test (n≥30) or Welch t-test + - **Categorical features:** Chi-squared test + - **Many significant features (>30%) → MAR** (missingness explained by observed data) + """) + + # Summary row first + summary_cols = st.columns(3) + summary_cols[0].metric("Features Tested", t_feat.get("total_tested", 0)) + summary_cols[1].metric("Significant (p<0.05)", t_feat.get("n_significant", 0)) + summary_cols[2].metric("% Significant", f'{t_feat.get("sig_pct", 0):.1f}%') + + if t_feat["results"]: + rows = [] + for f, r in t_feat["results"].items(): + rows.append({ + "Feature": f, + "Data Type": r["type"].capitalize(), + "Test Used": r["test"], + "Test Statistic": r["stat"], + "p-value": r["p_value"], + "p < 0.05?": "✅ Significant" if r["significant"] else "—" + }) + feat_df = pd.DataFrame(rows).sort_values("p-value") + + def highlight_sig(row): + if row["p < 0.05?"] == "✅ Significant": + return ["background-color:#ffe4e1; color:#900000"] * len(row) + return [""] * len(row) + + st.dataframe( + feat_df.style.apply(highlight_sig, axis=1), + use_container_width=True, + hide_index=True ) - - # Strategy chips - col_type_str = "Numerical" if pd.api.types.is_numeric_dtype(df[selected_col]) else "Categorical" - chips_html = strategy_chips_html(mechanism, miss_pct, col_type_str) - if chips_html: - st.markdown("**Recommended Strategy Options**") - st.markdown(chips_html, unsafe_allow_html=True) - - st.session_state["col_results"][selected_col] = { - "mechanism": mechanism, - "confidence": confidence, - "miss_pct": miss_pct, - "dtype": dtype_str, - "t1": t1, "t2": t2, "t3": t3, - } + else: + st.info("No feature dependency results available (insufficient data or no other columns).") + + # ══ Decision Logic Summary ══ + st.markdown('
🧠 Decision Logic Summary
', unsafe_allow_html=True) + logic_rows = [ + {"Rule Check": "Little's test rejects MCAR?", "Result": "✅ Yes" if little.get("reject_mcar") else "❌ No"}, + {"Rule Check": "Target differs significantly?", "Result": "✅ Yes" if t_target.get("significant") else "❌ No"}, + {"Rule Check": "Target diff magnitude", "Result": f'{t_target.get("diff_pct", 0):.1f}% difference'}, + {"Rule Check": "% of features with significant diff", "Result": f'{t_feat.get("sig_pct", 0):.1f}%'}, + {"Rule Check": "→ Final Mechanism", "Result": f'{res["mechanism"]} ({res["confidence"]} confidence)'}, + ] + st.dataframe(pd.DataFrame(logic_rows), use_container_width=True, hide_index=True) # ════════════════════════════════════════════════════════════════════ -# STEP 6 — STRATEGY & IMPUTATION +# STEP 4 — FEASIBILITY GATE (Interactive) # ════════════════════════════════════════════════════════════════════ +def render_step4(): + st.markdown('
⚖️ Step 4 — Imputation Feasibility Gate
', unsafe_allow_html=True) + + with st.expander("📚 Theory & Guide: Why test imputation mathematically? (Click to expand)"): + st.markdown(""" +
+

Why test imputation mathematically?

+

Single-value imputations (like filling blanks with Mean or Median) are dangerous if overused. They can:

+ +

KNN and MICE solve this by acting like mini machine-learning models — they look at other features to make an educated guess, preserving variance and correlations.

+
+ """, unsafe_allow_html=True) + + df, target = st.session_state.get("df_train"), st.session_state.get("target_col") + col_diag = st.session_state.get("col_diagnostics", {}) + if not col_diag: return st.warning("⚠️ Please run diagnostics in Step 3 first.") + + numeric_diag = {c: v for c, v in col_diag.items() if pd.api.types.is_numeric_dtype(df[c])} + if not numeric_diag: return st.info("No numeric columns available.") + + col1, col2 = st.columns([1, 2]) + selected_col = col1.selectbox("Select numeric column:", list(numeric_diag.keys())) + impute_choice = col2.radio("Simulate impact of:", ["Mean", "Median", "KNN", "MICE"], horizontal=True) + + if st.button(f"▶ Simulate {impute_choice} Imputation", type="primary"): + with st.spinner(f"Running {impute_choice} simulation (may take a moment for KNN/MICE)..."): + feas = feasibility_checks(df, selected_col, target, impute_choice) + + if not feas.get("applicable"): + return st.error("Column not applicable for numeric feasibility checks.") + + ICONS = {"ok": "✅", "warn": "⚠️", "fail": "❌"} + COLORS = {"ok": "stat-ok", "warn": "stat-warn", "fail": "stat-fail"} + + # ── Big Stats Banner ── + st.markdown("### 📊 Imputation Impact — Key Statistics") + m1, m2, m3, m4 = st.columns(4) + + var_pct = feas["variance"]["var_drop_pct"] + var_verd = feas["variance"]["verdict"] + new_out = feas["outliers"]["new_outliers"] + out_verd = feas["outliers"]["verdict"] + corr_verd = feas["correlation"]["verdict"] + corr_max = feas["correlation"]["max_shift"] + skew_val = feas["skewness"]["value"] + skew_verd = feas["skewness"]["verdict"] + + var_color = "#900000" if var_verd == "fail" else ("#7a4f00" if var_verd == "warn" else "#0a5c30") + out_color = "#900000" if out_verd == "fail" else ("#7a4f00" if out_verd == "warn" else "#0a5c30") + corr_color = "#900000" if corr_verd == "fail" else ("#7a4f00" if corr_verd == "warn" else "#0a5c30") + skew_color = "#900000" if skew_verd == "fail" else ("#7a4f00" if skew_verd == "warn" else "#0a5c30") + + m1.markdown( + f'
' + f'
-{var_pct:.1f}%
' + f'
Variance Change
' + f'
{ICONS[var_verd]} {"Safe" if var_verd=="ok" else "Caution" if var_verd=="warn" else "High Risk"}
' + f'
', unsafe_allow_html=True + ) + m2.markdown( + f'
' + f'
+{new_out}
' + f'
New Outliers Created
' + f'
{ICONS[out_verd]} Before: {feas["outliers"]["outliers_before"]} → After: {feas["outliers"]["outliers_after"]}
' + f'
', unsafe_allow_html=True + ) + m3.markdown( + f'
' + f'
Δ{corr_max:.3f}
' + f'
Max Corr. Shift
' + f'
{ICONS[corr_verd]} {corr_verd.capitalize()}
' + f'
', unsafe_allow_html=True + ) + m4.markdown( + f'
' + f'
{skew_val:.3f}
' + f'
Skewness
' + f'
{ICONS[skew_verd]} {"Low" if abs(skew_val)<=1 else "Moderate" if abs(skew_val)<=3 else "High"} skew
' + f'
', unsafe_allow_html=True + ) -elif step == STEPS[5]: - st.markdown('
🛠 Step 6 — Strategy & Imputation
', unsafe_allow_html=True) - st.markdown('
Based on the mechanism and missing %, select and apply the right strategy for each column.
', unsafe_allow_html=True) - - df = st.session_state.get("df") - col_results = st.session_state.get("col_results", {}) - - if df is None: - st.warning("⚠️ Please upload a CSV in Step 1 first.") - elif not col_results: - st.warning("⚠️ Please run diagnostics in Step 5 for at least one column first.") - else: - df_imputed = (df.copy() if st.session_state.get("df_imputed") is None - else st.session_state["df_imputed"].copy()) - - for col, res in col_results.items(): - mechanism = res["mechanism"] - miss_pct = res["miss_pct"] - dtype_str = res["dtype"] - - st.markdown(f"### Column: `{col}`") - st.markdown(f"**Mechanism:** {mechanism} | **Missing:** {miss_pct:.1f}% | **Type:** `{dtype_str}`") - - rec = recommend_strategy(mechanism, miss_pct, dtype_str) - card_class = "card-mcar" if mechanism == "MCAR" else "card-mar" if mechanism == "MAR" else "card-mnar" - st.markdown( - f'
' - f'Recommended: {rec["method"]}
' - f'{rec["reason"]}
' - f'{rec["adv"]}
' - f'{rec["disadv"]}' - f'
', - unsafe_allow_html=True, - ) + st.markdown("---") + + # ── KDE Plots — Two clear separate charts ── + st.markdown("### 📈 Distribution Comparison (KDE)") - if rec["add_indicator"]: - st.markdown( - '
🚩 Missing Indicator will be added BEFORE imputation — ' - "missingness itself carries signal for this column.
", - unsafe_allow_html=True, - ) - - is_num = "float" in dtype_str or "int" in dtype_str - strategy_options = ( - ["Mean", "Median", "Constant (0)", "Drop rows", "Keep as-is"] if is_num - else ["Mode", "Constant ('Unknown')", "Drop rows", "Keep as-is"] - ) - chosen = st.selectbox( - f"Apply strategy for `{col}`:", - options=strategy_options, - key=f"strategy_{col}", - ) + series = df[selected_col].dropna() + imputed = feas["imputed_series"] + miss_pct_col = df[selected_col].isnull().mean() * 100 - if st.button(f"▶ Apply to `{col}`", key=f"apply_{col}"): - if rec["add_indicator"]: - indicator_col = f"{col}_was_missing" - df_imputed[indicator_col] = df[col].isnull().astype(int) - st.info(f"✅ Created indicator column: `{indicator_col}`") - - if chosen == "Mean": - fill_val = df[col].mean() - df_imputed[col] = df_imputed[col].fillna(fill_val) - st.success(f"✅ Imputed with mean = {fill_val:.4f}") - elif chosen == "Median": - fill_val = df[col].median() - df_imputed[col] = df_imputed[col].fillna(fill_val) - st.success(f"✅ Imputed with median = {fill_val:.4f}") - elif chosen == "Mode": - fill_val = df[col].mode().iloc[0] - df_imputed[col] = df_imputed[col].fillna(fill_val) - st.success(f"✅ Imputed with mode = {fill_val}") - elif chosen in ("Constant (0)", "Constant ('Unknown')"): - fill_val = 0 if is_num else "Unknown" - df_imputed[col] = df_imputed[col].fillna(fill_val) - st.success(f"✅ Imputed with constant = {fill_val}") - elif chosen == "Drop rows": - before = len(df_imputed) - df_imputed = df_imputed.dropna(subset=[col]) - after = len(df_imputed) - st.success(f"✅ Dropped {before - after} rows with missing `{col}`") - else: - st.info("No imputation applied.") + fig, axes = plt.subplots(1, 2, figsize=(16, 5)) + fig.patch.set_facecolor('#fafafa') - st.session_state["df_imputed"] = df_imputed - - st.markdown("
", unsafe_allow_html=True) + # Plot 1: Overlapping KDE + ax = axes[0] + ax.set_facecolor('#f8f8f8') + try: + from scipy.stats import gaussian_kde + # Original KDE + kde_orig = gaussian_kde(series.values, bw_method='scott') + x_range = np.linspace(min(series.min(), imputed.min()), max(series.max(), imputed.max()), 300) + ax.fill_between(x_range, kde_orig(x_range), alpha=0.35, color='#17172b', label='Original (observed only)') + ax.plot(x_range, kde_orig(x_range), color='#17172b', lw=2.5) + + # Imputed KDE + kde_imp = gaussian_kde(imputed.values, bw_method='scott') + ax.fill_between(x_range, kde_imp(x_range), alpha=0.35, color='#d6336c', label=f'After {impute_choice}') + ax.plot(x_range, kde_imp(x_range), color='#d6336c', lw=2.5, linestyle='--') + except Exception: + ax.hist(series.values, bins=25, alpha=0.5, color='#17172b', label='Original', density=True) + ax.hist(imputed.values, bins=25, alpha=0.4, color='#d6336c', label=f'After {impute_choice}', density=True) + + ax.set_title(f'KDE: Original vs After {impute_choice}\n({miss_pct_col:.1f}% was missing)', fontsize=13, fontweight='bold', pad=12) + ax.set_xlabel(selected_col, fontsize=11) + ax.set_ylabel('Density', fontsize=11) + ax.legend(fontsize=10) + ax.grid(axis='y', alpha=0.3) + ax.spines[['top','right']].set_visible(False) + + # Plot 2: Box plots side by side + ax2 = axes[1] + ax2.set_facecolor('#f8f8f8') + bp = ax2.boxplot( + [series.values, imputed.values], + labels=['Original\n(non-missing)', f'After\n{impute_choice}'], + patch_artist=True, + widths=0.5, + medianprops=dict(color='#d6336c', linewidth=2.5), + flierprops=dict(marker='o', markerfacecolor='#d6336c', markersize=5, alpha=0.5), + whiskerprops=dict(linewidth=1.5), + capprops=dict(linewidth=1.5), + ) + bp['boxes'][0].set_facecolor('#c8d8f0') + bp['boxes'][1].set_facecolor('#f5c6d0') - st.markdown("### 📥 Download Imputed Dataset") - df_out = st.session_state.get("df_imputed", df) - csv_bytes = df_out.to_csv(index=False).encode("utf-8") - st.download_button( - label="⬇ Download imputed CSV", - data=csv_bytes, - file_name="imputed_dataset.csv", - mime="text/csv", + # Annotate variance change + ax2.set_title( + f'Spread & Outliers\nVariance Change: {var_pct:.1f}% | New Outliers: +{new_out}', + fontsize=13, fontweight='bold', pad=12 ) - st.dataframe(df_out.head(10), use_container_width=True) + ax2.set_ylabel('Value', fontsize=11) + ax2.grid(axis='y', alpha=0.3) + ax2.spines[['top','right']].set_visible(False) + + plt.tight_layout(pad=2.5) + st.pyplot(fig, use_container_width=True) + plt.close() + + # ── Correlation Details ── + st.markdown("---") + st.markdown("#### 🔗 Correlation Preservation Details") + st.markdown(f'
{ICONS[corr_verd]} {feas["correlation"]["msg"]}
', unsafe_allow_html=True) + if feas["correlation"]["details"]: + rows = [{ + "Feature": f, + "r (before)": r["r_before"], + "r (after)": r["r_after"], + "Δ (shift)": r["delta"], + "Sign Flip?": "🚨 YES" if r["sign_flip"] else "No" + } for f, r in feas["correlation"]["details"].items()] + corr_df = pd.DataFrame(rows).sort_values("Δ (shift)", ascending=False) + def highlight_corr(row): + if row["Sign Flip?"] == "🚨 YES": return ["background-color:#fde8e8; color:#900000"] * len(row) + if row["Δ (shift)"] > 0.10: return ["background-color:#fff0ed; color:#900000"] * len(row) + return [""] * len(row) + st.dataframe(corr_df.style.apply(highlight_corr, axis=1), use_container_width=True, hide_index=True) # ════════════════════════════════════════════════════════════════════ -# STEP 7 — VALIDATION CHECKS +# STEP 5 — FINAL REPORT # ════════════════════════════════════════════════════════════════════ +def render_step5(): + st.markdown('
📋 Step 5 — Final Diagnostic Report
', unsafe_allow_html=True) + + df, target = st.session_state.get("df_train"), st.session_state.get("target_col") + col_diag = st.session_state.get("col_diagnostics", {}) + if not col_diag: return st.warning("⚠️ Run diagnostics in Step 3 first.") + + # ── Legend ── + with st.expander("📖 How to read the Recommended Strategy column"): + st.markdown(""" + | Label | Meaning | + |-------|---------| + | **Drop Rows** | MCAR + <5% missing — safe to delete affected rows | + | **Drop Column** | >70% missing — too little data to impute reliably | + | **Mean Imputation** | Low-skew numeric, variance loss is acceptable | + | **Median Imputation** | Skewed numeric; median is more robust than mean | + | **Mode Imputation** | Categorical / non-numeric columns | + | **KNN Imputer** | Moderate missingness; feature relationships preserved | + | **MICE Imputer** | High missingness (>30%); multiple-imputation approach | + | **+ Missing Indicator** | Added when mechanism is MNAR, or MAR ≥ 10% missing — add a binary flag column `col_missing` alongside imputed values | + """) + + table_rows = [] + for col, res in col_diag.items(): + rec_string = get_auto_recommendation(df, col, target, res["mechanism"], res["miss_pct"], res["dtype"]) + table_rows.append({ + "Column": col, + "dtype": res["dtype"], + "Missing %": f'{res["miss_pct"]:.1f}%', + "Mechanism": res["mechanism"], + "Confidence": res["confidence"], + "Recommended Strategy": rec_string + }) + + report_df = pd.DataFrame(table_rows).sort_values("Missing %", ascending=False) + + def color_rows(row): + mech_colors = { + "MNAR": "background-color:#fff0ed; color:#000", + "MAR": "background-color:#fffaeb; color:#000", + "MCAR": "background-color:#edfaf3; color:#000" + } + return [mech_colors.get(row["Mechanism"], "")] * len(row) + + st.dataframe( + report_df.style.apply(color_rows, axis=1), + use_container_width=True, + hide_index=True + ) -elif step == STEPS[6]: - st.markdown('
✅ Step 7 — Validation Checks
', unsafe_allow_html=True) - st.markdown('
Confirm that imputation preserved statistical properties and did not introduce bias.
', unsafe_allow_html=True) - - df_orig = st.session_state.get("df") - df_imputed = st.session_state.get("df_imputed") - col_results = st.session_state.get("col_results", {}) - - if df_orig is None or df_imputed is None: - st.warning("⚠️ Complete Steps 1–6 first.") - elif not col_results: - st.warning("⚠️ Run diagnostics in Step 5 and apply a strategy in Step 6 first.") - else: - numeric_cols = [c for c in col_results if pd.api.types.is_numeric_dtype(df_orig[c])] - - if not numeric_cols: - st.info("Validation checks apply to numeric columns only. No numeric columns were diagnosed.") - else: - for col in numeric_cols: - before = df_orig[col].dropna() - after = df_imputed[col].dropna() - - if len(after) == 0 or len(before) == 0: - continue - - st.markdown(f"### `{col}`") - chk = validation_checks(before, after) - - c1, c2, c3 = st.columns(3) - def chk_icon(ok): return "✅" if ok else "⚠️" - with c1: - st.markdown( - f'
' - f'
{chk_icon(chk["mean_ok"])} {chk["mean_shift_pct"]}%
' - f'
Mean shift (≤5% OK)
' - f'
', unsafe_allow_html=True) - with c2: - st.markdown( - f'
' - f'
{chk_icon(chk["median_ok"])} {chk["median_shift_pct"]}%
' - f'
Median shift (≤3% OK)
' - f'
', unsafe_allow_html=True) - with c3: - st.markdown( - f'
' - f'
{chk_icon(chk["var_ok"])} {chk["var_change_pct"]}%
' - f'
Variance change (≤20% OK)
' - f'
', unsafe_allow_html=True) - - fig_v, ax_v = plt.subplots(figsize=(8, 3.5)) - ax_v.hist(before.values, bins=30, alpha=0.55, label="Before imputation", color="#17172b", edgecolor="white") - ax_v.hist(after.values, bins=30, alpha=0.55, label="After imputation", color="#6020a0", edgecolor="white") - ax_v.axvline(before.mean(), color="#17172b", linewidth=1.5, linestyle="--", label=f"Mean before: {before.mean():.2f}") - ax_v.axvline(after.mean(), color="#6020a0", linewidth=1.5, linestyle="--", label=f"Mean after: {after.mean():.2f}") - ax_v.set_title(f"Distribution: '{col}' before vs after imputation") - ax_v.legend(fontsize=8) - plt.tight_layout() - st.pyplot(fig_v); plt.close() - - target_col = st.session_state.get("target_col") - if target_col and target_col in df_orig.columns and pd.api.types.is_numeric_dtype(df_orig[target_col]): - corr_before = df_orig[[col, target_col]].dropna().corr().iloc[0, 1] - corr_after = df_imputed[[col, target_col]].dropna().corr().iloc[0, 1] - delta = abs(corr_before - corr_after) - sign_flip = (corr_before * corr_after < 0) - icon = "✅" if delta <= 0.05 and not sign_flip else "⚠️" - st.markdown( - f'
{icon} Correlation with target: ' - f'Before = {corr_before:.3f} → After = {corr_after:.3f} | Δ = {delta:.3f}' - + (" 🚨 Sign flipped!" if sign_flip else "") - + "
", - unsafe_allow_html=True, - ) - - st.markdown("
", unsafe_allow_html=True) - - st.markdown("### ⚠️ Common Pitfalls Checklist") - pitfalls = [ - "Each column treated independently?", - "Imputation done AFTER train-test split?", - "Target variable NOT used as imputation predictor?", - "Missing indicator created BEFORE imputation for MNAR/MAR ≥10%?", - "Validation checked beyond just accuracy?", - ] - for txt in pitfalls: - st.checkbox(txt, value=False, key=f"pitfall_{txt[:20]}") - - st.markdown( - '
' - '↻ Repeat Steps 5–6 for every column independently.
' - 'One column may be MCAR (drop rows), another MAR (KNN), another MNAR (indicator + median). ' - 'Never apply one method to all columns at once.' - '
', - unsafe_allow_html=True, - ) - -st.markdown("---") -st.caption("🔬 Missing Value Intelligence Suite · Merged from app.py + app_tanisha.py · Built with Streamlit, pandas, scikit-learn, scipy, seaborn") \ No newline at end of file + # ── Summary counts ── + st.markdown("---") + c1, c2, c3 = st.columns(3) + mcar_n = sum(1 for r in col_diag.values() if r["mechanism"] == "MCAR") + mar_n = sum(1 for r in col_diag.values() if r["mechanism"] == "MAR") + mnar_n = sum(1 for r in col_diag.values() if r["mechanism"] == "MNAR") + c1.markdown(f'
🟢 {mcar_n}
MCAR columns
', unsafe_allow_html=True) + c2.markdown(f'
🟠 {mar_n}
MAR columns
', unsafe_allow_html=True) + c3.markdown(f'
🔴 {mnar_n}
MNAR columns
', unsafe_allow_html=True) + + +if step == STEPS[0]: render_step1() +elif step == STEPS[1]: render_step2() +elif step == STEPS[2]: render_step3() +elif step == STEPS[3]: render_step4() +elif step == STEPS[4]: render_step5()