GS123 commited on
Commit
6dbfc11
Β·
verified Β·
1 Parent(s): 2df2baa

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +110 -19
src/streamlit_app.py CHANGED
@@ -51,17 +51,17 @@ section[data-testid="stSidebar"] hr{border-color:#ffffff33 !important;}
51
  .metric-val{font-size:1.4rem;font-weight:700;color:#17172b !important;}
52
  .metric-lbl{font-size:.78rem;color:#6060a0 !important;margin-top:2px;}
53
 
54
- .big-stat-box{border-radius:12px;padding:20px 24px;text-align:center;margin-bottom:8px;}
55
- .big-stat-val{font-size:2.2rem;font-weight:800;margin-bottom:4px;}
56
- .big-stat-lbl{font-size:.82rem;font-weight:500;opacity:0.8;text-transform:uppercase;letter-spacing:.05em;}
57
- .big-stat-sub{font-size:.78rem;opacity:0.65;margin-top:4px;}
58
 
59
  .stat-ok{background:#edfaf3;border:2px solid #89d9ac;}
60
- .stat-ok .big-stat-val{color:#0a5c30;}
61
  .stat-warn{background:#fffaeb;border:2px solid #f0cc7a;}
62
- .stat-warn .big-stat-val{color:#7a4f00;}
63
  .stat-fail{background:#fff0ed;border:2px solid #f5a898;}
64
- .stat-fail .big-stat-val{color:#900000;}
65
 
66
  .card-mcar{background:#edfaf3;border:2px solid #89d9ac;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
67
  .card-mar {background:#fffaeb;border:2px solid #f0cc7a;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
@@ -384,8 +384,104 @@ def render_step2():
384
  miss_cols = [c for c in df.columns if df[c].isnull().any()]
385
  if not miss_cols: return st.success("πŸŽ‰ No missing values!")
386
 
387
- summary = pd.DataFrame({"Missing Count": df.isnull().sum(), "Missing %": (df.isnull().sum()/len(df)*100).round(2)})
388
- st.dataframe(summary[summary["Missing Count"] > 0].sort_values("Missing %", ascending=False).style.background_gradient(cmap="YlOrRd"), use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
 
390
  # ════════════════════════════════════════════════════════════════════
391
  # STEP 3 β€” DIAGNOSTICS
@@ -578,35 +674,30 @@ def render_step4():
578
  skew_val = feas["skewness"]["value"]
579
  skew_verd = feas["skewness"]["verdict"]
580
 
581
- var_color = "#900000" if var_verd == "fail" else ("#7a4f00" if var_verd == "warn" else "#0a5c30")
582
- out_color = "#900000" if out_verd == "fail" else ("#7a4f00" if out_verd == "warn" else "#0a5c30")
583
- corr_color = "#900000" if corr_verd == "fail" else ("#7a4f00" if corr_verd == "warn" else "#0a5c30")
584
- skew_color = "#900000" if skew_verd == "fail" else ("#7a4f00" if skew_verd == "warn" else "#0a5c30")
585
-
586
  m1.markdown(
587
  f'<div class="big-stat-box {COLORS[var_verd]}">'
588
- f'<div class="big-stat-val" style="color:{var_color}">-{var_pct:.1f}%</div>'
589
  f'<div class="big-stat-lbl">Variance Change</div>'
590
  f'<div class="big-stat-sub">{ICONS[var_verd]} {"Safe" if var_verd=="ok" else "Caution" if var_verd=="warn" else "High Risk"}</div>'
591
  f'</div>', unsafe_allow_html=True
592
  )
593
  m2.markdown(
594
  f'<div class="big-stat-box {COLORS[out_verd]}">'
595
- f'<div class="big-stat-val" style="color:{out_color}">+{new_out}</div>'
596
  f'<div class="big-stat-lbl">New Outliers Created</div>'
597
  f'<div class="big-stat-sub">{ICONS[out_verd]} Before: {feas["outliers"]["outliers_before"]} β†’ After: {feas["outliers"]["outliers_after"]}</div>'
598
  f'</div>', unsafe_allow_html=True
599
  )
600
  m3.markdown(
601
  f'<div class="big-stat-box {COLORS[corr_verd]}">'
602
- f'<div class="big-stat-val" style="color:{corr_color}">Ξ”{corr_max:.3f}</div>'
603
  f'<div class="big-stat-lbl">Max Corr. Shift</div>'
604
  f'<div class="big-stat-sub">{ICONS[corr_verd]} {corr_verd.capitalize()}</div>'
605
  f'</div>', unsafe_allow_html=True
606
  )
607
  m4.markdown(
608
  f'<div class="big-stat-box {COLORS[skew_verd]}">'
609
- f'<div class="big-stat-val" style="color:{skew_color}">{skew_val:.3f}</div>'
610
  f'<div class="big-stat-lbl">Skewness</div>'
611
  f'<div class="big-stat-sub">{ICONS[skew_verd]} {"Low" if abs(skew_val)<=1 else "Moderate" if abs(skew_val)<=3 else "High"} skew</div>'
612
  f'</div>', unsafe_allow_html=True
@@ -767,4 +858,4 @@ if step == STEPS[0]: render_step1()
767
  elif step == STEPS[1]: render_step2()
768
  elif step == STEPS[2]: render_step3()
769
  elif step == STEPS[3]: render_step4()
770
- elif step == STEPS[4]: render_step5()
 
51
  .metric-val{font-size:1.4rem;font-weight:700;color:#17172b !important;}
52
  .metric-lbl{font-size:.78rem;color:#6060a0 !important;margin-top:2px;}
53
 
54
+ .big-stat-box{border-radius:12px;padding:20px 24px;text-align:center;margin-bottom:8px;min-height:110px;}
55
+ .big-stat-val{font-size:2.0rem;font-weight:800;margin-bottom:4px;line-height:1.1;}
56
+ .big-stat-lbl{font-size:.80rem;font-weight:600;opacity:0.85;text-transform:uppercase;letter-spacing:.05em;}
57
+ .big-stat-sub{font-size:.76rem;opacity:0.65;margin-top:6px;}
58
 
59
  .stat-ok{background:#edfaf3;border:2px solid #89d9ac;}
60
+ .stat-ok .big-stat-val,.stat-ok .big-stat-lbl{color:#0a5c30 !important;}
61
  .stat-warn{background:#fffaeb;border:2px solid #f0cc7a;}
62
+ .stat-warn .big-stat-val,.stat-warn .big-stat-lbl{color:#7a4f00 !important;}
63
  .stat-fail{background:#fff0ed;border:2px solid #f5a898;}
64
+ .stat-fail .big-stat-val,.stat-fail .big-stat-lbl{color:#900000 !important;}
65
 
66
  .card-mcar{background:#edfaf3;border:2px solid #89d9ac;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
67
  .card-mar {background:#fffaeb;border:2px solid #f0cc7a;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
 
384
  miss_cols = [c for c in df.columns if df[c].isnull().any()]
385
  if not miss_cols: return st.success("πŸŽ‰ No missing values!")
386
 
387
+ # ── Summary table ──
388
+ summary = pd.DataFrame({
389
+ "Missing Count": df[miss_cols].isnull().sum(),
390
+ "Missing %": (df[miss_cols].isnull().sum() / len(df) * 100).round(2)
391
+ }).sort_values("Missing %", ascending=False)
392
+ st.dataframe(
393
+ summary.style.background_gradient(cmap="YlOrRd", subset=["Missing %"]),
394
+ use_container_width=True
395
+ )
396
+
397
+ st.markdown("---")
398
+
399
+ # ── Missingness Heatmap ──
400
+ st.markdown("### πŸ—ΊοΈ Missingness Heatmap")
401
+ st.caption("Each dark stripe = a missing value in that row. Aligned stripes across columns = rows missing together (MAR signal).")
402
+
403
+ fig_h, ax_h = plt.subplots(figsize=(14, max(3, len(miss_cols) * 0.6)))
404
+ fig_h.patch.set_facecolor('#f8f8f8')
405
+ ax_h.set_facecolor('#f0f0f0')
406
+
407
+ miss_matrix = df[miss_cols].isnull().astype(int)
408
+ # Subsample rows for performance if large
409
+ if len(miss_matrix) > 2000:
410
+ miss_matrix = miss_matrix.sample(2000, random_state=42).reset_index(drop=True)
411
+
412
+ ax_h.imshow(
413
+ miss_matrix.T.values,
414
+ aspect='auto',
415
+ cmap=sns.color_palette(["#f0f0f0", "#17172b"], as_cmap=True),
416
+ interpolation='none'
417
+ )
418
+ ax_h.set_yticks(range(len(miss_cols)))
419
+ ax_h.set_yticklabels(miss_cols, fontsize=10)
420
+ ax_h.set_xlabel("Row index (sampled)" if len(df) > 2000 else "Row index", fontsize=10)
421
+ ax_h.set_title("Missing Value Pattern (dark = missing)", fontsize=12, fontweight='bold', pad=10)
422
+ ax_h.spines[['top','right','bottom','left']].set_visible(False)
423
+ plt.tight_layout()
424
+ st.pyplot(fig_h, use_container_width=True)
425
+ plt.close()
426
+
427
+ st.markdown("---")
428
+
429
+ # ── Missingness Correlation Heatmap ──
430
+ st.markdown("### πŸ”— Missingness Correlation")
431
+ st.caption("Correlation between missing patterns of columns. Values near 1.0 = these columns tend to be missing in the same rows β€” strong MAR signal.")
432
+
433
+ if len(miss_cols) >= 2:
434
+ miss_indicator = df[miss_cols].isnull().astype(int)
435
+ corr_matrix = miss_indicator.corr()
436
+
437
+ fig_c, ax_c = plt.subplots(figsize=(max(6, len(miss_cols) * 1.2), max(5, len(miss_cols) * 1.0)))
438
+ fig_c.patch.set_facecolor('#f8f8f8')
439
+
440
+ mask = np.zeros_like(corr_matrix, dtype=bool)
441
+ mask[np.triu_indices_from(mask, k=1)] = True # show lower triangle only
442
+
443
+ sns.heatmap(
444
+ corr_matrix,
445
+ mask=mask,
446
+ annot=True,
447
+ fmt=".2f",
448
+ cmap="RdYlGn",
449
+ vmin=-1, vmax=1,
450
+ center=0,
451
+ ax=ax_c,
452
+ square=True,
453
+ linewidths=0.5,
454
+ linecolor='white',
455
+ annot_kws={"size": 10, "weight": "bold"},
456
+ cbar_kws={"shrink": 0.8}
457
+ )
458
+ ax_c.set_title("Pairwise Missingness Correlation", fontsize=12, fontweight='bold', pad=12)
459
+ ax_c.tick_params(axis='x', rotation=45, labelsize=10)
460
+ ax_c.tick_params(axis='y', rotation=0, labelsize=10)
461
+ plt.tight_layout()
462
+ st.pyplot(fig_c, use_container_width=True)
463
+ plt.close()
464
+
465
+ # Interpretation callout
466
+ max_corr_pair = None
467
+ max_val = 0
468
+ for i in range(len(miss_cols)):
469
+ for j in range(i):
470
+ val = abs(corr_matrix.iloc[i, j])
471
+ if val > max_val:
472
+ max_val = val
473
+ max_corr_pair = (miss_cols[i], miss_cols[j], corr_matrix.iloc[i, j])
474
+
475
+ if max_corr_pair:
476
+ c1, c2, v = max_corr_pair
477
+ if v >= 0.9:
478
+ st.markdown(f'<div class="card-danger">🚨 <b>Very high missingness correlation ({v:.2f})</b> between <code>{c1}</code> and <code>{c2}</code> β€” these rows go missing together. Strong MAR signal; consider joint imputation (KNN/MICE).</div>', unsafe_allow_html=True)
479
+ elif v >= 0.5:
480
+ st.markdown(f'<div class="card-warn">⚠️ <b>Moderate missingness correlation ({v:.2f})</b> between <code>{c1}</code> and <code>{c2}</code> β€” partial co-occurrence of missingness detected.</div>', unsafe_allow_html=True)
481
+ else:
482
+ st.markdown(f'<div class="card-ok">βœ… <b>Low missingness correlation (max {v:.2f})</b> β€” columns appear to be missing independently.</div>', unsafe_allow_html=True)
483
+ else:
484
+ st.info("Only one column with missing values β€” correlation requires at least two.")
485
 
486
  # ════════════════════════════════════════════════════════════════════
487
  # STEP 3 β€” DIAGNOSTICS
 
674
  skew_val = feas["skewness"]["value"]
675
  skew_verd = feas["skewness"]["verdict"]
676
 
 
 
 
 
 
677
  m1.markdown(
678
  f'<div class="big-stat-box {COLORS[var_verd]}">'
679
+ f'<div class="big-stat-val">-{var_pct:.1f}%</div>'
680
  f'<div class="big-stat-lbl">Variance Change</div>'
681
  f'<div class="big-stat-sub">{ICONS[var_verd]} {"Safe" if var_verd=="ok" else "Caution" if var_verd=="warn" else "High Risk"}</div>'
682
  f'</div>', unsafe_allow_html=True
683
  )
684
  m2.markdown(
685
  f'<div class="big-stat-box {COLORS[out_verd]}">'
686
+ f'<div class="big-stat-val">+{new_out}</div>'
687
  f'<div class="big-stat-lbl">New Outliers Created</div>'
688
  f'<div class="big-stat-sub">{ICONS[out_verd]} Before: {feas["outliers"]["outliers_before"]} β†’ After: {feas["outliers"]["outliers_after"]}</div>'
689
  f'</div>', unsafe_allow_html=True
690
  )
691
  m3.markdown(
692
  f'<div class="big-stat-box {COLORS[corr_verd]}">'
693
+ f'<div class="big-stat-val">Ξ”{corr_max:.3f}</div>'
694
  f'<div class="big-stat-lbl">Max Corr. Shift</div>'
695
  f'<div class="big-stat-sub">{ICONS[corr_verd]} {corr_verd.capitalize()}</div>'
696
  f'</div>', unsafe_allow_html=True
697
  )
698
  m4.markdown(
699
  f'<div class="big-stat-box {COLORS[skew_verd]}">'
700
+ f'<div class="big-stat-val">{skew_val:.3f}</div>'
701
  f'<div class="big-stat-lbl">Skewness</div>'
702
  f'<div class="big-stat-sub">{ICONS[skew_verd]} {"Low" if abs(skew_val)<=1 else "Moderate" if abs(skew_val)<=3 else "High"} skew</div>'
703
  f'</div>', unsafe_allow_html=True
 
858
  elif step == STEPS[1]: render_step2()
859
  elif step == STEPS[2]: render_step3()
860
  elif step == STEPS[3]: render_step4()
861
+ elif step == STEPS[4]: render_step5()