Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +110 -19
src/streamlit_app.py
CHANGED
|
@@ -51,17 +51,17 @@ section[data-testid="stSidebar"] hr{border-color:#ffffff33 !important;}
|
|
| 51 |
.metric-val{font-size:1.4rem;font-weight:700;color:#17172b !important;}
|
| 52 |
.metric-lbl{font-size:.78rem;color:#6060a0 !important;margin-top:2px;}
|
| 53 |
|
| 54 |
-
.big-stat-box{border-radius:12px;padding:20px 24px;text-align:center;margin-bottom:8px;}
|
| 55 |
-
.big-stat-val{font-size:2.
|
| 56 |
-
.big-stat-lbl{font-size:.
|
| 57 |
-
.big-stat-sub{font-size:.
|
| 58 |
|
| 59 |
.stat-ok{background:#edfaf3;border:2px solid #89d9ac;}
|
| 60 |
-
.stat-ok .big-stat-val{color:#0a5c30;}
|
| 61 |
.stat-warn{background:#fffaeb;border:2px solid #f0cc7a;}
|
| 62 |
-
.stat-warn .big-stat-val{color:#7a4f00;}
|
| 63 |
.stat-fail{background:#fff0ed;border:2px solid #f5a898;}
|
| 64 |
-
.stat-fail .big-stat-val{color:#900000;}
|
| 65 |
|
| 66 |
.card-mcar{background:#edfaf3;border:2px solid #89d9ac;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
|
| 67 |
.card-mar {background:#fffaeb;border:2px solid #f0cc7a;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
|
|
@@ -384,8 +384,104 @@ def render_step2():
|
|
| 384 |
miss_cols = [c for c in df.columns if df[c].isnull().any()]
|
| 385 |
if not miss_cols: return st.success("π No missing values!")
|
| 386 |
|
| 387 |
-
|
| 388 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 389 |
|
| 390 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 391 |
# STEP 3 β DIAGNOSTICS
|
|
@@ -578,35 +674,30 @@ def render_step4():
|
|
| 578 |
skew_val = feas["skewness"]["value"]
|
| 579 |
skew_verd = feas["skewness"]["verdict"]
|
| 580 |
|
| 581 |
-
var_color = "#900000" if var_verd == "fail" else ("#7a4f00" if var_verd == "warn" else "#0a5c30")
|
| 582 |
-
out_color = "#900000" if out_verd == "fail" else ("#7a4f00" if out_verd == "warn" else "#0a5c30")
|
| 583 |
-
corr_color = "#900000" if corr_verd == "fail" else ("#7a4f00" if corr_verd == "warn" else "#0a5c30")
|
| 584 |
-
skew_color = "#900000" if skew_verd == "fail" else ("#7a4f00" if skew_verd == "warn" else "#0a5c30")
|
| 585 |
-
|
| 586 |
m1.markdown(
|
| 587 |
f'<div class="big-stat-box {COLORS[var_verd]}">'
|
| 588 |
-
f'<div class="big-stat-val"
|
| 589 |
f'<div class="big-stat-lbl">Variance Change</div>'
|
| 590 |
f'<div class="big-stat-sub">{ICONS[var_verd]} {"Safe" if var_verd=="ok" else "Caution" if var_verd=="warn" else "High Risk"}</div>'
|
| 591 |
f'</div>', unsafe_allow_html=True
|
| 592 |
)
|
| 593 |
m2.markdown(
|
| 594 |
f'<div class="big-stat-box {COLORS[out_verd]}">'
|
| 595 |
-
f'<div class="big-stat-val"
|
| 596 |
f'<div class="big-stat-lbl">New Outliers Created</div>'
|
| 597 |
f'<div class="big-stat-sub">{ICONS[out_verd]} Before: {feas["outliers"]["outliers_before"]} β After: {feas["outliers"]["outliers_after"]}</div>'
|
| 598 |
f'</div>', unsafe_allow_html=True
|
| 599 |
)
|
| 600 |
m3.markdown(
|
| 601 |
f'<div class="big-stat-box {COLORS[corr_verd]}">'
|
| 602 |
-
f'<div class="big-stat-val"
|
| 603 |
f'<div class="big-stat-lbl">Max Corr. Shift</div>'
|
| 604 |
f'<div class="big-stat-sub">{ICONS[corr_verd]} {corr_verd.capitalize()}</div>'
|
| 605 |
f'</div>', unsafe_allow_html=True
|
| 606 |
)
|
| 607 |
m4.markdown(
|
| 608 |
f'<div class="big-stat-box {COLORS[skew_verd]}">'
|
| 609 |
-
f'<div class="big-stat-val"
|
| 610 |
f'<div class="big-stat-lbl">Skewness</div>'
|
| 611 |
f'<div class="big-stat-sub">{ICONS[skew_verd]} {"Low" if abs(skew_val)<=1 else "Moderate" if abs(skew_val)<=3 else "High"} skew</div>'
|
| 612 |
f'</div>', unsafe_allow_html=True
|
|
@@ -767,4 +858,4 @@ if step == STEPS[0]: render_step1()
|
|
| 767 |
elif step == STEPS[1]: render_step2()
|
| 768 |
elif step == STEPS[2]: render_step3()
|
| 769 |
elif step == STEPS[3]: render_step4()
|
| 770 |
-
elif step == STEPS[4]: render_step5()
|
|
|
|
| 51 |
.metric-val{font-size:1.4rem;font-weight:700;color:#17172b !important;}
|
| 52 |
.metric-lbl{font-size:.78rem;color:#6060a0 !important;margin-top:2px;}
|
| 53 |
|
| 54 |
+
.big-stat-box{border-radius:12px;padding:20px 24px;text-align:center;margin-bottom:8px;min-height:110px;}
|
| 55 |
+
.big-stat-val{font-size:2.0rem;font-weight:800;margin-bottom:4px;line-height:1.1;}
|
| 56 |
+
.big-stat-lbl{font-size:.80rem;font-weight:600;opacity:0.85;text-transform:uppercase;letter-spacing:.05em;}
|
| 57 |
+
.big-stat-sub{font-size:.76rem;opacity:0.65;margin-top:6px;}
|
| 58 |
|
| 59 |
.stat-ok{background:#edfaf3;border:2px solid #89d9ac;}
|
| 60 |
+
.stat-ok .big-stat-val,.stat-ok .big-stat-lbl{color:#0a5c30 !important;}
|
| 61 |
.stat-warn{background:#fffaeb;border:2px solid #f0cc7a;}
|
| 62 |
+
.stat-warn .big-stat-val,.stat-warn .big-stat-lbl{color:#7a4f00 !important;}
|
| 63 |
.stat-fail{background:#fff0ed;border:2px solid #f5a898;}
|
| 64 |
+
.stat-fail .big-stat-val,.stat-fail .big-stat-lbl{color:#900000 !important;}
|
| 65 |
|
| 66 |
.card-mcar{background:#edfaf3;border:2px solid #89d9ac;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
|
| 67 |
.card-mar {background:#fffaeb;border:2px solid #f0cc7a;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;}
|
|
|
|
| 384 |
miss_cols = [c for c in df.columns if df[c].isnull().any()]
|
| 385 |
if not miss_cols: return st.success("π No missing values!")
|
| 386 |
|
| 387 |
+
# ββ Summary table ββ
|
| 388 |
+
summary = pd.DataFrame({
|
| 389 |
+
"Missing Count": df[miss_cols].isnull().sum(),
|
| 390 |
+
"Missing %": (df[miss_cols].isnull().sum() / len(df) * 100).round(2)
|
| 391 |
+
}).sort_values("Missing %", ascending=False)
|
| 392 |
+
st.dataframe(
|
| 393 |
+
summary.style.background_gradient(cmap="YlOrRd", subset=["Missing %"]),
|
| 394 |
+
use_container_width=True
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
+
st.markdown("---")
|
| 398 |
+
|
| 399 |
+
# ββ Missingness Heatmap ββ
|
| 400 |
+
st.markdown("### πΊοΈ Missingness Heatmap")
|
| 401 |
+
st.caption("Each dark stripe = a missing value in that row. Aligned stripes across columns = rows missing together (MAR signal).")
|
| 402 |
+
|
| 403 |
+
fig_h, ax_h = plt.subplots(figsize=(14, max(3, len(miss_cols) * 0.6)))
|
| 404 |
+
fig_h.patch.set_facecolor('#f8f8f8')
|
| 405 |
+
ax_h.set_facecolor('#f0f0f0')
|
| 406 |
+
|
| 407 |
+
miss_matrix = df[miss_cols].isnull().astype(int)
|
| 408 |
+
# Subsample rows for performance if large
|
| 409 |
+
if len(miss_matrix) > 2000:
|
| 410 |
+
miss_matrix = miss_matrix.sample(2000, random_state=42).reset_index(drop=True)
|
| 411 |
+
|
| 412 |
+
ax_h.imshow(
|
| 413 |
+
miss_matrix.T.values,
|
| 414 |
+
aspect='auto',
|
| 415 |
+
cmap=sns.color_palette(["#f0f0f0", "#17172b"], as_cmap=True),
|
| 416 |
+
interpolation='none'
|
| 417 |
+
)
|
| 418 |
+
ax_h.set_yticks(range(len(miss_cols)))
|
| 419 |
+
ax_h.set_yticklabels(miss_cols, fontsize=10)
|
| 420 |
+
ax_h.set_xlabel("Row index (sampled)" if len(df) > 2000 else "Row index", fontsize=10)
|
| 421 |
+
ax_h.set_title("Missing Value Pattern (dark = missing)", fontsize=12, fontweight='bold', pad=10)
|
| 422 |
+
ax_h.spines[['top','right','bottom','left']].set_visible(False)
|
| 423 |
+
plt.tight_layout()
|
| 424 |
+
st.pyplot(fig_h, use_container_width=True)
|
| 425 |
+
plt.close()
|
| 426 |
+
|
| 427 |
+
st.markdown("---")
|
| 428 |
+
|
| 429 |
+
# ββ Missingness Correlation Heatmap ββ
|
| 430 |
+
st.markdown("### π Missingness Correlation")
|
| 431 |
+
st.caption("Correlation between missing patterns of columns. Values near 1.0 = these columns tend to be missing in the same rows β strong MAR signal.")
|
| 432 |
+
|
| 433 |
+
if len(miss_cols) >= 2:
|
| 434 |
+
miss_indicator = df[miss_cols].isnull().astype(int)
|
| 435 |
+
corr_matrix = miss_indicator.corr()
|
| 436 |
+
|
| 437 |
+
fig_c, ax_c = plt.subplots(figsize=(max(6, len(miss_cols) * 1.2), max(5, len(miss_cols) * 1.0)))
|
| 438 |
+
fig_c.patch.set_facecolor('#f8f8f8')
|
| 439 |
+
|
| 440 |
+
mask = np.zeros_like(corr_matrix, dtype=bool)
|
| 441 |
+
mask[np.triu_indices_from(mask, k=1)] = True # show lower triangle only
|
| 442 |
+
|
| 443 |
+
sns.heatmap(
|
| 444 |
+
corr_matrix,
|
| 445 |
+
mask=mask,
|
| 446 |
+
annot=True,
|
| 447 |
+
fmt=".2f",
|
| 448 |
+
cmap="RdYlGn",
|
| 449 |
+
vmin=-1, vmax=1,
|
| 450 |
+
center=0,
|
| 451 |
+
ax=ax_c,
|
| 452 |
+
square=True,
|
| 453 |
+
linewidths=0.5,
|
| 454 |
+
linecolor='white',
|
| 455 |
+
annot_kws={"size": 10, "weight": "bold"},
|
| 456 |
+
cbar_kws={"shrink": 0.8}
|
| 457 |
+
)
|
| 458 |
+
ax_c.set_title("Pairwise Missingness Correlation", fontsize=12, fontweight='bold', pad=12)
|
| 459 |
+
ax_c.tick_params(axis='x', rotation=45, labelsize=10)
|
| 460 |
+
ax_c.tick_params(axis='y', rotation=0, labelsize=10)
|
| 461 |
+
plt.tight_layout()
|
| 462 |
+
st.pyplot(fig_c, use_container_width=True)
|
| 463 |
+
plt.close()
|
| 464 |
+
|
| 465 |
+
# Interpretation callout
|
| 466 |
+
max_corr_pair = None
|
| 467 |
+
max_val = 0
|
| 468 |
+
for i in range(len(miss_cols)):
|
| 469 |
+
for j in range(i):
|
| 470 |
+
val = abs(corr_matrix.iloc[i, j])
|
| 471 |
+
if val > max_val:
|
| 472 |
+
max_val = val
|
| 473 |
+
max_corr_pair = (miss_cols[i], miss_cols[j], corr_matrix.iloc[i, j])
|
| 474 |
+
|
| 475 |
+
if max_corr_pair:
|
| 476 |
+
c1, c2, v = max_corr_pair
|
| 477 |
+
if v >= 0.9:
|
| 478 |
+
st.markdown(f'<div class="card-danger">π¨ <b>Very high missingness correlation ({v:.2f})</b> between <code>{c1}</code> and <code>{c2}</code> β these rows go missing together. Strong MAR signal; consider joint imputation (KNN/MICE).</div>', unsafe_allow_html=True)
|
| 479 |
+
elif v >= 0.5:
|
| 480 |
+
st.markdown(f'<div class="card-warn">β οΈ <b>Moderate missingness correlation ({v:.2f})</b> between <code>{c1}</code> and <code>{c2}</code> β partial co-occurrence of missingness detected.</div>', unsafe_allow_html=True)
|
| 481 |
+
else:
|
| 482 |
+
st.markdown(f'<div class="card-ok">β
<b>Low missingness correlation (max {v:.2f})</b> β columns appear to be missing independently.</div>', unsafe_allow_html=True)
|
| 483 |
+
else:
|
| 484 |
+
st.info("Only one column with missing values β correlation requires at least two.")
|
| 485 |
|
| 486 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 487 |
# STEP 3 β DIAGNOSTICS
|
|
|
|
| 674 |
skew_val = feas["skewness"]["value"]
|
| 675 |
skew_verd = feas["skewness"]["verdict"]
|
| 676 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 677 |
m1.markdown(
|
| 678 |
f'<div class="big-stat-box {COLORS[var_verd]}">'
|
| 679 |
+
f'<div class="big-stat-val">-{var_pct:.1f}%</div>'
|
| 680 |
f'<div class="big-stat-lbl">Variance Change</div>'
|
| 681 |
f'<div class="big-stat-sub">{ICONS[var_verd]} {"Safe" if var_verd=="ok" else "Caution" if var_verd=="warn" else "High Risk"}</div>'
|
| 682 |
f'</div>', unsafe_allow_html=True
|
| 683 |
)
|
| 684 |
m2.markdown(
|
| 685 |
f'<div class="big-stat-box {COLORS[out_verd]}">'
|
| 686 |
+
f'<div class="big-stat-val">+{new_out}</div>'
|
| 687 |
f'<div class="big-stat-lbl">New Outliers Created</div>'
|
| 688 |
f'<div class="big-stat-sub">{ICONS[out_verd]} Before: {feas["outliers"]["outliers_before"]} β After: {feas["outliers"]["outliers_after"]}</div>'
|
| 689 |
f'</div>', unsafe_allow_html=True
|
| 690 |
)
|
| 691 |
m3.markdown(
|
| 692 |
f'<div class="big-stat-box {COLORS[corr_verd]}">'
|
| 693 |
+
f'<div class="big-stat-val">Ξ{corr_max:.3f}</div>'
|
| 694 |
f'<div class="big-stat-lbl">Max Corr. Shift</div>'
|
| 695 |
f'<div class="big-stat-sub">{ICONS[corr_verd]} {corr_verd.capitalize()}</div>'
|
| 696 |
f'</div>', unsafe_allow_html=True
|
| 697 |
)
|
| 698 |
m4.markdown(
|
| 699 |
f'<div class="big-stat-box {COLORS[skew_verd]}">'
|
| 700 |
+
f'<div class="big-stat-val">{skew_val:.3f}</div>'
|
| 701 |
f'<div class="big-stat-lbl">Skewness</div>'
|
| 702 |
f'<div class="big-stat-sub">{ICONS[skew_verd]} {"Low" if abs(skew_val)<=1 else "Moderate" if abs(skew_val)<=3 else "High"} skew</div>'
|
| 703 |
f'</div>', unsafe_allow_html=True
|
|
|
|
| 858 |
elif step == STEPS[1]: render_step2()
|
| 859 |
elif step == STEPS[2]: render_step3()
|
| 860 |
elif step == STEPS[3]: render_step4()
|
| 861 |
+
elif step == STEPS[4]: render_step5()
|