Spaces:
Running
Running
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +1492 -35
src/streamlit_app.py
CHANGED
|
@@ -1,40 +1,1497 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
| 4 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
indices = np.linspace(0, 1, num_points)
|
| 20 |
-
theta = 2 * np.pi * num_turns * indices
|
| 21 |
-
radius = indices
|
| 22 |
-
|
| 23 |
-
x = radius * np.cos(theta)
|
| 24 |
-
y = radius * np.sin(theta)
|
| 25 |
-
|
| 26 |
-
df = pd.DataFrame({
|
| 27 |
-
"x": x,
|
| 28 |
-
"y": y,
|
| 29 |
-
"idx": indices,
|
| 30 |
-
"rand": np.random.randn(num_points),
|
| 31 |
-
})
|
| 32 |
-
|
| 33 |
-
st.altair_chart(alt.Chart(df, height=700, width=700)
|
| 34 |
-
.mark_point(filled=True)
|
| 35 |
-
.encode(
|
| 36 |
-
x=alt.X("x", axis=None),
|
| 37 |
-
y=alt.Y("y", axis=None),
|
| 38 |
-
color=alt.Color("idx", legend=None, scale=alt.Scale()),
|
| 39 |
-
size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
|
| 40 |
-
))
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Missing Value Intelligence Suite β Merged App
|
| 3 |
+
Combines the stepwise pipeline (app.py) with the comprehensive dashboard (app_tanisha.py)
|
| 4 |
+
into a unified 7-step workflow.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
import streamlit as st
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import numpy as np
|
| 10 |
+
import matplotlib.pyplot as plt
|
| 11 |
+
import matplotlib.patches as mpatches
|
| 12 |
+
import seaborn as sns
|
| 13 |
+
from scipy import stats
|
| 14 |
+
from scipy.stats import chi2_contingency, ks_2samp, shapiro, skew, kurtosis
|
| 15 |
+
from sklearn.preprocessing import LabelEncoder, StandardScaler
|
| 16 |
+
from sklearn.linear_model import LogisticRegression
|
| 17 |
+
from sklearn.model_selection import train_test_split
|
| 18 |
+
import warnings
|
| 19 |
+
warnings.filterwarnings("ignore")
|
| 20 |
|
| 21 |
+
# βββββββββββββββββββββββββββ Page config ββββββββββββββββββββββββββββ
|
| 22 |
+
st.set_page_config(
|
| 23 |
+
page_title="Missing Value Intelligence Suite",
|
| 24 |
+
page_icon="π¬",
|
| 25 |
+
layout="wide",
|
| 26 |
+
initial_sidebar_state="expanded",
|
| 27 |
+
)
|
| 28 |
|
| 29 |
+
# βββββββββββββββββββββββββββ Custom CSS βββββββββββββββββββββββββββββ
|
| 30 |
+
st.markdown("""
|
| 31 |
+
<style>
|
| 32 |
+
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
|
| 33 |
|
| 34 |
+
html, body, [class*="css"] { font-family: 'Inter', sans-serif; }
|
| 35 |
+
|
| 36 |
+
section[data-testid="stSidebar"] {
|
| 37 |
+
background: #17172b;
|
| 38 |
+
color: #ffffff;
|
| 39 |
+
}
|
| 40 |
+
section[data-testid="stSidebar"] * { color: #ffffff !important; }
|
| 41 |
+
section[data-testid="stSidebar"] .stSelectbox label,
|
| 42 |
+
section[data-testid="stSidebar"] .stRadio label { color: #c0c0e0 !important; }
|
| 43 |
+
|
| 44 |
+
.main-title {
|
| 45 |
+
font-size: 2rem;
|
| 46 |
+
font-weight: 700;
|
| 47 |
+
color: #17172b;
|
| 48 |
+
margin-bottom: 0.2rem;
|
| 49 |
+
}
|
| 50 |
+
.main-sub {
|
| 51 |
+
font-size: 1rem;
|
| 52 |
+
color: #6060a0;
|
| 53 |
+
margin-bottom: 1.5rem;
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
.section-header {
|
| 57 |
+
font-size: 1.3rem; font-weight: 600; color: #1a1a2e;
|
| 58 |
+
background: linear-gradient(90deg, #eef2ff, transparent);
|
| 59 |
+
padding: 10px 16px; border-left: 4px solid #4f8ef7;
|
| 60 |
+
border-radius: 4px; margin: 24px 0 14px 0;
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
.step-badge {
|
| 64 |
+
display: inline-block;
|
| 65 |
+
background: #17172b;
|
| 66 |
+
color: #fff;
|
| 67 |
+
font-size: 0.72rem;
|
| 68 |
+
font-weight: 700;
|
| 69 |
+
padding: 3px 10px;
|
| 70 |
+
border-radius: 20px;
|
| 71 |
+
margin-bottom: 6px;
|
| 72 |
+
letter-spacing: 0.08em;
|
| 73 |
+
text-transform: uppercase;
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
.card-mcar { background:#edfaf3; border:2px solid #89d9ac; border-radius:10px; padding:14px 18px; margin-bottom:10px; }
|
| 77 |
+
.card-mar { background:#fffaeb; border:2px solid #f0cc7a; border-radius:10px; padding:14px 18px; margin-bottom:10px; }
|
| 78 |
+
.card-mnar { background:#fff0ed; border:2px solid #f5a898; border-radius:10px; padding:14px 18px; margin-bottom:10px; }
|
| 79 |
+
.card-info { background:#eef2ff; border:2px solid #bdc8f5; border-radius:10px; padding:14px 18px; margin-bottom:10px; }
|
| 80 |
+
.card-warn { background:#fff8e1; border:2px solid #ffe082; border-radius:10px; padding:14px 18px; margin-bottom:10px; }
|
| 81 |
+
.card-strat{ background:#f8f0ff; border:2px solid #c8a0f0; border-radius:10px; padding:14px 18px; margin-bottom:10px; }
|
| 82 |
+
|
| 83 |
+
.verdict-label { font-size: 1.1rem; font-weight: 700; margin-bottom: 4px; }
|
| 84 |
+
.verdict-desc { font-size: 0.88rem; color: #444; }
|
| 85 |
+
|
| 86 |
+
.metric-box {
|
| 87 |
+
background: #f5f3ee;
|
| 88 |
+
border-radius: 8px;
|
| 89 |
+
padding: 12px 16px;
|
| 90 |
+
text-align: center;
|
| 91 |
+
}
|
| 92 |
+
.metric-val { font-size: 1.4rem; font-weight: 700; color: #17172b; }
|
| 93 |
+
.metric-lbl { font-size: 0.78rem; color: #6060a0; margin-top: 2px; }
|
| 94 |
+
|
| 95 |
+
.metric-card {
|
| 96 |
+
background: white; border-radius: 10px; padding: 18px 24px;
|
| 97 |
+
box-shadow: 0 2px 8px rgba(0,0,0,0.08); text-align: center;
|
| 98 |
+
}
|
| 99 |
+
.metric-card .val { font-size: 2rem; font-weight: 700; color: #4f8ef7; }
|
| 100 |
+
.metric-card .lbl { font-size: 0.82rem; color: #666; margin-top: 4px; }
|
| 101 |
+
|
| 102 |
+
.col-stat-card {
|
| 103 |
+
background: white; border-radius: 10px; padding: 14px 18px;
|
| 104 |
+
box-shadow: 0 1px 6px rgba(0,0,0,0.07); text-align: center;
|
| 105 |
+
}
|
| 106 |
+
.col-stat-card .cv { font-size: 1.5rem; font-weight: 700; color: #1a1a2e; }
|
| 107 |
+
.col-stat-card .ck { font-size: 0.75rem; color: #888; margin-top: 3px;
|
| 108 |
+
text-transform: uppercase; letter-spacing: .05em; }
|
| 109 |
+
|
| 110 |
+
.badge-mcar { background:#d4edda; color:#155724; padding:3px 10px; border-radius:12px; font-size:0.82rem; font-weight:600; }
|
| 111 |
+
.badge-mar { background:#fff3cd; color:#856404; padding:3px 10px; border-radius:12px; font-size:0.82rem; font-weight:600; }
|
| 112 |
+
.badge-mnar { background:#f8d7da; color:#721c24; padding:3px 10px; border-radius:12px; font-size:0.82rem; font-weight:600; }
|
| 113 |
+
|
| 114 |
+
.strat-chip { display:inline-block; padding:4px 14px; border-radius:20px;
|
| 115 |
+
font-size:0.82rem; font-weight:600; margin:3px 3px; }
|
| 116 |
+
.chip-green { background:#d4edda; color:#155724; border:1px solid #89d9ac; }
|
| 117 |
+
.chip-yellow { background:#fff3cd; color:#856404; border:1px solid #f0cc7a; }
|
| 118 |
+
.chip-red { background:#f8d7da; color:#721c24; border:1px solid #f5a898; }
|
| 119 |
+
.chip-blue { background:#dce3ff; color:#2a3da0; border:1px solid #bdc8f5; }
|
| 120 |
+
|
| 121 |
+
.insight-box {
|
| 122 |
+
background: #f0f7ff; border: 1px solid #bdd5ff;
|
| 123 |
+
border-radius: 8px; padding: 16px 20px; margin: 12px 0;
|
| 124 |
+
}
|
| 125 |
+
.insight-box li { margin: 6px 0; color: #1a3a6e; font-size: 0.92rem; }
|
| 126 |
+
.theory-box {
|
| 127 |
+
background: #fafafa; border: 1px solid #e0e0e0;
|
| 128 |
+
border-radius: 8px; padding: 16px 20px; margin: 12px 0;
|
| 129 |
+
}
|
| 130 |
+
.theory-box h4 { color: #333; margin-bottom: 8px; }
|
| 131 |
+
.theory-box p { color: #555; font-size: 0.91rem; line-height: 1.6; }
|
| 132 |
+
|
| 133 |
+
code { background: #f0f0f8; padding: 2px 6px; border-radius: 4px; font-size: 0.85rem; }
|
| 134 |
+
hr.divider { border: none; border-top: 2px solid #e0ddd8; margin: 1.5rem 0; }
|
| 135 |
+
</style>
|
| 136 |
+
""", unsafe_allow_html=True)
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 140 |
+
# SHARED HELPER FUNCTIONS
|
| 141 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 142 |
+
|
| 143 |
+
def missing_summary_df(df: pd.DataFrame) -> pd.DataFrame:
|
| 144 |
+
total = len(df)
|
| 145 |
+
counts = df.isnull().sum()
|
| 146 |
+
pct = counts / total * 100
|
| 147 |
+
summary = pd.DataFrame({
|
| 148 |
+
"Missing Count": counts,
|
| 149 |
+
"Missing %": pct.round(2),
|
| 150 |
+
"Dtype": df.dtypes.astype(str),
|
| 151 |
+
})
|
| 152 |
+
return summary[summary["Missing Count"] > 0].sort_values("Missing %", ascending=False)
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def missing_summary_typed(df, num_cols, cat_cols):
|
| 156 |
+
rows = []
|
| 157 |
+
for col in df.columns:
|
| 158 |
+
mc = df[col].isnull().sum()
|
| 159 |
+
pct = mc / len(df) * 100
|
| 160 |
+
dtype = "Numerical" if col in num_cols else "Categorical"
|
| 161 |
+
rows.append({"Column": col, "Data Type": dtype,
|
| 162 |
+
"Missing Count": mc, "Missing %": round(pct, 2)})
|
| 163 |
+
result = pd.DataFrame(rows).sort_values("Missing %", ascending=False).reset_index(drop=True)
|
| 164 |
+
return result[result["Missing Count"] > 0].reset_index(drop=True)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def severity(pct):
|
| 168 |
+
if pct < 5: return "Low"
|
| 169 |
+
if pct < 20: return "Moderate"
|
| 170 |
+
return "High"
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def identify_columns(df):
|
| 174 |
+
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 175 |
+
cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
|
| 176 |
+
return num_cols, cat_cols
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def missingness_risk_level(pct: float) -> tuple:
|
| 180 |
+
if pct <= 5:
|
| 181 |
+
return "β€5%", "Very low missingness. Low risk of bias.", "#edfaf3", "#0d6b3a"
|
| 182 |
+
elif pct <= 15:
|
| 183 |
+
return "5β15%", "Moderate. Imputation preferred over dropping.", "#fffaeb", "#7a4d00"
|
| 184 |
+
elif pct <= 30:
|
| 185 |
+
return "15β30%", "High. Dropping loses too much data. Advanced imputation + missing indicator mandatory.", "#fff0ed", "#9e2210"
|
| 186 |
+
else:
|
| 187 |
+
return ">30%", "Very high. Consider dropping the column. Re-evaluate column usefulness + domain check.", "#fde8e8", "#7a0000"
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
# ββ Statistical Tests (from app.py) ββββββββββββββββββββββββββββββββββ
|
| 191 |
+
|
| 192 |
+
def test1_pattern_analysis(df: pd.DataFrame, col: str) -> dict:
|
| 193 |
+
indicator = df[col].isnull().astype(int)
|
| 194 |
+
miss_pct = indicator.mean() * 100
|
| 195 |
+
runs = (indicator != indicator.shift()).sum()
|
| 196 |
+
max_possible_runs = min(len(indicator) * 2, len(indicator[indicator == 1]) * 2 + 1)
|
| 197 |
+
cluster_ratio = runs / max(max_possible_runs, 1)
|
| 198 |
+
scattered = cluster_ratio > 0.5
|
| 199 |
+
return {
|
| 200 |
+
"indicator": indicator,
|
| 201 |
+
"miss_pct": miss_pct,
|
| 202 |
+
"scattered": scattered,
|
| 203 |
+
"cluster_ratio": cluster_ratio,
|
| 204 |
+
"signal": "MCAR signal" if scattered else "MAR / MNAR signal (clustered rows)",
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def test2_feature_dependency(df: pd.DataFrame, col: str) -> dict:
|
| 209 |
+
missing_mask = df[col].isnull()
|
| 210 |
+
if missing_mask.sum() < 5 or (~missing_mask).sum() < 5:
|
| 211 |
+
return {"diffs": {}, "max_diff": 0.0, "signal": "Insufficient data"}
|
| 212 |
+
|
| 213 |
+
diffs = {}
|
| 214 |
+
for other_col in df.columns:
|
| 215 |
+
if other_col == col:
|
| 216 |
+
continue
|
| 217 |
+
try:
|
| 218 |
+
miss_vals = df.loc[missing_mask, other_col].dropna()
|
| 219 |
+
obs_vals = df.loc[~missing_mask, other_col].dropna()
|
| 220 |
+
if len(miss_vals) < 3 or len(obs_vals) < 3:
|
| 221 |
+
continue
|
| 222 |
+
if pd.api.types.is_numeric_dtype(df[other_col]):
|
| 223 |
+
m1, m2 = miss_vals.mean(), obs_vals.mean()
|
| 224 |
+
denom = max(abs(m2), 1e-9)
|
| 225 |
+
diff_pct = abs(m1 - m2) / denom * 100
|
| 226 |
+
diffs[other_col] = diff_pct
|
| 227 |
+
else:
|
| 228 |
+
ct = pd.crosstab(
|
| 229 |
+
pd.concat([pd.Series(["missing"] * len(miss_vals)),
|
| 230 |
+
pd.Series(["present"] * len(obs_vals))]),
|
| 231 |
+
pd.concat([miss_vals, obs_vals])
|
| 232 |
+
)
|
| 233 |
+
chi2, _, _, _ = chi2_contingency(ct)
|
| 234 |
+
n = ct.values.sum()
|
| 235 |
+
k = min(ct.shape) - 1
|
| 236 |
+
cramers_v = np.sqrt(chi2 / (n * max(k, 1))) * 100
|
| 237 |
+
diffs[other_col] = cramers_v
|
| 238 |
+
except Exception:
|
| 239 |
+
continue
|
| 240 |
+
|
| 241 |
+
if not diffs:
|
| 242 |
+
return {"diffs": {}, "max_diff": 0.0, "signal": "No comparable features"}
|
| 243 |
+
|
| 244 |
+
max_diff = max(diffs.values())
|
| 245 |
+
if max_diff < 5:
|
| 246 |
+
signal = "Weak signal β MCAR likely"
|
| 247 |
+
elif max_diff < 30:
|
| 248 |
+
signal = "Strong MAR signal (feature dependency detected)"
|
| 249 |
+
else:
|
| 250 |
+
signal = "Very strong dependency β MAR or MNAR"
|
| 251 |
+
|
| 252 |
+
return {"diffs": diffs, "max_diff": max_diff, "signal": signal}
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
def test3_target_dependency(df: pd.DataFrame, col: str, target_col: str) -> dict:
|
| 256 |
+
missing_mask = df[col].isnull()
|
| 257 |
+
if missing_mask.sum() < 5 or (~missing_mask).sum() < 5:
|
| 258 |
+
return {"diff_pct": None, "signal": "Insufficient data"}
|
| 259 |
+
|
| 260 |
+
try:
|
| 261 |
+
miss_target = df.loc[missing_mask, target_col].dropna()
|
| 262 |
+
obs_target = df.loc[~missing_mask, target_col].dropna()
|
| 263 |
+
|
| 264 |
+
if pd.api.types.is_numeric_dtype(df[target_col]):
|
| 265 |
+
m1, m2 = miss_target.mean(), obs_target.mean()
|
| 266 |
+
denom = max(abs(m2), 1e-9)
|
| 267 |
+
diff_pct = abs(m1 - m2) / denom * 100
|
| 268 |
+
else:
|
| 269 |
+
p1 = miss_target.value_counts(normalize=True).iloc[0] * 100
|
| 270 |
+
p2 = obs_target.value_counts(normalize=True).iloc[0] * 100
|
| 271 |
+
diff_pct = abs(p1 - p2)
|
| 272 |
+
|
| 273 |
+
if diff_pct < 5:
|
| 274 |
+
signal = "No strong signal (<5% target diff)"
|
| 275 |
+
elif diff_pct < 10:
|
| 276 |
+
signal = "Moderate target dependency β possible MAR/MNAR"
|
| 277 |
+
else:
|
| 278 |
+
signal = "Strong target dependency β MNAR likely (>10% target diff)"
|
| 279 |
+
|
| 280 |
+
return {"diff_pct": round(diff_pct, 2), "signal": signal}
|
| 281 |
+
except Exception as e:
|
| 282 |
+
return {"diff_pct": None, "signal": f"Could not compute: {e}"}
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
def classify_mechanism(t1: dict, t2: dict, t3: dict) -> tuple:
|
| 286 |
+
feat_dep = t2.get("max_diff", 0)
|
| 287 |
+
tgt_dep = t3.get("diff_pct") or 0
|
| 288 |
+
scattered = t1.get("scattered", True)
|
| 289 |
+
|
| 290 |
+
if tgt_dep > 10:
|
| 291 |
+
return "MNAR", "High", (
|
| 292 |
+
f"Target variable differs by {tgt_dep:.1f}% between missing/present rows. "
|
| 293 |
+
"The probability of missingness depends on the unobserved value itself."
|
| 294 |
+
)
|
| 295 |
+
elif feat_dep >= 10 and not scattered:
|
| 296 |
+
return "MAR", "High", (
|
| 297 |
+
f"Feature distributions differ by up to {feat_dep:.1f}% and missing values appear "
|
| 298 |
+
"clustered β missingness depends on observed features."
|
| 299 |
+
)
|
| 300 |
+
elif feat_dep >= 5:
|
| 301 |
+
return "MAR", "Moderate", (
|
| 302 |
+
f"Feature distributions differ by up to {feat_dep:.1f}%. "
|
| 303 |
+
"Missingness likely depends on observed features."
|
| 304 |
+
)
|
| 305 |
+
elif scattered and feat_dep < 5 and tgt_dep < 5:
|
| 306 |
+
return "MCAR", "High", (
|
| 307 |
+
"Values appear randomly scattered, feature distributions are similar across "
|
| 308 |
+
"groups, and target shows no dependency β consistent with MCAR."
|
| 309 |
+
)
|
| 310 |
+
else:
|
| 311 |
+
return "MCAR", "Low", (
|
| 312 |
+
"Weak signals across all three tests. Treated as MCAR but verify with domain knowledge."
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
# ββ Logistic Regression-based mechanism diagnosis (from app_tanisha.py) ββ
|
| 317 |
+
|
| 318 |
+
def diagnose_mechanism_lr(df, col, num_cols):
|
| 319 |
+
miss_mask = df[col].isnull().astype(int)
|
| 320 |
+
predictors = [c for c in df.columns if c != col and df[c].isnull().mean() < 0.9]
|
| 321 |
+
if not predictors or miss_mask.sum() < 5:
|
| 322 |
+
return "MNAR", "Insufficient data to test; assumed MNAR."
|
| 323 |
+
mcar_p_vals = []
|
| 324 |
+
for p in predictors:
|
| 325 |
+
if p in num_cols and df[p].dropna().nunique() > 1:
|
| 326 |
+
try:
|
| 327 |
+
binned = pd.qcut(df[p].fillna(df[p].median()), q=4, duplicates="drop", labels=False)
|
| 328 |
+
ct = pd.crosstab(binned, miss_mask)
|
| 329 |
+
if ct.shape[0] > 1 and ct.shape[1] > 1:
|
| 330 |
+
_, p_val, _, _ = chi2_contingency(ct)
|
| 331 |
+
mcar_p_vals.append(p_val)
|
| 332 |
+
except Exception:
|
| 333 |
+
pass
|
| 334 |
+
if mcar_p_vals and np.mean(mcar_p_vals) > 0.05:
|
| 335 |
+
return "MCAR", (f"Chi-square tests show no significant dependency "
|
| 336 |
+
f"(avg p={np.mean(mcar_p_vals):.3f} > 0.05). Missingness appears random.")
|
| 337 |
+
try:
|
| 338 |
+
X_pred = df[predictors].copy()
|
| 339 |
+
for c in X_pred.select_dtypes(include="object").columns:
|
| 340 |
+
X_pred[c] = X_pred[c].astype("category").cat.codes
|
| 341 |
+
X_pred = X_pred.fillna(X_pred.median(numeric_only=True))
|
| 342 |
+
scaler = StandardScaler()
|
| 343 |
+
X_scaled = scaler.fit_transform(X_pred)
|
| 344 |
+
lr = LogisticRegression(max_iter=300, solver="lbfgs")
|
| 345 |
+
lr.fit(X_scaled, miss_mask)
|
| 346 |
+
score = lr.score(X_scaled, miss_mask)
|
| 347 |
+
baseline = max(miss_mask.mean(), 1 - miss_mask.mean())
|
| 348 |
+
if score > baseline + 0.05:
|
| 349 |
+
return "MAR", (f"Logistic Regression predicts missingness with accuracy {score:.2%} "
|
| 350 |
+
f"(baseline {baseline:.2%}). Missingness is related to observed variables.")
|
| 351 |
+
except Exception:
|
| 352 |
+
pass
|
| 353 |
+
return "MNAR", "Missingness not explained by observed data. Likely related to the missing value itself β assumed MNAR."
|
| 354 |
+
|
| 355 |
+
|
| 356 |
+
def recommend_strategy(mechanism: str, miss_pct: float, dtype: str) -> dict:
|
| 357 |
+
is_num = "float" in dtype or "int" in dtype
|
| 358 |
+
add_indicator = (mechanism == "MNAR") or (mechanism == "MAR" and miss_pct >= 10)
|
| 359 |
+
|
| 360 |
+
if mechanism == "MCAR" and miss_pct <= 5:
|
| 361 |
+
method = "Drop rows"
|
| 362 |
+
reason = "MCAR confirmed and loss is minimal (β€5%). Safe to drop."
|
| 363 |
+
adv = "β No artificial data introduced"
|
| 364 |
+
disadv = "β Loses data β only safe at very low %"
|
| 365 |
+
elif mechanism in ("MCAR", "MAR") and miss_pct <= 15:
|
| 366 |
+
if is_num:
|
| 367 |
+
method = "Median imputation"
|
| 368 |
+
reason = "Low-moderate missingness. Median is robust to skew and outliers."
|
| 369 |
+
adv = "β Outlier-resistant; recommended default for numeric"
|
| 370 |
+
disadv = "β Reduces variance slightly"
|
| 371 |
+
else:
|
| 372 |
+
method = "Mode imputation"
|
| 373 |
+
reason = "Low-moderate missingness on categorical data."
|
| 374 |
+
adv = "β Preserves category structure"
|
| 375 |
+
disadv = "β Can over-represent dominant category"
|
| 376 |
+
elif mechanism == "MAR" and miss_pct <= 30:
|
| 377 |
+
method = "KNN Imputation" if is_num else "Mode / KNN Imputation"
|
| 378 |
+
reason = "Moderate MAR missingness. KNN leverages feature relationships."
|
| 379 |
+
adv = "β Preserves local patterns; captures inter-feature structure"
|
| 380 |
+
disadv = "β Slow on large datasets; requires scaling"
|
| 381 |
+
elif mechanism == "MAR" and miss_pct > 30:
|
| 382 |
+
method = "Iterative Imputer (MICE)"
|
| 383 |
+
reason = "High MAR missingness. MICE models each column as a function of others."
|
| 384 |
+
adv = "β Most statistically principled; accounts for all feature relationships"
|
| 385 |
+
disadv = "β Computationally expensive; risk of instability"
|
| 386 |
+
elif mechanism == "MNAR":
|
| 387 |
+
method = "Median + Missing Indicator (mandatory)"
|
| 388 |
+
reason = "MNAR: the fact of missingness is informative. Indicator must be created BEFORE imputation."
|
| 389 |
+
adv = "β Preserves MNAR signal; lets model learn from missingness"
|
| 390 |
+
disadv = "β Imputation may still be biased; domain expertise required"
|
| 391 |
+
else:
|
| 392 |
+
method = "Consider dropping column"
|
| 393 |
+
reason = f"Missing > 30% with {mechanism}. Evaluate predictive value vs. cost of imputation."
|
| 394 |
+
adv = "β Eliminates noise if column is uninformative"
|
| 395 |
+
disadv = "β Irreversible β verify with domain expert first"
|
| 396 |
+
|
| 397 |
+
return {
|
| 398 |
+
"method": method,
|
| 399 |
+
"reason": reason,
|
| 400 |
+
"adv": adv,
|
| 401 |
+
"disadv": disadv,
|
| 402 |
+
"add_indicator": add_indicator,
|
| 403 |
+
}
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
def strategy_chips_html(mech, miss_pct, col_type):
|
| 407 |
+
chips = []
|
| 408 |
+
if mech == "CLEAN":
|
| 409 |
+
return '<span class="strat-chip chip-green">β
No action needed β column is complete</span>'
|
| 410 |
+
if miss_pct > 50:
|
| 411 |
+
chips.append(("β Consider Dropping Column (>50% missing)", "chip-red"))
|
| 412 |
+
if mech == "MCAR":
|
| 413 |
+
if miss_pct < 5:
|
| 414 |
+
chips.append(("Listwise Deletion (safe)", "chip-green"))
|
| 415 |
+
chips.append(("Median Imputation" if col_type == "Numerical" else "Mode Imputation", "chip-green"))
|
| 416 |
+
if mech == "MAR":
|
| 417 |
+
chips.append(("KNN Imputation", "chip-blue"))
|
| 418 |
+
chips.append(("Iterative Imputer (MICE)", "chip-blue"))
|
| 419 |
+
chips.append(("Group-wise Imputation", "chip-blue"))
|
| 420 |
+
if miss_pct >= 10:
|
| 421 |
+
chips.append(("Create Missing Indicator (β₯10% MAR)", "chip-yellow"))
|
| 422 |
+
if mech == "MNAR":
|
| 423 |
+
chips.append(("β Create Missing Indicator FIRST (mandatory)", "chip-red"))
|
| 424 |
+
chips.append(("Constant / Domain-Specific Value", "chip-yellow"))
|
| 425 |
+
chips.append(("Sensitivity Analysis Required", "chip-yellow"))
|
| 426 |
+
return " ".join(f'<span class="strat-chip {cls}">{lbl}</span>' for lbl, cls in chips)
|
| 427 |
+
|
| 428 |
+
|
| 429 |
+
def validation_checks(df_before: pd.Series, df_after: pd.Series) -> dict:
|
| 430 |
+
m_shift = abs(df_before.mean() - df_after.mean()) / max(abs(df_before.mean()), 1e-9) * 100
|
| 431 |
+
med_shift = abs(df_before.median() - df_after.median()) / max(abs(df_before.median()), 1e-9) * 100
|
| 432 |
+
var_change = abs(df_before.var() - df_after.var()) / max(df_before.var(), 1e-9) * 100
|
| 433 |
+
return {
|
| 434 |
+
"mean_shift_pct": round(m_shift, 2),
|
| 435 |
+
"median_shift_pct": round(med_shift, 2),
|
| 436 |
+
"var_change_pct": round(var_change, 2),
|
| 437 |
+
"mean_ok": m_shift <= 5,
|
| 438 |
+
"median_ok": med_shift <= 3,
|
| 439 |
+
"var_ok": var_change <= 20,
|
| 440 |
+
}
|
| 441 |
+
|
| 442 |
+
|
| 443 |
+
# ββ Outlier & Variance helpers (from app_tanisha.py) ββββββββββββββββββ
|
| 444 |
+
|
| 445 |
+
def detect_outliers_iqr(series):
|
| 446 |
+
s = series.dropna()
|
| 447 |
+
if len(s) < 4: return 0
|
| 448 |
+
Q1, Q3 = s.quantile(0.25), s.quantile(0.75)
|
| 449 |
+
IQR = Q3 - Q1
|
| 450 |
+
return int(((s < Q1 - 1.5 * IQR) | (s > Q3 + 1.5 * IQR)).sum())
|
| 451 |
+
|
| 452 |
+
|
| 453 |
+
def variance_impact(series):
|
| 454 |
+
s = series.dropna()
|
| 455 |
+
if len(s) < 2: return 0.0, 0.0, 0.0
|
| 456 |
+
var_before = float(s.var())
|
| 457 |
+
var_after = float(series.fillna(s.mean()).var())
|
| 458 |
+
return round(var_before, 4), round(var_after, 4), round(var_before - var_after, 4)
|
| 459 |
+
|
| 460 |
+
|
| 461 |
+
def stat_card(label, value, color="#1a1a2e"):
|
| 462 |
+
return (f'<div class="col-stat-card">'
|
| 463 |
+
f'<div class="cv" style="color:{color};">{value}</div>'
|
| 464 |
+
f'<div class="ck">{label}</div></div>')
|
| 465 |
+
|
| 466 |
+
|
| 467 |
+
# ββ Plot helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 468 |
+
|
| 469 |
+
def plot_missing_heatmap(df):
|
| 470 |
+
missing_cols = [c for c in df.columns if df[c].isnull().any()]
|
| 471 |
+
if not missing_cols:
|
| 472 |
+
return None
|
| 473 |
+
sorted_cols = sorted(missing_cols, key=lambda c: df[c].isnull().mean(), reverse=True)
|
| 474 |
+
sample_size = min(300, len(df))
|
| 475 |
+
df_s = df[sorted_cols].sample(n=sample_size, random_state=42) if len(df) > sample_size else df[sorted_cols]
|
| 476 |
+
mask_df = df_s.isnull().astype(int)
|
| 477 |
+
fig, ax = plt.subplots(figsize=(max(10, len(sorted_cols) * 0.7), 5))
|
| 478 |
+
sns.heatmap(mask_df.T, cmap=["#f5f3ee", "#17172b"], cbar=True,
|
| 479 |
+
yticklabels=sorted_cols, xticklabels=False, linewidths=0, ax=ax)
|
| 480 |
+
ax.set_title(f"Missing Value Heatmap β sample of {sample_size} rows", fontsize=13, fontweight="bold", pad=12)
|
| 481 |
+
ax.set_xlabel("Rows (observations)", fontsize=10)
|
| 482 |
+
ax.set_ylabel("Columns", fontsize=10)
|
| 483 |
+
plt.tight_layout()
|
| 484 |
+
return fig
|
| 485 |
+
|
| 486 |
+
|
| 487 |
+
def plot_missingness_correlation(df):
|
| 488 |
+
missing_cols = [c for c in df.columns if df[c].isnull().any()]
|
| 489 |
+
if len(missing_cols) < 2:
|
| 490 |
+
return None
|
| 491 |
+
miss_bin = df[missing_cols].isnull().astype(int)
|
| 492 |
+
corr = miss_bin.corr()
|
| 493 |
+
fig, ax = plt.subplots(figsize=(max(7, len(missing_cols) * 0.9), max(6, len(missing_cols) * 0.8)))
|
| 494 |
+
mask = np.triu(np.ones_like(corr, dtype=bool))
|
| 495 |
+
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", center=0,
|
| 496 |
+
mask=mask, square=True, linewidths=0.5, cbar_kws={"shrink": 0.8}, ax=ax)
|
| 497 |
+
ax.set_title("Missingness Correlation Matrix", fontsize=13, fontweight="bold", pad=12)
|
| 498 |
+
plt.tight_layout()
|
| 499 |
+
return fig
|
| 500 |
+
|
| 501 |
+
|
| 502 |
+
def plot_numerical_column(df, col):
|
| 503 |
+
s_original = df[col].dropna()
|
| 504 |
+
s_imputed = df[col].fillna(s_original.mean())
|
| 505 |
+
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
|
| 506 |
+
fig.suptitle(f"Deep Distribution Analysis β {col}", fontsize=14, fontweight="bold")
|
| 507 |
+
sns.kdeplot(s_original, ax=axes[0], color="#4f8ef7", linewidth=3,
|
| 508 |
+
label="Original (Before)", fill=True, alpha=0.2)
|
| 509 |
+
sns.kdeplot(s_imputed, ax=axes[0], color="#e07b54", linewidth=3,
|
| 510 |
+
label="Mean Imputed (After)", linestyle="--")
|
| 511 |
+
axes[0].set_title("Distribution Shift: Original vs. Imputed", fontsize=12)
|
| 512 |
+
axes[0].legend()
|
| 513 |
+
box_data = pd.DataFrame({
|
| 514 |
+
"Value": pd.concat([s_original, s_imputed]),
|
| 515 |
+
"Type": ["Original"] * len(s_original) + ["Imputed"] * len(s_imputed),
|
| 516 |
+
})
|
| 517 |
+
sns.boxplot(data=box_data, x="Type", y="Value", ax=axes[1], palette=["#dce3ff", "#fce4d6"])
|
| 518 |
+
axes[1].set_title("Variance & Outlier Comparison", fontsize=12)
|
| 519 |
+
plt.tight_layout()
|
| 520 |
+
return fig
|
| 521 |
+
|
| 522 |
+
|
| 523 |
+
def plot_categorical_column(df, col, top_n=10):
|
| 524 |
+
s_original = df[col].dropna()
|
| 525 |
+
s_imputed = df[col].fillna(s_original.mode()[0] if not s_original.empty else "N/A")
|
| 526 |
+
fig, axes = plt.subplots(1, 2, figsize=(16, 7))
|
| 527 |
+
fig.suptitle(f"Categorical Frequency Analysis β {col}", fontsize=14, fontweight="bold")
|
| 528 |
+
orig_counts = s_original.value_counts().head(top_n)
|
| 529 |
+
imp_counts = s_imputed.value_counts().head(top_n)
|
| 530 |
+
compare_df = pd.DataFrame({"Original": orig_counts, "Imputed (Mode)": imp_counts}).fillna(0)
|
| 531 |
+
compare_df.plot(kind="barh", ax=axes[0], color=["#4f8ef7", "#e07b54"], width=0.8)
|
| 532 |
+
axes[0].set_title(f"Top {top_n} Categories: Original vs Mode Imputed", fontsize=12)
|
| 533 |
+
axes[0].invert_yaxis()
|
| 534 |
+
top_pie = imp_counts.head(8)
|
| 535 |
+
axes[1].pie(top_pie, labels=top_pie.index.astype(str), autopct="%1.1f%%",
|
| 536 |
+
startangle=140, colors=plt.cm.Pastel1.colors, wedgeprops={"edgecolor": "white"})
|
| 537 |
+
axes[1].set_title("Final Proportion (After Imputation)", fontsize=12)
|
| 538 |
+
plt.tight_layout()
|
| 539 |
+
return fig
|
| 540 |
+
|
| 541 |
+
|
| 542 |
+
def plot_missing_vs_features(df, col):
|
| 543 |
+
num_others = [c for c in df.select_dtypes(include=[np.number]).columns
|
| 544 |
+
if c != col and df[c].isnull().mean() < 0.95]
|
| 545 |
+
if not num_others:
|
| 546 |
+
return None
|
| 547 |
+
means_present = df[df[col].notna()][num_others].mean()
|
| 548 |
+
means_missing = df[df[col].isnull()][num_others].mean()
|
| 549 |
+
diff_df = pd.DataFrame({"Present": means_present, "Missing": means_missing}).dropna().head(12)
|
| 550 |
+
if diff_df.empty:
|
| 551 |
+
return None
|
| 552 |
+
fig, ax = plt.subplots(figsize=(max(8, len(diff_df) * 0.9), 4))
|
| 553 |
+
x = np.arange(len(diff_df)); w = 0.35
|
| 554 |
+
ax.bar(x - w/2, diff_df["Present"], w, label="Present rows", color="#4f8ef7", alpha=0.85)
|
| 555 |
+
ax.bar(x + w/2, diff_df["Missing"], w, label="Missing rows", color="#e07b54", alpha=0.85)
|
| 556 |
+
ax.set_xticks(x)
|
| 557 |
+
ax.set_xticklabels(diff_df.index, rotation=35, ha="right", fontsize=9)
|
| 558 |
+
ax.set_title(f"Feature Means β Rows where '{col}' is Present vs Missing",
|
| 559 |
+
fontsize=11, fontweight="bold")
|
| 560 |
+
ax.set_ylabel("Mean value")
|
| 561 |
+
ax.legend(fontsize=9)
|
| 562 |
+
plt.tight_layout()
|
| 563 |
+
return fig
|
| 564 |
+
|
| 565 |
+
|
| 566 |
+
def render_per_column_deep_analysis(df, col, num_cols, cat_cols, mechanism_results):
|
| 567 |
+
miss_count = int(df[col].isnull().sum())
|
| 568 |
+
miss_pct = round(df[col].isnull().mean() * 100, 2)
|
| 569 |
+
total_rows = len(df)
|
| 570 |
+
present = total_rows - miss_count
|
| 571 |
+
col_type = "Numerical" if col in num_cols else "Categorical"
|
| 572 |
+
mech_info = mechanism_results.get(col, {})
|
| 573 |
+
mech = mech_info.get("mechanism", "N/A")
|
| 574 |
+
mech_reason = mech_info.get("reason", "Run the global diagnosis section above first.")
|
| 575 |
+
sev = severity(miss_pct) if miss_pct > 0 else "None"
|
| 576 |
+
|
| 577 |
+
miss_color = "#dc2626" if miss_pct >= 20 else "#d97706" if miss_pct >= 5 else "#16a34a"
|
| 578 |
+
sev_color = "#dc2626" if sev == "High" else "#d97706" if sev == "Moderate" else "#16a34a"
|
| 579 |
+
mech_color = {"MCAR": "#155724", "MAR": "#856404", "MNAR": "#721c24"}.get(mech, "#444")
|
| 580 |
+
|
| 581 |
+
st.markdown(f"#### π Deep Analysis β `{col}` Β· {col_type}", unsafe_allow_html=True)
|
| 582 |
+
m1, m2, m3, m4, m5 = st.columns(5)
|
| 583 |
+
with m1: st.markdown(stat_card("Total Rows", f"{total_rows:,}"), unsafe_allow_html=True)
|
| 584 |
+
with m2: st.markdown(stat_card("Present", f"{present:,}"), unsafe_allow_html=True)
|
| 585 |
+
with m3: st.markdown(stat_card("Missing", f"{miss_pct}%", miss_color), unsafe_allow_html=True)
|
| 586 |
+
with m4: st.markdown(stat_card("Severity", sev, sev_color), unsafe_allow_html=True)
|
| 587 |
+
with m5: st.markdown(stat_card("Mechanism", mech, mech_color), unsafe_allow_html=True)
|
| 588 |
+
st.markdown("")
|
| 589 |
+
|
| 590 |
+
if col_type == "Numerical":
|
| 591 |
+
s = df[col].dropna()
|
| 592 |
+
if len(s) > 1:
|
| 593 |
+
col_skew = float(skew(s))
|
| 594 |
+
col_kurt = float(kurtosis(s))
|
| 595 |
+
Q1, Q3 = float(s.quantile(0.25)), float(s.quantile(0.75))
|
| 596 |
+
IQR = Q3 - Q1
|
| 597 |
+
n_out = detect_outliers_iqr(df[col])
|
| 598 |
+
vb, va, vi = variance_impact(df[col])
|
| 599 |
+
out_pct = n_out / max(len(s), 1)
|
| 600 |
+
|
| 601 |
+
r1 = st.columns(4)
|
| 602 |
+
for (lbl, val), col_ui in zip(
|
| 603 |
+
[("Mean", f"{s.mean():.4g}"), ("Median", f"{s.median():.4g}"),
|
| 604 |
+
("Std Dev", f"{s.std():.4g}"), ("Variance", f"{s.var():.4g}")], r1):
|
| 605 |
+
with col_ui: st.markdown(stat_card(lbl, val), unsafe_allow_html=True)
|
| 606 |
+
st.markdown("")
|
| 607 |
+
|
| 608 |
+
r2 = st.columns(4)
|
| 609 |
+
for (lbl, val), col_ui in zip(
|
| 610 |
+
[("Min", f"{s.min():.4g}"), ("Max", f"{s.max():.4g}"),
|
| 611 |
+
("Skewness", f"{col_skew:.3f}"), ("Kurtosis", f"{col_kurt:.3f}")], r2):
|
| 612 |
+
with col_ui: st.markdown(stat_card(lbl, val), unsafe_allow_html=True)
|
| 613 |
+
st.markdown("")
|
| 614 |
+
|
| 615 |
+
r3 = st.columns(4)
|
| 616 |
+
out_color = "#dc2626" if out_pct > 0.15 else "#d97706" if out_pct > 0.05 else "#16a34a"
|
| 617 |
+
for (lbl, val, clr), col_ui in zip(
|
| 618 |
+
[("Q1", f"{Q1:.4g}", "#1a1a2e"), ("Q3", f"{Q3:.4g}", "#1a1a2e"),
|
| 619 |
+
("IQR", f"{IQR:.4g}", "#1a1a2e"), ("Outliers (IQR)", str(n_out), out_color)], r3):
|
| 620 |
+
with col_ui: st.markdown(stat_card(lbl, val, clr), unsafe_allow_html=True)
|
| 621 |
+
|
| 622 |
+
if len(s) <= 5000:
|
| 623 |
+
try:
|
| 624 |
+
_, p_norm = shapiro(s.sample(min(len(s), 5000), random_state=0))
|
| 625 |
+
norm_txt = f"β
Normal (p={p_norm:.4f})" if p_norm > 0.05 else f"β Not Normal (p={p_norm:.4f})"
|
| 626 |
+
st.caption(f"π Shapiro-Wilk normality test: {norm_txt}")
|
| 627 |
+
except Exception:
|
| 628 |
+
pass
|
| 629 |
+
|
| 630 |
+
st.markdown("")
|
| 631 |
+
fig_dist = plot_numerical_column(df, col)
|
| 632 |
+
st.pyplot(fig_dist); plt.close(fig_dist)
|
| 633 |
+
|
| 634 |
+
st.markdown("**Variance Impact of Mean Imputation (simulated)**")
|
| 635 |
+
vc = st.columns(3)
|
| 636 |
+
delta_color = "#dc2626" if abs(vi)/max(vb,1e-9) > 0.3 else "#d97706" if abs(vi)/max(vb,1e-9) > 0.1 else "#16a34a"
|
| 637 |
+
with vc[0]: st.markdown(stat_card("Variance (before)", f"{vb:.4g}"), unsafe_allow_html=True)
|
| 638 |
+
with vc[1]: st.markdown(stat_card("Variance (after)", f"{va:.4g}"), unsafe_allow_html=True)
|
| 639 |
+
with vc[2]: st.markdown(stat_card("Ξ Variance", f"{vi:.4g}", delta_color), unsafe_allow_html=True)
|
| 640 |
+
|
| 641 |
+
pct_chg = abs(vi) / max(vb, 1e-9) * 100
|
| 642 |
+
if pct_chg >= 30:
|
| 643 |
+
st.warning(f"β Variance drops by {pct_chg:.1f}% after mean imputation β over-smoothing risk. Use median or model-based imputation.")
|
| 644 |
+
elif pct_chg >= 10:
|
| 645 |
+
st.info(f"βΉ Variance drops by {pct_chg:.1f}% β acceptable, but monitor distribution shape.")
|
| 646 |
+
else:
|
| 647 |
+
st.success(f"β
Variance change is small ({pct_chg:.1f}%) β mean imputation is statistically safe here.")
|
| 648 |
+
else:
|
| 649 |
+
s = df[col].dropna()
|
| 650 |
+
n_unique = s.nunique()
|
| 651 |
+
mode_val = str(s.mode().iloc[0]) if len(s) > 0 else "N/A"
|
| 652 |
+
mode_cnt = int((s == s.mode().iloc[0]).sum()) if len(s) > 0 else 0
|
| 653 |
+
mode_pct = round(mode_cnt / max(len(s), 1) * 100, 1)
|
| 654 |
+
|
| 655 |
+
r1 = st.columns(4)
|
| 656 |
+
for (lbl, val), col_ui in zip(
|
| 657 |
+
[("Unique Values", n_unique), ("Mode", mode_val[:12]),
|
| 658 |
+
("Mode Count", f"{mode_cnt:,}"), ("Mode Freq %", f"{mode_pct}%")], r1):
|
| 659 |
+
with col_ui: st.markdown(stat_card(lbl, str(val)), unsafe_allow_html=True)
|
| 660 |
+
|
| 661 |
+
st.markdown("")
|
| 662 |
+
freq_table = s.value_counts().reset_index()
|
| 663 |
+
freq_table.columns = ["Value", "Count"]
|
| 664 |
+
freq_table["% of Present"] = (freq_table["Count"] / len(s) * 100).round(2)
|
| 665 |
+
tab_chart, tab_table = st.tabs(["π Frequency Chart", "π Frequency Table"])
|
| 666 |
+
with tab_chart:
|
| 667 |
+
fig_cat = plot_categorical_column(df, col)
|
| 668 |
+
st.pyplot(fig_cat); plt.close(fig_cat)
|
| 669 |
+
with tab_table:
|
| 670 |
+
st.dataframe(freq_table, use_container_width=True, hide_index=True)
|
| 671 |
+
|
| 672 |
+
st.markdown("")
|
| 673 |
+
if miss_count > 0:
|
| 674 |
+
st.markdown("**How Missingness Relates to Other Features**")
|
| 675 |
+
fig_pat = plot_missing_vs_features(df, col)
|
| 676 |
+
if fig_pat:
|
| 677 |
+
st.pyplot(fig_pat); plt.close(fig_pat)
|
| 678 |
+
st.caption("Large differences between blue (present) and orange (missing) bars signal MAR behavior.")
|
| 679 |
+
else:
|
| 680 |
+
st.info("No other numerical features available for pattern comparison.")
|
| 681 |
+
|
| 682 |
+
st.markdown("")
|
| 683 |
+
verdict_cls = {"MCAR": "card-mcar", "MAR": "card-mar", "MNAR": "card-mnar"}.get(mech, "card-info")
|
| 684 |
+
mech_icon = {"MCAR": "π’", "MAR": "π‘", "MNAR": "π΄"}.get(mech, "β
")
|
| 685 |
+
mech_label = {"MCAR": "Missing Completely At Random (MCAR)",
|
| 686 |
+
"MAR": "Missing At Random (MAR)",
|
| 687 |
+
"MNAR": "Missing Not At Random (MNAR)",
|
| 688 |
+
"N/A": "No Missing Values"}.get(mech, mech)
|
| 689 |
+
|
| 690 |
+
st.markdown(
|
| 691 |
+
f'<div class="{verdict_cls}"><strong>{mech_icon} {mech_label}</strong><br>'
|
| 692 |
+
f'<span style="font-size:0.9rem;color:#444;">{mech_reason}</span></div>',
|
| 693 |
+
unsafe_allow_html=True)
|
| 694 |
+
|
| 695 |
+
chips_html = strategy_chips_html(mech, miss_pct, col_type)
|
| 696 |
+
if chips_html:
|
| 697 |
+
st.markdown("")
|
| 698 |
+
st.markdown("**Recommended Strategies**")
|
| 699 |
+
st.markdown(chips_html, unsafe_allow_html=True)
|
| 700 |
+
|
| 701 |
+
pointer = {
|
| 702 |
+
"MCAR": ("π **MCAR**: Missing% <5% β listwise deletion is safe. 5β15% β median/mode imputation. "
|
| 703 |
+
"15β30% β advanced imputation with missing indicator."),
|
| 704 |
+
"MAR": ("π **MAR**: KNN / MICE preferred. Create a missing indicator if missing% β₯10%."),
|
| 705 |
+
"MNAR": ("π **MNAR**: **Create the missing indicator FIRST**, then use constant or sensitivity analysis. "
|
| 706 |
+
"Domain knowledge is essential."),
|
| 707 |
+
"N/A": "π No action needed β this column is complete. Proceed to feature engineering.",
|
| 708 |
+
}.get(mech, "")
|
| 709 |
+
if pointer:
|
| 710 |
+
st.markdown("")
|
| 711 |
+
st.info(pointer)
|
| 712 |
+
|
| 713 |
+
|
| 714 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 715 |
+
# SIDEBAR β NAVIGATION
|
| 716 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 717 |
+
|
| 718 |
+
STEPS = [
|
| 719 |
+
"1 Β· Upload CSV",
|
| 720 |
+
"2 Β· Select Target Column",
|
| 721 |
+
"3 Β· Overview & Patterns",
|
| 722 |
+
"4 Β· Mechanism Dashboard",
|
| 723 |
+
"5 Β· Column Diagnostics",
|
| 724 |
+
"6 Β· Strategy & Imputation",
|
| 725 |
+
"7 Β· Validation Checks",
|
| 726 |
+
]
|
| 727 |
+
|
| 728 |
+
with st.sidebar:
|
| 729 |
+
st.markdown("## π¬ Missing Value Intelligence Suite")
|
| 730 |
+
st.markdown("---")
|
| 731 |
+
st.markdown("**Navigation**")
|
| 732 |
+
step = st.radio("Go to step:", STEPS, label_visibility="collapsed")
|
| 733 |
+
st.markdown("---")
|
| 734 |
+
st.markdown(
|
| 735 |
+
"<small style='color:#9090c0'>Follow the steps in order for a complete analysis pipeline. "
|
| 736 |
+
"Steps 3β4 are exploratory; Steps 5β7 form the diagnostic pipeline.</small>",
|
| 737 |
+
unsafe_allow_html=True,
|
| 738 |
+
)
|
| 739 |
+
|
| 740 |
+
|
| 741 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 742 |
+
# SESSION STATE
|
| 743 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 744 |
+
|
| 745 |
+
for key in ["df", "target_col", "col_results", "df_imputed", "mechanism_results_lr"]:
|
| 746 |
+
if key not in st.session_state:
|
| 747 |
+
st.session_state[key] = None
|
| 748 |
+
if st.session_state["col_results"] is None:
|
| 749 |
+
st.session_state["col_results"] = {}
|
| 750 |
+
if st.session_state["mechanism_results_lr"] is None:
|
| 751 |
+
st.session_state["mechanism_results_lr"] = {}
|
| 752 |
+
|
| 753 |
+
|
| 754 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 755 |
+
# STEP 1 β UPLOAD CSV
|
| 756 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 757 |
+
|
| 758 |
+
if step == STEPS[0]:
|
| 759 |
+
st.markdown('<div class="main-title">π Step 1 β Upload Your CSV</div>', unsafe_allow_html=True)
|
| 760 |
+
st.markdown('<div class="main-sub">Upload a CSV file to begin the missing-value analysis pipeline.</div>', unsafe_allow_html=True)
|
| 761 |
+
|
| 762 |
+
uploaded = st.file_uploader("Choose a CSV file", type=["csv"])
|
| 763 |
+
|
| 764 |
+
if uploaded:
|
| 765 |
+
try:
|
| 766 |
+
df = pd.read_csv(uploaded)
|
| 767 |
+
# Auto-remove ID-like columns
|
| 768 |
+
id_cols = [c for c in df.columns if c.strip().lower() in ("id", "index", "row", "rowid", "row_id")]
|
| 769 |
+
if id_cols:
|
| 770 |
+
df.drop(columns=id_cols, inplace=True)
|
| 771 |
+
st.toast(f"Auto-removed non-informative column(s): {id_cols}", icon="ποΈ")
|
| 772 |
+
|
| 773 |
+
st.session_state["df"] = df
|
| 774 |
+
st.session_state["col_results"] = {}
|
| 775 |
+
st.session_state["mechanism_results_lr"] = {}
|
| 776 |
+
st.session_state["df_imputed"] = df.copy()
|
| 777 |
+
|
| 778 |
+
st.success(f"β
File loaded: **{uploaded.name}** β {df.shape[0]} rows Γ {df.shape[1]} columns")
|
| 779 |
+
st.markdown("### Preview (first 10 rows)")
|
| 780 |
+
st.dataframe(df.head(10), use_container_width=True)
|
| 781 |
+
|
| 782 |
+
c1, c2, c3, c4 = st.columns(4)
|
| 783 |
+
with c1:
|
| 784 |
+
st.markdown(f'<div class="metric-box"><div class="metric-val">{df.shape[0]:,}</div><div class="metric-lbl">Rows</div></div>', unsafe_allow_html=True)
|
| 785 |
+
with c2:
|
| 786 |
+
st.markdown(f'<div class="metric-box"><div class="metric-val">{df.shape[1]}</div><div class="metric-lbl">Columns</div></div>', unsafe_allow_html=True)
|
| 787 |
+
with c3:
|
| 788 |
+
n_miss_cols = df.isnull().any().sum()
|
| 789 |
+
st.markdown(f'<div class="metric-box"><div class="metric-val">{n_miss_cols}</div><div class="metric-lbl">Columns w/ Missings</div></div>', unsafe_allow_html=True)
|
| 790 |
+
with c4:
|
| 791 |
+
total_miss = df.isnull().sum().sum()
|
| 792 |
+
pct_miss = round(total_miss / df.size * 100, 1)
|
| 793 |
+
st.markdown(f'<div class="metric-box"><div class="metric-val">{pct_miss}%</div><div class="metric-lbl">Overall Missing Rate</div></div>', unsafe_allow_html=True)
|
| 794 |
+
|
| 795 |
+
st.markdown("### Column Types & Missingness")
|
| 796 |
+
type_df = pd.DataFrame({
|
| 797 |
+
"Column": df.columns,
|
| 798 |
+
"Dtype": df.dtypes.astype(str).values,
|
| 799 |
+
"Missing": df.isnull().sum().values,
|
| 800 |
+
"Missing %": (df.isnull().mean() * 100).round(2).values,
|
| 801 |
+
})
|
| 802 |
+
st.dataframe(type_df, use_container_width=True, hide_index=True)
|
| 803 |
+
|
| 804 |
+
except Exception as e:
|
| 805 |
+
st.error(f"Could not read file: {e}")
|
| 806 |
+
else:
|
| 807 |
+
st.info("π Upload a CSV to get started.")
|
| 808 |
+
|
| 809 |
+
|
| 810 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 811 |
+
# STEP 2 β SELECT TARGET COLUMN
|
| 812 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 813 |
+
|
| 814 |
+
elif step == STEPS[1]:
|
| 815 |
+
st.markdown('<div class="main-title">π― Step 2 β Select Target Column</div>', unsafe_allow_html=True)
|
| 816 |
+
st.markdown('<div class="main-sub">The target column (y) is used in Test 3 to detect MNAR patterns and is excluded from feature analysis.</div>', unsafe_allow_html=True)
|
| 817 |
+
|
| 818 |
+
df = st.session_state.get("df")
|
| 819 |
+
if df is None:
|
| 820 |
+
st.warning("β οΈ Please upload a CSV in Step 1 first.")
|
| 821 |
+
else:
|
| 822 |
+
target = st.selectbox(
|
| 823 |
+
"Select the output / target column:",
|
| 824 |
+
options=df.columns.tolist(),
|
| 825 |
+
index=len(df.columns) - 1,
|
| 826 |
+
)
|
| 827 |
+
if st.button("β
Confirm Target Column", type="primary"):
|
| 828 |
+
st.session_state["target_col"] = target
|
| 829 |
+
st.success(f"Target column set to: **{target}**")
|
| 830 |
+
|
| 831 |
+
if st.session_state.get("target_col"):
|
| 832 |
+
st.info(f"Current target: **{st.session_state['target_col']}**")
|
| 833 |
+
tc = st.session_state["target_col"]
|
| 834 |
+
col_data = df[tc]
|
| 835 |
+
st.markdown("#### Target Column Distribution")
|
| 836 |
+
fig, ax = plt.subplots(figsize=(7, 3))
|
| 837 |
+
if pd.api.types.is_numeric_dtype(col_data):
|
| 838 |
+
col_data.dropna().hist(bins=30, ax=ax, color="#17172b", edgecolor="white")
|
| 839 |
+
ax.set_xlabel(tc); ax.set_ylabel("Count")
|
| 840 |
+
else:
|
| 841 |
+
vc = col_data.value_counts().head(15)
|
| 842 |
+
vc.plot(kind="bar", ax=ax, color="#17172b")
|
| 843 |
+
ax.set_ylabel("Count")
|
| 844 |
+
ax.set_title(f"Distribution of '{tc}'")
|
| 845 |
+
plt.tight_layout()
|
| 846 |
+
st.pyplot(fig)
|
| 847 |
+
plt.close()
|
| 848 |
+
|
| 849 |
+
|
| 850 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 851 |
+
# STEP 3 β OVERVIEW & PATTERNS
|
| 852 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 853 |
+
|
| 854 |
+
elif step == STEPS[2]:
|
| 855 |
+
st.markdown('<div class="main-title">π Step 3 β Overview & Patterns</div>', unsafe_allow_html=True)
|
| 856 |
+
st.markdown('<div class="main-sub">Bird\'s-eye view of missingness across the dataset, including heatmaps and co-missingness patterns.</div>', unsafe_allow_html=True)
|
| 857 |
+
|
| 858 |
+
df = st.session_state.get("df")
|
| 859 |
+
target_col = st.session_state.get("target_col")
|
| 860 |
+
|
| 861 |
+
if df is None:
|
| 862 |
+
st.warning("β οΈ Please upload a CSV in Step 1 first.")
|
| 863 |
+
else:
|
| 864 |
+
X = df.drop(columns=[target_col]) if target_col and target_col in df.columns else df
|
| 865 |
+
summary = missing_summary_df(X)
|
| 866 |
+
|
| 867 |
+
if summary.empty:
|
| 868 |
+
st.success("π No missing values found in the dataset features!")
|
| 869 |
+
else:
|
| 870 |
+
st.markdown(f"### {len(summary)} column(s) have missing values")
|
| 871 |
+
st.dataframe(summary.style.background_gradient(subset=["Missing %"], cmap="YlOrRd"),
|
| 872 |
+
use_container_width=True)
|
| 873 |
+
|
| 874 |
+
# ββ Missing % bar chart
|
| 875 |
+
st.markdown('<div class="section-header">π Missing % per Column</div>', unsafe_allow_html=True)
|
| 876 |
+
miss_cols = summary.index.tolist()
|
| 877 |
+
fig_bar, ax_bar = plt.subplots(figsize=(max(7, len(miss_cols) * 0.9), 4))
|
| 878 |
+
colors = ["#9e2210" if v > 30 else "#7a4d00" if v > 15 else "#0d6b3a" for v in summary["Missing %"]]
|
| 879 |
+
ax_bar.barh(summary.index[::-1], summary["Missing %"][::-1], color=colors[::-1], edgecolor="white")
|
| 880 |
+
ax_bar.axvline(5, color="#89d9ac", linewidth=1.5, linestyle="--", label="5% threshold")
|
| 881 |
+
ax_bar.axvline(15, color="#f0cc7a", linewidth=1.5, linestyle="--", label="15% threshold")
|
| 882 |
+
ax_bar.axvline(30, color="#f5a898", linewidth=1.5, linestyle="--", label="30% threshold")
|
| 883 |
+
ax_bar.set_xlabel("Missing %"); ax_bar.set_title("Missing % per Column")
|
| 884 |
+
ax_bar.legend(loc="lower right", fontsize=8)
|
| 885 |
+
plt.tight_layout()
|
| 886 |
+
st.pyplot(fig_bar)
|
| 887 |
+
plt.close()
|
| 888 |
+
|
| 889 |
+
# ββ Heatmap + Correlation tabs
|
| 890 |
+
st.markdown('<div class="section-header">πΊ Missingness Patterns</div>', unsafe_allow_html=True)
|
| 891 |
+
tab_hm, tab_corr = st.tabs(["Missing Heatmap", "Missingness Correlation"])
|
| 892 |
+
with tab_hm:
|
| 893 |
+
fig_hm = plot_missing_heatmap(X)
|
| 894 |
+
if fig_hm:
|
| 895 |
+
st.pyplot(fig_hm); plt.close(fig_hm)
|
| 896 |
+
st.caption("Dark = missing, light = present. Each column is a row.")
|
| 897 |
+
else:
|
| 898 |
+
st.info("No missing values to display.")
|
| 899 |
+
with tab_corr:
|
| 900 |
+
fig_corr = plot_missingness_correlation(X)
|
| 901 |
+
if fig_corr:
|
| 902 |
+
st.pyplot(fig_corr); plt.close(fig_corr)
|
| 903 |
+
st.caption("Near +1: columns tend to be missing together. Near β1: rarely missing simultaneously.")
|
| 904 |
+
else:
|
| 905 |
+
st.info("Need at least 2 columns with missing values for this chart.")
|
| 906 |
+
|
| 907 |
+
# ββ Correlation among numerical features
|
| 908 |
+
num_cols_x, _ = identify_columns(X)
|
| 909 |
+
if len(num_cols_x) >= 2:
|
| 910 |
+
st.markdown('<div class="section-header">π Feature Correlations (Numerical)</div>', unsafe_allow_html=True)
|
| 911 |
+
valid = [c for c in num_cols_x if X[c].isnull().mean() < 1.0]
|
| 912 |
+
if len(valid) >= 2:
|
| 913 |
+
corr = X[valid].corr()
|
| 914 |
+
strong = (corr.abs() > 0.5) & (corr != 1.0)
|
| 915 |
+
if strong.any().any():
|
| 916 |
+
fig_fc, ax_fc = plt.subplots(figsize=(max(8, len(valid) * 0.9), max(7, len(valid) * 0.8)))
|
| 917 |
+
mask = np.triu(np.ones_like(corr, dtype=bool))
|
| 918 |
+
display_corr = corr.where(corr.abs() > 0.5)
|
| 919 |
+
sns.heatmap(display_corr, annot=False, cmap="RdYlGn", center=0,
|
| 920 |
+
mask=mask, square=True, linewidths=0.5,
|
| 921 |
+
cbar_kws={"shrink": 0.8}, ax=ax_fc, vmin=-1, vmax=1)
|
| 922 |
+
ax_fc.set_title("Strong Correlations (|r| > 0.5) β Numerical Features",
|
| 923 |
+
fontsize=13, fontweight="bold", pad=12)
|
| 924 |
+
plt.tight_layout()
|
| 925 |
+
st.pyplot(fig_fc); plt.close(fig_fc)
|
| 926 |
+
|
| 927 |
+
# Correlation pairs table
|
| 928 |
+
pairs = []
|
| 929 |
+
seen = set()
|
| 930 |
+
for i, c1 in enumerate(corr.columns):
|
| 931 |
+
for j, c2 in enumerate(corr.columns):
|
| 932 |
+
if i >= j: continue
|
| 933 |
+
v = corr.loc[c1, c2]
|
| 934 |
+
if abs(v) > 0.5:
|
| 935 |
+
key = tuple(sorted([c1, c2]))
|
| 936 |
+
if key not in seen:
|
| 937 |
+
seen.add(key)
|
| 938 |
+
pairs.append({"Column A": c1, "Column B": c2,
|
| 939 |
+
"Correlation": round(v, 4),
|
| 940 |
+
"Correlation %": f"{round(v * 100, 2)}%"})
|
| 941 |
+
if pairs:
|
| 942 |
+
corr_table = pd.DataFrame(pairs).sort_values("Correlation", key=abs, ascending=False)
|
| 943 |
+
st.markdown("**Strong Correlation Pairs (|r| > 0.5)**")
|
| 944 |
+
st.dataframe(corr_table, use_container_width=True, hide_index=True)
|
| 945 |
+
else:
|
| 946 |
+
st.info("No strong correlations (|r| > 0.5) found among numerical features.")
|
| 947 |
+
|
| 948 |
+
|
| 949 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 950 |
+
# STEP 4 β MECHANISM DASHBOARD (from app_tanisha.py)
|
| 951 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 952 |
+
|
| 953 |
+
elif step == STEPS[3]:
|
| 954 |
+
st.markdown('<div class="main-title">π§ͺ Step 4 β Mechanism Dashboard</div>', unsafe_allow_html=True)
|
| 955 |
+
st.markdown('<div class="main-sub">Automated MCAR/MAR/MNAR detection via Chi-square & Logistic Regression, plus outlier/variance analysis and deep per-column exploration.</div>', unsafe_allow_html=True)
|
| 956 |
+
|
| 957 |
+
df = st.session_state.get("df")
|
| 958 |
+
target_col = st.session_state.get("target_col")
|
| 959 |
+
|
| 960 |
+
if df is None:
|
| 961 |
+
st.warning("β οΈ Please upload a CSV in Step 1 first.")
|
| 962 |
+
elif target_col is None:
|
| 963 |
+
st.warning("β οΈ Please select a target column in Step 2 first.")
|
| 964 |
+
else:
|
| 965 |
+
X = df.drop(columns=[target_col])
|
| 966 |
+
y = df[target_col]
|
| 967 |
+
num_cols, cat_cols = identify_columns(X)
|
| 968 |
+
|
| 969 |
+
# ββ Train-test split
|
| 970 |
+
st.markdown('<div class="section-header">βοΈ Train-Test Split (80 / 20)</div>', unsafe_allow_html=True)
|
| 971 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 972 |
+
sc1, sc2 = st.columns(2)
|
| 973 |
+
with sc1: st.markdown(f"**Training Set** β X_train: `{X_train.shape}` Β· y_train: `{y_train.shape}`")
|
| 974 |
+
with sc2: st.markdown(f"**Test Set** β X_test: `{X_test.shape}` Β· y_test: `{y_test.shape}`")
|
| 975 |
+
|
| 976 |
+
# ββ Mechanism diagnosis
|
| 977 |
+
st.markdown('<div class="section-header">π¬ Missing Data Mechanism Diagnosis (Chi-square + Logistic Regression)</div>', unsafe_allow_html=True)
|
| 978 |
+
missing_feature_cols = [c for c in X.columns if X[c].isnull().any()]
|
| 979 |
+
|
| 980 |
+
if not missing_feature_cols:
|
| 981 |
+
st.success("No missing values in feature columns β nothing to diagnose.")
|
| 982 |
+
mechanism_results = {}
|
| 983 |
+
else:
|
| 984 |
+
cached = st.session_state.get("mechanism_results_lr", {})
|
| 985 |
+
if not cached:
|
| 986 |
+
with st.spinner("Running MCAR (Chi-square) and MAR (Logistic Regression) testsβ¦"):
|
| 987 |
+
mechanism_results = {}
|
| 988 |
+
for col in missing_feature_cols:
|
| 989 |
+
mech, reason = diagnose_mechanism_lr(X, col, num_cols)
|
| 990 |
+
mechanism_results[col] = {"mechanism": mech, "reason": reason}
|
| 991 |
+
st.session_state["mechanism_results_lr"] = mechanism_results
|
| 992 |
+
else:
|
| 993 |
+
mechanism_results = cached
|
| 994 |
+
|
| 995 |
+
badge_map = {"MCAR": "badge-mcar", "MAR": "badge-mar", "MNAR": "badge-mnar"}
|
| 996 |
+
for col, res in mechanism_results.items():
|
| 997 |
+
mech = res["mechanism"]
|
| 998 |
+
pct = round(X[col].isnull().mean() * 100, 2)
|
| 999 |
+
with st.expander(f"π **{col}** β {mech} | {pct}% missing"):
|
| 1000 |
+
st.markdown(f'<span class="{badge_map[mech]}">{mech}</span> {res["reason"]}',
|
| 1001 |
+
unsafe_allow_html=True)
|
| 1002 |
+
|
| 1003 |
+
# ββ Outlier Detection & Variance Impact
|
| 1004 |
+
st.markdown('<div class="section-header">β‘ Outlier Detection & Variance Impact</div>', unsafe_allow_html=True)
|
| 1005 |
+
outlier_data = {}
|
| 1006 |
+
for col in num_cols:
|
| 1007 |
+
n_out = detect_outliers_iqr(X[col])
|
| 1008 |
+
vb, va, vi = variance_impact(X[col])
|
| 1009 |
+
outlier_data[col] = {
|
| 1010 |
+
"Missing %": round(X[col].isnull().mean() * 100, 2),
|
| 1011 |
+
"Outliers (IQR)": n_out,
|
| 1012 |
+
"Variance (before impute)": vb,
|
| 1013 |
+
"Variance (after mean impute)": va,
|
| 1014 |
+
"Variance Impact (Ξ)": vi,
|
| 1015 |
+
}
|
| 1016 |
+
if outlier_data:
|
| 1017 |
+
out_df = (pd.DataFrame(outlier_data).T.reset_index()
|
| 1018 |
+
.rename(columns={"index": "Column"})
|
| 1019 |
+
.sort_values("Outliers (IQR)", ascending=False))
|
| 1020 |
+
|
| 1021 |
+
def color_outliers(val):
|
| 1022 |
+
if isinstance(val, (int, float)):
|
| 1023 |
+
if val > 50: return "background-color: #f8d7da; color: #721c24;"
|
| 1024 |
+
if val > 10: return "background-color: #fff3cd; color: #856404;"
|
| 1025 |
+
return ""
|
| 1026 |
+
st.dataframe(out_df.style.applymap(color_outliers, subset=["Outliers (IQR)"]),
|
| 1027 |
+
use_container_width=True, hide_index=True)
|
| 1028 |
+
else:
|
| 1029 |
+
st.info("No numerical columns available for outlier analysis.")
|
| 1030 |
+
|
| 1031 |
+
# ββ Final Diagnosis Table
|
| 1032 |
+
st.markdown('<div class="section-header">π Final Diagnosis Table</div>', unsafe_allow_html=True)
|
| 1033 |
+
diag_rows = []
|
| 1034 |
+
for col in X.columns:
|
| 1035 |
+
mp = round(X[col].isnull().mean() * 100, 2)
|
| 1036 |
+
mech = mechanism_results.get(col, {}).get("mechanism", "N/A") if col in missing_feature_cols else "N/A"
|
| 1037 |
+
diag_rows.append({
|
| 1038 |
+
"Column": col, "Missing %": mp,
|
| 1039 |
+
"Mechanism": mech, "Severity": severity(mp) if mp > 0 else "None",
|
| 1040 |
+
"Outliers": outlier_data.get(col, {}).get("Outliers (IQR)", "β"),
|
| 1041 |
+
"Variance Impact (Ξ)": outlier_data.get(col, {}).get("Variance Impact (Ξ)", "β"),
|
| 1042 |
+
})
|
| 1043 |
+
diag_df = pd.DataFrame(diag_rows).sort_values("Missing %", ascending=False).reset_index(drop=True)
|
| 1044 |
+
|
| 1045 |
+
sev_colors = {"High": "background-color: #f8d7da; color: #721c24;",
|
| 1046 |
+
"Moderate": "background-color: #fff3cd; color: #856404;",
|
| 1047 |
+
"Low": "background-color: #d4edda; color: #155724;"}
|
| 1048 |
+
mech_colors = {"MCAR": "background-color: #d4edda; color: #155724;",
|
| 1049 |
+
"MAR": "background-color: #fff3cd; color: #856404;",
|
| 1050 |
+
"MNAR": "background-color: #f8d7da; color: #721c24;"}
|
| 1051 |
+
|
| 1052 |
+
def color_diag_row(row):
|
| 1053 |
+
mech_style = mech_colors.get(row["Mechanism"], "")
|
| 1054 |
+
sev_style = sev_colors.get(row["Severity"], "")
|
| 1055 |
+
return ["", "", mech_style, sev_style, "", ""]
|
| 1056 |
+
|
| 1057 |
+
st.dataframe(diag_df.style.apply(color_diag_row, axis=1),
|
| 1058 |
+
use_container_width=True, hide_index=True)
|
| 1059 |
+
|
| 1060 |
+
# ββ Per-Column Deep Analysis
|
| 1061 |
+
st.markdown('<div class="section-header">π¬ Per-Column Deep Analysis</div>', unsafe_allow_html=True)
|
| 1062 |
+
col_label_to_name = {}
|
| 1063 |
+
for col in X.columns:
|
| 1064 |
+
mp_l = round(X[col].isnull().mean() * 100, 1)
|
| 1065 |
+
type_lbl = "Num" if col in num_cols else "Cat"
|
| 1066 |
+
mech_lbl = mechanism_results.get(col, {}).get("mechanism", "β") if col in missing_feature_cols else "complete"
|
| 1067 |
+
label = f"{col} [{type_lbl} Β· {mp_l}% missing Β· {mech_lbl}]"
|
| 1068 |
+
col_label_to_name[label] = col
|
| 1069 |
+
|
| 1070 |
+
chosen_label = st.selectbox(
|
| 1071 |
+
"Select a column to analyse in detail:",
|
| 1072 |
+
options=["β choose a column β"] + list(col_label_to_name.keys()),
|
| 1073 |
+
key="deep_col_select"
|
| 1074 |
+
)
|
| 1075 |
+
if chosen_label != "β choose a column β":
|
| 1076 |
+
chosen_col = col_label_to_name[chosen_label]
|
| 1077 |
+
with st.spinner(f"Analysing `{chosen_col}`β¦"):
|
| 1078 |
+
st.markdown("---")
|
| 1079 |
+
render_per_column_deep_analysis(
|
| 1080 |
+
df=X, col=chosen_col,
|
| 1081 |
+
num_cols=num_cols, cat_cols=cat_cols,
|
| 1082 |
+
mechanism_results=mechanism_results,
|
| 1083 |
+
)
|
| 1084 |
+
st.markdown("---")
|
| 1085 |
+
|
| 1086 |
+
# ββ Insights
|
| 1087 |
+
st.markdown('<div class="section-header">π‘ Data Analysis Insights</div>', unsafe_allow_html=True)
|
| 1088 |
+
high_miss = diag_df[diag_df["Missing %"] >= 20]["Column"].tolist()
|
| 1089 |
+
mar_cols = diag_df[diag_df["Mechanism"] == "MAR"]["Column"].tolist()
|
| 1090 |
+
mnar_cols = diag_df[diag_df["Mechanism"] == "MNAR"]["Column"].tolist()
|
| 1091 |
+
high_out = [c for c in num_cols if outlier_data.get(c, {}).get("Outliers (IQR)", 0) > 10]
|
| 1092 |
+
|
| 1093 |
+
insights = [
|
| 1094 |
+
"Missing data must be understood <b>before</b> any imputation or modeling to avoid biased results.",
|
| 1095 |
+
(f"<b>{', '.join(high_miss)}</b> have β₯20% missing values β treat with caution or consider dropping."
|
| 1096 |
+
if high_miss else "No columns have critically high (β₯20%) missing rates β dataset quality looks reasonable."),
|
| 1097 |
+
(f"Columns <b>{', '.join(mar_cols)}</b> show MAR behavior β KNN/MICE imputation is viable."
|
| 1098 |
+
if mar_cols else "No columns confirmed MAR."),
|
| 1099 |
+
(f"Columns <b>{', '.join(mnar_cols)}</b> are likely MNAR β create a missing indicator before imputing."
|
| 1100 |
+
if mnar_cols else "No columns flagged as MNAR."),
|
| 1101 |
+
(f"Columns <b>{', '.join(high_out)}</b> have many outliers β prefer median over mean imputation."
|
| 1102 |
+
if high_out else "Outlier counts appear manageable across numerical columns."),
|
| 1103 |
+
"Correlated missingness indicates data is likely <b>not MCAR</b> β jointly missing due to a common cause.",
|
| 1104 |
+
"MCAR is rare in real-world datasets. Most missingness in practice is MAR or MNAR.",
|
| 1105 |
+
"MNAR <b>cannot be confirmed statistically</b> from observed data alone β domain knowledge is essential.",
|
| 1106 |
+
]
|
| 1107 |
+
st.markdown('<div class="insight-box"><ul>' + "".join(f"<li>{i}</li>" for i in insights) + "</ul></div>",
|
| 1108 |
+
unsafe_allow_html=True)
|
| 1109 |
+
|
| 1110 |
+
# ββ Theory
|
| 1111 |
+
st.markdown('<div class="section-header">π Theoretical Background</div>', unsafe_allow_html=True)
|
| 1112 |
+
theories = [
|
| 1113 |
+
("π΅ MCAR β Missing Completely At Random",
|
| 1114 |
+
"The probability of missingness is entirely independent of observed and unobserved data. "
|
| 1115 |
+
"Listwise deletion is unbiased under MCAR, though it reduces sample size."),
|
| 1116 |
+
("π‘ MAR β Missing At Random",
|
| 1117 |
+
"Missingness depends on <i>observed</i> data but not on the missing value itself. "
|
| 1118 |
+
"Multiple imputation or FIML methods produce valid estimates under MAR."),
|
| 1119 |
+
("π΄ MNAR β Missing Not At Random",
|
| 1120 |
+
"Missingness depends on the <i>unobserved value itself</i>. Cannot be detected from observed data. "
|
| 1121 |
+
"Requires sensitivity analysis and domain knowledge. Ignoring MNAR produces biased results."),
|
| 1122 |
+
("π Why Chi-Square for MCAR Testing?",
|
| 1123 |
+
"Chi-square tests independence between the binary missingness indicator and binned numeric predictors. "
|
| 1124 |
+
"No significant association is consistent with MCAR, though this only confirms pairwise independence."),
|
| 1125 |
+
("π€ Why Logistic Regression for MAR Detection?",
|
| 1126 |
+
"LR models the binary missingness indicator as a function of all observed features. "
|
| 1127 |
+
"Accuracy substantially above the majority-class baseline indicates MAR."),
|
| 1128 |
+
("π Why MNAR Cannot Be Confirmed Statistically",
|
| 1129 |
+
"MNAR depends on unobserved values β data we do not have. No statistical test on observed data "
|
| 1130 |
+
"can definitively confirm it. Domain reasoning about the data generation process is required."),
|
| 1131 |
+
("π¦ Outliers and Their Impact on Variance",
|
| 1132 |
+
"Outliers (>1.5ΓIQR) inflate variance and distort the mean. Mean imputation artificially collapses "
|
| 1133 |
+
"variance because all missing cells receive the same central value, masking true data spread."),
|
| 1134 |
+
]
|
| 1135 |
+
for title, body in theories:
|
| 1136 |
+
st.markdown(f'<div class="theory-box"><h4>{title}</h4><p>{body}</p></div>', unsafe_allow_html=True)
|
| 1137 |
+
|
| 1138 |
+
|
| 1139 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1140 |
+
# STEP 5 β COLUMN DIAGNOSTICS (from app.py β 3 statistical tests)
|
| 1141 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1142 |
+
|
| 1143 |
+
elif step == STEPS[4]:
|
| 1144 |
+
st.markdown('<div class="main-title">π¬ Step 5 β Column Diagnostics</div>', unsafe_allow_html=True)
|
| 1145 |
+
st.markdown('<div class="main-sub">Run three independent statistical tests per column to determine the missing-data mechanism (MCAR / MAR / MNAR).</div>', unsafe_allow_html=True)
|
| 1146 |
+
|
| 1147 |
+
df = st.session_state.get("df")
|
| 1148 |
+
target_col = st.session_state.get("target_col")
|
| 1149 |
+
|
| 1150 |
+
if df is None:
|
| 1151 |
+
st.warning("β οΈ Please upload a CSV in Step 1 first.")
|
| 1152 |
+
elif target_col is None:
|
| 1153 |
+
st.warning("β οΈ Please select a target column in Step 2 first.")
|
| 1154 |
+
else:
|
| 1155 |
+
summary = missing_summary_df(df)
|
| 1156 |
+
if summary.empty:
|
| 1157 |
+
st.success("π No missing values β nothing to diagnose.")
|
| 1158 |
+
else:
|
| 1159 |
+
miss_cols = summary.index.tolist()
|
| 1160 |
+
selected_col = st.selectbox("Select a column to analyse:", miss_cols)
|
| 1161 |
+
miss_pct = summary.loc[selected_col, "Missing %"]
|
| 1162 |
+
dtype_str = str(df[selected_col].dtype)
|
| 1163 |
+
|
| 1164 |
+
st.markdown(f"---")
|
| 1165 |
+
st.markdown(f"### Analysing column: `{selected_col}`")
|
| 1166 |
+
|
| 1167 |
+
lv, risk_txt, risk_bg, risk_fg = missingness_risk_level(miss_pct)
|
| 1168 |
+
c1, c2, c3 = st.columns(3)
|
| 1169 |
+
with c1:
|
| 1170 |
+
st.markdown(f'<div class="metric-box"><div class="metric-val" style="color:#9e2210">{miss_pct:.1f}%</div><div class="metric-lbl">Missing</div></div>', unsafe_allow_html=True)
|
| 1171 |
+
with c2:
|
| 1172 |
+
st.markdown(f'<div class="metric-box"><div class="metric-val">{dtype_str}</div><div class="metric-lbl">Data Type</div></div>', unsafe_allow_html=True)
|
| 1173 |
+
with c3:
|
| 1174 |
+
n_miss = int(summary.loc[selected_col, "Missing Count"])
|
| 1175 |
+
st.markdown(f'<div class="metric-box"><div class="metric-val">{n_miss:,}</div><div class="metric-lbl">Missing Rows</div></div>', unsafe_allow_html=True)
|
| 1176 |
+
|
| 1177 |
+
st.markdown(
|
| 1178 |
+
f'<div style="background:{risk_bg};border:1.5px solid {risk_fg};border-radius:8px;padding:10px 16px;margin:12px 0;">'
|
| 1179 |
+
f'<b style="color:{risk_fg}">{lv} Missingness</b> β {risk_txt}</div>',
|
| 1180 |
+
unsafe_allow_html=True,
|
| 1181 |
+
)
|
| 1182 |
+
|
| 1183 |
+
# ββ Test 1
|
| 1184 |
+
st.markdown("#### π¬ Test 1 β Pattern Analysis (Missingness Map)")
|
| 1185 |
+
t1 = test1_pattern_analysis(df, selected_col)
|
| 1186 |
+
fig, axes = plt.subplots(1, 2, figsize=(12, 3))
|
| 1187 |
+
sample_size = min(300, len(df))
|
| 1188 |
+
idx_sample = df.sample(n=sample_size, random_state=42).index if len(df) > sample_size else df.index
|
| 1189 |
+
ind_sample = t1["indicator"].loc[idx_sample]
|
| 1190 |
+
axes[0].scatter(range(len(ind_sample)), ind_sample.values,
|
| 1191 |
+
c=["#9e2210" if v else "#89d9ac" for v in ind_sample.values], s=8, alpha=0.8)
|
| 1192 |
+
axes[0].set_yticks([0, 1]); axes[0].set_yticklabels(["Present", "Missing"])
|
| 1193 |
+
axes[0].set_title(f"Missingness Pattern ({sample_size} rows)")
|
| 1194 |
+
axes[0].set_xlabel("Row index")
|
| 1195 |
+
roll = t1["indicator"].rolling(50, min_periods=1).mean()
|
| 1196 |
+
axes[1].plot(roll.values, color="#17172b", linewidth=1.2)
|
| 1197 |
+
axes[1].set_title("Rolling Miss Rate (window=50)")
|
| 1198 |
+
axes[1].set_xlabel("Row index"); axes[1].set_ylabel("Miss rate")
|
| 1199 |
+
axes[1].axhline(t1["miss_pct"] / 100, color="#9e2210", linestyle="--", label="Mean miss rate")
|
| 1200 |
+
axes[1].legend(fontsize=8)
|
| 1201 |
+
plt.tight_layout()
|
| 1202 |
+
st.pyplot(fig); plt.close()
|
| 1203 |
+
|
| 1204 |
+
scatter_icon = "π’" if t1["scattered"] else "π "
|
| 1205 |
+
st.markdown(f'<div class="card-info"><b>{scatter_icon} {t1["signal"]}</b><br><small>Cluster ratio: {t1["cluster_ratio"]:.2f} (higher = more scattered = MCAR signal)</small></div>', unsafe_allow_html=True)
|
| 1206 |
+
|
| 1207 |
+
# ββ Test 2
|
| 1208 |
+
st.markdown("#### π¬ Test 2 β Feature Dependency")
|
| 1209 |
+
t2 = test2_feature_dependency(df, selected_col)
|
| 1210 |
+
if t2["diffs"]:
|
| 1211 |
+
top_diffs = dict(sorted(t2["diffs"].items(), key=lambda x: -x[1])[:15])
|
| 1212 |
+
fig2, ax2 = plt.subplots(figsize=(10, max(3, len(top_diffs) * 0.45)))
|
| 1213 |
+
colors = ["#9e2210" if v >= 30 else "#f0a040" if v >= 10 else "#89d9ac" for v in top_diffs.values()]
|
| 1214 |
+
ax2.barh(list(top_diffs.keys())[::-1], list(top_diffs.values())[::-1], color=colors[::-1], edgecolor="white")
|
| 1215 |
+
ax2.axvline(5, color="#89d9ac", linewidth=1.5, linestyle="--", label="5% weak")
|
| 1216 |
+
ax2.axvline(10, color="#f0cc7a", linewidth=1.5, linestyle="--", label="10% MAR signal")
|
| 1217 |
+
ax2.axvline(30, color="#f5a898", linewidth=1.5, linestyle="--", label="30% strong")
|
| 1218 |
+
ax2.set_xlabel("Distribution Difference (%)")
|
| 1219 |
+
ax2.set_title("Feature Distribution Difference")
|
| 1220 |
+
ax2.legend(fontsize=8)
|
| 1221 |
+
plt.tight_layout()
|
| 1222 |
+
st.pyplot(fig2); plt.close()
|
| 1223 |
+
dep_icon = "π’" if t2["max_diff"] < 5 else "π " if t2["max_diff"] < 30 else "π΄"
|
| 1224 |
+
st.markdown(f'<div class="card-info"><b>{dep_icon} {t2["signal"]}</b><br><small>Max difference: {t2["max_diff"]:.1f}%</small></div>', unsafe_allow_html=True)
|
| 1225 |
+
else:
|
| 1226 |
+
st.info("Not enough data to compare feature distributions.")
|
| 1227 |
+
t2 = {"diffs": {}, "max_diff": 0.0, "signal": "Insufficient data"}
|
| 1228 |
+
|
| 1229 |
+
# ββ Test 3
|
| 1230 |
+
st.markdown("#### π¬ Test 3 β Target Dependency")
|
| 1231 |
+
if selected_col == target_col:
|
| 1232 |
+
st.warning("β οΈ Selected column IS the target column. Test 3 skipped.")
|
| 1233 |
+
t3 = {"diff_pct": None, "signal": "Skipped β column is target"}
|
| 1234 |
+
else:
|
| 1235 |
+
t3 = test3_target_dependency(df, selected_col, target_col)
|
| 1236 |
+
if t3["diff_pct"] is not None:
|
| 1237 |
+
missing_mask = df[selected_col].isnull()
|
| 1238 |
+
fig3, ax3 = plt.subplots(figsize=(7, 3.5))
|
| 1239 |
+
if pd.api.types.is_numeric_dtype(df[target_col]):
|
| 1240 |
+
miss_target = df.loc[missing_mask, target_col].dropna()
|
| 1241 |
+
obs_target = df.loc[~missing_mask, target_col].dropna()
|
| 1242 |
+
ax3.hist(obs_target, bins=25, alpha=0.7, label="Target when present", color="#17172b", edgecolor="white")
|
| 1243 |
+
ax3.hist(miss_target, bins=25, alpha=0.7, label="Target when missing", color="#9e2210", edgecolor="white")
|
| 1244 |
+
ax3.set_xlabel(target_col); ax3.set_ylabel("Count")
|
| 1245 |
+
ax3.legend()
|
| 1246 |
+
else:
|
| 1247 |
+
miss_target = df.loc[missing_mask, target_col].value_counts(normalize=True) * 100
|
| 1248 |
+
obs_target = df.loc[~missing_mask, target_col].value_counts(normalize=True) * 100
|
| 1249 |
+
cats = list(set(miss_target.index) | set(obs_target.index))
|
| 1250 |
+
x = np.arange(len(cats))
|
| 1251 |
+
ax3.bar(x - 0.2, [obs_target.get(c, 0) for c in cats], 0.4, label="Present", color="#17172b")
|
| 1252 |
+
ax3.bar(x + 0.2, [miss_target.get(c, 0) for c in cats], 0.4, label="Missing", color="#9e2210")
|
| 1253 |
+
ax3.set_xticks(x); ax3.set_xticklabels(cats, rotation=30)
|
| 1254 |
+
ax3.set_ylabel("% of group"); ax3.legend()
|
| 1255 |
+
ax3.set_title(f"Target ({target_col}) dist: present vs missing in '{selected_col}'")
|
| 1256 |
+
plt.tight_layout()
|
| 1257 |
+
st.pyplot(fig3); plt.close()
|
| 1258 |
+
dep_icon = "π’" if (t3["diff_pct"] or 0) < 5 else "π " if (t3["diff_pct"] or 0) < 10 else "π΄"
|
| 1259 |
+
st.markdown(f'<div class="card-info"><b>{dep_icon} {t3["signal"]}</b><br><small>Target diff: {t3["diff_pct"]}%</small></div>', unsafe_allow_html=True)
|
| 1260 |
+
else:
|
| 1261 |
+
st.info(t3["signal"])
|
| 1262 |
+
|
| 1263 |
+
# ββ Verdict
|
| 1264 |
+
st.markdown("---")
|
| 1265 |
+
st.markdown("### π Mechanism Verdict")
|
| 1266 |
+
mechanism, confidence, explanation = classify_mechanism(t1, t2, t3)
|
| 1267 |
+
card_class = {"MCAR": "card-mcar", "MAR": "card-mar", "MNAR": "card-mnar"}[mechanism]
|
| 1268 |
+
emoji = {"MCAR": "π’", "MAR": "π ", "MNAR": "π΄"}[mechanism]
|
| 1269 |
+
st.markdown(
|
| 1270 |
+
f'<div class="{card_class}">'
|
| 1271 |
+
f'<div class="verdict-label">{emoji} {mechanism} β {confidence} confidence</div>'
|
| 1272 |
+
f'<div class="verdict-desc">{explanation}</div>'
|
| 1273 |
+
f'</div>',
|
| 1274 |
+
unsafe_allow_html=True,
|
| 1275 |
+
)
|
| 1276 |
+
|
| 1277 |
+
# Strategy chips
|
| 1278 |
+
col_type_str = "Numerical" if pd.api.types.is_numeric_dtype(df[selected_col]) else "Categorical"
|
| 1279 |
+
chips_html = strategy_chips_html(mechanism, miss_pct, col_type_str)
|
| 1280 |
+
if chips_html:
|
| 1281 |
+
st.markdown("**Recommended Strategy Options**")
|
| 1282 |
+
st.markdown(chips_html, unsafe_allow_html=True)
|
| 1283 |
+
|
| 1284 |
+
st.session_state["col_results"][selected_col] = {
|
| 1285 |
+
"mechanism": mechanism,
|
| 1286 |
+
"confidence": confidence,
|
| 1287 |
+
"miss_pct": miss_pct,
|
| 1288 |
+
"dtype": dtype_str,
|
| 1289 |
+
"t1": t1, "t2": t2, "t3": t3,
|
| 1290 |
+
}
|
| 1291 |
+
|
| 1292 |
+
|
| 1293 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1294 |
+
# STEP 6 β STRATEGY & IMPUTATION
|
| 1295 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1296 |
+
|
| 1297 |
+
elif step == STEPS[5]:
|
| 1298 |
+
st.markdown('<div class="main-title">π Step 6 β Strategy & Imputation</div>', unsafe_allow_html=True)
|
| 1299 |
+
st.markdown('<div class="main-sub">Based on the mechanism and missing %, select and apply the right strategy for each column.</div>', unsafe_allow_html=True)
|
| 1300 |
+
|
| 1301 |
+
df = st.session_state.get("df")
|
| 1302 |
+
col_results = st.session_state.get("col_results", {})
|
| 1303 |
+
|
| 1304 |
+
if df is None:
|
| 1305 |
+
st.warning("β οΈ Please upload a CSV in Step 1 first.")
|
| 1306 |
+
elif not col_results:
|
| 1307 |
+
st.warning("β οΈ Please run diagnostics in Step 5 for at least one column first.")
|
| 1308 |
+
else:
|
| 1309 |
+
df_imputed = (df.copy() if st.session_state.get("df_imputed") is None
|
| 1310 |
+
else st.session_state["df_imputed"].copy())
|
| 1311 |
+
|
| 1312 |
+
for col, res in col_results.items():
|
| 1313 |
+
mechanism = res["mechanism"]
|
| 1314 |
+
miss_pct = res["miss_pct"]
|
| 1315 |
+
dtype_str = res["dtype"]
|
| 1316 |
+
|
| 1317 |
+
st.markdown(f"### Column: `{col}`")
|
| 1318 |
+
st.markdown(f"**Mechanism:** {mechanism} | **Missing:** {miss_pct:.1f}% | **Type:** `{dtype_str}`")
|
| 1319 |
+
|
| 1320 |
+
rec = recommend_strategy(mechanism, miss_pct, dtype_str)
|
| 1321 |
+
card_class = "card-mcar" if mechanism == "MCAR" else "card-mar" if mechanism == "MAR" else "card-mnar"
|
| 1322 |
+
st.markdown(
|
| 1323 |
+
f'<div class="{card_class}">'
|
| 1324 |
+
f'<b>Recommended: {rec["method"]}</b><br>'
|
| 1325 |
+
f'<small>{rec["reason"]}</small><br>'
|
| 1326 |
+
f'<small>{rec["adv"]}</small><br>'
|
| 1327 |
+
f'<small style="color:#888">{rec["disadv"]}</small>'
|
| 1328 |
+
f'</div>',
|
| 1329 |
+
unsafe_allow_html=True,
|
| 1330 |
+
)
|
| 1331 |
+
|
| 1332 |
+
if rec["add_indicator"]:
|
| 1333 |
+
st.markdown(
|
| 1334 |
+
'<div class="card-warn">π© <b>Missing Indicator will be added BEFORE imputation</b> β '
|
| 1335 |
+
"missingness itself carries signal for this column.</div>",
|
| 1336 |
+
unsafe_allow_html=True,
|
| 1337 |
+
)
|
| 1338 |
+
|
| 1339 |
+
is_num = "float" in dtype_str or "int" in dtype_str
|
| 1340 |
+
strategy_options = (
|
| 1341 |
+
["Mean", "Median", "Constant (0)", "Drop rows", "Keep as-is"] if is_num
|
| 1342 |
+
else ["Mode", "Constant ('Unknown')", "Drop rows", "Keep as-is"]
|
| 1343 |
+
)
|
| 1344 |
+
chosen = st.selectbox(
|
| 1345 |
+
f"Apply strategy for `{col}`:",
|
| 1346 |
+
options=strategy_options,
|
| 1347 |
+
key=f"strategy_{col}",
|
| 1348 |
+
)
|
| 1349 |
+
|
| 1350 |
+
if st.button(f"βΆ Apply to `{col}`", key=f"apply_{col}"):
|
| 1351 |
+
if rec["add_indicator"]:
|
| 1352 |
+
indicator_col = f"{col}_was_missing"
|
| 1353 |
+
df_imputed[indicator_col] = df[col].isnull().astype(int)
|
| 1354 |
+
st.info(f"β
Created indicator column: `{indicator_col}`")
|
| 1355 |
+
|
| 1356 |
+
if chosen == "Mean":
|
| 1357 |
+
fill_val = df[col].mean()
|
| 1358 |
+
df_imputed[col] = df_imputed[col].fillna(fill_val)
|
| 1359 |
+
st.success(f"β
Imputed with mean = {fill_val:.4f}")
|
| 1360 |
+
elif chosen == "Median":
|
| 1361 |
+
fill_val = df[col].median()
|
| 1362 |
+
df_imputed[col] = df_imputed[col].fillna(fill_val)
|
| 1363 |
+
st.success(f"β
Imputed with median = {fill_val:.4f}")
|
| 1364 |
+
elif chosen == "Mode":
|
| 1365 |
+
fill_val = df[col].mode().iloc[0]
|
| 1366 |
+
df_imputed[col] = df_imputed[col].fillna(fill_val)
|
| 1367 |
+
st.success(f"β
Imputed with mode = {fill_val}")
|
| 1368 |
+
elif chosen in ("Constant (0)", "Constant ('Unknown')"):
|
| 1369 |
+
fill_val = 0 if is_num else "Unknown"
|
| 1370 |
+
df_imputed[col] = df_imputed[col].fillna(fill_val)
|
| 1371 |
+
st.success(f"β
Imputed with constant = {fill_val}")
|
| 1372 |
+
elif chosen == "Drop rows":
|
| 1373 |
+
before = len(df_imputed)
|
| 1374 |
+
df_imputed = df_imputed.dropna(subset=[col])
|
| 1375 |
+
after = len(df_imputed)
|
| 1376 |
+
st.success(f"β
Dropped {before - after} rows with missing `{col}`")
|
| 1377 |
+
else:
|
| 1378 |
+
st.info("No imputation applied.")
|
| 1379 |
+
|
| 1380 |
+
st.session_state["df_imputed"] = df_imputed
|
| 1381 |
+
|
| 1382 |
+
st.markdown("<hr class='divider'>", unsafe_allow_html=True)
|
| 1383 |
+
|
| 1384 |
+
st.markdown("### π₯ Download Imputed Dataset")
|
| 1385 |
+
df_out = st.session_state.get("df_imputed", df)
|
| 1386 |
+
csv_bytes = df_out.to_csv(index=False).encode("utf-8")
|
| 1387 |
+
st.download_button(
|
| 1388 |
+
label="β¬ Download imputed CSV",
|
| 1389 |
+
data=csv_bytes,
|
| 1390 |
+
file_name="imputed_dataset.csv",
|
| 1391 |
+
mime="text/csv",
|
| 1392 |
+
)
|
| 1393 |
+
st.dataframe(df_out.head(10), use_container_width=True)
|
| 1394 |
+
|
| 1395 |
+
|
| 1396 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1397 |
+
# STEP 7 β VALIDATION CHECKS
|
| 1398 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1399 |
+
|
| 1400 |
+
elif step == STEPS[6]:
|
| 1401 |
+
st.markdown('<div class="main-title">β
Step 7 β Validation Checks</div>', unsafe_allow_html=True)
|
| 1402 |
+
st.markdown('<div class="main-sub">Confirm that imputation preserved statistical properties and did not introduce bias.</div>', unsafe_allow_html=True)
|
| 1403 |
+
|
| 1404 |
+
df_orig = st.session_state.get("df")
|
| 1405 |
+
df_imputed = st.session_state.get("df_imputed")
|
| 1406 |
+
col_results = st.session_state.get("col_results", {})
|
| 1407 |
+
|
| 1408 |
+
if df_orig is None or df_imputed is None:
|
| 1409 |
+
st.warning("β οΈ Complete Steps 1β6 first.")
|
| 1410 |
+
elif not col_results:
|
| 1411 |
+
st.warning("β οΈ Run diagnostics in Step 5 and apply a strategy in Step 6 first.")
|
| 1412 |
+
else:
|
| 1413 |
+
numeric_cols = [c for c in col_results if pd.api.types.is_numeric_dtype(df_orig[c])]
|
| 1414 |
+
|
| 1415 |
+
if not numeric_cols:
|
| 1416 |
+
st.info("Validation checks apply to numeric columns only. No numeric columns were diagnosed.")
|
| 1417 |
+
else:
|
| 1418 |
+
for col in numeric_cols:
|
| 1419 |
+
before = df_orig[col].dropna()
|
| 1420 |
+
after = df_imputed[col].dropna()
|
| 1421 |
+
|
| 1422 |
+
if len(after) == 0 or len(before) == 0:
|
| 1423 |
+
continue
|
| 1424 |
+
|
| 1425 |
+
st.markdown(f"### `{col}`")
|
| 1426 |
+
chk = validation_checks(before, after)
|
| 1427 |
+
|
| 1428 |
+
c1, c2, c3 = st.columns(3)
|
| 1429 |
+
def chk_icon(ok): return "β
" if ok else "β οΈ"
|
| 1430 |
+
with c1:
|
| 1431 |
+
st.markdown(
|
| 1432 |
+
f'<div class="metric-box">'
|
| 1433 |
+
f'<div class="metric-val">{chk_icon(chk["mean_ok"])} {chk["mean_shift_pct"]}%</div>'
|
| 1434 |
+
f'<div class="metric-lbl">Mean shift (β€5% OK)</div>'
|
| 1435 |
+
f'</div>', unsafe_allow_html=True)
|
| 1436 |
+
with c2:
|
| 1437 |
+
st.markdown(
|
| 1438 |
+
f'<div class="metric-box">'
|
| 1439 |
+
f'<div class="metric-val">{chk_icon(chk["median_ok"])} {chk["median_shift_pct"]}%</div>'
|
| 1440 |
+
f'<div class="metric-lbl">Median shift (β€3% OK)</div>'
|
| 1441 |
+
f'</div>', unsafe_allow_html=True)
|
| 1442 |
+
with c3:
|
| 1443 |
+
st.markdown(
|
| 1444 |
+
f'<div class="metric-box">'
|
| 1445 |
+
f'<div class="metric-val">{chk_icon(chk["var_ok"])} {chk["var_change_pct"]}%</div>'
|
| 1446 |
+
f'<div class="metric-lbl">Variance change (β€20% OK)</div>'
|
| 1447 |
+
f'</div>', unsafe_allow_html=True)
|
| 1448 |
+
|
| 1449 |
+
fig_v, ax_v = plt.subplots(figsize=(8, 3.5))
|
| 1450 |
+
ax_v.hist(before.values, bins=30, alpha=0.55, label="Before imputation", color="#17172b", edgecolor="white")
|
| 1451 |
+
ax_v.hist(after.values, bins=30, alpha=0.55, label="After imputation", color="#6020a0", edgecolor="white")
|
| 1452 |
+
ax_v.axvline(before.mean(), color="#17172b", linewidth=1.5, linestyle="--", label=f"Mean before: {before.mean():.2f}")
|
| 1453 |
+
ax_v.axvline(after.mean(), color="#6020a0", linewidth=1.5, linestyle="--", label=f"Mean after: {after.mean():.2f}")
|
| 1454 |
+
ax_v.set_title(f"Distribution: '{col}' before vs after imputation")
|
| 1455 |
+
ax_v.legend(fontsize=8)
|
| 1456 |
+
plt.tight_layout()
|
| 1457 |
+
st.pyplot(fig_v); plt.close()
|
| 1458 |
+
|
| 1459 |
+
target_col = st.session_state.get("target_col")
|
| 1460 |
+
if target_col and target_col in df_orig.columns and pd.api.types.is_numeric_dtype(df_orig[target_col]):
|
| 1461 |
+
corr_before = df_orig[[col, target_col]].dropna().corr().iloc[0, 1]
|
| 1462 |
+
corr_after = df_imputed[[col, target_col]].dropna().corr().iloc[0, 1]
|
| 1463 |
+
delta = abs(corr_before - corr_after)
|
| 1464 |
+
sign_flip = (corr_before * corr_after < 0)
|
| 1465 |
+
icon = "β
" if delta <= 0.05 and not sign_flip else "β οΈ"
|
| 1466 |
+
st.markdown(
|
| 1467 |
+
f'<div class="card-info">{icon} <b>Correlation with target:</b> '
|
| 1468 |
+
f'Before = {corr_before:.3f} β After = {corr_after:.3f} | Ξ = {delta:.3f}'
|
| 1469 |
+
+ (" π¨ Sign flipped!" if sign_flip else "")
|
| 1470 |
+
+ "</div>",
|
| 1471 |
+
unsafe_allow_html=True,
|
| 1472 |
+
)
|
| 1473 |
+
|
| 1474 |
+
st.markdown("<hr class='divider'>", unsafe_allow_html=True)
|
| 1475 |
+
|
| 1476 |
+
st.markdown("### β οΈ Common Pitfalls Checklist")
|
| 1477 |
+
pitfalls = [
|
| 1478 |
+
"Each column treated independently?",
|
| 1479 |
+
"Imputation done AFTER train-test split?",
|
| 1480 |
+
"Target variable NOT used as imputation predictor?",
|
| 1481 |
+
"Missing indicator created BEFORE imputation for MNAR/MAR β₯10%?",
|
| 1482 |
+
"Validation checked beyond just accuracy?",
|
| 1483 |
+
]
|
| 1484 |
+
for txt in pitfalls:
|
| 1485 |
+
st.checkbox(txt, value=False, key=f"pitfall_{txt[:20]}")
|
| 1486 |
+
|
| 1487 |
+
st.markdown(
|
| 1488 |
+
'<div class="card-warn">'
|
| 1489 |
+
'<b>β» Repeat Steps 5β6 for every column independently.</b><br>'
|
| 1490 |
+
'One column may be MCAR (drop rows), another MAR (KNN), another MNAR (indicator + median). '
|
| 1491 |
+
'Never apply one method to all columns at once.'
|
| 1492 |
+
'</div>',
|
| 1493 |
+
unsafe_allow_html=True,
|
| 1494 |
+
)
|
| 1495 |
|
| 1496 |
+
st.markdown("---")
|
| 1497 |
+
st.caption("π¬ Missing Value Intelligence Suite Β· Merged from app.py + app_tanisha.py Β· Built with Streamlit, pandas, scikit-learn, scipy, seaborn")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|