dwmk's picture
Update src/eda.py
e8a3427 verified
# eda.py
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
from sklearn.impute import SimpleImputer
def run_eda():
st.header("πŸ“Š Exploratory Data Analysis")
df = st.session_state.processed_df
if df is None:
return
# Layout: Overview
with st.expander("Show Raw Data & Statistics", expanded=False):
c1, c2 = st.columns([2, 1])
c1.dataframe(df.head(100), use_container_width=True)
c2.write(df.describe(include="all"))
st.subheader("πŸ›  Data Configuration")
# Column Selector
all_cols = df.columns.tolist()
c1, c2 = st.columns(2)
with c1:
target = st.selectbox(
"Target Variable (Label)",
options=["None"] + all_cols,
index=0,
help="Select the variable you want to predict."
)
with c2:
# Auto-exclude target from features
available_feats = [c for c in all_cols if c != target]
features = st.multiselect("Feature Variables", available_feats, default=available_feats[:5])
# Persist selection
if target != "None":
st.session_state.target_col = target
st.session_state.feature_cols = features
# ---------------- Preprocessing ----------------
st.subheader("🧹 Smart Cleaning")
col1, col2 = st.columns(2)
with col1:
missing_num = df.select_dtypes(include=np.number).isnull().sum().sum()
missing_cat = df.select_dtypes(exclude=np.number).isnull().sum().sum()
st.info(f"Missing Values - Numeric: {missing_num} | Categorical: {missing_cat}")
with col2:
if st.button("Auto-Impute Missing Values"):
# Numeric -> Mean, Categorical -> Mode
num_cols = df.select_dtypes(include=np.number).columns
cat_cols = df.select_dtypes(exclude=np.number).columns
if len(num_cols) > 0:
imp_num = SimpleImputer(strategy="mean")
df[num_cols] = imp_num.fit_transform(df[num_cols])
if len(cat_cols) > 0:
imp_cat = SimpleImputer(strategy="most_frequent")
df[cat_cols] = imp_cat.fit_transform(df[cat_cols]) # Returns object array, pandas handles it
st.session_state.processed_df = df
st.success("Imputation Applied! Data refreshed.")
st.rerun()
# ---------------- Visualization ----------------
st.subheader("πŸ“ˆ Interactive Visualization")
viz_type = st.selectbox(
"Chart Type",
["Correlation Heatmap", "Distribution Plot", "Scatter Matrix", "Box Plot"]
)
if viz_type == "Correlation Heatmap":
numeric_df = df.select_dtypes(include=np.number)
if not numeric_df.empty:
corr = numeric_df.corr()
fig = px.imshow(
corr,
text_auto=True,
aspect="auto",
color_continuous_scale="RdBu_r",
title="Feature Correlation Matrix"
)
st.plotly_chart(fig, use_container_width=True)
else:
st.warning("No numeric columns for correlation.")
elif viz_type == "Distribution Plot":
col_to_plot = st.selectbox("Select Column", all_cols)
fig = px.histogram(df, x=col_to_plot, color=target if target != "None" else None, marginal="box")
st.plotly_chart(fig, use_container_width=True)
elif viz_type == "Scatter Matrix":
if len(features) > 0:
dims = features[:4] # Limit to 4 for performance
fig = px.scatter_matrix(
df,
dimensions=dims,
color=target if target != "None" else None,
title="Scatter Matrix (First 4 Features)"
)
st.plotly_chart(fig, use_container_width=True)
elif viz_type == "Box Plot":
y_col = st.selectbox("Y Axis (Numeric)", df.select_dtypes(include=np.number).columns)
x_col = st.selectbox("X Axis (Categorical)", all_cols, index=min(len(all_cols)-1, 1))
fig = px.box(df, x=x_col, y=y_col, color=target if target != "None" else None)
st.plotly_chart(fig, use_container_width=True)