dwmk's picture
Create eda.py
e0ff2bb verified
raw
history blame
1.79 kB
# eda.py
import streamlit as st
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
def run_eda():
st.header("πŸ“Š Exploratory Data Analysis")
df_raw = st.session_state.original_df
df = st.session_state.processed_df
c1, c2 = st.columns(2)
with c1:
st.subheader("Raw Data")
st.dataframe(df_raw.head(10))
st.write(df_raw.describe(include="all"))
with c2:
st.subheader("Processed Data")
st.dataframe(df.head(10))
st.subheader("🎯 Feature & Target Selection")
cols = df.columns.tolist()
target = st.selectbox("Target", cols)
features = st.multiselect("Features", [c for c in cols if c != target])
st.session_state.target_col = target
st.session_state.feature_cols = features
st.subheader("🧹 Cleaning")
if st.checkbox("Apply smart imputation"):
num = df.select_dtypes(include=np.number).columns
cat = df.select_dtypes(exclude=np.number).columns
if len(num):
df[num] = SimpleImputer(strategy="mean").fit_transform(df[num])
if len(cat):
df[cat] = SimpleImputer(strategy="most_frequent").fit_transform(df[cat])
st.session_state.processed_df = df
st.success("Imputation complete")
st.rerun()
st.subheader("πŸ“ˆ Visuals")
plot = st.selectbox(
"Plot type",
["Correlation Heatmap", "Target Distribution"]
)
fig, ax = plt.subplots(figsize=(8,6))
if plot == "Correlation Heatmap":
sns.heatmap(df.select_dtypes(np.number).corr(), annot=True, ax=ax)
elif plot == "Target Distribution":
sns.histplot(df[target], kde=True, ax=ax)
st.pyplot(fig)
plt.close(fig)