# eda.py import streamlit as st import pandas as pd import numpy as np import plotly.express as px import plotly.figure_factory as ff from sklearn.impute import SimpleImputer def run_eda(): st.header("๐Ÿ“Š Exploratory Data Analysis") df = st.session_state.processed_df if df is None: return # Layout: Overview with st.expander("Show Raw Data & Statistics", expanded=False): c1, c2 = st.columns([2, 1]) c1.dataframe(df.head(100), use_container_width=True) c2.write(df.describe(include="all")) st.subheader("๐Ÿ›  Data Configuration") # Column Selector all_cols = df.columns.tolist() c1, c2 = st.columns(2) with c1: target = st.selectbox( "Target Variable (Label)", options=["None"] + all_cols, index=0, help="Select the variable you want to predict." ) with c2: # Auto-exclude target from features available_feats = [c for c in all_cols if c != target] features = st.multiselect("Feature Variables", available_feats, default=available_feats[:5]) # Persist selection if target != "None": st.session_state.target_col = target st.session_state.feature_cols = features # ---------------- Preprocessing ---------------- st.subheader("๐Ÿงน Smart Cleaning") col1, col2 = st.columns(2) with col1: missing_num = df.select_dtypes(include=np.number).isnull().sum().sum() missing_cat = df.select_dtypes(exclude=np.number).isnull().sum().sum() st.info(f"Missing Values - Numeric: {missing_num} | Categorical: {missing_cat}") with col2: if st.button("Auto-Impute Missing Values"): # Numeric -> Mean, Categorical -> Mode num_cols = df.select_dtypes(include=np.number).columns cat_cols = df.select_dtypes(exclude=np.number).columns if len(num_cols) > 0: imp_num = SimpleImputer(strategy="mean") df[num_cols] = imp_num.fit_transform(df[num_cols]) if len(cat_cols) > 0: imp_cat = SimpleImputer(strategy="most_frequent") df[cat_cols] = imp_cat.fit_transform(df[cat_cols]) # Returns object array, pandas handles it st.session_state.processed_df = df st.success("Imputation Applied! Data refreshed.") st.rerun() # ---------------- Visualization ---------------- st.subheader("๐Ÿ“ˆ Interactive Visualization") viz_type = st.selectbox( "Chart Type", ["Correlation Heatmap", "Distribution Plot", "Scatter Matrix", "Box Plot"] ) if viz_type == "Correlation Heatmap": numeric_df = df.select_dtypes(include=np.number) if not numeric_df.empty: corr = numeric_df.corr() fig = px.imshow( corr, text_auto=True, aspect="auto", color_continuous_scale="RdBu_r", title="Feature Correlation Matrix" ) st.plotly_chart(fig, use_container_width=True) else: st.warning("No numeric columns for correlation.") elif viz_type == "Distribution Plot": col_to_plot = st.selectbox("Select Column", all_cols) fig = px.histogram(df, x=col_to_plot, color=target if target != "None" else None, marginal="box") st.plotly_chart(fig, use_container_width=True) elif viz_type == "Scatter Matrix": if len(features) > 0: dims = features[:4] # Limit to 4 for performance fig = px.scatter_matrix( df, dimensions=dims, color=target if target != "None" else None, title="Scatter Matrix (First 4 Features)" ) st.plotly_chart(fig, use_container_width=True) elif viz_type == "Box Plot": y_col = st.selectbox("Y Axis (Numeric)", df.select_dtypes(include=np.number).columns) x_col = st.selectbox("X Axis (Categorical)", all_cols, index=min(len(all_cols)-1, 1)) fig = px.box(df, x=x_col, y=y_col, color=target if target != "None" else None) st.plotly_chart(fig, use_container_width=True)