Spaces:
Sleeping
Sleeping
| # eda.py | |
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import plotly.express as px | |
| import plotly.figure_factory as ff | |
| from sklearn.impute import SimpleImputer | |
| def run_eda(): | |
| st.header("π Exploratory Data Analysis") | |
| df = st.session_state.processed_df | |
| if df is None: | |
| return | |
| # Layout: Overview | |
| with st.expander("Show Raw Data & Statistics", expanded=False): | |
| c1, c2 = st.columns([2, 1]) | |
| c1.dataframe(df.head(100), use_container_width=True) | |
| c2.write(df.describe(include="all")) | |
| st.subheader("π Data Configuration") | |
| # Column Selector | |
| all_cols = df.columns.tolist() | |
| c1, c2 = st.columns(2) | |
| with c1: | |
| target = st.selectbox( | |
| "Target Variable (Label)", | |
| options=["None"] + all_cols, | |
| index=0, | |
| help="Select the variable you want to predict." | |
| ) | |
| with c2: | |
| # Auto-exclude target from features | |
| available_feats = [c for c in all_cols if c != target] | |
| features = st.multiselect("Feature Variables", available_feats, default=available_feats[:5]) | |
| # Persist selection | |
| if target != "None": | |
| st.session_state.target_col = target | |
| st.session_state.feature_cols = features | |
| # ---------------- Preprocessing ---------------- | |
| st.subheader("π§Ή Smart Cleaning") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| missing_num = df.select_dtypes(include=np.number).isnull().sum().sum() | |
| missing_cat = df.select_dtypes(exclude=np.number).isnull().sum().sum() | |
| st.info(f"Missing Values - Numeric: {missing_num} | Categorical: {missing_cat}") | |
| with col2: | |
| if st.button("Auto-Impute Missing Values"): | |
| # Numeric -> Mean, Categorical -> Mode | |
| num_cols = df.select_dtypes(include=np.number).columns | |
| cat_cols = df.select_dtypes(exclude=np.number).columns | |
| if len(num_cols) > 0: | |
| imp_num = SimpleImputer(strategy="mean") | |
| df[num_cols] = imp_num.fit_transform(df[num_cols]) | |
| if len(cat_cols) > 0: | |
| imp_cat = SimpleImputer(strategy="most_frequent") | |
| df[cat_cols] = imp_cat.fit_transform(df[cat_cols]) # Returns object array, pandas handles it | |
| st.session_state.processed_df = df | |
| st.success("Imputation Applied! Data refreshed.") | |
| st.rerun() | |
| # ---------------- Visualization ---------------- | |
| st.subheader("π Interactive Visualization") | |
| viz_type = st.selectbox( | |
| "Chart Type", | |
| ["Correlation Heatmap", "Distribution Plot", "Scatter Matrix", "Box Plot"] | |
| ) | |
| if viz_type == "Correlation Heatmap": | |
| numeric_df = df.select_dtypes(include=np.number) | |
| if not numeric_df.empty: | |
| corr = numeric_df.corr() | |
| fig = px.imshow( | |
| corr, | |
| text_auto=True, | |
| aspect="auto", | |
| color_continuous_scale="RdBu_r", | |
| title="Feature Correlation Matrix" | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| else: | |
| st.warning("No numeric columns for correlation.") | |
| elif viz_type == "Distribution Plot": | |
| col_to_plot = st.selectbox("Select Column", all_cols) | |
| fig = px.histogram(df, x=col_to_plot, color=target if target != "None" else None, marginal="box") | |
| st.plotly_chart(fig, use_container_width=True) | |
| elif viz_type == "Scatter Matrix": | |
| if len(features) > 0: | |
| dims = features[:4] # Limit to 4 for performance | |
| fig = px.scatter_matrix( | |
| df, | |
| dimensions=dims, | |
| color=target if target != "None" else None, | |
| title="Scatter Matrix (First 4 Features)" | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| elif viz_type == "Box Plot": | |
| y_col = st.selectbox("Y Axis (Numeric)", df.select_dtypes(include=np.number).columns) | |
| x_col = st.selectbox("X Axis (Categorical)", all_cols, index=min(len(all_cols)-1, 1)) | |
| fig = px.box(df, x=x_col, y=y_col, color=target if target != "None" else None) | |
| st.plotly_chart(fig, use_container_width=True) |