Spaces:

asenturisk
/

Benchmark-Kit-26

Sleeping

App Files Files Community

dwmk commited on 26 days ago

Commit

e8a3427

verified ·

1 Parent(s): c771784

Update src/eda.py

Browse files

Files changed (1) hide show

src/eda.py +98 -47

src/eda.py CHANGED Viewed

@@ -2,64 +2,115 @@
 import streamlit as st
 import pandas as pd
 import numpy as np
-import seaborn as sns
-import matplotlib.pyplot as plt
 from sklearn.impute import SimpleImputer
 def run_eda():
     st.header("📊 Exploratory Data Analysis")
-    df_raw = st.session_state.original_df
     df = st.session_state.processed_df
     c1, c2 = st.columns(2)
     with c1:
-        st.subheader("Raw Data")
-        st.dataframe(df_raw.head(10))
-        st.write(df_raw.describe(include="all"))
     with c2:
-        st.subheader("Processed Data")
-        st.dataframe(df.head(10))
-    st.subheader("🎯 Feature & Target Selection")
-    cols = df.columns.tolist()
-    target = st.selectbox("Target", cols)
-    features = st.multiselect("Features", [c for c in cols if c != target])
-    st.session_state.target_col = target
     st.session_state.feature_cols = features
-    st.subheader("🧹 Cleaning")
-    if st.checkbox("Apply smart imputation"):
-        num = df.select_dtypes(include=np.number).columns
-        cat = df.select_dtypes(exclude=np.number).columns
-        if len(num):
-            df[num] = SimpleImputer(strategy="mean").fit_transform(df[num])
-        if len(cat):
-            df[cat] = SimpleImputer(strategy="most_frequent").fit_transform(df[cat])
-        st.session_state.processed_df = df
-        st.success("Imputation complete")
-        st.rerun()
-    st.subheader("📈 Visuals")
-    plot = st.selectbox(
-        "Plot type",
-        ["Correlation Heatmap", "Target Distribution"]
     )
-    fig, ax = plt.subplots(figsize=(8,6))
-    if plot == "Correlation Heatmap":
-        sns.heatmap(df.select_dtypes(np.number).corr(), annot=True, ax=ax)
-    elif plot == "Target Distribution":
-        sns.histplot(df[target], kde=True, ax=ax)
-    st.pyplot(fig)
-    plt.close(fig)

 import streamlit as st
 import pandas as pd
 import numpy as np
+import plotly.express as px
+import plotly.figure_factory as ff
 from sklearn.impute import SimpleImputer
 def run_eda():
     st.header("📊 Exploratory Data Analysis")
     df = st.session_state.processed_df
+    if df is None:
+        return
+    # Layout: Overview
+    with st.expander("Show Raw Data & Statistics", expanded=False):
+        c1, c2 = st.columns([2, 1])
+        c1.dataframe(df.head(100), use_container_width=True)
+        c2.write(df.describe(include="all"))
+    st.subheader("🛠 Data Configuration")
+    # Column Selector
+    all_cols = df.columns.tolist()
     c1, c2 = st.columns(2)
     with c1:
+        target = st.selectbox(
+            "Target Variable (Label)",
+            options=["None"] + all_cols,
+            index=0,
+            help="Select the variable you want to predict."
+        )
     with c2:
+        # Auto-exclude target from features
+        available_feats = [c for c in all_cols if c != target]
+        features = st.multiselect("Feature Variables", available_feats, default=available_feats[:5])
+    # Persist selection
+    if target != "None":
+        st.session_state.target_col = target
     st.session_state.feature_cols = features
+    # ---------------- Preprocessing ----------------
+    st.subheader("🧹 Smart Cleaning")
+    col1, col2 = st.columns(2)
+    with col1:
+        missing_num = df.select_dtypes(include=np.number).isnull().sum().sum()
+        missing_cat = df.select_dtypes(exclude=np.number).isnull().sum().sum()
+        st.info(f"Missing Values - Numeric: {missing_num} | Categorical: {missing_cat}")
+    with col2:
+        if st.button("Auto-Impute Missing Values"):
+            # Numeric -> Mean, Categorical -> Mode
+            num_cols = df.select_dtypes(include=np.number).columns
+            cat_cols = df.select_dtypes(exclude=np.number).columns
+            if len(num_cols) > 0:
+                imp_num = SimpleImputer(strategy="mean")
+                df[num_cols] = imp_num.fit_transform(df[num_cols])
+            if len(cat_cols) > 0:
+                imp_cat = SimpleImputer(strategy="most_frequent")
+                df[cat_cols] = imp_cat.fit_transform(df[cat_cols]) # Returns object array, pandas handles it
+            st.session_state.processed_df = df
+            st.success("Imputation Applied! Data refreshed.")
+            st.rerun()
+    # ---------------- Visualization ----------------
+    st.subheader("📈 Interactive Visualization")
+    viz_type = st.selectbox(
+        "Chart Type",
+        ["Correlation Heatmap", "Distribution Plot", "Scatter Matrix", "Box Plot"]
     )
+    if viz_type == "Correlation Heatmap":
+        numeric_df = df.select_dtypes(include=np.number)
+        if not numeric_df.empty:
+            corr = numeric_df.corr()
+            fig = px.imshow(
+                corr,
+                text_auto=True,
+                aspect="auto",
+                color_continuous_scale="RdBu_r",
+                title="Feature Correlation Matrix"
+            )
+            st.plotly_chart(fig, use_container_width=True)
+        else:
+            st.warning("No numeric columns for correlation.")
+    elif viz_type == "Distribution Plot":
+        col_to_plot = st.selectbox("Select Column", all_cols)
+        fig = px.histogram(df, x=col_to_plot, color=target if target != "None" else None, marginal="box")
+        st.plotly_chart(fig, use_container_width=True)
+    elif viz_type == "Scatter Matrix":
+        if len(features) > 0:
+            dims = features[:4] # Limit to 4 for performance
+            fig = px.scatter_matrix(
+                df,
+                dimensions=dims,
+                color=target if target != "None" else None,
+                title="Scatter Matrix (First 4 Features)"
+            )
+            st.plotly_chart(fig, use_container_width=True)
+    elif viz_type == "Box Plot":
+        y_col = st.selectbox("Y Axis (Numeric)", df.select_dtypes(include=np.number).columns)
+        x_col = st.selectbox("X Axis (Categorical)", all_cols, index=min(len(all_cols)-1, 1))
+        fig = px.box(df, x=x_col, y=y_col, color=target if target != "None" else None)
+        st.plotly_chart(fig, use_container_width=True)