File size: 4,278 Bytes
e0ff2bb
 
 
 
e8a3427
 
e0ff2bb
 
 
 
 
 
e8a3427
 
 
 
 
 
 
 
 
 
 
 
 
e0ff2bb
e8a3427
e0ff2bb
e8a3427
 
 
 
 
 
 
e0ff2bb
e8a3427
 
 
e0ff2bb
e8a3427
 
 
e0ff2bb
 
e8a3427
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e0ff2bb
 
e8a3427
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# eda.py
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
from sklearn.impute import SimpleImputer

def run_eda():
    st.header("๐Ÿ“Š Exploratory Data Analysis")

    df = st.session_state.processed_df
    if df is None:
        return

    # Layout: Overview
    with st.expander("Show Raw Data & Statistics", expanded=False):
        c1, c2 = st.columns([2, 1])
        c1.dataframe(df.head(100), use_container_width=True)
        c2.write(df.describe(include="all"))

    st.subheader("๐Ÿ›  Data Configuration")
    
    # Column Selector
    all_cols = df.columns.tolist()
    c1, c2 = st.columns(2)
    
    with c1:
        target = st.selectbox(
            "Target Variable (Label)", 
            options=["None"] + all_cols,
            index=0,
            help="Select the variable you want to predict."
        )
    
    with c2:
        # Auto-exclude target from features
        available_feats = [c for c in all_cols if c != target]
        features = st.multiselect("Feature Variables", available_feats, default=available_feats[:5])

    # Persist selection
    if target != "None":
        st.session_state.target_col = target
    st.session_state.feature_cols = features

    # ---------------- Preprocessing ----------------
    st.subheader("๐Ÿงน Smart Cleaning")
    
    col1, col2 = st.columns(2)
    with col1:
        missing_num = df.select_dtypes(include=np.number).isnull().sum().sum()
        missing_cat = df.select_dtypes(exclude=np.number).isnull().sum().sum()
        st.info(f"Missing Values - Numeric: {missing_num} | Categorical: {missing_cat}")

    with col2:
        if st.button("Auto-Impute Missing Values"):
            # Numeric -> Mean, Categorical -> Mode
            num_cols = df.select_dtypes(include=np.number).columns
            cat_cols = df.select_dtypes(exclude=np.number).columns

            if len(num_cols) > 0:
                imp_num = SimpleImputer(strategy="mean")
                df[num_cols] = imp_num.fit_transform(df[num_cols])
            
            if len(cat_cols) > 0:
                imp_cat = SimpleImputer(strategy="most_frequent")
                df[cat_cols] = imp_cat.fit_transform(df[cat_cols]) # Returns object array, pandas handles it
                
            st.session_state.processed_df = df
            st.success("Imputation Applied! Data refreshed.")
            st.rerun()

    # ---------------- Visualization ----------------
    st.subheader("๐Ÿ“ˆ Interactive Visualization")
    
    viz_type = st.selectbox(
        "Chart Type",
        ["Correlation Heatmap", "Distribution Plot", "Scatter Matrix", "Box Plot"]
    )

    if viz_type == "Correlation Heatmap":
        numeric_df = df.select_dtypes(include=np.number)
        if not numeric_df.empty:
            corr = numeric_df.corr()
            fig = px.imshow(
                corr, 
                text_auto=True, 
                aspect="auto", 
                color_continuous_scale="RdBu_r",
                title="Feature Correlation Matrix"
            )
            st.plotly_chart(fig, use_container_width=True)
        else:
            st.warning("No numeric columns for correlation.")

    elif viz_type == "Distribution Plot":
        col_to_plot = st.selectbox("Select Column", all_cols)
        fig = px.histogram(df, x=col_to_plot, color=target if target != "None" else None, marginal="box")
        st.plotly_chart(fig, use_container_width=True)

    elif viz_type == "Scatter Matrix":
        if len(features) > 0:
            dims = features[:4] # Limit to 4 for performance
            fig = px.scatter_matrix(
                df, 
                dimensions=dims, 
                color=target if target != "None" else None,
                title="Scatter Matrix (First 4 Features)"
            )
            st.plotly_chart(fig, use_container_width=True)
    
    elif viz_type == "Box Plot":
        y_col = st.selectbox("Y Axis (Numeric)", df.select_dtypes(include=np.number).columns)
        x_col = st.selectbox("X Axis (Categorical)", all_cols, index=min(len(all_cols)-1, 1))
        fig = px.box(df, x=x_col, y=y_col, color=target if target != "None" else None)
        st.plotly_chart(fig, use_container_width=True)