dwmk commited on
Commit
e8a3427
Β·
verified Β·
1 Parent(s): c771784

Update src/eda.py

Browse files
Files changed (1) hide show
  1. src/eda.py +98 -47
src/eda.py CHANGED
@@ -2,64 +2,115 @@
2
  import streamlit as st
3
  import pandas as pd
4
  import numpy as np
5
- import seaborn as sns
6
- import matplotlib.pyplot as plt
7
  from sklearn.impute import SimpleImputer
8
 
9
  def run_eda():
10
  st.header("πŸ“Š Exploratory Data Analysis")
11
 
12
- df_raw = st.session_state.original_df
13
  df = st.session_state.processed_df
14
-
 
 
 
 
 
 
 
 
 
 
 
 
15
  c1, c2 = st.columns(2)
 
16
  with c1:
17
- st.subheader("Raw Data")
18
- st.dataframe(df_raw.head(10))
19
- st.write(df_raw.describe(include="all"))
20
-
 
 
 
21
  with c2:
22
- st.subheader("Processed Data")
23
- st.dataframe(df.head(10))
24
-
25
- st.subheader("🎯 Feature & Target Selection")
26
 
27
- cols = df.columns.tolist()
28
- target = st.selectbox("Target", cols)
29
- features = st.multiselect("Features", [c for c in cols if c != target])
30
-
31
- st.session_state.target_col = target
32
  st.session_state.feature_cols = features
33
 
34
- st.subheader("🧹 Cleaning")
35
-
36
- if st.checkbox("Apply smart imputation"):
37
- num = df.select_dtypes(include=np.number).columns
38
- cat = df.select_dtypes(exclude=np.number).columns
39
-
40
- if len(num):
41
- df[num] = SimpleImputer(strategy="mean").fit_transform(df[num])
42
- if len(cat):
43
- df[cat] = SimpleImputer(strategy="most_frequent").fit_transform(df[cat])
44
-
45
- st.session_state.processed_df = df
46
- st.success("Imputation complete")
47
- st.rerun()
48
-
49
- st.subheader("πŸ“ˆ Visuals")
50
-
51
- plot = st.selectbox(
52
- "Plot type",
53
- ["Correlation Heatmap", "Target Distribution"]
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  )
55
 
56
- fig, ax = plt.subplots(figsize=(8,6))
57
-
58
- if plot == "Correlation Heatmap":
59
- sns.heatmap(df.select_dtypes(np.number).corr(), annot=True, ax=ax)
60
-
61
- elif plot == "Target Distribution":
62
- sns.histplot(df[target], kde=True, ax=ax)
63
-
64
- st.pyplot(fig)
65
- plt.close(fig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import streamlit as st
3
  import pandas as pd
4
  import numpy as np
5
+ import plotly.express as px
6
+ import plotly.figure_factory as ff
7
  from sklearn.impute import SimpleImputer
8
 
9
  def run_eda():
10
  st.header("πŸ“Š Exploratory Data Analysis")
11
 
 
12
  df = st.session_state.processed_df
13
+ if df is None:
14
+ return
15
+
16
+ # Layout: Overview
17
+ with st.expander("Show Raw Data & Statistics", expanded=False):
18
+ c1, c2 = st.columns([2, 1])
19
+ c1.dataframe(df.head(100), use_container_width=True)
20
+ c2.write(df.describe(include="all"))
21
+
22
+ st.subheader("πŸ›  Data Configuration")
23
+
24
+ # Column Selector
25
+ all_cols = df.columns.tolist()
26
  c1, c2 = st.columns(2)
27
+
28
  with c1:
29
+ target = st.selectbox(
30
+ "Target Variable (Label)",
31
+ options=["None"] + all_cols,
32
+ index=0,
33
+ help="Select the variable you want to predict."
34
+ )
35
+
36
  with c2:
37
+ # Auto-exclude target from features
38
+ available_feats = [c for c in all_cols if c != target]
39
+ features = st.multiselect("Feature Variables", available_feats, default=available_feats[:5])
 
40
 
41
+ # Persist selection
42
+ if target != "None":
43
+ st.session_state.target_col = target
 
 
44
  st.session_state.feature_cols = features
45
 
46
+ # ---------------- Preprocessing ----------------
47
+ st.subheader("🧹 Smart Cleaning")
48
+
49
+ col1, col2 = st.columns(2)
50
+ with col1:
51
+ missing_num = df.select_dtypes(include=np.number).isnull().sum().sum()
52
+ missing_cat = df.select_dtypes(exclude=np.number).isnull().sum().sum()
53
+ st.info(f"Missing Values - Numeric: {missing_num} | Categorical: {missing_cat}")
54
+
55
+ with col2:
56
+ if st.button("Auto-Impute Missing Values"):
57
+ # Numeric -> Mean, Categorical -> Mode
58
+ num_cols = df.select_dtypes(include=np.number).columns
59
+ cat_cols = df.select_dtypes(exclude=np.number).columns
60
+
61
+ if len(num_cols) > 0:
62
+ imp_num = SimpleImputer(strategy="mean")
63
+ df[num_cols] = imp_num.fit_transform(df[num_cols])
64
+
65
+ if len(cat_cols) > 0:
66
+ imp_cat = SimpleImputer(strategy="most_frequent")
67
+ df[cat_cols] = imp_cat.fit_transform(df[cat_cols]) # Returns object array, pandas handles it
68
+
69
+ st.session_state.processed_df = df
70
+ st.success("Imputation Applied! Data refreshed.")
71
+ st.rerun()
72
+
73
+ # ---------------- Visualization ----------------
74
+ st.subheader("πŸ“ˆ Interactive Visualization")
75
+
76
+ viz_type = st.selectbox(
77
+ "Chart Type",
78
+ ["Correlation Heatmap", "Distribution Plot", "Scatter Matrix", "Box Plot"]
79
  )
80
 
81
+ if viz_type == "Correlation Heatmap":
82
+ numeric_df = df.select_dtypes(include=np.number)
83
+ if not numeric_df.empty:
84
+ corr = numeric_df.corr()
85
+ fig = px.imshow(
86
+ corr,
87
+ text_auto=True,
88
+ aspect="auto",
89
+ color_continuous_scale="RdBu_r",
90
+ title="Feature Correlation Matrix"
91
+ )
92
+ st.plotly_chart(fig, use_container_width=True)
93
+ else:
94
+ st.warning("No numeric columns for correlation.")
95
+
96
+ elif viz_type == "Distribution Plot":
97
+ col_to_plot = st.selectbox("Select Column", all_cols)
98
+ fig = px.histogram(df, x=col_to_plot, color=target if target != "None" else None, marginal="box")
99
+ st.plotly_chart(fig, use_container_width=True)
100
+
101
+ elif viz_type == "Scatter Matrix":
102
+ if len(features) > 0:
103
+ dims = features[:4] # Limit to 4 for performance
104
+ fig = px.scatter_matrix(
105
+ df,
106
+ dimensions=dims,
107
+ color=target if target != "None" else None,
108
+ title="Scatter Matrix (First 4 Features)"
109
+ )
110
+ st.plotly_chart(fig, use_container_width=True)
111
+
112
+ elif viz_type == "Box Plot":
113
+ y_col = st.selectbox("Y Axis (Numeric)", df.select_dtypes(include=np.number).columns)
114
+ x_col = st.selectbox("X Axis (Categorical)", all_cols, index=min(len(all_cols)-1, 1))
115
+ fig = px.box(df, x=x_col, y=y_col, color=target if target != "None" else None)
116
+ st.plotly_chart(fig, use_container_width=True)