dwmk commited on
Commit
e0ff2bb
·
verified ·
1 Parent(s): 56290d3

Create eda.py

Browse files
Files changed (1) hide show
  1. src/eda.py +65 -0
src/eda.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # eda.py
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import numpy as np
5
+ import seaborn as sns
6
+ import matplotlib.pyplot as plt
7
+ from sklearn.impute import SimpleImputer
8
+
9
+ def run_eda():
10
+ st.header("📊 Exploratory Data Analysis")
11
+
12
+ df_raw = st.session_state.original_df
13
+ df = st.session_state.processed_df
14
+
15
+ c1, c2 = st.columns(2)
16
+ with c1:
17
+ st.subheader("Raw Data")
18
+ st.dataframe(df_raw.head(10))
19
+ st.write(df_raw.describe(include="all"))
20
+
21
+ with c2:
22
+ st.subheader("Processed Data")
23
+ st.dataframe(df.head(10))
24
+
25
+ st.subheader("🎯 Feature & Target Selection")
26
+
27
+ cols = df.columns.tolist()
28
+ target = st.selectbox("Target", cols)
29
+ features = st.multiselect("Features", [c for c in cols if c != target])
30
+
31
+ st.session_state.target_col = target
32
+ st.session_state.feature_cols = features
33
+
34
+ st.subheader("🧹 Cleaning")
35
+
36
+ if st.checkbox("Apply smart imputation"):
37
+ num = df.select_dtypes(include=np.number).columns
38
+ cat = df.select_dtypes(exclude=np.number).columns
39
+
40
+ if len(num):
41
+ df[num] = SimpleImputer(strategy="mean").fit_transform(df[num])
42
+ if len(cat):
43
+ df[cat] = SimpleImputer(strategy="most_frequent").fit_transform(df[cat])
44
+
45
+ st.session_state.processed_df = df
46
+ st.success("Imputation complete")
47
+ st.rerun()
48
+
49
+ st.subheader("📈 Visuals")
50
+
51
+ plot = st.selectbox(
52
+ "Plot type",
53
+ ["Correlation Heatmap", "Target Distribution"]
54
+ )
55
+
56
+ fig, ax = plt.subplots(figsize=(8,6))
57
+
58
+ if plot == "Correlation Heatmap":
59
+ sns.heatmap(df.select_dtypes(np.number).corr(), annot=True, ax=ax)
60
+
61
+ elif plot == "Target Distribution":
62
+ sns.histplot(df[target], kde=True, ax=ax)
63
+
64
+ st.pyplot(fig)
65
+ plt.close(fig)