Spaces:

asenturisk
/

Benchmark-Kit-26

Sleeping

App Files Files Community

Benchmark-Kit-26 / src /eda.py

dwmk

Update src/eda.py

e8a3427 verified 14 days ago

raw

history blame contribute delete

4.28 kB

	# eda.py
	import streamlit as st
	import pandas as pd
	import numpy as np
	import plotly.express as px
	import plotly.figure_factory as ff
	from sklearn.impute import SimpleImputer

	def run_eda():
	st.header("📊 Exploratory Data Analysis")

	df = st.session_state.processed_df
	if df is None:
	return

	# Layout: Overview
	with st.expander("Show Raw Data & Statistics", expanded=False):
	c1, c2 = st.columns([2, 1])
	c1.dataframe(df.head(100), use_container_width=True)
	c2.write(df.describe(include="all"))

	st.subheader("🛠 Data Configuration")

	# Column Selector
	all_cols = df.columns.tolist()
	c1, c2 = st.columns(2)

	with c1:
	target = st.selectbox(
	"Target Variable (Label)",
	options=["None"] + all_cols,
	index=0,
	help="Select the variable you want to predict."
	)

	with c2:
	# Auto-exclude target from features
	available_feats = [c for c in all_cols if c != target]
	features = st.multiselect("Feature Variables", available_feats, default=available_feats[:5])

	# Persist selection
	if target != "None":
	st.session_state.target_col = target
	st.session_state.feature_cols = features

	# ---------------- Preprocessing ----------------
	st.subheader("🧹 Smart Cleaning")

	col1, col2 = st.columns(2)
	with col1:
	missing_num = df.select_dtypes(include=np.number).isnull().sum().sum()
	missing_cat = df.select_dtypes(exclude=np.number).isnull().sum().sum()
	st.info(f"Missing Values - Numeric: {missing_num} \| Categorical: {missing_cat}")

	with col2:
	if st.button("Auto-Impute Missing Values"):
	# Numeric -> Mean, Categorical -> Mode
	num_cols = df.select_dtypes(include=np.number).columns
	cat_cols = df.select_dtypes(exclude=np.number).columns

	if len(num_cols) > 0:
	imp_num = SimpleImputer(strategy="mean")
	df[num_cols] = imp_num.fit_transform(df[num_cols])

	if len(cat_cols) > 0:
	imp_cat = SimpleImputer(strategy="most_frequent")
	df[cat_cols] = imp_cat.fit_transform(df[cat_cols]) # Returns object array, pandas handles it

	st.session_state.processed_df = df
	st.success("Imputation Applied! Data refreshed.")
	st.rerun()

	# ---------------- Visualization ----------------
	st.subheader("📈 Interactive Visualization")

	viz_type = st.selectbox(
	"Chart Type",
	["Correlation Heatmap", "Distribution Plot", "Scatter Matrix", "Box Plot"]
	)

	if viz_type == "Correlation Heatmap":
	numeric_df = df.select_dtypes(include=np.number)
	if not numeric_df.empty:
	corr = numeric_df.corr()
	fig = px.imshow(
	corr,
	text_auto=True,
	aspect="auto",
	color_continuous_scale="RdBu_r",
	title="Feature Correlation Matrix"
	)
	st.plotly_chart(fig, use_container_width=True)
	else:
	st.warning("No numeric columns for correlation.")

	elif viz_type == "Distribution Plot":
	col_to_plot = st.selectbox("Select Column", all_cols)
	fig = px.histogram(df, x=col_to_plot, color=target if target != "None" else None, marginal="box")
	st.plotly_chart(fig, use_container_width=True)

	elif viz_type == "Scatter Matrix":
	if len(features) > 0:
	dims = features[:4] # Limit to 4 for performance
	fig = px.scatter_matrix(
	df,
	dimensions=dims,
	color=target if target != "None" else None,
	title="Scatter Matrix (First 4 Features)"
	)
	st.plotly_chart(fig, use_container_width=True)

	elif viz_type == "Box Plot":
	y_col = st.selectbox("Y Axis (Numeric)", df.select_dtypes(include=np.number).columns)
	x_col = st.selectbox("X Axis (Categorical)", all_cols, index=min(len(all_cols)-1, 1))
	fig = px.box(df, x=x_col, y=y_col, color=target if target != "None" else None)
	st.plotly_chart(fig, use_container_width=True)