Spaces:

Harika22
/

ML_Automate_Hub

Runtime error

App Files Files Community

ML_Automate_Hub / pages /2_Simple_EDA.py

Harika22

Update pages/2_Simple_EDA.py

2d76a15 verified 11 months ago

raw

history blame contribute delete

5.35 kB

	import streamlit as st
	import pandas as pd
	import io

	st.markdown("""
	<div style="text-align: center; margin-bottom: 20px;">
	<h2 style="color: #c71585; font-size: 36px;">Simple EDA: Understanding Your Data🔍</h1>
	<h3 style="color: #4F4F4F; font-size: 20px;">
	This helps us understand the quality of the data and see how the data looks.
	</h3>
	</div>
	""", unsafe_allow_html=True)

	if "df" in st.session_state and st.session_state.df is not None:
	df = st.session_state.df

	# Dataset Preview
	st.markdown("<h3 style='color: #2a52be;'>Dataset Preview📌</h3>", unsafe_allow_html=True)
	st.dataframe(df.head())

	# Shape of the Data
	st.markdown("<h3 style='color: #843f5b;'>Dataset Shape</h3>", unsafe_allow_html=True)
	st.write(f"🔹 The dataset contains {df.shape[0]} rows and {df.shape[1]} columns.")

	# Column Names & Data Types
	st.markdown("<h3 style='color: #e25822;'>Column Names & Data Types</h3>", unsafe_allow_html=True)
	st.write(df.dtypes)

	# 📝 Dataset Information (Equivalent to df.info())
	st.markdown("<h3 style='color: #9400d3;'>Dataset Information📝</h3>", unsafe_allow_html=True)

	buffer = io.StringIO()
	df.info(buf=buffer)
	info_str = buffer.getvalue()
	st.text(info_str)

	st.markdown(f"<pre style='background-color: #f8f8f8; padding: 10px; border-radius: 5px; font-size: 14px; font-family: monospace;'>{info_str}</pre>", unsafe_allow_html=True)


	# Numerical and categorical Columns
	st.markdown("<h3 style='color: #9400d3;'>Numerical and Categorical Columns</h3>", unsafe_allow_html=True)

	numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
	categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

	st.write(f"🔹 Numerical Columns ({len(numerical_cols)}): {', '.join(numerical_cols) if numerical_cols else 'None'}")
	st.write(f"🔹 Categorical Columns ({len(categorical_cols)}): {', '.join(categorical_cols) if categorical_cols else 'None'}")

	# Unique Values in Categorical Columns
	st.markdown("<h3 style='color: #e25822;'>Unique Values in Categorical Columns</h3>", unsafe_allow_html=True)

	if categorical_cols:
	for col in categorical_cols:
	unique_count = df[col].nunique()
	st.write(f"{col}: {unique_count} unique values")
	else:
	st.info("No categorical columns detectedℹ️.")

	# Value Counts in Categorical Columns
	st.markdown("<h3 style='color: #9400d3;'>Value Counts in Categorical Columns</h3>", unsafe_allow_html=True)

	if categorical_cols:
	for col in categorical_cols:
	st.write(f"🔹 {col} Value Distribution:")
	st.write(df[col].value_counts().head(10)) # Show top 10 categories
	else:
	st.info("No categorical columns detectedℹ️.")


	# Summary Statistics
	st.markdown("<h3 style='color: #843f5b;'>Summary Statistics for Numerical Columns</h3>", unsafe_allow_html=True)

	st.write("🔹 Basic statistical insights into the dataset:")
	st.write(df.describe())

	st.markdown("<h3 style='color: #2a52be;'>Summary Statistics for Categorical Columns</h3>", unsafe_allow_html=True)

	if categorical_cols:
	st.write(df[categorical_cols].describe(include='object'))
	else:
	st.info("No categorical columns detectedℹ️.")

	# Checking for Missing Values
	st.markdown("<h3 style='color: #9400d3;'>Missing Values in the Dataset⚠️</h3>", unsafe_allow_html=True)
	missing_values = df.isnull().sum()

	if missing_values.sum() == 0:
	st.success("No missing values found!")
	else:
	st.warning(f"Found missing values in the dataset.")
	st.write("🔹 Columns with Missing Values:")
	st.write(missing_values[missing_values > 0])

	# Checking for Duplicate Records
	st.markdown("<h3 style='color: #2a52be;'>Duplicate Records</h3>", unsafe_allow_html=True)
	duplicate_count = df.duplicated().sum()

	if duplicate_count == 0:
	st.success("No duplicate records found!")
	else:
	st.warning(f"Found {duplicate_count} duplicate rows in the dataset.")
	st.write("🔹 Example Duplicate Rows:")
	st.dataframe(df[df.duplicated()].head())

	# 📊 Outlier Detection
	st.markdown("<h3 style='color: #e25822;'>Outlier Detection</h3>", unsafe_allow_html=True)

	if numerical_cols:
	outlier_info = {}

	for col in numerical_cols:
	Q1 = df[col].quantile(0.25)
	Q3 = df[col].quantile(0.75)
	IQR = Q3 - Q1
	lower_bound = Q1 - 1.5 * IQR
	upper_bound = Q3 + 1.5 * IQR
	outliers = ((df[col] < lower_bound) \| (df[col] > upper_bound)).sum()

	if outliers > 0:
	outlier_info[col] = outliers

	if outlier_info:
	st.warning("Outliers detected:")
	for col, count in outlier_info.items():
	st.write(f"🔹 {col}: {count} outliers")
	else:
	st.success("No significant outliers detected!")
	else:
	st.info("No numerical columns detectedℹ️.")


	else:
	st.warning("No dataset found! Please upload a dataset first⚠️.")