Spaces:
Runtime error
Runtime error
File size: 5,351 Bytes
d939849 d2358c2 d939849 2d76a15 d2358c2 d939849 2d76a15 d2358c2 d939849 2d76a15 d2358c2 d939849 2d76a15 d2358c2 d939849 d2358c2 d939849 d2358c2 2d76a15 d2358c2 d939849 2d76a15 d2358c2 d939849 d2358c2 d939849 d2358c2 d939849 d2358c2 d939849 d2358c2 2d76a15 d2358c2 d939849 d2358c2 d939849 2d76a15 d2358c2 d939849 d2358c2 d939849 d2358c2 d939849 2d76a15 d2358c2 d939849 d2358c2 d939849 d2358c2 d939849 2d76a15 d2358c2 d939849 d2358c2 d939849 d2358c2 d939849 d2358c2 d939849 d2358c2 d939849 2d76a15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | import streamlit as st
import pandas as pd
import io
st.markdown("""
<div style="text-align: center; margin-bottom: 20px;">
<h2 style="color: #c71585; font-size: 36px;">Simple EDA: Understanding Your Data🔍</h1>
<h3 style="color: #4F4F4F; font-size: 20px;">
This helps us understand the quality of the data and see how the data looks.
</h3>
</div>
""", unsafe_allow_html=True)
if "df" in st.session_state and st.session_state.df is not None:
df = st.session_state.df
# Dataset Preview
st.markdown("<h3 style='color: #2a52be;'>Dataset Preview📌</h3>", unsafe_allow_html=True)
st.dataframe(df.head())
# Shape of the Data
st.markdown("<h3 style='color: #843f5b;'>Dataset Shape</h3>", unsafe_allow_html=True)
st.write(f"🔹 The dataset contains **{df.shape[0]} rows** and **{df.shape[1]} columns**.")
# Column Names & Data Types
st.markdown("<h3 style='color: #e25822;'>Column Names & Data Types</h3>", unsafe_allow_html=True)
st.write(df.dtypes)
# 📝 Dataset Information (Equivalent to df.info())
st.markdown("<h3 style='color: #9400d3;'>Dataset Information📝</h3>", unsafe_allow_html=True)
buffer = io.StringIO()
df.info(buf=buffer)
info_str = buffer.getvalue()
st.text(info_str)
st.markdown(f"<pre style='background-color: #f8f8f8; padding: 10px; border-radius: 5px; font-size: 14px; font-family: monospace;'>{info_str}</pre>", unsafe_allow_html=True)
# Numerical and categorical Columns
st.markdown("<h3 style='color: #9400d3;'>Numerical and Categorical Columns</h3>", unsafe_allow_html=True)
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
st.write(f"🔹 **Numerical Columns ({len(numerical_cols)}):** {', '.join(numerical_cols) if numerical_cols else 'None'}")
st.write(f"🔹 **Categorical Columns ({len(categorical_cols)}):** {', '.join(categorical_cols) if categorical_cols else 'None'}")
# Unique Values in Categorical Columns
st.markdown("<h3 style='color: #e25822;'>Unique Values in Categorical Columns</h3>", unsafe_allow_html=True)
if categorical_cols:
for col in categorical_cols:
unique_count = df[col].nunique()
st.write(f"**{col}:** {unique_count} unique values")
else:
st.info("No categorical columns detectedℹ️.")
# Value Counts in Categorical Columns
st.markdown("<h3 style='color: #9400d3;'>Value Counts in Categorical Columns</h3>", unsafe_allow_html=True)
if categorical_cols:
for col in categorical_cols:
st.write(f"🔹 **{col} Value Distribution:**")
st.write(df[col].value_counts().head(10)) # Show top 10 categories
else:
st.info("No categorical columns detectedℹ️.")
# Summary Statistics
st.markdown("<h3 style='color: #843f5b;'>Summary Statistics for Numerical Columns</h3>", unsafe_allow_html=True)
st.write("🔹 **Basic statistical insights into the dataset:**")
st.write(df.describe())
st.markdown("<h3 style='color: #2a52be;'>Summary Statistics for Categorical Columns</h3>", unsafe_allow_html=True)
if categorical_cols:
st.write(df[categorical_cols].describe(include='object'))
else:
st.info("No categorical columns detectedℹ️.")
# Checking for Missing Values
st.markdown("<h3 style='color: #9400d3;'>Missing Values in the Dataset⚠️</h3>", unsafe_allow_html=True)
missing_values = df.isnull().sum()
if missing_values.sum() == 0:
st.success("No missing values found!")
else:
st.warning(f"Found missing values in the dataset.")
st.write("🔹 **Columns with Missing Values:**")
st.write(missing_values[missing_values > 0])
# Checking for Duplicate Records
st.markdown("<h3 style='color: #2a52be;'>Duplicate Records</h3>", unsafe_allow_html=True)
duplicate_count = df.duplicated().sum()
if duplicate_count == 0:
st.success("No duplicate records found!")
else:
st.warning(f"Found {duplicate_count} duplicate rows in the dataset.")
st.write("🔹 **Example Duplicate Rows:**")
st.dataframe(df[df.duplicated()].head())
# 📊 Outlier Detection
st.markdown("<h3 style='color: #e25822;'>Outlier Detection</h3>", unsafe_allow_html=True)
if numerical_cols:
outlier_info = {}
for col in numerical_cols:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
if outliers > 0:
outlier_info[col] = outliers
if outlier_info:
st.warning("Outliers detected:")
for col, count in outlier_info.items():
st.write(f"🔹 **{col}:** {count} outliers")
else:
st.success("No significant outliers detected!")
else:
st.info("No numerical columns detectedℹ️.")
else:
st.warning("No dataset found! Please upload a dataset first⚠️.") |