dataanalyst / dataset_overview.py
RamAi2026's picture
Upload 13 files
da8e446 verified
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
def eda_analysis(df):
"""
Comprehensive Exploratory Data Analysis (EDA) with visual insights
"""
st.markdown("""
<div style='text-align: center; margin-bottom: 2rem;'>
<h2>πŸ” Exploratory Data Analysis (EDA)</h2>
<p style='color: gray;'>Discover patterns, relationships, and insights through visual exploration</p>
</div>
""", unsafe_allow_html=True)
# Error handling
if df.empty:
st.error("❌ The dataset is empty. Please upload a valid dataset.")
return
try:
# Create tabs for different EDA aspects
tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([
"πŸ“‹ Data Overview",
"πŸ” Missing Data Analysis",
"πŸ“Š Univariate Analysis",
"πŸ”„ Bivariate Analysis",
"πŸ“ˆ Multivariate Analysis",
"🎯 Pattern Discovery"
])
with tab1:
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
st.subheader("πŸ“‹ Dataset Overview")
try:
# Key metrics in cards
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Total Rows", f"{df.shape[0]:,}")
with col2:
st.metric("Total Columns", df.shape[1])
with col3:
memory_usage = df.memory_usage(deep=True).sum() / 1024**2
st.metric("Memory Usage", f"{memory_usage:.2f} MB")
with col4:
missing_total = df.isnull().sum().sum()
st.metric("Missing Values", f"{missing_total:,}")
# Data preview with interactive controls
st.subheader("πŸ” Data Preview")
col1, col2 = st.columns(2)
with col1:
preview_rows = st.slider("Number of rows to display", 5, 50, 10, key="preview_rows")
with col2:
preview_type = st.radio("Preview type", ["Head", "Tail", "Random Sample"],
horizontal=True, key="preview_type")
if preview_type == "Head":
st.dataframe(df.head(preview_rows), use_container_width=True)
elif preview_type == "Tail":
st.dataframe(df.tail(preview_rows), use_container_width=True)
else:
if len(df) > preview_rows:
st.dataframe(df.sample(preview_rows), use_container_width=True)
else:
st.warning("⚠️ Sample size larger than dataset. Showing all rows.")
st.dataframe(df, use_container_width=True)
# Column information with visual indicators
st.subheader("πŸ“‹ Column Information")
col_info = pd.DataFrame({
'Column': df.columns,
'Data Type': df.dtypes.astype(str),
'Non-Null Count': df.count().values,
'Null Count': df.isnull().sum().values,
'Null %': (df.isnull().sum().values / len(df) * 100).round(2),
'Unique Values': [df[col].nunique() for col in df.columns],
'Sample Values': [str(df[col].dropna().iloc[:3].tolist()) if len(df[col].dropna()) > 0 else "All null" for col in df.columns]
})
# Add color coding for data types
def color_data_type(val):
if 'int' in val or 'float' in val:
return 'background-color: #e3f2fd'
elif 'object' in val:
return 'background-color: #f1f8e9'
elif 'datetime' in val:
return 'background-color: #fff3e0'
return ''
st.dataframe(col_info.style.applymap(color_data_type, subset=['Data Type']),
use_container_width=True)
# Data type distribution
st.subheader("πŸ“Š Data Type Distribution")
dtype_counts = df.dtypes.value_counts()
if len(dtype_counts) > 0:
fig = make_subplots(rows=1, cols=2,
specs=[[{"type": "pie"}, {"type": "bar"}]],
subplot_titles=("Pie Chart", "Bar Chart"))
fig.add_trace(go.Pie(labels=dtype_counts.index.astype(str),
values=dtype_counts.values,
hole=0.3), row=1, col=1)
fig.add_trace(go.Bar(x=dtype_counts.index.astype(str),
y=dtype_counts.values,
marker_color=['#42a5f5', '#66bb6a', '#ffa726'][:len(dtype_counts)]),
row=1, col=2)
fig.update_layout(height=400, title_text="Column Types Distribution")
st.plotly_chart(fig, use_container_width=True)
else:
st.warning("⚠️ No data type information available")
# Dataset statistics
st.subheader("πŸ“ˆ Dataset Statistics")
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
bool_cols = df.select_dtypes(include=['bool']).columns.tolist()
col1, col2, col3, col4 = st.columns(4)
with col1:
st.info(f"**Numeric:** {len(numeric_cols)} columns")
with col2:
st.info(f"**Categorical:** {len(categorical_cols)} columns")
with col3:
st.info(f"**Datetime:** {len(datetime_cols)} columns")
with col4:
st.info(f"**Boolean:** {len(bool_cols)} columns")
except Exception as e:
st.error(f"❌ Error in data overview: {str(e)}")
st.info("πŸ’‘ Tip: Check if your dataset contains valid data types")
st.markdown('</div>', unsafe_allow_html=True)
with tab2:
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
st.subheader("πŸ” Missing Data Analysis")
try:
if df.isnull().sum().sum() > 0:
# Missing data overview
missing_df = pd.DataFrame({
'Column': df.columns,
'Missing Count': df.isnull().sum().values,
'Missing %': (df.isnull().sum().values / len(df) * 100).round(2)
}).sort_values('Missing %', ascending=False)
missing_df = missing_df[missing_df['Missing Count'] > 0]
if len(missing_df) > 0:
# Visualize missing data
fig = make_subplots(rows=2, cols=2,
subplot_titles=("Missing Values Heatmap",
"Missing Values by Column",
"Missing Data Patterns",
"Missing Data Matrix"),
specs=[[{"type": "heatmap"}, {"type": "bar"}],
[{"type": "scatter"}, {"type": "heatmap"}]])
# Heatmap of missing values
missing_matrix = df.isnull().astype(int).T
fig.add_trace(go.Heatmap(z=missing_matrix.values,
y=missing_matrix.index,
colorscale='Reds',
showscale=False), row=1, col=1)
# Bar chart of missing values
fig.add_trace(go.Bar(x=missing_df['Column'].head(20),
y=missing_df['Missing Count'].head(20),
marker_color='#ef5350',
name="Missing Count"), row=1, col=2)
# Missing data patterns (rows with missing data)
missing_rows = df[df.isnull().any(axis=1)]
if len(missing_rows) > 0:
pattern_df = missing_rows.isnull().sum(axis=1).value_counts().reset_index()
pattern_df.columns = ['Missing Count per Row', 'Number of Rows']
pattern_df = pattern_df.sort_values('Missing Count per Row')
fig.add_trace(go.Scatter(x=pattern_df['Missing Count per Row'],
y=pattern_df['Number of Rows'],
mode='lines+markers',
name="Patterns"), row=2, col=1)
# Missing data matrix for first 50 rows
sample_missing = df.head(min(50, len(df))).isnull().astype(int).T
fig.add_trace(go.Heatmap(z=sample_missing.values,
y=sample_missing.index,
colorscale='Reds',
showscale=False,
name="Matrix"), row=2, col=2)
fig.update_layout(height=800, title_text="Missing Data Analysis",
showlegend=False)
st.plotly_chart(fig, use_container_width=True)
# Detailed missing data table
st.subheader("πŸ“‹ Missing Data Details")
# Add severity classification
def classify_severity(pct):
if pct == 0:
return "βœ… None"
elif pct < 5:
return "🟒 Low"
elif pct < 20:
return "🟑 Medium"
else:
return "πŸ”΄ High"
missing_df['Severity'] = missing_df['Missing %'].apply(classify_severity)
missing_df['Recommendation'] = missing_df['Missing %'].apply(
lambda x: "No action needed" if x == 0 else
"Consider imputation" if x < 5 else
"Imputation recommended" if x < 20 else
"Consider dropping column"
)
st.dataframe(missing_df, use_container_width=True)
# Missing data patterns
if len(missing_df) > 1:
st.subheader("πŸ”„ Missing Data Patterns")
# Find columns with similar missing patterns
missing_corr = df[missing_df['Column'].tolist()].isnull().corr()
if len(missing_corr) > 1:
fig = px.imshow(missing_corr,
text_auto=True,
aspect="auto",
color_continuous_scale='RdBu_r',
title="Missing Value Correlation Matrix")
st.plotly_chart(fig, use_container_width=True)
# Find highly correlated missing patterns
high_corr = []
for i in range(len(missing_corr.columns)):
for j in range(i+1, len(missing_corr.columns)):
if abs(missing_corr.iloc[i, j]) > 0.7:
high_corr.append({
'Column 1': missing_corr.columns[i],
'Column 2': missing_corr.columns[j],
'Correlation': missing_corr.iloc[i, j]
})
if high_corr:
st.info("πŸ” **Columns with similar missing patterns:**")
for item in high_corr[:5]: # Show top 5
st.write(f"β€’ {item['Column 1']} & {item['Column 2']}: {item['Correlation']:.2f}")
else:
st.success("βœ… No missing values found in the dataset!")
else:
st.success("βœ… No missing values found in the dataset!")
# Show complete data visualization
fig = go.Figure()
fig.add_trace(go.Indicator(
mode="number+gauge",
value=100,
title={'text': "Data Completeness"},
gauge={'axis': {'range': [0, 100]},
'bar': {'color': "green"},
'steps': [{'range': [0, 100], 'color': "lightgreen"}]}
))
st.plotly_chart(fig, use_container_width=True)
except Exception as e:
st.error(f"❌ Error in missing data analysis: {str(e)}")
st.info("πŸ’‘ Tip: Ensure your dataset has valid data for missing value analysis")
st.markdown('</div>', unsafe_allow_html=True)
with tab3:
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
st.subheader("πŸ“Š Univariate Analysis")
try:
col_type = st.radio("Select column type", ["Numeric", "Categorical", "Datetime"],
horizontal=True, key="univariate_type")
if col_type == "Numeric":
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if numeric_cols:
selected_col = st.selectbox("Select numeric column", numeric_cols, key="univariate_num")
data = df[selected_col].dropna()
if len(data) > 0:
# Create comprehensive visualization
fig = make_subplots(rows=2, cols=3,
subplot_titles=("Histogram", "Box Plot", "Violin Plot",
"ECDF", "QQ Plot", "Summary Stats"),
specs=[[{"type": "xy"}, {"type": "xy"}, {"type": "xy"}],
[{"type": "xy"}, {"type": "xy"}, {"type": "domain"}]])
# Histogram
fig.add_trace(go.Histogram(x=data, nbinsx=30, name="Histogram",
marker_color='#42a5f5'), row=1, col=1)
# Box plot
fig.add_trace(go.Box(y=data, name="Box Plot", boxpoints='outliers',
marker_color='#66bb6a'), row=1, col=2)
# Violin plot
fig.add_trace(go.Violin(y=data, name="Violin Plot", box_visible=True,
line_color='black', fillcolor='#ffa726',
opacity=0.6), row=1, col=3)
# ECDF
sorted_data = np.sort(data)
ecdf = np.arange(1, len(sorted_data)+1) / len(sorted_data)
fig.add_trace(go.Scatter(x=sorted_data, y=ecdf, mode='lines',
name="ECDF", line=dict(color='#ab47bc')),
row=2, col=1)
# QQ plot
theoretical_q = np.random.normal(data.mean(), data.std(), len(data))
theoretical_q.sort()
fig.add_trace(go.Scatter(x=theoretical_q, y=sorted_data,
mode='markers', name="QQ Plot",
marker=dict(color='#7e57c2', size=3)),
row=2, col=2)
# Add reference line to QQ plot
min_val = min(theoretical_q.min(), sorted_data.min())
max_val = max(theoretical_q.max(), sorted_data.max())
fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val],
mode='lines', line=dict(color='red', dash='dash'),
showlegend=False), row=2, col=2)
# Summary statistics as table
stats_text = f"""
<b>Summary Statistics</b><br>
Count: {len(data):,}<br>
Mean: {data.mean():.4f}<br>
Std: {data.std():.4f}<br>
Min: {data.min():.4f}<br>
Q1: {data.quantile(0.25):.4f}<br>
Median: {data.median():.4f}<br>
Q3: {data.quantile(0.75):.4f}<br>
Max: {data.max():.4f}<br>
IQR: {data.quantile(0.75) - data.quantile(0.25):.4f}<br>
Skewness: {data.skew():.4f}<br>
Kurtosis: {data.kurtosis():.4f}
"""
fig.add_annotation(x=0.5, y=0.5, text=stats_text,
showarrow=False, font=dict(size=10),
row=2, col=3, align='left')
fig.update_layout(height=800, title_text=f"Univariate Analysis: {selected_col}")
st.plotly_chart(fig, use_container_width=True)
# Outlier detection
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
outliers = data[(data < Q1 - 1.5 * IQR) | (data > Q3 + 1.5 * IQR)]
col1, col2 = st.columns(2)
with col1:
st.metric("Outliers Count", len(outliers))
with col2:
st.metric("Outliers %", f"{len(outliers)/len(data)*100:.2f}%")
if len(outliers) > 0:
with st.expander("View outlier values"):
st.write(outliers.tolist()[:20]) # Show first 20 outliers
if len(outliers) > 20:
st.info(f"... and {len(outliers) - 20} more outliers")
else:
st.warning("⚠️ No numeric columns available for analysis")
elif col_type == "Categorical":
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
if categorical_cols:
selected_col = st.selectbox("Select categorical column", categorical_cols,
key="univariate_cat")
# Get value counts
value_counts = df[selected_col].value_counts().reset_index()
value_counts.columns = [selected_col, 'count']
value_counts['percentage'] = (value_counts['count'] / len(df) * 100).round(2)
if len(value_counts) > 0:
# Create visualizations
fig = make_subplots(rows=2, cols=2,
subplot_titles=("Bar Chart (Top 20)", "Pie Chart (Top 10)",
"Treemap (Top 10)", "Frequency Table"),
specs=[[{"type": "xy"}, {"type": "domain"}],
[{"type": "domain"}, {"type": "table"}]])
# Bar chart (top 20)
top20 = value_counts.head(20)
fig.add_trace(go.Bar(x=top20[selected_col],
y=top20['count'],
marker_color='#42a5f5',
name="Count"), row=1, col=1)
# Pie chart (top 10)
top10 = value_counts.head(10)
fig.add_trace(go.Pie(labels=top10[selected_col],
values=top10['count'],
hole=0.3,
textinfo='percent+label',
name="Proportion"), row=1, col=2)
# Treemap (top 10)
fig.add_trace(go.Treemap(labels=top10[selected_col],
parents=['']*len(top10),
values=top10['count'],
textinfo='label+value',
name="Treemap"), row=2, col=1)
# Frequency table (top 10)
fig.add_trace(go.Table(header=dict(values=[selected_col, 'Count', 'Percentage']),
cells=dict(values=[top10[selected_col].tolist(),
top10['count'].tolist(),
top10['percentage'].tolist()]),
name="Table"), row=2, col=2)
fig.update_layout(height=800, title_text=f"Categorical Analysis: {selected_col}")
st.plotly_chart(fig, use_container_width=True)
# Summary statistics for categorical
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Unique Values", f"{value_counts.shape[0]:,}")
with col2:
st.metric("Most Frequent", f"{value_counts.iloc[0, 0]}")
with col3:
st.metric("Frequency", f"{value_counts.iloc[0, 1]:,} ({value_counts.iloc[0, 2]}%)")
# Cardinality warning
if value_counts.shape[0] > 50:
st.warning(f"⚠️ High cardinality detected: {value_counts.shape[0]} unique values. Consider grouping rare categories.")
else:
st.warning("⚠️ No categorical columns available for analysis")
elif col_type == "Datetime":
datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
if datetime_cols:
selected_col = st.selectbox("Select datetime column", datetime_cols,
key="univariate_datetime")
# Extract temporal features
df_temp = df[selected_col].dropna()
if len(df_temp) > 0:
# Create temporal distributions
fig = make_subplots(rows=2, cols=2,
subplot_titles=("Year Distribution", "Month Distribution",
"Day of Week Distribution", "Hour Distribution"),
specs=[[{"type": "xy"}, {"type": "xy"}],
[{"type": "xy"}, {"type": "xy"}]])
# Year distribution
years = df_temp.dt.year.value_counts().sort_index()
if len(years) > 0:
fig.add_trace(go.Bar(x=years.index.astype(str), y=years.values,
marker_color='#42a5f5', name="Year"), row=1, col=1)
# Month distribution
months = df_temp.dt.month.value_counts().sort_index()
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
if len(months) > 0:
fig.add_trace(go.Bar(x=[month_names[i-1] for i in months.index],
y=months.values, marker_color='#66bb6a',
name="Month"), row=1, col=2)
# Day of week distribution
days = df_temp.dt.dayofweek.value_counts().sort_index()
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
if len(days) > 0:
fig.add_trace(go.Bar(x=[day_names[i] for i in days.index],
y=days.values, marker_color='#ffa726',
name="Day of Week"), row=2, col=1)
# Hour distribution (if time component exists)
if df_temp.dt.hour.nunique() > 1:
hours = df_temp.dt.hour.value_counts().sort_index()
fig.add_trace(go.Bar(x=hours.index.astype(str), y=hours.values,
marker_color='#ab47bc', name="Hour"), row=2, col=2)
fig.update_layout(height=800, title_text=f"Temporal Analysis: {selected_col}")
st.plotly_chart(fig, use_container_width=True)
# Date range information
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Start Date", df_temp.min().strftime('%Y-%m-%d'))
with col2:
st.metric("End Date", df_temp.max().strftime('%Y-%m-%d'))
with col3:
date_range = (df_temp.max() - df_temp.min()).days
st.metric("Date Range", f"{date_range} days")
else:
st.warning("⚠️ No datetime columns available for analysis")
except Exception as e:
st.error(f"❌ Error in univariate analysis: {str(e)}")
st.info("πŸ’‘ Tip: Ensure the selected column contains valid data for analysis")
st.markdown('</div>', unsafe_allow_html=True)
with tab4:
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
st.subheader("πŸ”„ Bivariate Analysis")
try:
analysis_type = st.radio("Select analysis type",
["Numeric vs Numeric", "Numeric vs Categorical",
"Categorical vs Categorical"],
horizontal=True, key="bivariate_type")
if analysis_type == "Numeric vs Numeric":
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if len(numeric_cols) >= 2:
col1, col2 = st.columns(2)
with col1:
x_col = st.selectbox("Select X axis", numeric_cols, key="bi_x")
with col2:
y_col = st.selectbox("Select Y axis", [c for c in numeric_cols if c != x_col],
key="bi_y")
# Clean data for analysis
plot_df = df[[x_col, y_col]].dropna()
if len(plot_df) > 0:
# Create comprehensive visualization
fig = make_subplots(rows=2, cols=3,
subplot_titles=("Scatter Plot", "Hexbin Plot", "Density Contour",
"Marginal Distributions", "Residuals", "Statistics"),
specs=[[{"type": "xy"}, {"type": "xy"}, {"type": "xy"}],
[{"type": "xy"}, {"type": "xy"}, {"type": "domain"}]])
# Scatter plot with trendline
fig.add_trace(go.Scatter(x=plot_df[x_col], y=plot_df[y_col],
mode='markers', name="Scatter",
marker=dict(size=5, opacity=0.6, color='#42a5f5')),
row=1, col=1)
# Add trendline
try:
z = np.polyfit(plot_df[x_col], plot_df[y_col], 1)
p = np.poly1d(z)
x_range = np.linspace(plot_df[x_col].min(), plot_df[x_col].max(), 100)
fig.add_trace(go.Scatter(x=x_range, y=p(x_range),
mode='lines', name="Trend",
line=dict(color='red', width=2)), row=1, col=1)
except:
pass
# Hexbin plot
fig.add_trace(go.Histogram2d(x=plot_df[x_col], y=plot_df[y_col],
colorscale='Viridis',
name="Hexbin"), row=1, col=2)
# Density contour
fig.add_trace(go.Histogram2dContour(x=plot_df[x_col], y=plot_df[y_col],
colorscale='Viridis',
name="Contour"), row=1, col=3)
# Marginal distributions
fig.add_trace(go.Histogram(x=plot_df[x_col], name=f"{x_col}",
marker_color='#66bb6a'), row=2, col=1)
fig.add_trace(go.Histogram(y=plot_df[y_col], name=f"{y_col}",
marker_color='#ffa726', orientation='h'),
row=2, col=1)
# Residuals
try:
residuals = plot_df[y_col] - p(plot_df[x_col])
fig.add_trace(go.Scatter(x=plot_df[x_col], y=residuals,
mode='markers', name="Residuals",
marker=dict(size=3, opacity=0.5, color='#ab47bc')),
row=2, col=2)
fig.add_hline(y=0, line_dash="dash", line_color="red", row=2, col=2)
except:
pass
# Statistics
corr = plot_df[x_col].corr(plot_df[y_col])
stats_text = f"""
<b>Statistics</b><br>
Correlation: {corr:.4f}<br>
RΒ²: {corr**2:.4f}<br>
Covariance: {plot_df[x_col].cov(plot_df[y_col]):.4f}<br>
Sample Size: {len(plot_df)}<br>
"""
fig.add_annotation(x=0.5, y=0.5, text=stats_text,
showarrow=False, font=dict(size=10),
row=2, col=3, align='left')
fig.update_layout(height=800, title_text=f"Bivariate Analysis: {x_col} vs {y_col}")
st.plotly_chart(fig, use_container_width=True)
# Correlation interpretation
if abs(corr) > 0.7:
st.success(f"βœ… Strong {'positive' if corr > 0 else 'negative'} correlation detected")
elif abs(corr) > 0.3:
st.info(f"ℹ️ Moderate {'positive' if corr > 0 else 'negative'} correlation detected")
else:
st.warning(f"⚠️ Weak or no correlation detected")
else:
st.warning("⚠️ Need at least 2 numeric columns for this analysis")
elif analysis_type == "Numeric vs Categorical":
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
if numeric_cols and categorical_cols:
col1, col2 = st.columns(2)
with col1:
num_col = st.selectbox("Select numeric column", numeric_cols, key="bi_num")
with col2:
cat_col = st.selectbox("Select categorical column", categorical_cols, key="bi_cat")
# Clean data
plot_df = df[[num_col, cat_col]].dropna()
if len(plot_df) > 0 and plot_df[cat_col].nunique() <= 30:
# Create visualizations
fig = make_subplots(rows=2, cols=2,
subplot_titles=("Box Plot", "Violin Plot",
"Strip Plot", "Bar Chart (Means Β± SD)"),
specs=[[{"type": "xy"}, {"type": "xy"}],
[{"type": "xy"}, {"type": "xy"}]])
# Box plot
fig.add_trace(go.Box(x=plot_df[cat_col], y=plot_df[num_col],
name="Box Plot", marker_color='#42a5f5'), row=1, col=1)
# Violin plot
fig.add_trace(go.Violin(x=plot_df[cat_col], y=plot_df[num_col],
box_visible=True, line_color='black',
fillcolor='#66bb6a', opacity=0.6,
name="Violin Plot"), row=1, col=2)
# Strip plot
fig.add_trace(go.Scatter(x=plot_df[cat_col], y=plot_df[num_col],
mode='markers', name="Strip Plot",
marker=dict(size=3, opacity=0.3, color='#ffa726')),
row=2, col=1)
# Bar chart with error bars
stats_by_cat = plot_df.groupby(cat_col)[num_col].agg(['mean', 'std', 'count']).reset_index()
stats_by_cat = stats_by_cat.sort_values('mean', ascending=False).head(15)
fig.add_trace(go.Bar(x=stats_by_cat[cat_col], y=stats_by_cat['mean'],
error_y=dict(type='data', array=stats_by_cat['std']),
name="Mean Β± SD", marker_color='#ab47bc'),
row=2, col=2)
fig.update_layout(height=800, title_text=f"{num_col} by {cat_col}")
st.plotly_chart(fig, use_container_width=True)
# ANOVA test for groups with >2 categories
if plot_df[cat_col].nunique() >= 2:
groups = [group[num_col].values for name, group in plot_df.groupby(cat_col)]
if all(len(g) > 0 for g in groups):
f_stat, p_val = stats.f_oneway(*groups)
st.write(f"**One-way ANOVA Results:** F-statistic = {f_stat:.4f}, p-value = {p_val:.4f}")
if p_val < 0.05:
st.success("βœ… Significant differences exist between groups")
else:
st.info("ℹ️ No significant differences found between groups")
elif plot_df[cat_col].nunique() > 30:
st.warning(f"⚠️ Categorical column has {plot_df[cat_col].nunique()} unique values. Consider grouping or selecting another column.")
else:
st.warning("⚠️ Need both numeric and categorical columns for this analysis")
elif analysis_type == "Categorical vs Categorical":
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
if len(categorical_cols) >= 2:
col1, col2 = st.columns(2)
with col1:
cat1 = st.selectbox("Select first categorical column", categorical_cols, key="bi_cat1")
with col2:
cat2 = st.selectbox("Select second categorical column",
[c for c in categorical_cols if c != cat1], key="bi_cat2")
# Create contingency table
contingency = pd.crosstab(df[cat1], df[cat2])
if contingency.size > 0:
fig = make_subplots(rows=1, cols=2,
subplot_titles=("Stacked Bar Chart", "Heatmap"),
specs=[[{"type": "xy"}, {"type": "heatmap"}]])
# Stacked bar chart
for col in contingency.columns[:10]: # Limit to 10 categories
fig.add_trace(go.Bar(x=contingency.index[:10], y=contingency[col][:10],
name=str(col)), row=1, col=1)
# Heatmap
fig.add_trace(go.Heatmap(z=contingency.values[:10, :10],
x=contingency.columns[:10].astype(str),
y=contingency.index[:10].astype(str),
colorscale='Viridis',
text=contingency.values[:10, :10],
texttemplate="%{text}"), row=1, col=2)
fig.update_layout(height=600, title_text=f"Relationship: {cat1} vs {cat2}",
barmode='stack')
st.plotly_chart(fig, use_container_width=True)
# Chi-square test
from scipy.stats import chi2_contingency
chi2, p_val, dof, expected = chi2_contingency(contingency)
st.write(f"**Chi-square Test Results:**")
st.write(f"χ² = {chi2:.4f}, df = {dof}, p-value = {p_val:.4f}")
if p_val < 0.05:
st.success("βœ… Significant association found between variables")
# Cramer's V for effect size
n = contingency.sum().sum()
cramer_v = np.sqrt(chi2 / (n * (min(contingency.shape) - 1)))
st.write(f"**Cramer's V (effect size):** {cramer_v:.4f}")
else:
st.info("ℹ️ No significant association found")
else:
st.warning("⚠️ Need at least 2 categorical columns for this analysis")
except Exception as e:
st.error(f"❌ Error in bivariate analysis: {str(e)}")
st.info("πŸ’‘ Tip: Check if selected columns have sufficient data for analysis")
st.markdown('</div>', unsafe_allow_html=True)
with tab5:
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
st.subheader("πŸ“ˆ Multivariate Analysis")
try:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if len(numeric_cols) >= 3:
analysis_type = st.radio("Select analysis type",
["Correlation Matrix", "Parallel Coordinates",
"3D Scatter", "Radar Chart"],
horizontal=True, key="multivariate_type")
if analysis_type == "Correlation Matrix":
corr_matrix = df[numeric_cols].corr()
fig = px.imshow(corr_matrix,
text_auto=True,
aspect="auto",
color_continuous_scale='RdBu_r',
title="Correlation Matrix Heatmap",
zmin=-1, zmax=1)
fig.update_layout(height=700)
st.plotly_chart(fig, use_container_width=True)
# Find highly correlated pairs
high_corr = []
for i in range(len(numeric_cols)):
for j in range(i+1, len(numeric_cols)):
if abs(corr_matrix.iloc[i, j]) > 0.7:
high_corr.append({
'Feature 1': numeric_cols[i],
'Feature 2': numeric_cols[j],
'Correlation': corr_matrix.iloc[i, j]
})
if high_corr:
st.subheader("πŸ” Highly Correlated Pairs (|r| > 0.7)")
for item in high_corr:
st.write(f"β€’ **{item['Feature 1']}** & **{item['Feature 2']}**: {item['Correlation']:.4f}")
elif analysis_type == "Parallel Coordinates":
# Select dimensions
selected_dims = st.multiselect("Select dimensions (columns)",
numeric_cols,
default=numeric_cols[:min(4, len(numeric_cols))])
if len(selected_dims) >= 2:
# Optional color dimension
color_dim = st.selectbox("Color by", ["None"] + numeric_cols +
df.select_dtypes(include=['object', 'category']).columns.tolist())
plot_df = df[selected_dims].dropna()
if len(plot_df) > 0:
if color_dim == "None":
fig = px.parallel_coordinates(plot_df,
dimensions=selected_dims,
title="Parallel Coordinates Plot")
else:
if color_dim in numeric_cols:
fig = px.parallel_coordinates(plot_df,
dimensions=selected_dims,
color=color_dim,
color_continuous_scale=px.colors.diverging.RdBu,
title=f"Parallel Coordinates colored by {color_dim}")
else:
# Categorical color
temp_df = df[selected_dims + [color_dim]].dropna()
fig = px.parallel_coordinates(temp_df,
dimensions=selected_dims,
color=color_dim,
title=f"Parallel Coordinates colored by {color_dim}")
fig.update_layout(height=600)
st.plotly_chart(fig, use_container_width=True)
elif analysis_type == "3D Scatter":
if len(numeric_cols) >= 3:
col1, col2, col3 = st.columns(3)
with col1:
x_3d = st.selectbox("X axis", numeric_cols, key="3d_x")
with col2:
y_3d = st.selectbox("Y axis", [c for c in numeric_cols if c != x_3d], key="3d_y")
with col3:
z_3d = st.selectbox("Z axis", [c for c in numeric_cols if c not in [x_3d, y_3d]],
key="3d_z")
color_3d = st.selectbox("Color by", ["None"] +
df.select_dtypes(include=['object', 'category']).columns.tolist())
plot_df = df[[x_3d, y_3d, z_3d]].dropna()
if len(plot_df) > 0:
if color_3d == "None":
fig = px.scatter_3d(plot_df, x=x_3d, y=y_3d, z=z_3d,
title=f"3D Scatter Plot",
opacity=0.7)
else:
temp_df = df[[x_3d, y_3d, z_3d, color_3d]].dropna()
fig = px.scatter_3d(temp_df, x=x_3d, y=y_3d, z=z_3d,
color=color_3d,
title=f"3D Scatter colored by {color_3d}",
opacity=0.7)
fig.update_layout(height=700)
st.plotly_chart(fig, use_container_width=True)
elif analysis_type == "Radar Chart":
# Select features for radar
radar_features = st.multiselect("Select features for radar chart",
numeric_cols,
default=numeric_cols[:min(5, len(numeric_cols))])
if len(radar_features) >= 3:
# Select how many samples to show
n_samples = st.slider("Number of samples to show", 1, min(10, len(df)), 3)
fig = go.Figure()
for i in range(n_samples):
sample = df.iloc[i][radar_features].values
fig.add_trace(go.Scatterpolar(
r=sample,
theta=radar_features,
fill='toself',
name=f'Sample {i}'
))
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
range=[df[radar_features].min().min(), df[radar_features].max().max()]
)),
title=f"Radar Chart - First {n_samples} Samples",
height=600
)
st.plotly_chart(fig, use_container_width=True)
else:
st.warning("⚠️ Need at least 3 numeric columns for multivariate analysis")
except Exception as e:
st.error(f"❌ Error in multivariate analysis: {str(e)}")
st.info("πŸ’‘ Tip: Ensure you have enough numeric columns for multivariate analysis")
st.markdown('</div>', unsafe_allow_html=True)
with tab6:
st.markdown('<div class="custom-card">', unsafe_allow_html=True)
st.subheader("🎯 Pattern Discovery")
try:
analysis_type = st.radio("Select pattern discovery method",
["Clustering Visualization", "Outlier Detection",
"Trend Detection", "Seasonal Patterns"],
horizontal=True, key="pattern_type")
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if analysis_type == "Clustering Visualization":
if len(numeric_cols) >= 2:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
# Select features for clustering
cluster_features = st.multiselect("Select features for clustering",
numeric_cols,
default=numeric_cols[:min(3, len(numeric_cols))])
if len(cluster_features) >= 2:
n_clusters = st.slider("Number of clusters", 2, 8, 3)
# Prepare data
X = df[cluster_features].dropna()
if len(X) > 0:
# Scale data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Perform clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)
# Create visualization
if len(cluster_features) == 2:
fig = px.scatter(x=X[cluster_features[0]], y=X[cluster_features[1]],
color=clusters.astype(str),
title=f"K-Means Clustering (k={n_clusters})",
labels={'x': cluster_features[0], 'y': cluster_features[1],
'color': 'Cluster'})
elif len(cluster_features) >= 3:
fig = px.scatter_3d(x=X[cluster_features[0]], y=X[cluster_features[1]],
z=X[cluster_features[2]], color=clusters.astype(str),
title=f"K-Means Clustering (k={n_clusters})",
labels={cluster_features[0]: cluster_features[0],
cluster_features[1]: cluster_features[1],
cluster_features[2]: cluster_features[2],
'color': 'Cluster'})
fig.update_layout(height=600)
st.plotly_chart(fig, use_container_width=True)
# Cluster statistics
st.subheader("πŸ“Š Cluster Statistics")
X['Cluster'] = clusters
cluster_stats = X.groupby('Cluster')[cluster_features].mean()
st.dataframe(cluster_stats.style.format("{:.4f}"))
elif analysis_type == "Outlier Detection":
if len(numeric_cols) >= 2:
from sklearn.ensemble import IsolationForest
# Select features for outlier detection
outlier_features = st.multiselect("Select features for outlier detection",
numeric_cols,
default=numeric_cols[:min(3, len(numeric_cols))])
if len(outlier_features) >= 2:
contamination = st.slider("Expected outlier proportion", 0.01, 0.5, 0.1, 0.01)
# Prepare data
X = df[outlier_features].dropna()
if len(X) > 0:
# Detect outliers
iso_forest = IsolationForest(contamination=contamination, random_state=42)
outliers = iso_forest.fit_predict(X)
# Create visualization
if len(outlier_features) == 2:
fig = px.scatter(x=X[outlier_features[0]], y=X[outlier_features[1]],
color=outliers,
color_continuous_scale=['blue', 'red'],
title=f"Outlier Detection (contamination={contamination})",
labels={'x': outlier_features[0], 'y': outlier_features[1],
'color': 'Outlier'})
elif len(outlier_features) >= 3:
fig = px.scatter_3d(x=X[outlier_features[0]], y=X[outlier_features[1]],
z=X[outlier_features[2]], color=outliers,
color_continuous_scale=['blue', 'red'],
title=f"Outlier Detection (contamination={contamination})",
labels={outlier_features[0]: outlier_features[0],
outlier_features[1]: outlier_features[1],
outlier_features[2]: outlier_features[2],
'color': 'Outlier'})
fig.update_layout(height=600)
st.plotly_chart(fig, use_container_width=True)
# Outlier statistics
n_outliers = (outliers == -1).sum()
st.write(f"**Outliers detected:** {n_outliers} ({n_outliers/len(X)*100:.2f}%)")
elif analysis_type == "Trend Detection":
datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
if datetime_cols and numeric_cols:
date_col = st.selectbox("Select date column", datetime_cols)
value_col = st.selectbox("Select value column", numeric_cols)
# Prepare time series data
ts_df = df[[date_col, value_col]].dropna().sort_values(date_col)
if len(ts_df) > 10:
# Calculate moving averages
window = st.slider("Moving average window", 2, 30, 7)
ts_df['MA'] = ts_df[value_col].rolling(window=window).mean()
# Detect trend using linear regression
from sklearn.linear_model import LinearRegression
X = np.arange(len(ts_df)).reshape(-1, 1)
y = ts_df[value_col].values
model = LinearRegression()
model.fit(X, y)
trend = model.predict(X)
# Create visualization
fig = go.Figure()
fig.add_trace(go.Scatter(x=ts_df[date_col], y=ts_df[value_col],
mode='lines', name='Original'))
fig.add_trace(go.Scatter(x=ts_df[date_col], y=ts_df['MA'],
mode='lines', name=f'{window}-period MA',
line=dict(color='orange')))
fig.add_trace(go.Scatter(x=ts_df[date_col], y=trend,
mode='lines', name='Linear Trend',
line=dict(color='red', dash='dash')))
fig.update_layout(title="Trend Detection",
xaxis_title="Date",
yaxis_title=value_col,
height=500)
st.plotly_chart(fig, use_container_width=True)
# Trend statistics
slope = model.coef_[0]
st.write(f"**Trend slope:** {slope:.4f} units per time step")
if slope > 0:
st.success("βœ… Upward trend detected")
elif slope < 0:
st.warning("⚠️ Downward trend detected")
else:
st.info("ℹ️ No clear trend detected")
elif analysis_type == "Seasonal Patterns":
datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
if datetime_cols and numeric_cols:
date_col = st.selectbox("Select date column", datetime_cols, key="seasonal_date")
value_col = st.selectbox("Select value column", numeric_cols, key="seasonal_value")
# Extract seasonal components
df_temp = df[[date_col, value_col]].dropna()
df_temp['year'] = pd.DatetimeIndex(df_temp[date_col]).year
df_temp['month'] = pd.DatetimeIndex(df_temp[date_col]).month
df_temp['quarter'] = pd.DatetimeIndex(df_temp[date_col]).quarter
df_temp['dayofweek'] = pd.DatetimeIndex(df_temp[date_col]).dayofweek
# Create seasonal visualizations
fig = make_subplots(rows=2, cols=2,
subplot_titles=("Year-over-Year", "Monthly Pattern",
"Quarterly Pattern", "Day of Week Pattern"),
specs=[[{"type": "xy"}, {"type": "xy"}],
[{"type": "xy"}, {"type": "xy"}]])
# Year-over-Year
yearly_avg = df_temp.groupby('year')[value_col].mean().reset_index()
fig.add_trace(go.Scatter(x=yearly_avg['year'], y=yearly_avg[value_col],
mode='lines+markers', name="Yearly Avg"), row=1, col=1)
# Monthly pattern
monthly_avg = df_temp.groupby('month')[value_col].mean().reset_index()
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
fig.add_trace(go.Bar(x=[month_names[m-1] for m in monthly_avg['month']],
y=monthly_avg[value_col], name="Monthly Avg"), row=1, col=2)
# Quarterly pattern
quarterly_avg = df_temp.groupby('quarter')[value_col].mean().reset_index()
quarter_names = ['Q1', 'Q2', 'Q3', 'Q4']
fig.add_trace(go.Bar(x=[quarter_names[q-1] for q in quarterly_avg['quarter']],
y=quarterly_avg[value_col], name="Quarterly Avg"), row=2, col=1)
# Day of week pattern
dow_avg = df_temp.groupby('dayofweek')[value_col].mean().reset_index()
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
fig.add_trace(go.Bar(x=[day_names[d] for d in dow_avg['dayofweek']],
y=dow_avg[value_col], name="Day of Week Avg"), row=2, col=2)
fig.update_layout(height=800, title_text="Seasonal Pattern Analysis")
st.plotly_chart(fig, use_container_width=True)
except Exception as e:
st.error(f"❌ Error in pattern discovery: {str(e)}")
st.info("πŸ’‘ Tip: Ensure you have sufficient data for pattern detection")
st.markdown('</div>', unsafe_allow_html=True)
except Exception as e:
st.error(f"❌ Critical error in EDA: {str(e)}")
st.info("πŸ’‘ Please check your dataset and try again")
# Export options
st.markdown("---")
st.markdown("### πŸ“₯ Export EDA Report")
try:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
report_text = f"""
EXPLORATORY DATA ANALYSIS REPORT
=================================
Dataset Information:
β€’ Total Rows: {df.shape[0]:,}
β€’ Total Columns: {df.shape[1]}
β€’ Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB
Column Types:
β€’ Numeric: {len(numeric_cols)}
β€’ Categorical: {len(df.select_dtypes(include=['object', 'category']).columns)}
β€’ Datetime: {len(df.select_dtypes(include=['datetime64']).columns)}
Data Quality:
β€’ Missing Values: {df.isnull().sum().sum():,}
β€’ Complete Cases: {df.dropna().shape[0]:,}
β€’ Duplicate Rows: {df.duplicated().sum():,}
Analysis Performed:
β€’ Data Overview
β€’ Missing Data Analysis
β€’ Univariate Analysis
β€’ Bivariate Analysis
β€’ Multivariate Analysis
β€’ Pattern Discovery
"""
st.download_button(
label="πŸ“₯ Download EDA Report",
data=report_text,
file_name="eda_report.txt",
mime="text/plain",
use_container_width=True
)
except Exception as e:
st.error(f"❌ Error generating report: {str(e)}")