anycoder-ee07a46b / streamlit_app.py
Mimi1782's picture
Upload streamlit_app.py with huggingface_hub
5913dd6 verified
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from io import StringIO
import time
# Set page configuration
st.set_page_config(
page_title="Debunker - Data Quality Validator",
page_icon="πŸ”",
layout="wide",
initial_sidebar_state="expanded"
)
# Custom CSS for modern styling
st.markdown("""
<style>
.main-header {
font-size: 3rem;
font-weight: 700;
color: #1f77b4;
margin-bottom: 1rem;
}
.card {
background-color: #f8f9fa;
border-radius: 10px;
padding: 1.5rem;
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
margin-bottom: 1rem;
}
.metric-card {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
border-radius: 10px;
padding: 1.5rem;
color: white;
text-align: center;
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
}
.success-text { color: #2ecc71; }
.warning-text { color: #f1c40f; }
.error-text { color: #e74c3c; }
</style>
""", unsafe_allow_html=True)
# Header with Built with anycoder
st.markdown("""
<div style="text-align: center; margin-bottom: 2rem;">
<h1 class="main-header">πŸ” Debunker</h1>
<p style="font-size: 1.2rem; color: #666;">
Advanced Data Quality Validator & Anomaly Detector
</p>
<a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank"
style="color: #1f77b4; font-weight: bold; text-decoration: none; margin-top: 10px; display: inline-block;">
Built with anycoder
</a>
</div>
""", unsafe_allow_html=True)
# Sidebar Configuration
with st.sidebar:
st.header("βš™οΈ Configuration")
st.markdown("---")
st.subheader("πŸ“Š Data Input Method")
input_method = st.radio(
"Choose input method:",
["Upload CSV File", "Paste Data", "Generate Sample Data"],
label_visibility="collapsed"
)
st.markdown("---")
st.subheader("🎯 Validation Rules")
st.checkbox("Detect Missing Values", value=True, help="Check for NaN or empty cells")
st.checkbox("Detect Duplicates", value=True, help="Identify duplicate rows")
st.checkbox("Detect Outliers (IQR)", value=True, help="Flag values beyond statistical bounds")
st.checkbox("Detect Empty Strings", value=True, help="Find rows with empty string values")
st.markdown("---")
st.subheader("πŸ“ˆ Visualization Options")
plot_type = st.selectbox(
"Chart Type:",
["Bar Chart", "Scatter Plot", "Distribution Plot", "Heatmap"],
label_visibility="collapsed"
)
# Initialize session state
if 'df' not in st.session_state:
st.session_state.df = None
if 'analysis_results' not in st.session_state:
st.session_state.analysis_results = None
# Main Application Logic
def load_sample_data():
"""Generate sample dataset for demonstration"""
np.random.seed(42)
data = {
'Customer_ID': range(1, 101),
'Name': np.random.choice(['Alice', 'Bob', 'Charlie', 'David', 'Eve'], 100),
'Age': np.random.randint(18, 70, 100),
'Purchase_Amount': np.random.uniform(10, 500, 100),
'Rating': np.random.randint(1, 6, 100),
'Date': pd.date_range(start='2023-01-01', periods=100).strftime('%Y-%m-%d')
}
return pd.DataFrame(data)
def detect_anomalies(df):
"""Perform comprehensive data quality checks"""
results = {
'missing_values': {},
'duplicates': 0,
'outliers': {},
'empty_strings': {}
}
# Missing Values
for col in df.columns:
missing_count = df[col].isna().sum()
if missing_count > 0:
results['missing_values'][col] = missing_count
# Duplicates
results['duplicates'] = df.duplicated().sum()
# Outliers using IQR method
for col in df.select_dtypes(include=[np.number]).columns:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
if len(outliers) > 0:
results['outliers'][col] = {
'count': len(outliers),
'percentage': round((len(outliers) / len(df)) * 100, 2),
'values': outliers[col].tolist()
}
# Empty Strings
for col in df.select_dtypes(include=['object']).columns:
empty_count = (df[col] == '').sum()
if empty_count > 0:
results['empty_strings'][col] = empty_count
return results
def main():
# Input Handling
if input_method == "Upload CSV File":
uploaded_file = st.file_uploader("Upload your CSV file", type=['csv'])
if uploaded_file is not None:
try:
st.session_state.df = pd.read_csv(uploaded_file)
st.success(f"Successfully loaded: {uploaded_file.name}")
except Exception as e:
st.error(f"Error loading file: {str(e)}")
elif input_method == "Paste Data":
st.info("Paste your CSV data below:")
csv_text = st.text_area("CSV Data", height=200, placeholder="column1,column2,column3\nvalue1,value2,value3")
if st.button("Process Data", type="primary"):
try:
st.session_state.df = pd.read_csv(StringIO(csv_text))
st.success("Data processed successfully!")
except Exception as e:
st.error(f"Error processing data: {str(e)}")
elif input_method == "Generate Sample Data":
if st.button("Generate Sample Data", type="primary"):
st.session_state.df = load_sample_data()
st.success("Sample data generated!")
# Process Data if available
if st.session_state.df is not None:
st.markdown("---")
st.header("πŸ“Š Data Overview")
# Display data preview
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Total Rows", st.session_state.df.shape[0])
with col2:
st.metric("Total Columns", st.session_state.df.shape[1])
with col3:
st.metric("Memory Usage", f"{st.session_state.df.memory_usage(deep=True).sum() / 1024:.2f} KB")
# Data Preview
with st.expander("View Data Preview", expanded=True):
st.dataframe(st.session_state.df.head(10))
# Run Analysis
with st.spinner("Analyzing data quality..."):
time.sleep(0.5) # Simulate processing time
st.session_state.analysis_results = detect_anomalies(st.session_state.df)
# Analysis Results
st.markdown("---")
st.header("πŸ” Analysis Results")
# Missing Values Section
if st.session_state.analysis_results['missing_values']:
st.subheader("⚠️ Missing Values Detected")
missing_df = pd.DataFrame.from_dict(
st.session_state.analysis_results['missing_values'],
orient='index',
columns=['Count']
)
st.dataframe(missing_df, use_container_width=True)
st.caption(f"Total missing values: {sum(st.session_state.analysis_results['missing_values'].values())}")
else:
st.success("βœ… No missing values detected in the dataset.")
# Duplicates Section
if st.session_state.analysis_results['duplicates'] > 0:
st.warning(f"⚠️ {st.session_state.analysis_results['duplicates']} duplicate rows detected.")
else:
st.success("βœ… No duplicate rows detected.")
# Outliers Section
if st.session_state.analysis_results['outliers']:
st.subheader("🚨 Outliers Detected (IQR Method)")
outlier_df = pd.DataFrame.from_dict(
{k: v['count'] for k, v in st.session_state.analysis_results['outliers'].items()},
orient='index',
columns=['Count']
)
st.dataframe(outlier_df, use_container_width=True)
# Visualization
if plot_type in ["Bar Chart", "Distribution Plot"]:
fig = px.bar(
outlier_df,
x=outlier_df.index,
y='Count',
title="Outliers by Column",
color='Count',
color_continuous_scale='Reds'
)
st.plotly_chart(fig, use_container_width=True)
else:
st.success("βœ… No outliers detected in numerical columns.")
# Empty Strings Section
if st.session_state.analysis_results['empty_strings']:
st.subheader("πŸ“ Empty Strings Detected")
empty_df = pd.DataFrame.from_dict(
st.session_state.analysis_results['empty_strings'],
orient='index',
columns=['Count']
)
st.dataframe(empty_df, use_container_width=True)
else:
st.success("βœ… No empty strings detected in text columns.")
# Detailed Analysis Section
st.markdown("---")
st.header("πŸ“ˆ Detailed Analysis")
# Summary Metrics
col1, col2, col3, col4 = st.columns(4)
total_issues = (
sum(st.session_state.analysis_results['missing_values'].values()) +
st.session_state.analysis_results['duplicates'] +
sum([v['count'] for v in st.session_state.analysis_results['outliers'].values()]) +
sum(st.session_state.analysis_results['empty_strings'].values())
)
with col1:
st.metric("Total Issues Found", total_issues, delta_color="inverse")
with col2:
st.metric("Data Quality Score", f"{max(0, 100 - (total_issues / (st.session_state.df.shape[0] * st.session_state.df.shape[1]) * 100)):.1f}%")
with col3:
st.metric("Columns Analyzed", st.session_state.df.shape[1])
with col4:
st.metric("Rows Analyzed", st.session_state.df.shape[0])
# Visualizations
if st.session_state.analysis_results['missing_values'] or st.session_state.analysis_results['outliers']:
st.subheader("Visual Summary")
# Create a summary chart
chart_data = {
'Missing Values': sum(st.session_state.analysis_results['missing_values'].values()),
'Duplicates': st.session_state.analysis_results['duplicates'],
'Outliers': sum([v['count'] for v in st.session_state.analysis_results['outliers'].values()]),
'Empty Strings': sum(st.session_state.analysis_results['empty_strings'].values())
}
fig = px.bar(
x=list(chart_data.keys()),
y=list(chart_data.values()),
title="Data Quality Issues Summary",
labels={'x': 'Issue Type', 'y': 'Count'},
color=list(chart_data.keys()),
color_discrete_sequence=px.colors.qualitative.Set2
)
st.plotly_chart(fig, use_container_width=True)
# Distribution Plots for numerical columns
num_cols = st.session_state.df.select_dtypes(include=[np.number]).columns
if len(num_cols) > 0 and plot_type == "Distribution Plot":
with st.expander("Distribution Plots"):
for col in num_cols[:3]: # Show first 3 numerical columns
fig_dist = px.histogram(
st.session_state.df,
x=col,
title=f"Distribution of {col}",
nbins=30
)
st.plotly_chart(fig_dist, use_container_width=True)
if __name__ == "__main__":
main()