import streamlit as st import pandas as pd import numpy as np import plotly.express as px import plotly.graph_objects as go from io import StringIO import time # Set page configuration st.set_page_config( page_title="Debunker - Data Quality Validator", page_icon="🔍", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS for modern styling st.markdown(""" """, unsafe_allow_html=True) # Header with Built with anycoder st.markdown("""

🔍 Debunker

Advanced Data Quality Validator & Anomaly Detector

Built with anycoder
""", unsafe_allow_html=True) # Sidebar Configuration with st.sidebar: st.header("⚙️ Configuration") st.markdown("---") st.subheader("📊 Data Input Method") input_method = st.radio( "Choose input method:", ["Upload CSV File", "Paste Data", "Generate Sample Data"], label_visibility="collapsed" ) st.markdown("---") st.subheader("🎯 Validation Rules") st.checkbox("Detect Missing Values", value=True, help="Check for NaN or empty cells") st.checkbox("Detect Duplicates", value=True, help="Identify duplicate rows") st.checkbox("Detect Outliers (IQR)", value=True, help="Flag values beyond statistical bounds") st.checkbox("Detect Empty Strings", value=True, help="Find rows with empty string values") st.markdown("---") st.subheader("📈 Visualization Options") plot_type = st.selectbox( "Chart Type:", ["Bar Chart", "Scatter Plot", "Distribution Plot", "Heatmap"], label_visibility="collapsed" ) # Initialize session state if 'df' not in st.session_state: st.session_state.df = None if 'analysis_results' not in st.session_state: st.session_state.analysis_results = None # Main Application Logic def load_sample_data(): """Generate sample dataset for demonstration""" np.random.seed(42) data = { 'Customer_ID': range(1, 101), 'Name': np.random.choice(['Alice', 'Bob', 'Charlie', 'David', 'Eve'], 100), 'Age': np.random.randint(18, 70, 100), 'Purchase_Amount': np.random.uniform(10, 500, 100), 'Rating': np.random.randint(1, 6, 100), 'Date': pd.date_range(start='2023-01-01', periods=100).strftime('%Y-%m-%d') } return pd.DataFrame(data) def detect_anomalies(df): """Perform comprehensive data quality checks""" results = { 'missing_values': {}, 'duplicates': 0, 'outliers': {}, 'empty_strings': {} } # Missing Values for col in df.columns: missing_count = df[col].isna().sum() if missing_count > 0: results['missing_values'][col] = missing_count # Duplicates results['duplicates'] = df.duplicated().sum() # Outliers using IQR method for col in df.select_dtypes(include=[np.number]).columns: Q1 = df[col].quantile(0.25) Q3 = df[col].quantile(0.75) IQR = Q3 - Q1 lower_bound = Q1 - 1.5 * IQR upper_bound = Q3 + 1.5 * IQR outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)] if len(outliers) > 0: results['outliers'][col] = { 'count': len(outliers), 'percentage': round((len(outliers) / len(df)) * 100, 2), 'values': outliers[col].tolist() } # Empty Strings for col in df.select_dtypes(include=['object']).columns: empty_count = (df[col] == '').sum() if empty_count > 0: results['empty_strings'][col] = empty_count return results def main(): # Input Handling if input_method == "Upload CSV File": uploaded_file = st.file_uploader("Upload your CSV file", type=['csv']) if uploaded_file is not None: try: st.session_state.df = pd.read_csv(uploaded_file) st.success(f"Successfully loaded: {uploaded_file.name}") except Exception as e: st.error(f"Error loading file: {str(e)}") elif input_method == "Paste Data": st.info("Paste your CSV data below:") csv_text = st.text_area("CSV Data", height=200, placeholder="column1,column2,column3\nvalue1,value2,value3") if st.button("Process Data", type="primary"): try: st.session_state.df = pd.read_csv(StringIO(csv_text)) st.success("Data processed successfully!") except Exception as e: st.error(f"Error processing data: {str(e)}") elif input_method == "Generate Sample Data": if st.button("Generate Sample Data", type="primary"): st.session_state.df = load_sample_data() st.success("Sample data generated!") # Process Data if available if st.session_state.df is not None: st.markdown("---") st.header("📊 Data Overview") # Display data preview col1, col2, col3 = st.columns(3) with col1: st.metric("Total Rows", st.session_state.df.shape[0]) with col2: st.metric("Total Columns", st.session_state.df.shape[1]) with col3: st.metric("Memory Usage", f"{st.session_state.df.memory_usage(deep=True).sum() / 1024:.2f} KB") # Data Preview with st.expander("View Data Preview", expanded=True): st.dataframe(st.session_state.df.head(10)) # Run Analysis with st.spinner("Analyzing data quality..."): time.sleep(0.5) # Simulate processing time st.session_state.analysis_results = detect_anomalies(st.session_state.df) # Analysis Results st.markdown("---") st.header("🔍 Analysis Results") # Missing Values Section if st.session_state.analysis_results['missing_values']: st.subheader("⚠️ Missing Values Detected") missing_df = pd.DataFrame.from_dict( st.session_state.analysis_results['missing_values'], orient='index', columns=['Count'] ) st.dataframe(missing_df, use_container_width=True) st.caption(f"Total missing values: {sum(st.session_state.analysis_results['missing_values'].values())}") else: st.success("✅ No missing values detected in the dataset.") # Duplicates Section if st.session_state.analysis_results['duplicates'] > 0: st.warning(f"⚠️ {st.session_state.analysis_results['duplicates']} duplicate rows detected.") else: st.success("✅ No duplicate rows detected.") # Outliers Section if st.session_state.analysis_results['outliers']: st.subheader("🚨 Outliers Detected (IQR Method)") outlier_df = pd.DataFrame.from_dict( {k: v['count'] for k, v in st.session_state.analysis_results['outliers'].items()}, orient='index', columns=['Count'] ) st.dataframe(outlier_df, use_container_width=True) # Visualization if plot_type in ["Bar Chart", "Distribution Plot"]: fig = px.bar( outlier_df, x=outlier_df.index, y='Count', title="Outliers by Column", color='Count', color_continuous_scale='Reds' ) st.plotly_chart(fig, use_container_width=True) else: st.success("✅ No outliers detected in numerical columns.") # Empty Strings Section if st.session_state.analysis_results['empty_strings']: st.subheader("📝 Empty Strings Detected") empty_df = pd.DataFrame.from_dict( st.session_state.analysis_results['empty_strings'], orient='index', columns=['Count'] ) st.dataframe(empty_df, use_container_width=True) else: st.success("✅ No empty strings detected in text columns.") # Detailed Analysis Section st.markdown("---") st.header("📈 Detailed Analysis") # Summary Metrics col1, col2, col3, col4 = st.columns(4) total_issues = ( sum(st.session_state.analysis_results['missing_values'].values()) + st.session_state.analysis_results['duplicates'] + sum([v['count'] for v in st.session_state.analysis_results['outliers'].values()]) + sum(st.session_state.analysis_results['empty_strings'].values()) ) with col1: st.metric("Total Issues Found", total_issues, delta_color="inverse") with col2: st.metric("Data Quality Score", f"{max(0, 100 - (total_issues / (st.session_state.df.shape[0] * st.session_state.df.shape[1]) * 100)):.1f}%") with col3: st.metric("Columns Analyzed", st.session_state.df.shape[1]) with col4: st.metric("Rows Analyzed", st.session_state.df.shape[0]) # Visualizations if st.session_state.analysis_results['missing_values'] or st.session_state.analysis_results['outliers']: st.subheader("Visual Summary") # Create a summary chart chart_data = { 'Missing Values': sum(st.session_state.analysis_results['missing_values'].values()), 'Duplicates': st.session_state.analysis_results['duplicates'], 'Outliers': sum([v['count'] for v in st.session_state.analysis_results['outliers'].values()]), 'Empty Strings': sum(st.session_state.analysis_results['empty_strings'].values()) } fig = px.bar( x=list(chart_data.keys()), y=list(chart_data.values()), title="Data Quality Issues Summary", labels={'x': 'Issue Type', 'y': 'Count'}, color=list(chart_data.keys()), color_discrete_sequence=px.colors.qualitative.Set2 ) st.plotly_chart(fig, use_container_width=True) # Distribution Plots for numerical columns num_cols = st.session_state.df.select_dtypes(include=[np.number]).columns if len(num_cols) > 0 and plot_type == "Distribution Plot": with st.expander("Distribution Plots"): for col in num_cols[:3]: # Show first 3 numerical columns fig_dist = px.histogram( st.session_state.df, x=col, title=f"Distribution of {col}", nbins=30 ) st.plotly_chart(fig_dist, use_container_width=True) if __name__ == "__main__": main()