Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from io import StringIO | |
| import time | |
| # Set page configuration | |
| st.set_page_config( | |
| page_title="Debunker - Data Quality Validator", | |
| page_icon="π", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # Custom CSS for modern styling | |
| st.markdown(""" | |
| <style> | |
| .main-header { | |
| font-size: 3rem; | |
| font-weight: 700; | |
| color: #1f77b4; | |
| margin-bottom: 1rem; | |
| } | |
| .card { | |
| background-color: #f8f9fa; | |
| border-radius: 10px; | |
| padding: 1.5rem; | |
| box-shadow: 0 4px 6px rgba(0,0,0,0.1); | |
| margin-bottom: 1rem; | |
| } | |
| .metric-card { | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| border-radius: 10px; | |
| padding: 1.5rem; | |
| color: white; | |
| text-align: center; | |
| box-shadow: 0 4px 6px rgba(0,0,0,0.1); | |
| } | |
| .success-text { color: #2ecc71; } | |
| .warning-text { color: #f1c40f; } | |
| .error-text { color: #e74c3c; } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Header with Built with anycoder | |
| st.markdown(""" | |
| <div style="text-align: center; margin-bottom: 2rem;"> | |
| <h1 class="main-header">π Debunker</h1> | |
| <p style="font-size: 1.2rem; color: #666;"> | |
| Advanced Data Quality Validator & Anomaly Detector | |
| </p> | |
| <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" | |
| style="color: #1f77b4; font-weight: bold; text-decoration: none; margin-top: 10px; display: inline-block;"> | |
| Built with anycoder | |
| </a> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Sidebar Configuration | |
| with st.sidebar: | |
| st.header("βοΈ Configuration") | |
| st.markdown("---") | |
| st.subheader("π Data Input Method") | |
| input_method = st.radio( | |
| "Choose input method:", | |
| ["Upload CSV File", "Paste Data", "Generate Sample Data"], | |
| label_visibility="collapsed" | |
| ) | |
| st.markdown("---") | |
| st.subheader("π― Validation Rules") | |
| st.checkbox("Detect Missing Values", value=True, help="Check for NaN or empty cells") | |
| st.checkbox("Detect Duplicates", value=True, help="Identify duplicate rows") | |
| st.checkbox("Detect Outliers (IQR)", value=True, help="Flag values beyond statistical bounds") | |
| st.checkbox("Detect Empty Strings", value=True, help="Find rows with empty string values") | |
| st.markdown("---") | |
| st.subheader("π Visualization Options") | |
| plot_type = st.selectbox( | |
| "Chart Type:", | |
| ["Bar Chart", "Scatter Plot", "Distribution Plot", "Heatmap"], | |
| label_visibility="collapsed" | |
| ) | |
| # Initialize session state | |
| if 'df' not in st.session_state: | |
| st.session_state.df = None | |
| if 'analysis_results' not in st.session_state: | |
| st.session_state.analysis_results = None | |
| # Main Application Logic | |
| def load_sample_data(): | |
| """Generate sample dataset for demonstration""" | |
| np.random.seed(42) | |
| data = { | |
| 'Customer_ID': range(1, 101), | |
| 'Name': np.random.choice(['Alice', 'Bob', 'Charlie', 'David', 'Eve'], 100), | |
| 'Age': np.random.randint(18, 70, 100), | |
| 'Purchase_Amount': np.random.uniform(10, 500, 100), | |
| 'Rating': np.random.randint(1, 6, 100), | |
| 'Date': pd.date_range(start='2023-01-01', periods=100).strftime('%Y-%m-%d') | |
| } | |
| return pd.DataFrame(data) | |
| def detect_anomalies(df): | |
| """Perform comprehensive data quality checks""" | |
| results = { | |
| 'missing_values': {}, | |
| 'duplicates': 0, | |
| 'outliers': {}, | |
| 'empty_strings': {} | |
| } | |
| # Missing Values | |
| for col in df.columns: | |
| missing_count = df[col].isna().sum() | |
| if missing_count > 0: | |
| results['missing_values'][col] = missing_count | |
| # Duplicates | |
| results['duplicates'] = df.duplicated().sum() | |
| # Outliers using IQR method | |
| for col in df.select_dtypes(include=[np.number]).columns: | |
| Q1 = df[col].quantile(0.25) | |
| Q3 = df[col].quantile(0.75) | |
| IQR = Q3 - Q1 | |
| lower_bound = Q1 - 1.5 * IQR | |
| upper_bound = Q3 + 1.5 * IQR | |
| outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)] | |
| if len(outliers) > 0: | |
| results['outliers'][col] = { | |
| 'count': len(outliers), | |
| 'percentage': round((len(outliers) / len(df)) * 100, 2), | |
| 'values': outliers[col].tolist() | |
| } | |
| # Empty Strings | |
| for col in df.select_dtypes(include=['object']).columns: | |
| empty_count = (df[col] == '').sum() | |
| if empty_count > 0: | |
| results['empty_strings'][col] = empty_count | |
| return results | |
| def main(): | |
| # Input Handling | |
| if input_method == "Upload CSV File": | |
| uploaded_file = st.file_uploader("Upload your CSV file", type=['csv']) | |
| if uploaded_file is not None: | |
| try: | |
| st.session_state.df = pd.read_csv(uploaded_file) | |
| st.success(f"Successfully loaded: {uploaded_file.name}") | |
| except Exception as e: | |
| st.error(f"Error loading file: {str(e)}") | |
| elif input_method == "Paste Data": | |
| st.info("Paste your CSV data below:") | |
| csv_text = st.text_area("CSV Data", height=200, placeholder="column1,column2,column3\nvalue1,value2,value3") | |
| if st.button("Process Data", type="primary"): | |
| try: | |
| st.session_state.df = pd.read_csv(StringIO(csv_text)) | |
| st.success("Data processed successfully!") | |
| except Exception as e: | |
| st.error(f"Error processing data: {str(e)}") | |
| elif input_method == "Generate Sample Data": | |
| if st.button("Generate Sample Data", type="primary"): | |
| st.session_state.df = load_sample_data() | |
| st.success("Sample data generated!") | |
| # Process Data if available | |
| if st.session_state.df is not None: | |
| st.markdown("---") | |
| st.header("π Data Overview") | |
| # Display data preview | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Total Rows", st.session_state.df.shape[0]) | |
| with col2: | |
| st.metric("Total Columns", st.session_state.df.shape[1]) | |
| with col3: | |
| st.metric("Memory Usage", f"{st.session_state.df.memory_usage(deep=True).sum() / 1024:.2f} KB") | |
| # Data Preview | |
| with st.expander("View Data Preview", expanded=True): | |
| st.dataframe(st.session_state.df.head(10)) | |
| # Run Analysis | |
| with st.spinner("Analyzing data quality..."): | |
| time.sleep(0.5) # Simulate processing time | |
| st.session_state.analysis_results = detect_anomalies(st.session_state.df) | |
| # Analysis Results | |
| st.markdown("---") | |
| st.header("π Analysis Results") | |
| # Missing Values Section | |
| if st.session_state.analysis_results['missing_values']: | |
| st.subheader("β οΈ Missing Values Detected") | |
| missing_df = pd.DataFrame.from_dict( | |
| st.session_state.analysis_results['missing_values'], | |
| orient='index', | |
| columns=['Count'] | |
| ) | |
| st.dataframe(missing_df, use_container_width=True) | |
| st.caption(f"Total missing values: {sum(st.session_state.analysis_results['missing_values'].values())}") | |
| else: | |
| st.success("β No missing values detected in the dataset.") | |
| # Duplicates Section | |
| if st.session_state.analysis_results['duplicates'] > 0: | |
| st.warning(f"β οΈ {st.session_state.analysis_results['duplicates']} duplicate rows detected.") | |
| else: | |
| st.success("β No duplicate rows detected.") | |
| # Outliers Section | |
| if st.session_state.analysis_results['outliers']: | |
| st.subheader("π¨ Outliers Detected (IQR Method)") | |
| outlier_df = pd.DataFrame.from_dict( | |
| {k: v['count'] for k, v in st.session_state.analysis_results['outliers'].items()}, | |
| orient='index', | |
| columns=['Count'] | |
| ) | |
| st.dataframe(outlier_df, use_container_width=True) | |
| # Visualization | |
| if plot_type in ["Bar Chart", "Distribution Plot"]: | |
| fig = px.bar( | |
| outlier_df, | |
| x=outlier_df.index, | |
| y='Count', | |
| title="Outliers by Column", | |
| color='Count', | |
| color_continuous_scale='Reds' | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| else: | |
| st.success("β No outliers detected in numerical columns.") | |
| # Empty Strings Section | |
| if st.session_state.analysis_results['empty_strings']: | |
| st.subheader("π Empty Strings Detected") | |
| empty_df = pd.DataFrame.from_dict( | |
| st.session_state.analysis_results['empty_strings'], | |
| orient='index', | |
| columns=['Count'] | |
| ) | |
| st.dataframe(empty_df, use_container_width=True) | |
| else: | |
| st.success("β No empty strings detected in text columns.") | |
| # Detailed Analysis Section | |
| st.markdown("---") | |
| st.header("π Detailed Analysis") | |
| # Summary Metrics | |
| col1, col2, col3, col4 = st.columns(4) | |
| total_issues = ( | |
| sum(st.session_state.analysis_results['missing_values'].values()) + | |
| st.session_state.analysis_results['duplicates'] + | |
| sum([v['count'] for v in st.session_state.analysis_results['outliers'].values()]) + | |
| sum(st.session_state.analysis_results['empty_strings'].values()) | |
| ) | |
| with col1: | |
| st.metric("Total Issues Found", total_issues, delta_color="inverse") | |
| with col2: | |
| st.metric("Data Quality Score", f"{max(0, 100 - (total_issues / (st.session_state.df.shape[0] * st.session_state.df.shape[1]) * 100)):.1f}%") | |
| with col3: | |
| st.metric("Columns Analyzed", st.session_state.df.shape[1]) | |
| with col4: | |
| st.metric("Rows Analyzed", st.session_state.df.shape[0]) | |
| # Visualizations | |
| if st.session_state.analysis_results['missing_values'] or st.session_state.analysis_results['outliers']: | |
| st.subheader("Visual Summary") | |
| # Create a summary chart | |
| chart_data = { | |
| 'Missing Values': sum(st.session_state.analysis_results['missing_values'].values()), | |
| 'Duplicates': st.session_state.analysis_results['duplicates'], | |
| 'Outliers': sum([v['count'] for v in st.session_state.analysis_results['outliers'].values()]), | |
| 'Empty Strings': sum(st.session_state.analysis_results['empty_strings'].values()) | |
| } | |
| fig = px.bar( | |
| x=list(chart_data.keys()), | |
| y=list(chart_data.values()), | |
| title="Data Quality Issues Summary", | |
| labels={'x': 'Issue Type', 'y': 'Count'}, | |
| color=list(chart_data.keys()), | |
| color_discrete_sequence=px.colors.qualitative.Set2 | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Distribution Plots for numerical columns | |
| num_cols = st.session_state.df.select_dtypes(include=[np.number]).columns | |
| if len(num_cols) > 0 and plot_type == "Distribution Plot": | |
| with st.expander("Distribution Plots"): | |
| for col in num_cols[:3]: # Show first 3 numerical columns | |
| fig_dist = px.histogram( | |
| st.session_state.df, | |
| x=col, | |
| title=f"Distribution of {col}", | |
| nbins=30 | |
| ) | |
| st.plotly_chart(fig_dist, use_container_width=True) | |
| if __name__ == "__main__": | |
| main() |