| | import streamlit as st |
| | import pandas as pd |
| | import plotly.express as px |
| | import plotly.graph_objects as go |
| | from plotly.subplots import make_subplots |
| | from advanced_validator import AdvancedDataValidator |
| | from typing import Dict, Any |
| | import json |
| |
|
| | def render_advanced_validation_tab(df: pd.DataFrame): |
| | """ |
| | Render the advanced validation tab in Streamlit |
| | |
| | Args: |
| | df: DataFrame to validate |
| | """ |
| | st.header("π Advanced Data Quality Validation") |
| | st.markdown(""" |
| | This advanced validation focuses on **logical inconsistencies** and **business rule violations** |
| | rather than basic technical data issues. It performs deep analysis to detect: |
| | |
| | - **Duplicate Identity Detection**: Email and phone number duplicates with normalization |
| | - **Data Pattern Anomalies**: Suspicious clustering and artificial standardization |
| | - **Business Logic Violations**: Chronological inconsistencies and employment logic errors |
| | - **Contextual Integrity Issues**: Bulk import patterns and unrealistic data ranges |
| | """) |
| | |
| | |
| | with st.expander("βοΈ Validation Configuration", expanded=False): |
| | col1, col2 = st.columns(2) |
| | |
| | with col1: |
| | company_founding_year = st.number_input( |
| | "Company Founding Year", |
| | min_value=1800, |
| | max_value=2024, |
| | value=1990, |
| | help="Used to validate join dates aren't before company founding" |
| | ) |
| | |
| | with col2: |
| | validation_scope = st.multiselect( |
| | "Validation Scope", |
| | ["Duplicate Detection", "Pattern Analysis", "Business Logic", "Contextual Integrity"], |
| | default=["Duplicate Detection", "Pattern Analysis", "Business Logic", "Contextual Integrity"], |
| | help="Select which validation categories to run" |
| | ) |
| | |
| | |
| | if st.button("π Run Advanced Validation", type="primary"): |
| | with st.spinner("Performing advanced data quality validation..."): |
| | |
| | validator = AdvancedDataValidator(company_founding_year=company_founding_year) |
| | |
| | |
| | validation_results = validator.validate_dataset(df) |
| | |
| | |
| | st.session_state['validation_results'] = validation_results |
| | st.session_state['validator'] = validator |
| | |
| | |
| | if 'validation_results' in st.session_state: |
| | display_validation_results(st.session_state['validation_results'], df) |
| |
|
| | def display_validation_results(results: Dict[str, Any], df: pd.DataFrame): |
| | """Display comprehensive validation results""" |
| | |
| | |
| | st.subheader("π Validation Summary") |
| | |
| | summary = results['summary'] |
| | |
| | |
| | col1, col2, col3, col4, col5 = st.columns(5) |
| | |
| | with col1: |
| | st.metric("Data Quality Score", f"{summary['data_quality_score']}%") |
| | |
| | with col2: |
| | st.metric("Total Issues", summary['total_issues_found']) |
| | |
| | with col3: |
| | st.metric("High Severity", summary['high_severity_issues'], |
| | delta=f"-{summary['high_severity_issues']}" if summary['high_severity_issues'] > 0 else None) |
| | |
| | with col4: |
| | st.metric("Medium Severity", summary['medium_severity_issues']) |
| | |
| | with col5: |
| | st.metric("Affected Records", f"{summary['total_affected_records']} ({summary['total_affected_records']/summary['total_records']*100:.1f}%)") |
| | |
| | |
| | if results['validation_passed']: |
| | st.success("β
**Validation Passed**: No critical issues found!") |
| | else: |
| | st.error("β **Validation Failed**: Critical issues require immediate attention!") |
| | |
| | |
| | if summary['total_issues_found'] > 0: |
| | st.subheader("π Issues by Severity") |
| | |
| | severity_data = { |
| | 'Severity': ['High', 'Medium', 'Low'], |
| | 'Count': [summary['high_severity_issues'], summary['medium_severity_issues'], summary['low_severity_issues']], |
| | 'Color': ['#FF4B4B', '#FFA500', '#32CD32'] |
| | } |
| | |
| | fig = px.bar( |
| | severity_data, |
| | x='Severity', |
| | y='Count', |
| | color='Color', |
| | color_discrete_map={color: color for color in severity_data['Color']}, |
| | title="Distribution of Issues by Severity Level" |
| | ) |
| | fig.update_layout(showlegend=False, height=400) |
| | st.plotly_chart(fig, use_container_width=True) |
| | |
| | |
| | st.subheader("π Detailed Issue Analysis") |
| | |
| | if results['detailed_issues']: |
| | |
| | severity_tabs = [] |
| | if summary['high_severity_issues'] > 0: |
| | severity_tabs.append("π΄ High Severity") |
| | if summary['medium_severity_issues'] > 0: |
| | severity_tabs.append("π‘ Medium Severity") |
| | if summary['low_severity_issues'] > 0: |
| | severity_tabs.append("π’ Low Severity") |
| | |
| | if severity_tabs: |
| | tabs = st.tabs(severity_tabs) |
| | |
| | tab_index = 0 |
| | for severity, color in [("HIGH", "π΄"), ("MEDIUM", "π‘"), ("LOW", "π’")]: |
| | severity_issues = [issue for issue in results['detailed_issues'] if issue['severity'] == severity] |
| | |
| | if severity_issues and tab_index < len(tabs): |
| | with tabs[tab_index]: |
| | display_severity_issues(severity_issues, severity, df) |
| | tab_index += 1 |
| | else: |
| | st.info("π No data quality issues detected! Your dataset appears to be logically consistent.") |
| | |
| | |
| | if results['recommendations']: |
| | st.subheader("π‘ Recommendations") |
| | |
| | for rec_group in results['recommendations']: |
| | priority_color = { |
| | 'HIGH': 'π΄', |
| | 'MEDIUM': 'π‘', |
| | 'LOW': 'π’' |
| | } |
| | |
| | with st.expander(f"{priority_color[rec_group['priority']]} {rec_group['title']}", expanded=rec_group['priority'] == 'HIGH'): |
| | for i, item in enumerate(rec_group['items'], 1): |
| | st.markdown(f"{i}. {item}") |
| | |
| | |
| | st.subheader("π₯ Export Validation Results") |
| | |
| | col1, col2 = st.columns(2) |
| | |
| | with col1: |
| | if st.button("π Download Detailed Report"): |
| | report_json = json.dumps(results, indent=2, default=str) |
| | st.download_button( |
| | label="Download JSON Report", |
| | data=report_json, |
| | file_name=f"advanced_validation_report_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.json", |
| | mime="application/json" |
| | ) |
| | |
| | with col2: |
| | if st.button("π Download Issue Summary"): |
| | |
| | summary_data = [] |
| | for issue in results['detailed_issues']: |
| | summary_data.append({ |
| | 'Category': issue['category'], |
| | 'Severity': issue['severity'], |
| | 'Description': issue['description'], |
| | 'Affected Records': issue['count'], |
| | 'Percentage': f"{issue['affected_percentage']}%", |
| | 'Recommendation': issue['recommendation'] |
| | }) |
| | |
| | summary_df = pd.DataFrame(summary_data) |
| | csv = summary_df.to_csv(index=False) |
| | |
| | st.download_button( |
| | label="Download CSV Summary", |
| | data=csv, |
| | file_name=f"validation_summary_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.csv", |
| | mime="text/csv" |
| | ) |
| |
|
| | def display_severity_issues(issues: list, severity: str, df: pd.DataFrame): |
| | """Display issues for a specific severity level""" |
| | |
| | for i, issue in enumerate(issues): |
| | with st.container(): |
| | |
| | st.markdown(f"### {i+1}. {issue['category']}") |
| | |
| | |
| | col1, col2 = st.columns([2, 1]) |
| | |
| | with col1: |
| | st.markdown(f"**Description:** {issue['description']}") |
| | st.markdown(f"**Recommendation:** {issue['recommendation']}") |
| | |
| | with col2: |
| | st.metric("Affected Records", issue['count']) |
| | st.metric("Percentage", f"{issue['affected_percentage']}%") |
| | |
| | |
| | if issue['examples']: |
| | with st.expander(f"π View Examples ({len(issue['examples'])} shown)", expanded=False): |
| | |
| | |
| | if isinstance(issue['examples'][0], dict): |
| | if 'email' in issue['examples'][0]: |
| | |
| | for example in issue['examples']: |
| | st.markdown(f"**Email:** `{example['email']}` (appears {example['count']} times)") |
| | if 'records' in example: |
| | example_df = pd.DataFrame(example['records']) |
| | st.dataframe(example_df, use_container_width=True) |
| | |
| | elif 'normalized_phone' in issue['examples'][0]: |
| | |
| | for example in issue['examples']: |
| | st.markdown(f"**Phone:** `{example['normalized_phone']}` (appears {example['count']} times)") |
| | if 'records' in example: |
| | example_df = pd.DataFrame(example['records']) |
| | st.dataframe(example_df, use_container_width=True) |
| | |
| | elif 'period' in issue['examples'][0]: |
| | |
| | for example in issue['examples']: |
| | st.markdown(f"**Period:** {example['period']} ({example['count']} records, {example['percentage_of_total']}% of total)") |
| | if 'sample_records' in example: |
| | example_df = pd.DataFrame(example['sample_records']) |
| | st.dataframe(example_df, use_container_width=True) |
| | |
| | else: |
| | |
| | for j, example in enumerate(issue['examples']): |
| | st.markdown(f"**Example {j+1}:**") |
| | st.json(example) |
| | |
| | else: |
| | |
| | example_df = pd.DataFrame(issue['examples']) |
| | st.dataframe(example_df, use_container_width=True) |
| | |
| | st.markdown("---") |
| |
|
| | def create_validation_visualization(results: Dict[str, Any]) -> go.Figure: |
| | """Create comprehensive validation visualization""" |
| | |
| | |
| | fig = make_subplots( |
| | rows=2, cols=2, |
| | subplot_titles=('Issues by Severity', 'Affected Records Distribution', |
| | 'Data Quality Score', 'Issue Categories'), |
| | specs=[[{"type": "bar"}, {"type": "pie"}], |
| | [{"type": "indicator"}, {"type": "bar"}]] |
| | ) |
| | |
| | summary = results['summary'] |
| | |
| | |
| | fig.add_trace( |
| | go.Bar( |
| | x=['High', 'Medium', 'Low'], |
| | y=[summary['high_severity_issues'], summary['medium_severity_issues'], summary['low_severity_issues']], |
| | marker_color=['#FF4B4B', '#FFA500', '#32CD32'], |
| | name='Issues by Severity' |
| | ), |
| | row=1, col=1 |
| | ) |
| | |
| | |
| | affected = summary['total_affected_records'] |
| | clean = summary['total_records'] - affected |
| | |
| | fig.add_trace( |
| | go.Pie( |
| | labels=['Clean Records', 'Affected Records'], |
| | values=[clean, affected], |
| | marker_colors=['#32CD32', '#FF4B4B'], |
| | name='Record Distribution' |
| | ), |
| | row=1, col=2 |
| | ) |
| | |
| | |
| | fig.add_trace( |
| | go.Indicator( |
| | mode="gauge+number", |
| | value=summary['data_quality_score'], |
| | domain={'x': [0, 1], 'y': [0, 1]}, |
| | title={'text': "Data Quality Score"}, |
| | gauge={ |
| | 'axis': {'range': [None, 100]}, |
| | 'bar': {'color': "darkblue"}, |
| | 'steps': [ |
| | {'range': [0, 50], 'color': "lightgray"}, |
| | {'range': [50, 80], 'color': "yellow"}, |
| | {'range': [80, 100], 'color': "lightgreen"} |
| | ], |
| | 'threshold': { |
| | 'line': {'color': "red", 'width': 4}, |
| | 'thickness': 0.75, |
| | 'value': 90 |
| | } |
| | } |
| | ), |
| | row=2, col=1 |
| | ) |
| | |
| | |
| | if results['detailed_issues']: |
| | categories = [issue['category'] for issue in results['detailed_issues']] |
| | counts = [issue['count'] for issue in results['detailed_issues']] |
| | |
| | fig.add_trace( |
| | go.Bar( |
| | x=categories, |
| | y=counts, |
| | marker_color='#1f77b4', |
| | name='Issues by Category' |
| | ), |
| | row=2, col=2 |
| | ) |
| | |
| | fig.update_layout(height=800, showlegend=False, title_text="Advanced Data Quality Validation Dashboard") |
| | |
| | return fig |
| |
|