AI-Based-Data-Cleaner / src /advanced_validation_ui.py
midlajvalappil's picture
Upload 14 files
6aa09c0 verified
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from advanced_validator import AdvancedDataValidator
from typing import Dict, Any
import json
def render_advanced_validation_tab(df: pd.DataFrame):
"""
Render the advanced validation tab in Streamlit
Args:
df: DataFrame to validate
"""
st.header("πŸ” Advanced Data Quality Validation")
st.markdown("""
This advanced validation focuses on **logical inconsistencies** and **business rule violations**
rather than basic technical data issues. It performs deep analysis to detect:
- **Duplicate Identity Detection**: Email and phone number duplicates with normalization
- **Data Pattern Anomalies**: Suspicious clustering and artificial standardization
- **Business Logic Violations**: Chronological inconsistencies and employment logic errors
- **Contextual Integrity Issues**: Bulk import patterns and unrealistic data ranges
""")
# Configuration section
with st.expander("βš™οΈ Validation Configuration", expanded=False):
col1, col2 = st.columns(2)
with col1:
company_founding_year = st.number_input(
"Company Founding Year",
min_value=1800,
max_value=2024,
value=1990,
help="Used to validate join dates aren't before company founding"
)
with col2:
validation_scope = st.multiselect(
"Validation Scope",
["Duplicate Detection", "Pattern Analysis", "Business Logic", "Contextual Integrity"],
default=["Duplicate Detection", "Pattern Analysis", "Business Logic", "Contextual Integrity"],
help="Select which validation categories to run"
)
# Run validation button
if st.button("πŸš€ Run Advanced Validation", type="primary"):
with st.spinner("Performing advanced data quality validation..."):
# Initialize validator
validator = AdvancedDataValidator(company_founding_year=company_founding_year)
# Run validation
validation_results = validator.validate_dataset(df)
# Store results in session state
st.session_state['validation_results'] = validation_results
st.session_state['validator'] = validator
# Display results if available
if 'validation_results' in st.session_state:
display_validation_results(st.session_state['validation_results'], df)
def display_validation_results(results: Dict[str, Any], df: pd.DataFrame):
"""Display comprehensive validation results"""
# Summary metrics
st.subheader("πŸ“Š Validation Summary")
summary = results['summary']
# Create metrics columns
col1, col2, col3, col4, col5 = st.columns(5)
with col1:
st.metric("Data Quality Score", f"{summary['data_quality_score']}%")
with col2:
st.metric("Total Issues", summary['total_issues_found'])
with col3:
st.metric("High Severity", summary['high_severity_issues'],
delta=f"-{summary['high_severity_issues']}" if summary['high_severity_issues'] > 0 else None)
with col4:
st.metric("Medium Severity", summary['medium_severity_issues'])
with col5:
st.metric("Affected Records", f"{summary['total_affected_records']} ({summary['total_affected_records']/summary['total_records']*100:.1f}%)")
# Overall status
if results['validation_passed']:
st.success("βœ… **Validation Passed**: No critical issues found!")
else:
st.error("❌ **Validation Failed**: Critical issues require immediate attention!")
# Severity distribution chart
if summary['total_issues_found'] > 0:
st.subheader("πŸ“ˆ Issues by Severity")
severity_data = {
'Severity': ['High', 'Medium', 'Low'],
'Count': [summary['high_severity_issues'], summary['medium_severity_issues'], summary['low_severity_issues']],
'Color': ['#FF4B4B', '#FFA500', '#32CD32']
}
fig = px.bar(
severity_data,
x='Severity',
y='Count',
color='Color',
color_discrete_map={color: color for color in severity_data['Color']},
title="Distribution of Issues by Severity Level"
)
fig.update_layout(showlegend=False, height=400)
st.plotly_chart(fig, use_container_width=True)
# Detailed issues
st.subheader("πŸ” Detailed Issue Analysis")
if results['detailed_issues']:
# Create tabs for each severity level
severity_tabs = []
if summary['high_severity_issues'] > 0:
severity_tabs.append("πŸ”΄ High Severity")
if summary['medium_severity_issues'] > 0:
severity_tabs.append("🟑 Medium Severity")
if summary['low_severity_issues'] > 0:
severity_tabs.append("🟒 Low Severity")
if severity_tabs:
tabs = st.tabs(severity_tabs)
tab_index = 0
for severity, color in [("HIGH", "πŸ”΄"), ("MEDIUM", "🟑"), ("LOW", "🟒")]:
severity_issues = [issue for issue in results['detailed_issues'] if issue['severity'] == severity]
if severity_issues and tab_index < len(tabs):
with tabs[tab_index]:
display_severity_issues(severity_issues, severity, df)
tab_index += 1
else:
st.info("πŸŽ‰ No data quality issues detected! Your dataset appears to be logically consistent.")
# Recommendations section
if results['recommendations']:
st.subheader("πŸ’‘ Recommendations")
for rec_group in results['recommendations']:
priority_color = {
'HIGH': 'πŸ”΄',
'MEDIUM': '🟑',
'LOW': '🟒'
}
with st.expander(f"{priority_color[rec_group['priority']]} {rec_group['title']}", expanded=rec_group['priority'] == 'HIGH'):
for i, item in enumerate(rec_group['items'], 1):
st.markdown(f"{i}. {item}")
# Export options
st.subheader("πŸ“₯ Export Validation Results")
col1, col2 = st.columns(2)
with col1:
if st.button("πŸ“„ Download Detailed Report"):
report_json = json.dumps(results, indent=2, default=str)
st.download_button(
label="Download JSON Report",
data=report_json,
file_name=f"advanced_validation_report_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.json",
mime="application/json"
)
with col2:
if st.button("πŸ“Š Download Issue Summary"):
# Create summary DataFrame
summary_data = []
for issue in results['detailed_issues']:
summary_data.append({
'Category': issue['category'],
'Severity': issue['severity'],
'Description': issue['description'],
'Affected Records': issue['count'],
'Percentage': f"{issue['affected_percentage']}%",
'Recommendation': issue['recommendation']
})
summary_df = pd.DataFrame(summary_data)
csv = summary_df.to_csv(index=False)
st.download_button(
label="Download CSV Summary",
data=csv,
file_name=f"validation_summary_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.csv",
mime="text/csv"
)
def display_severity_issues(issues: list, severity: str, df: pd.DataFrame):
"""Display issues for a specific severity level"""
for i, issue in enumerate(issues):
with st.container():
# Issue header
st.markdown(f"### {i+1}. {issue['category']}")
# Issue details
col1, col2 = st.columns([2, 1])
with col1:
st.markdown(f"**Description:** {issue['description']}")
st.markdown(f"**Recommendation:** {issue['recommendation']}")
with col2:
st.metric("Affected Records", issue['count'])
st.metric("Percentage", f"{issue['affected_percentage']}%")
# Examples section
if issue['examples']:
with st.expander(f"πŸ“‹ View Examples ({len(issue['examples'])} shown)", expanded=False):
# Handle different example formats
if isinstance(issue['examples'][0], dict):
if 'email' in issue['examples'][0]:
# Email duplicate examples
for example in issue['examples']:
st.markdown(f"**Email:** `{example['email']}` (appears {example['count']} times)")
if 'records' in example:
example_df = pd.DataFrame(example['records'])
st.dataframe(example_df, use_container_width=True)
elif 'normalized_phone' in issue['examples'][0]:
# Phone duplicate examples
for example in issue['examples']:
st.markdown(f"**Phone:** `{example['normalized_phone']}` (appears {example['count']} times)")
if 'records' in example:
example_df = pd.DataFrame(example['records'])
st.dataframe(example_df, use_container_width=True)
elif 'period' in issue['examples'][0]:
# Bulk import pattern examples
for example in issue['examples']:
st.markdown(f"**Period:** {example['period']} ({example['count']} records, {example['percentage_of_total']}% of total)")
if 'sample_records' in example:
example_df = pd.DataFrame(example['sample_records'])
st.dataframe(example_df, use_container_width=True)
else:
# Generic examples
for j, example in enumerate(issue['examples']):
st.markdown(f"**Example {j+1}:**")
st.json(example)
else:
# Simple list examples
example_df = pd.DataFrame(issue['examples'])
st.dataframe(example_df, use_container_width=True)
st.markdown("---")
def create_validation_visualization(results: Dict[str, Any]) -> go.Figure:
"""Create comprehensive validation visualization"""
# Create subplots
fig = make_subplots(
rows=2, cols=2,
subplot_titles=('Issues by Severity', 'Affected Records Distribution',
'Data Quality Score', 'Issue Categories'),
specs=[[{"type": "bar"}, {"type": "pie"}],
[{"type": "indicator"}, {"type": "bar"}]]
)
summary = results['summary']
# Severity distribution
fig.add_trace(
go.Bar(
x=['High', 'Medium', 'Low'],
y=[summary['high_severity_issues'], summary['medium_severity_issues'], summary['low_severity_issues']],
marker_color=['#FF4B4B', '#FFA500', '#32CD32'],
name='Issues by Severity'
),
row=1, col=1
)
# Affected vs Clean records
affected = summary['total_affected_records']
clean = summary['total_records'] - affected
fig.add_trace(
go.Pie(
labels=['Clean Records', 'Affected Records'],
values=[clean, affected],
marker_colors=['#32CD32', '#FF4B4B'],
name='Record Distribution'
),
row=1, col=2
)
# Data Quality Score
fig.add_trace(
go.Indicator(
mode="gauge+number",
value=summary['data_quality_score'],
domain={'x': [0, 1], 'y': [0, 1]},
title={'text': "Data Quality Score"},
gauge={
'axis': {'range': [None, 100]},
'bar': {'color': "darkblue"},
'steps': [
{'range': [0, 50], 'color': "lightgray"},
{'range': [50, 80], 'color': "yellow"},
{'range': [80, 100], 'color': "lightgreen"}
],
'threshold': {
'line': {'color': "red", 'width': 4},
'thickness': 0.75,
'value': 90
}
}
),
row=2, col=1
)
# Issue categories
if results['detailed_issues']:
categories = [issue['category'] for issue in results['detailed_issues']]
counts = [issue['count'] for issue in results['detailed_issues']]
fig.add_trace(
go.Bar(
x=categories,
y=counts,
marker_color='#1f77b4',
name='Issues by Category'
),
row=2, col=2
)
fig.update_layout(height=800, showlegend=False, title_text="Advanced Data Quality Validation Dashboard")
return fig