AI-Based-Data-Cleaner / src /streamlit_app.py
midlajvalappil's picture
Update src/streamlit_app.py
9d06884 verified
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from io import BytesIO
import json
from datetime import datetime
# Import our modules
from data_processor import DataProcessor
from cleaning_engine import CleaningEngine
from config import Config
from utils.logger import setup_logger
from advanced_validation_ui import render_advanced_validation_tab
# Configure page
st.set_page_config(
page_title="AI-Based Data Cleaner",
page_icon="🧹",
layout="wide",
initial_sidebar_state="expanded"
)
# Initialize logger
logger = setup_logger(__name__)
# Initialize session state
if 'original_df' not in st.session_state:
st.session_state.original_df = None
if 'cleaned_df' not in st.session_state:
st.session_state.cleaned_df = None
if 'cleaning_report' not in st.session_state:
st.session_state.cleaning_report = None
if 'file_uploaded' not in st.session_state:
st.session_state.file_uploaded = False
def main():
"""Main application function"""
# Header
st.title("🧹 AI-Based Data Cleaner")
st.markdown("Upload your Excel or CSV file and let AI clean your data automatically!")
# Sidebar for configuration
with st.sidebar:
st.header("βš™οΈ Configuration")
# Check API key
if not Config.OPENAI_API_KEY:
st.error("⚠️ OpenAI API key not found! Please set OPENAI_API_KEY in your .env file.")
st.stop()
else:
st.success("βœ… OpenAI API key configured")
st.header("πŸ”§ Cleaning Options")
cleaning_options = {
'remove_duplicates': st.checkbox("Remove duplicate rows", value=True),
'ai_text_cleaning': st.checkbox("AI-powered text cleaning", value=True),
'fill_missing_values': st.checkbox("Fill missing values", value=True),
'use_ai_for_missing': st.checkbox("Use AI for missing values (small datasets)", value=False),
'fix_data_types': st.checkbox("Fix data types", value=True),
'handle_outliers': st.checkbox("Handle outliers", value=False)
}
if st.button("ℹ️ About"):
show_about()
# Main content area
tab1, tab2, tab3, tab4, tab5 = st.tabs(["πŸ“ Upload & Clean", "πŸ“Š Data Analysis", "πŸ” Before/After", "πŸ”¬ Advanced Validation", "πŸ“₯ Download"])
with tab1:
upload_and_clean_tab(cleaning_options)
with tab2:
data_analysis_tab()
with tab3:
before_after_tab()
with tab4:
advanced_validation_tab()
with tab5:
download_tab()
def upload_and_clean_tab(cleaning_options):
"""File upload and cleaning tab"""
st.header("πŸ“ Upload Your Data File")
uploaded_file = st.file_uploader(
"Choose a CSV or Excel file",
type=['csv', 'xlsx', 'xls'],
help=f"Maximum file size: {Config.MAX_FILE_SIZE_MB}MB"
)
if uploaded_file is not None:
try:
# Check file size
file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
if file_size_mb > Config.MAX_FILE_SIZE_MB:
st.error(f"File size ({file_size_mb:.1f}MB) exceeds maximum allowed size ({Config.MAX_FILE_SIZE_MB}MB)")
return
# Load data
with st.spinner("Loading data..."):
data_processor = DataProcessor()
df = data_processor.read_file(uploaded_file.getvalue(), uploaded_file.name)
st.session_state.original_df = df
st.session_state.file_uploaded = True
st.success(f"βœ… File loaded successfully! Shape: {df.shape}")
# Show data preview
st.subheader("πŸ“‹ Data Preview")
st.dataframe(df.head(10), use_container_width=True)
# Data quality overview
st.subheader("πŸ“ˆ Data Quality Overview")
show_data_quality_overview(df)
# Clean data button
if st.button("🧹 Clean Data", type="primary", use_container_width=True):
clean_data(df, cleaning_options)
except Exception as e:
st.error(f"Error loading file: {str(e)}")
logger.error(f"Error loading file {uploaded_file.name}: {str(e)}")
def clean_data(df, cleaning_options):
"""Clean the data using the cleaning engine"""
try:
with st.spinner("πŸ€– AI is cleaning your data... This may take a few minutes."):
# Initialize cleaning engine
cleaning_engine = CleaningEngine()
# Clean the data
cleaned_df, cleaning_report = cleaning_engine.clean_dataset(df, cleaning_options)
# Store results in session state
st.session_state.cleaned_df = cleaned_df
st.session_state.cleaning_report = cleaning_report
st.success("βœ… Data cleaning completed!")
# Show summary
if cleaning_report['statistics'].get('summary'):
summary = cleaning_report['statistics']['summary']
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Total Changes", summary['total_changes'])
with col2:
st.metric("Missing Values",
f"{summary['missing_values_after']}",
f"-{summary['missing_values_before'] - summary['missing_values_after']}")
with col3:
st.metric("Rows", summary['rows_after'],
f"{summary['rows_after'] - summary['rows_before']}")
with col4:
st.metric("Missing Reduction", f"{summary['missing_reduction_percentage']:.1f}%")
# Show errors if any
if cleaning_report.get('errors'):
st.warning("⚠️ Some issues occurred during cleaning:")
for error in cleaning_report['errors']:
st.write(f"β€’ {error}")
except Exception as e:
st.error(f"Error during data cleaning: {str(e)}")
logger.error(f"Error during data cleaning: {str(e)}")
def show_data_quality_overview(df):
"""Show data quality overview"""
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Rows", f"{df.shape[0]:,}")
with col2:
st.metric("Columns", df.shape[1])
with col3:
missing_count = df.isnull().sum().sum()
st.metric("Missing Values", f"{missing_count:,}")
with col4:
duplicate_count = df.duplicated().sum()
st.metric("Duplicates", f"{duplicate_count:,}")
# Missing values by column
if missing_count > 0:
st.subheader("Missing Values by Column")
missing_data = df.isnull().sum()
missing_data = missing_data[missing_data > 0].sort_values(ascending=False)
if len(missing_data) > 0:
fig = px.bar(
x=missing_data.values,
y=missing_data.index,
orientation='h',
title="Missing Values Count by Column"
)
fig.update_layout(height=400)
st.plotly_chart(fig, use_container_width=True)
def data_analysis_tab():
"""Data analysis and statistics tab"""
if st.session_state.original_df is None:
st.info("Please upload a file first.")
return
st.header("πŸ“Š Data Analysis")
df = st.session_state.original_df
# Column selection
selected_columns = st.multiselect(
"Select columns to analyze:",
df.columns.tolist(),
default=df.columns.tolist()[:5]
)
if selected_columns:
# Basic statistics
st.subheader("πŸ“ˆ Basic Statistics")
st.dataframe(df[selected_columns].describe(), use_container_width=True)
# Data types
st.subheader("🏷️ Data Types")
dtype_df = pd.DataFrame({
'Column': selected_columns,
'Data Type': [str(df[col].dtype) for col in selected_columns],
'Non-Null Count': [df[col].count() for col in selected_columns],
'Unique Values': [df[col].nunique() for col in selected_columns]
})
st.dataframe(dtype_df, use_container_width=True)
def before_after_tab():
"""Before/after comparison tab"""
if st.session_state.original_df is None or st.session_state.cleaned_df is None:
st.info("Please upload and clean a file first.")
return
st.header("πŸ” Before/After Comparison")
original_df = st.session_state.original_df
cleaned_df = st.session_state.cleaned_df
# Column selection for comparison
column = st.selectbox("Select column to compare:", original_df.columns)
col1, col2 = st.columns(2)
with col1:
st.subheader("πŸ“‹ Before Cleaning")
st.dataframe(original_df[column].head(20), use_container_width=True)
# Statistics
if original_df[column].dtype in ['int64', 'float64']:
st.write("**Statistics:**")
st.write(f"Mean: {original_df[column].mean():.2f}")
st.write(f"Missing: {original_df[column].isnull().sum()}")
with col2:
st.subheader("✨ After Cleaning")
st.dataframe(cleaned_df[column].head(20), use_container_width=True)
# Statistics
if cleaned_df[column].dtype in ['int64', 'float64']:
st.write("**Statistics:**")
st.write(f"Mean: {cleaned_df[column].mean():.2f}")
st.write(f"Missing: {cleaned_df[column].isnull().sum()}")
# Show changes made
if st.session_state.cleaning_report:
st.subheader("πŸ“ Changes Made")
changes = st.session_state.cleaning_report.get('changes', [])
column_changes = [change for change in changes if change.get('column') == column]
if column_changes:
for change in column_changes:
st.write(f"β€’ **{change['type']}**: {change}")
else:
st.info("No changes made to this column.")
def download_tab():
"""Download cleaned data tab"""
if st.session_state.cleaned_df is None:
st.info("Please clean your data first.")
return
st.header("πŸ“₯ Download Cleaned Data")
cleaned_df = st.session_state.cleaned_df
# File format selection
file_format = st.radio("Select download format:", ["CSV", "Excel"])
# Generate download
if file_format == "CSV":
csv_buffer = BytesIO()
cleaned_df.to_csv(csv_buffer, index=False)
csv_data = csv_buffer.getvalue()
st.download_button(
label="πŸ“₯ Download CSV",
data=csv_data,
file_name=f"cleaned_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
mime="text/csv",
use_container_width=True
)
else: # Excel
excel_buffer = BytesIO()
with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer:
cleaned_df.to_excel(writer, sheet_name='Cleaned_Data', index=False)
# Add cleaning report if available
if st.session_state.cleaning_report:
report_df = pd.DataFrame(st.session_state.cleaning_report['changes'])
if not report_df.empty:
report_df.to_excel(writer, sheet_name='Cleaning_Report', index=False)
excel_data = excel_buffer.getvalue()
st.download_button(
label="πŸ“₯ Download Excel",
data=excel_data,
file_name=f"cleaned_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
use_container_width=True
)
# Show cleaning report
if st.session_state.cleaning_report:
st.subheader("πŸ“‹ Cleaning Report")
with st.expander("View detailed cleaning report"):
st.json(st.session_state.cleaning_report)
def advanced_validation_tab():
"""Advanced data quality validation tab"""
if st.session_state.original_df is not None:
render_advanced_validation_tab(st.session_state.original_df)
else:
st.info("πŸ“ Please upload a data file first to perform advanced validation.")
st.markdown("""
### πŸ”¬ Advanced Data Quality Validation
This feature provides comprehensive analysis of your data for:
- **πŸ” Duplicate Identity Detection**: Find duplicate emails, phone numbers, and contact information
- **πŸ“Š Data Pattern Anomalies**: Detect suspicious clustering and artificial standardization
- **βš–οΈ Business Logic Violations**: Identify chronological inconsistencies and employment logic errors
- **🎯 Contextual Integrity Issues**: Spot bulk import patterns and unrealistic data ranges
Upload your data file to get started!
""")
def show_about():
"""Show about information"""
st.info("""
**AI-Based Data Cleaner** 🧹
This application uses artificial intelligence to automatically clean and validate your data:
**Features:**
β€’ πŸ€– AI-powered text cleaning and spelling correction
β€’ πŸ“Š Intelligent missing value imputation
β€’ πŸ” Duplicate detection and removal
β€’ πŸ“ˆ Data type optimization
β€’ πŸ“‰ Outlier detection and handling
**Supported Formats:**
β€’ CSV files (.csv)
β€’ Excel files (.xlsx, .xls)
**Requirements:**
β€’ OpenAI API key for AI features
β€’ Maximum file size: 50MB
""")
if __name__ == "__main__":
main()