File size: 13,849 Bytes
eea5189 9d06884 eea5189 9d06884 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 | import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from io import BytesIO
import json
from datetime import datetime
# Import our modules
from data_processor import DataProcessor
from cleaning_engine import CleaningEngine
from config import Config
from utils.logger import setup_logger
from advanced_validation_ui import render_advanced_validation_tab
# Configure page
st.set_page_config(
page_title="AI-Based Data Cleaner",
page_icon="π§Ή",
layout="wide",
initial_sidebar_state="expanded"
)
# Initialize logger
logger = setup_logger(__name__)
# Initialize session state
if 'original_df' not in st.session_state:
st.session_state.original_df = None
if 'cleaned_df' not in st.session_state:
st.session_state.cleaned_df = None
if 'cleaning_report' not in st.session_state:
st.session_state.cleaning_report = None
if 'file_uploaded' not in st.session_state:
st.session_state.file_uploaded = False
def main():
"""Main application function"""
# Header
st.title("π§Ή AI-Based Data Cleaner")
st.markdown("Upload your Excel or CSV file and let AI clean your data automatically!")
# Sidebar for configuration
with st.sidebar:
st.header("βοΈ Configuration")
# Check API key
if not Config.OPENAI_API_KEY:
st.error("β οΈ OpenAI API key not found! Please set OPENAI_API_KEY in your .env file.")
st.stop()
else:
st.success("β
OpenAI API key configured")
st.header("π§ Cleaning Options")
cleaning_options = {
'remove_duplicates': st.checkbox("Remove duplicate rows", value=True),
'ai_text_cleaning': st.checkbox("AI-powered text cleaning", value=True),
'fill_missing_values': st.checkbox("Fill missing values", value=True),
'use_ai_for_missing': st.checkbox("Use AI for missing values (small datasets)", value=False),
'fix_data_types': st.checkbox("Fix data types", value=True),
'handle_outliers': st.checkbox("Handle outliers", value=False)
}
if st.button("βΉοΈ About"):
show_about()
# Main content area
tab1, tab2, tab3, tab4, tab5 = st.tabs(["π Upload & Clean", "π Data Analysis", "π Before/After", "π¬ Advanced Validation", "π₯ Download"])
with tab1:
upload_and_clean_tab(cleaning_options)
with tab2:
data_analysis_tab()
with tab3:
before_after_tab()
with tab4:
advanced_validation_tab()
with tab5:
download_tab()
def upload_and_clean_tab(cleaning_options):
"""File upload and cleaning tab"""
st.header("π Upload Your Data File")
uploaded_file = st.file_uploader(
"Choose a CSV or Excel file",
type=['csv', 'xlsx', 'xls'],
help=f"Maximum file size: {Config.MAX_FILE_SIZE_MB}MB"
)
if uploaded_file is not None:
try:
# Check file size
file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
if file_size_mb > Config.MAX_FILE_SIZE_MB:
st.error(f"File size ({file_size_mb:.1f}MB) exceeds maximum allowed size ({Config.MAX_FILE_SIZE_MB}MB)")
return
# Load data
with st.spinner("Loading data..."):
data_processor = DataProcessor()
df = data_processor.read_file(uploaded_file.getvalue(), uploaded_file.name)
st.session_state.original_df = df
st.session_state.file_uploaded = True
st.success(f"β
File loaded successfully! Shape: {df.shape}")
# Show data preview
st.subheader("π Data Preview")
st.dataframe(df.head(10), use_container_width=True)
# Data quality overview
st.subheader("π Data Quality Overview")
show_data_quality_overview(df)
# Clean data button
if st.button("π§Ή Clean Data", type="primary", use_container_width=True):
clean_data(df, cleaning_options)
except Exception as e:
st.error(f"Error loading file: {str(e)}")
logger.error(f"Error loading file {uploaded_file.name}: {str(e)}")
def clean_data(df, cleaning_options):
"""Clean the data using the cleaning engine"""
try:
with st.spinner("π€ AI is cleaning your data... This may take a few minutes."):
# Initialize cleaning engine
cleaning_engine = CleaningEngine()
# Clean the data
cleaned_df, cleaning_report = cleaning_engine.clean_dataset(df, cleaning_options)
# Store results in session state
st.session_state.cleaned_df = cleaned_df
st.session_state.cleaning_report = cleaning_report
st.success("β
Data cleaning completed!")
# Show summary
if cleaning_report['statistics'].get('summary'):
summary = cleaning_report['statistics']['summary']
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Total Changes", summary['total_changes'])
with col2:
st.metric("Missing Values",
f"{summary['missing_values_after']}",
f"-{summary['missing_values_before'] - summary['missing_values_after']}")
with col3:
st.metric("Rows", summary['rows_after'],
f"{summary['rows_after'] - summary['rows_before']}")
with col4:
st.metric("Missing Reduction", f"{summary['missing_reduction_percentage']:.1f}%")
# Show errors if any
if cleaning_report.get('errors'):
st.warning("β οΈ Some issues occurred during cleaning:")
for error in cleaning_report['errors']:
st.write(f"β’ {error}")
except Exception as e:
st.error(f"Error during data cleaning: {str(e)}")
logger.error(f"Error during data cleaning: {str(e)}")
def show_data_quality_overview(df):
"""Show data quality overview"""
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Rows", f"{df.shape[0]:,}")
with col2:
st.metric("Columns", df.shape[1])
with col3:
missing_count = df.isnull().sum().sum()
st.metric("Missing Values", f"{missing_count:,}")
with col4:
duplicate_count = df.duplicated().sum()
st.metric("Duplicates", f"{duplicate_count:,}")
# Missing values by column
if missing_count > 0:
st.subheader("Missing Values by Column")
missing_data = df.isnull().sum()
missing_data = missing_data[missing_data > 0].sort_values(ascending=False)
if len(missing_data) > 0:
fig = px.bar(
x=missing_data.values,
y=missing_data.index,
orientation='h',
title="Missing Values Count by Column"
)
fig.update_layout(height=400)
st.plotly_chart(fig, use_container_width=True)
def data_analysis_tab():
"""Data analysis and statistics tab"""
if st.session_state.original_df is None:
st.info("Please upload a file first.")
return
st.header("π Data Analysis")
df = st.session_state.original_df
# Column selection
selected_columns = st.multiselect(
"Select columns to analyze:",
df.columns.tolist(),
default=df.columns.tolist()[:5]
)
if selected_columns:
# Basic statistics
st.subheader("π Basic Statistics")
st.dataframe(df[selected_columns].describe(), use_container_width=True)
# Data types
st.subheader("π·οΈ Data Types")
dtype_df = pd.DataFrame({
'Column': selected_columns,
'Data Type': [str(df[col].dtype) for col in selected_columns],
'Non-Null Count': [df[col].count() for col in selected_columns],
'Unique Values': [df[col].nunique() for col in selected_columns]
})
st.dataframe(dtype_df, use_container_width=True)
def before_after_tab():
"""Before/after comparison tab"""
if st.session_state.original_df is None or st.session_state.cleaned_df is None:
st.info("Please upload and clean a file first.")
return
st.header("π Before/After Comparison")
original_df = st.session_state.original_df
cleaned_df = st.session_state.cleaned_df
# Column selection for comparison
column = st.selectbox("Select column to compare:", original_df.columns)
col1, col2 = st.columns(2)
with col1:
st.subheader("π Before Cleaning")
st.dataframe(original_df[column].head(20), use_container_width=True)
# Statistics
if original_df[column].dtype in ['int64', 'float64']:
st.write("**Statistics:**")
st.write(f"Mean: {original_df[column].mean():.2f}")
st.write(f"Missing: {original_df[column].isnull().sum()}")
with col2:
st.subheader("β¨ After Cleaning")
st.dataframe(cleaned_df[column].head(20), use_container_width=True)
# Statistics
if cleaned_df[column].dtype in ['int64', 'float64']:
st.write("**Statistics:**")
st.write(f"Mean: {cleaned_df[column].mean():.2f}")
st.write(f"Missing: {cleaned_df[column].isnull().sum()}")
# Show changes made
if st.session_state.cleaning_report:
st.subheader("π Changes Made")
changes = st.session_state.cleaning_report.get('changes', [])
column_changes = [change for change in changes if change.get('column') == column]
if column_changes:
for change in column_changes:
st.write(f"β’ **{change['type']}**: {change}")
else:
st.info("No changes made to this column.")
def download_tab():
"""Download cleaned data tab"""
if st.session_state.cleaned_df is None:
st.info("Please clean your data first.")
return
st.header("π₯ Download Cleaned Data")
cleaned_df = st.session_state.cleaned_df
# File format selection
file_format = st.radio("Select download format:", ["CSV", "Excel"])
# Generate download
if file_format == "CSV":
csv_buffer = BytesIO()
cleaned_df.to_csv(csv_buffer, index=False)
csv_data = csv_buffer.getvalue()
st.download_button(
label="π₯ Download CSV",
data=csv_data,
file_name=f"cleaned_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
mime="text/csv",
use_container_width=True
)
else: # Excel
excel_buffer = BytesIO()
with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer:
cleaned_df.to_excel(writer, sheet_name='Cleaned_Data', index=False)
# Add cleaning report if available
if st.session_state.cleaning_report:
report_df = pd.DataFrame(st.session_state.cleaning_report['changes'])
if not report_df.empty:
report_df.to_excel(writer, sheet_name='Cleaning_Report', index=False)
excel_data = excel_buffer.getvalue()
st.download_button(
label="π₯ Download Excel",
data=excel_data,
file_name=f"cleaned_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
use_container_width=True
)
# Show cleaning report
if st.session_state.cleaning_report:
st.subheader("π Cleaning Report")
with st.expander("View detailed cleaning report"):
st.json(st.session_state.cleaning_report)
def advanced_validation_tab():
"""Advanced data quality validation tab"""
if st.session_state.original_df is not None:
render_advanced_validation_tab(st.session_state.original_df)
else:
st.info("π Please upload a data file first to perform advanced validation.")
st.markdown("""
### π¬ Advanced Data Quality Validation
This feature provides comprehensive analysis of your data for:
- **π Duplicate Identity Detection**: Find duplicate emails, phone numbers, and contact information
- **π Data Pattern Anomalies**: Detect suspicious clustering and artificial standardization
- **βοΈ Business Logic Violations**: Identify chronological inconsistencies and employment logic errors
- **π― Contextual Integrity Issues**: Spot bulk import patterns and unrealistic data ranges
Upload your data file to get started!
""")
def show_about():
"""Show about information"""
st.info("""
**AI-Based Data Cleaner** π§Ή
This application uses artificial intelligence to automatically clean and validate your data:
**Features:**
β’ π€ AI-powered text cleaning and spelling correction
β’ π Intelligent missing value imputation
β’ π Duplicate detection and removal
β’ π Data type optimization
β’ π Outlier detection and handling
**Supported Formats:**
β’ CSV files (.csv)
β’ Excel files (.xlsx, .xls)
**Requirements:**
β’ OpenAI API key for AI features
β’ Maximum file size: 50MB
""")
if __name__ == "__main__":
main() |