Data-Cleaner / app.py
translators-will's picture
Update app.py
9aae495 verified
# app.py
import streamlit as st
import pandas as pd
from io import BytesIO
import os
import tempfile
from data_clean_simple import clean_data, display_suggestions_report
# Set page config
st.set_page_config(page_title="Data Cleaning App", page_icon=":sparkles:", layout="wide")
# Use session state to avoid reloading data
if 'processed_data' not in st.session_state:
st.session_state.processed_data = None
st.session_state.suggestions = None
st.session_state.file_details = None
st.title("Smart Data Cleaner :sparkles:")
st.markdown(
"""
Upload a CSV, TSV, or Excel file, and we'll clean it for you using smart data cleaning techniques.
The system will automatically:
- Fix formatting issues
- Handle missing values
- Standardize data entries
- Provide practical suggestions for data improvements
Then, you can download the cleaned data for your analysis.
"""
)
# File uploader
uploaded_file = st.file_uploader("Choose a file", type=["csv", "tsv", "xlsx"])
if uploaded_file:
# Check if we need to process the file (new file or button clicked)
file_details = {
'name': uploaded_file.name,
'size': uploaded_file.size,
'type': os.path.splitext(uploaded_file.name)[1].lower()
}
# Only process if it's a new file
if st.session_state.file_details != file_details:
st.session_state.file_details = file_details
file_bytes = uploaded_file.read()
file_type = file_details['type']
# Show original data before processing
st.subheader("Original Data")
try:
if file_type == ".tsv":
original_df = pd.read_csv(BytesIO(file_bytes), sep='\t')
elif file_type == ".xlsx":
original_df = pd.read_excel(BytesIO(file_bytes))
else:
original_df = pd.read_csv(BytesIO(file_bytes))
st.dataframe(original_df.head(10), use_container_width=True)
# Show data info before processing
total_rows = len(original_df)
total_cols = len(original_df.columns)
st.info(f"File contains {total_rows} rows and {total_cols} columns.")
except Exception as e:
st.error(f"Error loading original data: {str(e)}")
st.stop()
# Create a progress bar
progress_bar = st.progress(0)
status_text = st.empty()
# Clean data and get suggestions with progress updates
with st.spinner("Cleaning your data..."):
with tempfile.NamedTemporaryFile(delete=False, suffix=file_type) as temp_file:
temp_file.write(file_bytes)
temp_file_path = temp_file.name
try:
# Update progress for file loading
status_text.text("Loading and preparing data...")
progress_bar.progress(10)
# Process data with progress updates
status_text.text("Cleaning data...")
cleaned_df, suggestions = clean_data(temp_file_path, progress_callback=lambda p: progress_bar.progress(10 + int(p * 80)))
# Final progress update
status_text.text("Finalizing results...")
progress_bar.progress(95)
st.session_state.processed_data = cleaned_df
st.session_state.suggestions = suggestions
# Complete the progress bar
status_text.text("Processing complete!")
st.success("Data cleaned successfully!")
finally:
# Clean up the temporary file
if os.path.exists(temp_file_path):
os.unlink(temp_file_path)
# Show cleaned data
if st.session_state.processed_data is not None:
st.subheader("Cleaned Data Preview")
st.dataframe(st.session_state.processed_data.head(10), use_container_width=True)
# Data statistics
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Total Rows", len(st.session_state.processed_data))
with col2:
st.metric("Total Columns", len(st.session_state.processed_data.columns))
with col3:
null_percentage = round((st.session_state.processed_data.isnull().sum().sum() / (st.session_state.processed_data.shape[0] * st.session_state.processed_data.shape[1])) * 100, 2)
st.metric("Null Values (%)", f"{null_percentage}%")
# Show data cleaning suggestions
display_suggestions_report(st.session_state.suggestions)
# Prepare data
file_type = st.session_state.file_details['type']
cleaned_df = st.session_state.processed_data
# Convert to downloadable format
if file_type == ".csv":
cleaned_file = cleaned_df.to_csv(index=False).encode("utf-8")
download_name = "cleaned_data.csv"
mime_type = "text/csv"
elif file_type == ".tsv":
cleaned_file = cleaned_df.to_csv(index=False, sep="\t").encode("utf-8")
download_name = "cleaned_data.tsv"
mime_type = "text/tsv"
elif file_type == ".xlsx":
output = BytesIO()
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
cleaned_df.to_excel(writer, index=False)
cleaned_file = output.getvalue()
download_name = "cleaned_data.xlsx"
mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
# Download button
st.download_button(
label="📁 Download Cleaned Data",
data=cleaned_file,
file_name=download_name,
mime=mime_type
)