Spaces:
Sleeping
Sleeping
File size: 5,414 Bytes
523bc48 05d617b 523bc48 05d617b 523bc48 9aae495 523bc48 9aae495 523bc48 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
# app.py
import streamlit as st
import pandas as pd
from io import BytesIO
import os
import tempfile
from data_clean_simple import clean_data, display_suggestions_report
# Set page config
st.set_page_config(page_title="Data Cleaning App", page_icon=":sparkles:", layout="wide")
# Use session state to avoid reloading data
if 'processed_data' not in st.session_state:
st.session_state.processed_data = None
st.session_state.suggestions = None
st.session_state.file_details = None
st.title("Smart Data Cleaner :sparkles:")
st.markdown(
"""
Upload a CSV, TSV, or Excel file, and we'll clean it for you using smart data cleaning techniques.
The system will automatically:
- Fix formatting issues
- Handle missing values
- Standardize data entries
- Provide practical suggestions for data improvements
Then, you can download the cleaned data for your analysis.
"""
)
# File uploader
uploaded_file = st.file_uploader("Choose a file", type=["csv", "tsv", "xlsx"])
if uploaded_file:
# Check if we need to process the file (new file or button clicked)
file_details = {
'name': uploaded_file.name,
'size': uploaded_file.size,
'type': os.path.splitext(uploaded_file.name)[1].lower()
}
# Only process if it's a new file
if st.session_state.file_details != file_details:
st.session_state.file_details = file_details
file_bytes = uploaded_file.read()
file_type = file_details['type']
# Show original data before processing
st.subheader("Original Data")
try:
if file_type == ".tsv":
original_df = pd.read_csv(BytesIO(file_bytes), sep='\t')
elif file_type == ".xlsx":
original_df = pd.read_excel(BytesIO(file_bytes))
else:
original_df = pd.read_csv(BytesIO(file_bytes))
st.dataframe(original_df.head(10), use_container_width=True)
# Show data info before processing
total_rows = len(original_df)
total_cols = len(original_df.columns)
st.info(f"File contains {total_rows} rows and {total_cols} columns.")
except Exception as e:
st.error(f"Error loading original data: {str(e)}")
st.stop()
# Create a progress bar
progress_bar = st.progress(0)
status_text = st.empty()
# Clean data and get suggestions with progress updates
with st.spinner("Cleaning your data..."):
with tempfile.NamedTemporaryFile(delete=False, suffix=file_type) as temp_file:
temp_file.write(file_bytes)
temp_file_path = temp_file.name
try:
# Update progress for file loading
status_text.text("Loading and preparing data...")
progress_bar.progress(10)
# Process data with progress updates
status_text.text("Cleaning data...")
cleaned_df, suggestions = clean_data(temp_file_path, progress_callback=lambda p: progress_bar.progress(10 + int(p * 80)))
# Final progress update
status_text.text("Finalizing results...")
progress_bar.progress(95)
st.session_state.processed_data = cleaned_df
st.session_state.suggestions = suggestions
# Complete the progress bar
status_text.text("Processing complete!")
st.success("Data cleaned successfully!")
finally:
# Clean up the temporary file
if os.path.exists(temp_file_path):
os.unlink(temp_file_path)
# Show cleaned data
if st.session_state.processed_data is not None:
st.subheader("Cleaned Data Preview")
st.dataframe(st.session_state.processed_data.head(10), use_container_width=True)
# Data statistics
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Total Rows", len(st.session_state.processed_data))
with col2:
st.metric("Total Columns", len(st.session_state.processed_data.columns))
with col3:
null_percentage = round((st.session_state.processed_data.isnull().sum().sum() / (st.session_state.processed_data.shape[0] * st.session_state.processed_data.shape[1])) * 100, 2)
st.metric("Null Values (%)", f"{null_percentage}%")
# Show data cleaning suggestions
display_suggestions_report(st.session_state.suggestions)
# Prepare data
file_type = st.session_state.file_details['type']
cleaned_df = st.session_state.processed_data
# Convert to downloadable format
if file_type == ".csv":
cleaned_file = cleaned_df.to_csv(index=False).encode("utf-8")
download_name = "cleaned_data.csv"
mime_type = "text/csv"
elif file_type == ".tsv":
cleaned_file = cleaned_df.to_csv(index=False, sep="\t").encode("utf-8")
download_name = "cleaned_data.tsv"
mime_type = "text/tsv"
elif file_type == ".xlsx":
output = BytesIO()
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
cleaned_df.to_excel(writer, index=False)
cleaned_file = output.getvalue()
download_name = "cleaned_data.xlsx"
mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
# Download button
st.download_button(
label="📁 Download Cleaned Data",
data=cleaned_file,
file_name=download_name,
mime=mime_type
) |