Spaces:

translators-will
/

Data-Cleaner

Sleeping

App Files Files Community

Data-Cleaner / app.py

translators-will

Update app.py

9aae495 verified 10 months ago

raw

history blame contribute delete

5.41 kB

	# app.py

	import streamlit as st
	import pandas as pd
	from io import BytesIO
	import os
	import tempfile
	from data_clean_simple import clean_data, display_suggestions_report

	# Set page config
	st.set_page_config(page_title="Data Cleaning App", page_icon=":sparkles:", layout="wide")

	# Use session state to avoid reloading data
	if 'processed_data' not in st.session_state:
	st.session_state.processed_data = None
	st.session_state.suggestions = None
	st.session_state.file_details = None

	st.title("Smart Data Cleaner :sparkles:")
	st.markdown(
	"""
	Upload a CSV, TSV, or Excel file, and we'll clean it for you using smart data cleaning techniques.
	The system will automatically:
	- Fix formatting issues
	- Handle missing values
	- Standardize data entries
	- Provide practical suggestions for data improvements

	Then, you can download the cleaned data for your analysis.
	"""
	)

	# File uploader
	uploaded_file = st.file_uploader("Choose a file", type=["csv", "tsv", "xlsx"])

	if uploaded_file:
	# Check if we need to process the file (new file or button clicked)
	file_details = {
	'name': uploaded_file.name,
	'size': uploaded_file.size,
	'type': os.path.splitext(uploaded_file.name)[1].lower()
	}
	# Only process if it's a new file
	if st.session_state.file_details != file_details:
	st.session_state.file_details = file_details

	file_bytes = uploaded_file.read()
	file_type = file_details['type']

	# Show original data before processing
	st.subheader("Original Data")
	try:
	if file_type == ".tsv":
	original_df = pd.read_csv(BytesIO(file_bytes), sep='\t')
	elif file_type == ".xlsx":
	original_df = pd.read_excel(BytesIO(file_bytes))
	else:
	original_df = pd.read_csv(BytesIO(file_bytes))

	st.dataframe(original_df.head(10), use_container_width=True)

	# Show data info before processing
	total_rows = len(original_df)
	total_cols = len(original_df.columns)
	st.info(f"File contains {total_rows} rows and {total_cols} columns.")

	except Exception as e:
	st.error(f"Error loading original data: {str(e)}")
	st.stop()

	# Create a progress bar
	progress_bar = st.progress(0)
	status_text = st.empty()

	# Clean data and get suggestions with progress updates
	with st.spinner("Cleaning your data..."):

	with tempfile.NamedTemporaryFile(delete=False, suffix=file_type) as temp_file:
	temp_file.write(file_bytes)
	temp_file_path = temp_file.name

	try:
	# Update progress for file loading
	status_text.text("Loading and preparing data...")
	progress_bar.progress(10)

	# Process data with progress updates
	status_text.text("Cleaning data...")
	cleaned_df, suggestions = clean_data(temp_file_path, progress_callback=lambda p: progress_bar.progress(10 + int(p * 80)))

	# Final progress update
	status_text.text("Finalizing results...")
	progress_bar.progress(95)

	st.session_state.processed_data = cleaned_df
	st.session_state.suggestions = suggestions

	# Complete the progress bar
	status_text.text("Processing complete!")

	st.success("Data cleaned successfully!")

	finally:
	# Clean up the temporary file
	if os.path.exists(temp_file_path):
	os.unlink(temp_file_path)

	# Show cleaned data
	if st.session_state.processed_data is not None:
	st.subheader("Cleaned Data Preview")
	st.dataframe(st.session_state.processed_data.head(10), use_container_width=True)

	# Data statistics
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Total Rows", len(st.session_state.processed_data))
	with col2:
	st.metric("Total Columns", len(st.session_state.processed_data.columns))
	with col3:
	null_percentage = round((st.session_state.processed_data.isnull().sum().sum() / (st.session_state.processed_data.shape[0] * st.session_state.processed_data.shape[1])) * 100, 2)
	st.metric("Null Values (%)", f"{null_percentage}%")

	# Show data cleaning suggestions
	display_suggestions_report(st.session_state.suggestions)

	# Prepare data
	file_type = st.session_state.file_details['type']
	cleaned_df = st.session_state.processed_data

	# Convert to downloadable format
	if file_type == ".csv":
	cleaned_file = cleaned_df.to_csv(index=False).encode("utf-8")
	download_name = "cleaned_data.csv"
	mime_type = "text/csv"
	elif file_type == ".tsv":
	cleaned_file = cleaned_df.to_csv(index=False, sep="\t").encode("utf-8")
	download_name = "cleaned_data.tsv"
	mime_type = "text/tsv"
	elif file_type == ".xlsx":
	output = BytesIO()
	with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
	cleaned_df.to_excel(writer, index=False)
	cleaned_file = output.getvalue()
	download_name = "cleaned_data.xlsx"
	mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"

	# Download button
	st.download_button(
	label="📁 Download Cleaned Data",
	data=cleaned_file,
	file_name=download_name,
	mime=mime_type
	)