Spaces:
Sleeping
Sleeping
| # app.py | |
| import streamlit as st | |
| import pandas as pd | |
| from io import BytesIO | |
| import os | |
| import tempfile | |
| from data_clean_simple import clean_data, display_suggestions_report | |
| # Set page config | |
| st.set_page_config(page_title="Data Cleaning App", page_icon=":sparkles:", layout="wide") | |
| # Use session state to avoid reloading data | |
| if 'processed_data' not in st.session_state: | |
| st.session_state.processed_data = None | |
| st.session_state.suggestions = None | |
| st.session_state.file_details = None | |
| st.title("Smart Data Cleaner :sparkles:") | |
| st.markdown( | |
| """ | |
| Upload a CSV, TSV, or Excel file, and we'll clean it for you using smart data cleaning techniques. | |
| The system will automatically: | |
| - Fix formatting issues | |
| - Handle missing values | |
| - Standardize data entries | |
| - Provide practical suggestions for data improvements | |
| Then, you can download the cleaned data for your analysis. | |
| """ | |
| ) | |
| # File uploader | |
| uploaded_file = st.file_uploader("Choose a file", type=["csv", "tsv", "xlsx"]) | |
| if uploaded_file: | |
| # Check if we need to process the file (new file or button clicked) | |
| file_details = { | |
| 'name': uploaded_file.name, | |
| 'size': uploaded_file.size, | |
| 'type': os.path.splitext(uploaded_file.name)[1].lower() | |
| } | |
| # Only process if it's a new file | |
| if st.session_state.file_details != file_details: | |
| st.session_state.file_details = file_details | |
| file_bytes = uploaded_file.read() | |
| file_type = file_details['type'] | |
| # Show original data before processing | |
| st.subheader("Original Data") | |
| try: | |
| if file_type == ".tsv": | |
| original_df = pd.read_csv(BytesIO(file_bytes), sep='\t') | |
| elif file_type == ".xlsx": | |
| original_df = pd.read_excel(BytesIO(file_bytes)) | |
| else: | |
| original_df = pd.read_csv(BytesIO(file_bytes)) | |
| st.dataframe(original_df.head(10), use_container_width=True) | |
| # Show data info before processing | |
| total_rows = len(original_df) | |
| total_cols = len(original_df.columns) | |
| st.info(f"File contains {total_rows} rows and {total_cols} columns.") | |
| except Exception as e: | |
| st.error(f"Error loading original data: {str(e)}") | |
| st.stop() | |
| # Create a progress bar | |
| progress_bar = st.progress(0) | |
| status_text = st.empty() | |
| # Clean data and get suggestions with progress updates | |
| with st.spinner("Cleaning your data..."): | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=file_type) as temp_file: | |
| temp_file.write(file_bytes) | |
| temp_file_path = temp_file.name | |
| try: | |
| # Update progress for file loading | |
| status_text.text("Loading and preparing data...") | |
| progress_bar.progress(10) | |
| # Process data with progress updates | |
| status_text.text("Cleaning data...") | |
| cleaned_df, suggestions = clean_data(temp_file_path, progress_callback=lambda p: progress_bar.progress(10 + int(p * 80))) | |
| # Final progress update | |
| status_text.text("Finalizing results...") | |
| progress_bar.progress(95) | |
| st.session_state.processed_data = cleaned_df | |
| st.session_state.suggestions = suggestions | |
| # Complete the progress bar | |
| status_text.text("Processing complete!") | |
| st.success("Data cleaned successfully!") | |
| finally: | |
| # Clean up the temporary file | |
| if os.path.exists(temp_file_path): | |
| os.unlink(temp_file_path) | |
| # Show cleaned data | |
| if st.session_state.processed_data is not None: | |
| st.subheader("Cleaned Data Preview") | |
| st.dataframe(st.session_state.processed_data.head(10), use_container_width=True) | |
| # Data statistics | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Total Rows", len(st.session_state.processed_data)) | |
| with col2: | |
| st.metric("Total Columns", len(st.session_state.processed_data.columns)) | |
| with col3: | |
| null_percentage = round((st.session_state.processed_data.isnull().sum().sum() / (st.session_state.processed_data.shape[0] * st.session_state.processed_data.shape[1])) * 100, 2) | |
| st.metric("Null Values (%)", f"{null_percentage}%") | |
| # Show data cleaning suggestions | |
| display_suggestions_report(st.session_state.suggestions) | |
| # Prepare data | |
| file_type = st.session_state.file_details['type'] | |
| cleaned_df = st.session_state.processed_data | |
| # Convert to downloadable format | |
| if file_type == ".csv": | |
| cleaned_file = cleaned_df.to_csv(index=False).encode("utf-8") | |
| download_name = "cleaned_data.csv" | |
| mime_type = "text/csv" | |
| elif file_type == ".tsv": | |
| cleaned_file = cleaned_df.to_csv(index=False, sep="\t").encode("utf-8") | |
| download_name = "cleaned_data.tsv" | |
| mime_type = "text/tsv" | |
| elif file_type == ".xlsx": | |
| output = BytesIO() | |
| with pd.ExcelWriter(output, engine='xlsxwriter') as writer: | |
| cleaned_df.to_excel(writer, index=False) | |
| cleaned_file = output.getvalue() | |
| download_name = "cleaned_data.xlsx" | |
| mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
| # Download button | |
| st.download_button( | |
| label="📁 Download Cleaned Data", | |
| data=cleaned_file, | |
| file_name=download_name, | |
| mime=mime_type | |
| ) |