# app.py import streamlit as st import pandas as pd from io import BytesIO import os import tempfile from data_clean_simple import clean_data, display_suggestions_report # Set page config st.set_page_config(page_title="Data Cleaning App", page_icon=":sparkles:", layout="wide") # Use session state to avoid reloading data if 'processed_data' not in st.session_state: st.session_state.processed_data = None st.session_state.suggestions = None st.session_state.file_details = None st.title("Smart Data Cleaner :sparkles:") st.markdown( """ Upload a CSV, TSV, or Excel file, and we'll clean it for you using smart data cleaning techniques. The system will automatically: - Fix formatting issues - Handle missing values - Standardize data entries - Provide practical suggestions for data improvements Then, you can download the cleaned data for your analysis. """ ) # File uploader uploaded_file = st.file_uploader("Choose a file", type=["csv", "tsv", "xlsx"]) if uploaded_file: # Check if we need to process the file (new file or button clicked) file_details = { 'name': uploaded_file.name, 'size': uploaded_file.size, 'type': os.path.splitext(uploaded_file.name)[1].lower() } # Only process if it's a new file if st.session_state.file_details != file_details: st.session_state.file_details = file_details file_bytes = uploaded_file.read() file_type = file_details['type'] # Show original data before processing st.subheader("Original Data") try: if file_type == ".tsv": original_df = pd.read_csv(BytesIO(file_bytes), sep='\t') elif file_type == ".xlsx": original_df = pd.read_excel(BytesIO(file_bytes)) else: original_df = pd.read_csv(BytesIO(file_bytes)) st.dataframe(original_df.head(10), use_container_width=True) # Show data info before processing total_rows = len(original_df) total_cols = len(original_df.columns) st.info(f"File contains {total_rows} rows and {total_cols} columns.") except Exception as e: st.error(f"Error loading original data: {str(e)}") st.stop() # Create a progress bar progress_bar = st.progress(0) status_text = st.empty() # Clean data and get suggestions with progress updates with st.spinner("Cleaning your data..."): with tempfile.NamedTemporaryFile(delete=False, suffix=file_type) as temp_file: temp_file.write(file_bytes) temp_file_path = temp_file.name try: # Update progress for file loading status_text.text("Loading and preparing data...") progress_bar.progress(10) # Process data with progress updates status_text.text("Cleaning data...") cleaned_df, suggestions = clean_data(temp_file_path, progress_callback=lambda p: progress_bar.progress(10 + int(p * 80))) # Final progress update status_text.text("Finalizing results...") progress_bar.progress(95) st.session_state.processed_data = cleaned_df st.session_state.suggestions = suggestions # Complete the progress bar status_text.text("Processing complete!") st.success("Data cleaned successfully!") finally: # Clean up the temporary file if os.path.exists(temp_file_path): os.unlink(temp_file_path) # Show cleaned data if st.session_state.processed_data is not None: st.subheader("Cleaned Data Preview") st.dataframe(st.session_state.processed_data.head(10), use_container_width=True) # Data statistics col1, col2, col3 = st.columns(3) with col1: st.metric("Total Rows", len(st.session_state.processed_data)) with col2: st.metric("Total Columns", len(st.session_state.processed_data.columns)) with col3: null_percentage = round((st.session_state.processed_data.isnull().sum().sum() / (st.session_state.processed_data.shape[0] * st.session_state.processed_data.shape[1])) * 100, 2) st.metric("Null Values (%)", f"{null_percentage}%") # Show data cleaning suggestions display_suggestions_report(st.session_state.suggestions) # Prepare data file_type = st.session_state.file_details['type'] cleaned_df = st.session_state.processed_data # Convert to downloadable format if file_type == ".csv": cleaned_file = cleaned_df.to_csv(index=False).encode("utf-8") download_name = "cleaned_data.csv" mime_type = "text/csv" elif file_type == ".tsv": cleaned_file = cleaned_df.to_csv(index=False, sep="\t").encode("utf-8") download_name = "cleaned_data.tsv" mime_type = "text/tsv" elif file_type == ".xlsx": output = BytesIO() with pd.ExcelWriter(output, engine='xlsxwriter') as writer: cleaned_df.to_excel(writer, index=False) cleaned_file = output.getvalue() download_name = "cleaned_data.xlsx" mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" # Download button st.download_button( label="📁 Download Cleaned Data", data=cleaned_file, file_name=download_name, mime=mime_type )