Spaces:

translators-will
/

Data-Cleaner

Sleeping

App Files Files Community

translators-will commited on Apr 9, 2025

Commit

523bc48

verified ·

1 Parent(s): 394c7a7

Create app.py

Browse files

Files changed (1) hide show

app.py +132 -0

app.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# app.py
+# app.py
+import streamlit as st
+import pandas as pd
+from io import BytesIO
+import os
+import tempfile
+from data_clean_simple import clean_data, display_suggestions_report
+# Set page config
+st.set_page_config(page_title="Data Cleaning App", page_icon=":sparkles:", layout="wide")
+# Use session state to avoid reloading data
+if 'processed_data' not in st.session_state:
+    st.session_state.processed_data = None
+    st.session_state.suggestions = None
+    st.session_state.file_details = None
+st.title("Smart Data Cleaner :sparkles:")
+st.markdown(
+    """
+    Upload a CSV, TSV, or Excel file, and we'll clean it for you using smart data cleaning techniques.
+    The system will automatically:
+    - Fix formatting issues
+    - Handle missing values
+    - Standardize data entries
+    - Provide practical suggestions for data improvements
+    Then, you can download the cleaned data for your analysis.
+    """
+)
+# File uploader
+uploaded_file = st.file_uploader("Choose a file", type=["csv", "tsv", "xlsx"])
+if uploaded_file:
+    # Check if we need to process the file (new file or button clicked)
+    file_details = (
+        'name': uploaded_file.name,
+        'size': uploaded_file.size,
+        'type': os.path.splitext(uploaded_file.name)[1].lower()
+    )
+    # Only process if it's a new file
+    if st.session_state.file_details != file_details:
+        st.session_state.file_details = file_details
+        file_bytes = uploaded_file.read()
+        file_type = file_details['type']
+    # Clean data and get suggestions
+    with st.spinner("Cleaning your data..."):
+        with tempfile.NamedTemporaryFile(delete=False, suffix=file_type) as temp_file:
+            temp_file.write(file_bytes)
+            temp_file_path = temp_file.name
+        try:
+            # Temporary file to clean_data function
+            cleaned_df, suggestions = clean_data(temp_file_path)
+            st.success("Data cleaned successfully!")
+        finally:
+            # Clean up the temporary file
+            if os.path.exists(temp_file_path):
+                os.unlink(temp_file_path)
+    # Show original data
+    st.subheader("Original Data")
+    try:
+        if file_type == ".tsv":
+            original_df = pd.read_csv(BytesIO(file_bytes), sep='\t')
+        elif file_type == ".xlsx":
+            original_df = pd.read_excel(BytesIO(file_bytes))
+        else:
+            original_df = pd.read_csv(BytesIO(file_bytes))
+        st.dataframe(original_df.head(10), use_container_width=True)
+    except Exception as e:
+        st.error(f"Error loading original data: {str(e)}")
+    # Show cleaned data
+    if st.session_state.processed_data is not None:
+        st.subheader("Cleaned Data Preview")
+        st.dataframe(st.session_state.processed_data.head(10), use_container_width=True)
+    # Data statistics
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        st.metric("Total Rows", len(st.session_state.processed_data))
+    with col2:
+        st.metric("Total Columns", len(st.session_state.processed_data.columns))
+    with col3:
+        null_percentage = round((st.session_state.processed_data.isnull().sum().sum() / (st.session_state.processed_data.shape[0] * st.session_state.processed_data.shape[1])) * 100, 2)
+        st.metric("Null Values (%)", f"{null_percentage}%")
+    # Show data cleaning suggestions
+    display_suggestions_report(st.session_state.suggestions)
+    # Prepare data
+    file_type = st.session_state.file_details['type']
+    cleaned_df = st.session_state.processed_data
+    # Convert to downloadable format
+    if file_type == ".csv":
+        cleaned_file = cleaned_df.to_csv(index=False).encode("utf-8")
+        download_name = "cleaned_data.csv"
+        mime_type = "text/csv"
+    elif file_type == ".tsv":
+        cleaned_file = cleaned_df.to_csv(index=False, sep="\t").encode("utf-8")
+        download_name = "cleaned_data.tsv"
+        mime_type = "text/tsv"
+    elif file_type == ".xlsx":
+        output = BytesIO()
+        with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
+            cleaned_df.to_excel(writer, index=False)
+        cleaned_file = output.getvalue()
+        download_name = "cleaned_data.xlsx"
+        mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+    # Download button
+    st.download_button(
+        label="📁 Download Cleaned Data",
+        data=cleaned_file,
+        file_name=download_name,
+        mime=mime_type
+    )