File size: 5,414 Bytes
523bc48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05d617b
523bc48
 
 
05d617b
523bc48
 
 
 
 
 
 
9aae495
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
523bc48
 
 
 
 
 
 
9aae495
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
523bc48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# app.py

import streamlit as st
import pandas as pd
from io import BytesIO
import os
import tempfile
from data_clean_simple import clean_data, display_suggestions_report

# Set page config
st.set_page_config(page_title="Data Cleaning App", page_icon=":sparkles:", layout="wide")

# Use session state to avoid reloading data
if 'processed_data' not in st.session_state:
    st.session_state.processed_data = None
    st.session_state.suggestions = None
    st.session_state.file_details = None

st.title("Smart Data Cleaner :sparkles:")
st.markdown(
    """
    Upload a CSV, TSV, or Excel file, and we'll clean it for you using smart data cleaning techniques.
    The system will automatically:
    - Fix formatting issues
    - Handle missing values
    - Standardize data entries
    - Provide practical suggestions for data improvements

    Then, you can download the cleaned data for your analysis.
    """
)

# File uploader
uploaded_file = st.file_uploader("Choose a file", type=["csv", "tsv", "xlsx"])

if uploaded_file:
    # Check if we need to process the file (new file or button clicked)
    file_details = {
        'name': uploaded_file.name,
        'size': uploaded_file.size,
        'type': os.path.splitext(uploaded_file.name)[1].lower()
    }
    # Only process if it's a new file
    if st.session_state.file_details != file_details:
        st.session_state.file_details = file_details

        file_bytes = uploaded_file.read()
        file_type = file_details['type']

    # Show original data before processing
    st.subheader("Original Data")
    try:
        if file_type == ".tsv":
            original_df = pd.read_csv(BytesIO(file_bytes), sep='\t')
        elif file_type == ".xlsx":
            original_df = pd.read_excel(BytesIO(file_bytes))
        else:
            original_df = pd.read_csv(BytesIO(file_bytes))

        st.dataframe(original_df.head(10), use_container_width=True)

        # Show data info before processing
        total_rows = len(original_df)
        total_cols = len(original_df.columns)
        st.info(f"File contains {total_rows} rows and {total_cols} columns.")
    
    except Exception as e:
        st.error(f"Error loading original data: {str(e)}")
        st.stop()
    
    # Create a progress bar
    progress_bar = st.progress(0)
    status_text = st.empty()    

    # Clean data and get suggestions with progress updates
    with st.spinner("Cleaning your data..."):

        with tempfile.NamedTemporaryFile(delete=False, suffix=file_type) as temp_file:
            temp_file.write(file_bytes)
            temp_file_path = temp_file.name

        try:
            # Update progress for file loading
            status_text.text("Loading and preparing data...")
            progress_bar.progress(10)

            # Process data with progress updates
            status_text.text("Cleaning data...")
            cleaned_df, suggestions = clean_data(temp_file_path, progress_callback=lambda p: progress_bar.progress(10 + int(p * 80)))

            # Final progress update
            status_text.text("Finalizing results...")
            progress_bar.progress(95)

            st.session_state.processed_data = cleaned_df
            st.session_state.suggestions = suggestions

            # Complete the progress bar
            status_text.text("Processing complete!")

            st.success("Data cleaned successfully!")

        finally:
            # Clean up the temporary file
            if os.path.exists(temp_file_path):
                os.unlink(temp_file_path)

    # Show cleaned data
    if st.session_state.processed_data is not None:
        st.subheader("Cleaned Data Preview")
        st.dataframe(st.session_state.processed_data.head(10), use_container_width=True)

    # Data statistics
    col1, col2, col3 = st.columns(3)
    with col1:
        st.metric("Total Rows", len(st.session_state.processed_data))
    with col2:
        st.metric("Total Columns", len(st.session_state.processed_data.columns))
    with col3:
        null_percentage = round((st.session_state.processed_data.isnull().sum().sum() / (st.session_state.processed_data.shape[0] * st.session_state.processed_data.shape[1])) * 100, 2)
        st.metric("Null Values (%)", f"{null_percentage}%")
        
    # Show data cleaning suggestions
    display_suggestions_report(st.session_state.suggestions)

    # Prepare data
    file_type = st.session_state.file_details['type']
    cleaned_df = st.session_state.processed_data

    # Convert to downloadable format
    if file_type == ".csv":
        cleaned_file = cleaned_df.to_csv(index=False).encode("utf-8")
        download_name = "cleaned_data.csv"
        mime_type = "text/csv"
    elif file_type == ".tsv":
        cleaned_file = cleaned_df.to_csv(index=False, sep="\t").encode("utf-8")
        download_name = "cleaned_data.tsv"
        mime_type = "text/tsv"
    elif file_type == ".xlsx":
        output = BytesIO()
        with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
            cleaned_df.to_excel(writer, index=False)
        cleaned_file = output.getvalue()
        download_name = "cleaned_data.xlsx"
        mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"

    # Download button
    st.download_button(
        label="📁 Download Cleaned Data",
        data=cleaned_file,
        file_name=download_name,
        mime=mime_type
    )