translators-will commited on
Commit
523bc48
·
verified ·
1 Parent(s): 394c7a7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +132 -0
app.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+
3
+ # app.py
4
+
5
+ import streamlit as st
6
+ import pandas as pd
7
+ from io import BytesIO
8
+ import os
9
+ import tempfile
10
+ from data_clean_simple import clean_data, display_suggestions_report
11
+
12
+ # Set page config
13
+ st.set_page_config(page_title="Data Cleaning App", page_icon=":sparkles:", layout="wide")
14
+
15
+ # Use session state to avoid reloading data
16
+ if 'processed_data' not in st.session_state:
17
+ st.session_state.processed_data = None
18
+ st.session_state.suggestions = None
19
+ st.session_state.file_details = None
20
+
21
+ st.title("Smart Data Cleaner :sparkles:")
22
+ st.markdown(
23
+ """
24
+ Upload a CSV, TSV, or Excel file, and we'll clean it for you using smart data cleaning techniques.
25
+ The system will automatically:
26
+ - Fix formatting issues
27
+ - Handle missing values
28
+ - Standardize data entries
29
+ - Provide practical suggestions for data improvements
30
+
31
+ Then, you can download the cleaned data for your analysis.
32
+ """
33
+ )
34
+
35
+ # File uploader
36
+ uploaded_file = st.file_uploader("Choose a file", type=["csv", "tsv", "xlsx"])
37
+
38
+ if uploaded_file:
39
+ # Check if we need to process the file (new file or button clicked)
40
+ file_details = (
41
+ 'name': uploaded_file.name,
42
+ 'size': uploaded_file.size,
43
+ 'type': os.path.splitext(uploaded_file.name)[1].lower()
44
+ )
45
+
46
+ # Only process if it's a new file
47
+ if st.session_state.file_details != file_details:
48
+ st.session_state.file_details = file_details
49
+
50
+ file_bytes = uploaded_file.read()
51
+ file_type = file_details['type']
52
+
53
+
54
+ # Clean data and get suggestions
55
+ with st.spinner("Cleaning your data..."):
56
+
57
+ with tempfile.NamedTemporaryFile(delete=False, suffix=file_type) as temp_file:
58
+ temp_file.write(file_bytes)
59
+ temp_file_path = temp_file.name
60
+
61
+ try:
62
+ # Temporary file to clean_data function
63
+ cleaned_df, suggestions = clean_data(temp_file_path)
64
+
65
+ st.success("Data cleaned successfully!")
66
+
67
+ finally:
68
+ # Clean up the temporary file
69
+ if os.path.exists(temp_file_path):
70
+ os.unlink(temp_file_path)
71
+
72
+ # Show original data
73
+ st.subheader("Original Data")
74
+ try:
75
+ if file_type == ".tsv":
76
+ original_df = pd.read_csv(BytesIO(file_bytes), sep='\t')
77
+ elif file_type == ".xlsx":
78
+ original_df = pd.read_excel(BytesIO(file_bytes))
79
+ else:
80
+ original_df = pd.read_csv(BytesIO(file_bytes))
81
+
82
+ st.dataframe(original_df.head(10), use_container_width=True)
83
+
84
+ except Exception as e:
85
+ st.error(f"Error loading original data: {str(e)}")
86
+
87
+ # Show cleaned data
88
+ if st.session_state.processed_data is not None:
89
+ st.subheader("Cleaned Data Preview")
90
+ st.dataframe(st.session_state.processed_data.head(10), use_container_width=True)
91
+
92
+ # Data statistics
93
+ col1, col2, col3 = st.columns(3)
94
+ with col1:
95
+ st.metric("Total Rows", len(st.session_state.processed_data))
96
+ with col2:
97
+ st.metric("Total Columns", len(st.session_state.processed_data.columns))
98
+ with col3:
99
+ null_percentage = round((st.session_state.processed_data.isnull().sum().sum() / (st.session_state.processed_data.shape[0] * st.session_state.processed_data.shape[1])) * 100, 2)
100
+ st.metric("Null Values (%)", f"{null_percentage}%")
101
+
102
+ # Show data cleaning suggestions
103
+ display_suggestions_report(st.session_state.suggestions)
104
+
105
+ # Prepare data
106
+ file_type = st.session_state.file_details['type']
107
+ cleaned_df = st.session_state.processed_data
108
+
109
+ # Convert to downloadable format
110
+ if file_type == ".csv":
111
+ cleaned_file = cleaned_df.to_csv(index=False).encode("utf-8")
112
+ download_name = "cleaned_data.csv"
113
+ mime_type = "text/csv"
114
+ elif file_type == ".tsv":
115
+ cleaned_file = cleaned_df.to_csv(index=False, sep="\t").encode("utf-8")
116
+ download_name = "cleaned_data.tsv"
117
+ mime_type = "text/tsv"
118
+ elif file_type == ".xlsx":
119
+ output = BytesIO()
120
+ with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
121
+ cleaned_df.to_excel(writer, index=False)
122
+ cleaned_file = output.getvalue()
123
+ download_name = "cleaned_data.xlsx"
124
+ mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
125
+
126
+ # Download button
127
+ st.download_button(
128
+ label="📁 Download Cleaned Data",
129
+ data=cleaned_file,
130
+ file_name=download_name,
131
+ mime=mime_type
132
+ )