enacimie commited on
Commit
67cadf7
·
verified ·
1 Parent(s): 3877afb

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +264 -38
src/streamlit_app.py CHANGED
@@ -1,40 +1,266 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
1
  import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.impute import SimpleImputer, KNNImputer
5
+ from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
6
+ from sklearn.compose import ColumnTransformer
7
+ import plotly.express as px
8
+ import io
9
+
10
+ # Metadata
11
+ AUTHOR = "Eduardo Nacimiento García"
12
+ EMAIL = "enacimie@ull.edu.es"
13
+ LICENSE = "Apache 2.0"
14
+
15
+ # Page config
16
+ st.set_page_config(
17
+ page_title="SimpleClean",
18
+ page_icon="🧹",
19
+ layout="wide",
20
+ initial_sidebar_state="expanded",
21
+ )
22
+
23
+ # Title
24
+ st.title("🧹 SimpleClean")
25
+ st.markdown(f"**Author:** {AUTHOR} | **Email:** {EMAIL} | **License:** {LICENSE}")
26
+ st.write("""
27
+ Upload a CSV or use the demo dataset to interactively clean your data: handle missing values, encode categories, scale features, and more.
28
+ """)
29
+
30
+ # === GENERATE DEMO DATASET ===
31
+ @st.cache_data
32
+ def create_demo_data():
33
+ np.random.seed(42)
34
+ n = 300
35
+ data = {
36
+ "Age": np.random.normal(35, 12, n).astype(int),
37
+ "Income": np.random.normal(45000, 15000, n),
38
+ "City": np.random.choice(["Madrid", "Barcelona", "Valencia", "Seville", None], n, p=[0.25, 0.25, 0.25, 0.2, 0.05]),
39
+ "Gender": np.random.choice(["M", "F", None], n, p=[0.45, 0.45, 0.10]),
40
+ "Has_Children": np.random.choice([0, 1, None], n, p=[0.4, 0.4, 0.2]),
41
+ "Satisfaction": np.random.randint(1, 11, n)
42
+ }
43
+ df = pd.DataFrame(data)
44
+ # Introduce some nulls
45
+ df.loc[np.random.choice(df.index, 15), "Age"] = np.nan
46
+ df.loc[np.random.choice(df.index, 20), "Income"] = np.nan
47
+ df.loc[np.random.choice(df.index, 10), "Satisfaction"] = np.nan
48
+ return df
49
+
50
+ # === LOAD DATA ===
51
+ if st.button("🧪 Load Demo Dataset"):
52
+ st.session_state['df_original'] = create_demo_data()
53
+ st.session_state['df_clean'] = st.session_state['df_original'].copy()
54
+ st.success("✅ Demo dataset loaded!")
55
+
56
+ uploaded_file = st.file_uploader("📂 Upload your CSV file", type=["csv"])
57
+
58
+ if uploaded_file:
59
+ df = pd.read_csv(uploaded_file)
60
+ st.session_state['df_original'] = df
61
+ st.session_state['df_clean'] = df.copy()
62
+ st.success("✅ File uploaded successfully.")
63
+
64
+ if 'df_clean' not in st.session_state:
65
+ st.info("👆 Upload a CSV or click 'Load Demo Dataset' to begin.")
66
+ st.stop()
67
+
68
+ df_original = st.session_state['df_original']
69
+ df_clean = st.session_state['df_clean']
70
+
71
+ # Show data preview
72
+ st.subheader("🔍 Data Preview")
73
+ with st.expander("Original Data (first 10 rows)"):
74
+ st.dataframe(df_original.head(10))
75
+
76
+ with st.expander("Current Cleaned Data (first 10 rows)"):
77
+ st.dataframe(df_clean.head(10))
78
+
79
+ # === DATA QUALITY REPORT ===
80
+ st.header("📊 Data Quality Report")
81
+
82
+ col1, col2, col3, col4 = st.columns(4)
83
+ col1.metric("Rows", df_clean.shape[0])
84
+ col2.metric("Columns", df_clean.shape[1])
85
+ col3.metric("Missing Cells", df_clean.isnull().sum().sum())
86
+ col4.metric("Duplicate Rows", df_clean.duplicated().sum())
87
+
88
+ # Missing values by column
89
+ st.subheader("🕳️ Missing Values by Column")
90
+ missing_data = df_clean.isnull().sum()
91
+ fig_missing = px.bar(
92
+ missing_data,
93
+ title="Missing Values per Column",
94
+ labels={'value': 'Count', 'index': 'Column'},
95
+ color=missing_data
96
+ )
97
+ st.plotly_chart(fig_missing, use_container_width=True)
98
+
99
+ # Data types
100
+ st.subheader("🔤 Column Data Types")
101
+ dtypes_df = pd.DataFrame(df_clean.dtypes, columns=['Data Type']).reset_index()
102
+ dtypes_df.columns = ['Column', 'Data Type']
103
+ st.dataframe(dtypes_df)
104
+
105
+ # === CLEANING OPTIONS ===
106
+ st.header("🧼 Cleaning Actions")
107
+
108
+ tab1, tab2, tab3, tab4 = st.tabs([
109
+ "🧹 Remove Duplicates",
110
+ "🩹 Handle Missing Values",
111
+ "🔠 Encode Categorical Variables",
112
+ "📏 Scale Numeric Variables"
113
+ ])
114
+
115
+ # Tab 1: Remove Duplicates
116
+ with tab1:
117
+ st.subheader("Remove Duplicate Rows")
118
+ if st.button("Remove All Duplicates"):
119
+ original_count = len(df_clean)
120
+ df_clean = df_clean.drop_duplicates().reset_index(drop=True)
121
+ st.session_state['df_clean'] = df_clean
122
+ removed = original_count - len(df_clean)
123
+ st.success(f"✅ Removed {removed} duplicate rows.")
124
+
125
+ # Tab 2: Handle Missing Values
126
+ with tab2:
127
+ st.subheader("Impute Missing Values")
128
+
129
+ # Select column with missing values
130
+ cols_with_missing = df_clean.columns[df_clean.isnull().any()].tolist()
131
+ if not cols_with_missing:
132
+ st.success("✅ No missing values to impute.")
133
+ else:
134
+ col_to_impute = st.selectbox("Select column to impute:", cols_with_missing)
135
+
136
+ # Detect column type
137
+ col_dtype = df_clean[col_to_impute].dtype
138
+ if col_dtype in ['object', 'category']:
139
+ strategy = st.selectbox(
140
+ f"Imputation strategy for {col_to_impute} (categorical):",
141
+ ["Most Frequent", "Constant"]
142
+ )
143
+ if strategy == "Constant":
144
+ fill_value = st.text_input("Constant value:", value="Unknown")
145
+ else:
146
+ fill_value = None
147
+ else:
148
+ strategy = st.selectbox(
149
+ f"Imputation strategy for {col_to_impute} (numeric):",
150
+ ["Mean", "Median", "Most Frequent", "Constant", "KNN Imputer"]
151
+ )
152
+ if strategy == "Constant":
153
+ fill_value = st.number_input("Constant value:", value=0.0)
154
+ else:
155
+ fill_value = None
156
+
157
+ if st.button(f"Apply Imputation to '{col_to_impute}'"):
158
+ try:
159
+ if strategy == "Mean":
160
+ imputer = SimpleImputer(strategy='mean')
161
+ elif strategy == "Median":
162
+ imputer = SimpleImputer(strategy='median')
163
+ elif strategy == "Most Frequent":
164
+ imputer = SimpleImputer(strategy='most_frequent')
165
+ elif strategy == "Constant":
166
+ imputer = SimpleImputer(strategy='constant', fill_value=fill_value)
167
+ elif strategy == "KNN Imputer" and col_dtype in [np.number, 'float64', 'int64']:
168
+ # Only for numeric
169
+ imputer = KNNImputer(n_neighbors=5)
170
+ else:
171
+ st.error("Invalid strategy for this column type.")
172
+ st.stop()
173
+
174
+ # Apply imputation
175
+ if strategy == "KNN Imputer":
176
+ # Only apply to numeric columns for KNN
177
+ numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
178
+ df_clean[numeric_cols] = imputer.fit_transform(df_clean[numeric_cols])
179
+ else:
180
+ df_clean[col_to_impute] = imputer.fit_transform(df_clean[[col_to_impute]]).ravel()
181
+
182
+ st.session_state['df_clean'] = df_clean
183
+ st.success(f"✅ Missing values in '{col_to_impute}' imputed using '{strategy}'.")
184
+ except Exception as e:
185
+ st.error(f"❌ Error during imputation: {e}")
186
+
187
+ # Tab 3: Encode Categorical Variables
188
+ with tab3:
189
+ st.subheader("Encode Categorical Variables")
190
+
191
+ categorical_cols = df_clean.select_dtypes(include=['object', 'category']).columns.tolist()
192
+ if not categorical_cols:
193
+ st.info("ℹ️ No categorical columns to encode.")
194
+ else:
195
+ col_to_encode = st.selectbox("Select categorical column to encode:", categorical_cols)
196
+ encoding_method = st.radio(
197
+ "Encoding method:",
198
+ ["Label Encoding", "One-Hot Encoding"]
199
+ )
200
+
201
+ if st.button(f"Apply {encoding_method} to '{col_to_encode}'"):
202
+ try:
203
+ if encoding_method == "Label Encoding":
204
+ le = LabelEncoder()
205
+ df_clean[col_to_encode] = le.fit_transform(df_clean[col_to_encode].astype(str))
206
+ st.session_state['df_clean'] = df_clean
207
+ st.success(f"✅ '{col_to_encode}' label encoded.")
208
+ else: # One-Hot Encoding
209
+ df_encoded = pd.get_dummies(df_clean[col_to_encode], prefix=col_to_encode)
210
+ df_clean = df_clean.drop(columns=[col_to_encode])
211
+ df_clean = pd.concat([df_clean, df_encoded], axis=1)
212
+ st.session_state['df_clean'] = df_clean
213
+ st.success(f"✅ '{col_to_encode}' one-hot encoded. {df_encoded.shape[1]} new columns added.")
214
+ except Exception as e:
215
+ st.error(f"❌ Error during encoding: {e}")
216
+
217
+ # Tab 4: Scale Numeric Variables
218
+ with tab4:
219
+ st.subheader("Scale Numeric Variables")
220
+
221
+ numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
222
+ if not numeric_cols:
223
+ st.info("ℹ️ No numeric columns to scale.")
224
+ else:
225
+ cols_to_scale = st.multiselect("Select numeric columns to scale:", numeric_cols, default=numeric_cols[:2] if len(numeric_cols) >= 2 else numeric_cols)
226
+ scaling_method = st.radio("Scaling method:", ["StandardScaler (Z-score)", "MinMaxScaler (0-1)"])
227
+
228
+ if st.button("Apply Scaling"):
229
+ try:
230
+ if scaling_method == "StandardScaler (Z-score)":
231
+ scaler = StandardScaler()
232
+ else:
233
+ scaler = MinMaxScaler()
234
+
235
+ df_clean[cols_to_scale] = scaler.fit_transform(df_clean[cols_to_scale])
236
+ st.session_state['df_clean'] = df_clean
237
+ st.success(f"✅ Columns {cols_to_scale} scaled using {scaling_method}.")
238
+ except Exception as e:
239
+ st.error(f"❌ Error during scaling: {e}")
240
+
241
+ # === DOWNLOAD CLEANED DATA ===
242
+ st.header("📥 Download Cleaned Data")
243
+
244
+ df_clean_final = st.session_state['df_clean']
245
+
246
+ # Show final preview
247
+ with st.expander("Final Cleaned Data Preview"):
248
+ st.dataframe(df_clean_final.head(10))
249
+
250
+ # Download button
251
+ csv = df_clean_final.to_csv(index=False).encode('utf-8')
252
+ st.download_button(
253
+ label="💾 Download Cleaned CSV",
254
+ data=csv,
255
+ file_name="cleaned_data.csv",
256
+ mime="text/csv",
257
+ )
258
+
259
+ # Reset button
260
+ if st.button("🔄 Reset to Original Data"):
261
+ st.session_state['df_clean'] = st.session_state['df_original'].copy()
262
+ st.success("✅ Data reset to original state.")
263
 
264
+ # Footer
265
+ st.markdown("---")
266
+ st.caption(f"© {AUTHOR} | License {LICENSE} | Contact: {EMAIL}")