1MR commited on
Commit
395ef11
·
verified ·
1 Parent(s): 587e047

Update Preprocessing2.py

Browse files
Files changed (1) hide show
  1. Preprocessing2.py +212 -217
Preprocessing2.py CHANGED
@@ -1,217 +1,212 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import numpy as np
4
- import io
5
- import matplotlib.pyplot as plt
6
- from sklearn.preprocessing import LabelEncoder
7
- import seaborn as sns
8
- import base64
9
-
10
-
11
- def handle_categorical_values():
12
- if "data" in st.session_state:
13
- data = st.session_state["data"]
14
-
15
- st.subheader("Handle Categorical Values")
16
-
17
- categorical_cols_features = list(
18
- data.select_dtypes(include="object").columns)
19
-
20
- # One-Hot Encoding for nominal categorical features
21
- one_hot_enc = st.multiselect(
22
- "Select nominal categorical columns", categorical_cols_features)
23
-
24
- # Apply one-hot encoding to selected columns
25
- if one_hot_enc:
26
- for column in one_hot_enc:
27
- if data[column].dtype == 'object': # Only apply to categorical/string columns
28
- data = pd.get_dummies(data, columns=[column])
29
- st.write("### Data after One-Hot Encoding:")
30
- st.write(data.head())
31
-
32
- # Label Encoding for ordinal categorical features
33
- label_encoder = LabelEncoder()
34
- label_enc = st.multiselect(
35
- "Select ordinal categorical columns", categorical_cols_features)
36
-
37
- # Apply label encoding to selected columns
38
- if label_enc:
39
- for column in label_enc:
40
- if data[column].dtype == 'object': # Only apply to categorical/string columns
41
- data[column] = label_encoder.fit_transform(data[column])
42
- st.write("### Data after Label Encoding:")
43
- st.write(data.head())
44
-
45
- else:
46
- st.warning("Please upload a dataset to handle categorical values.")
47
-
48
-
49
- def missing_values():
50
- st.title("Handle Missing Values")
51
-
52
- if "data" in st.session_state:
53
- data = st.session_state["data"].copy()
54
-
55
- action = st.selectbox(
56
- "Select Action", ["Drop", "Dropna", "Fill missing val"])
57
-
58
- column = st.selectbox("Select Column", data.columns)
59
-
60
- st.write("### Before:")
61
- st.dataframe(data)
62
-
63
- modified_data = data.copy()
64
-
65
- if action == "Drop":
66
- modified_data.drop(columns=[column], inplace=True)
67
- elif action == "Dropna":
68
- modified_data.dropna(subset=[column], inplace=True)
69
- elif action == "Fill missing val":
70
- fill_method = st.selectbox(
71
- "Select fill method", ["Mean", "Mode", "Median"])
72
-
73
- if fill_method == "Mean":
74
- fill_value = data[column].mean()
75
- elif fill_method == "Mode":
76
- fill_value = data[column].mode()[0]
77
- elif fill_method == "Median":
78
- fill_value = data[column].median()
79
-
80
- modified_data[column].fillna(fill_value, inplace=True)
81
-
82
- st.write("### After (Preview):")
83
- st.dataframe(modified_data)
84
-
85
- if st.button("OK"):
86
- st.session_state["data"] = modified_data
87
- st.success("Done! The action has been applied.")
88
- st.write("### After:")
89
- st.dataframe(modified_data)
90
-
91
- else:
92
- st.warning("Please upload a dataset first.")
93
-
94
-
95
- def handle_duplicates():
96
- st.title("Handle Duplicates")
97
-
98
- if "data" in st.session_state:
99
- data = st.session_state["data"].copy()
100
-
101
- action = st.selectbox(
102
- "Select Action", ["Drop Duplicates", "Drop Duplicates in Column", "Keep First", "Keep Last"])
103
-
104
- if action in ["Drop Duplicates in Column", "Keep First", "Keep Last"]:
105
- column = st.selectbox("Select Column", data.columns)
106
- else:
107
- column = None
108
-
109
- st.write("### Before:")
110
- st.dataframe(data)
111
-
112
- after_placeholder = st.empty()
113
-
114
- modified_data = data.copy()
115
-
116
- if action == "Drop Duplicates":
117
- modified_data.drop_duplicates(inplace=True)
118
- elif action == "Drop Duplicates in Column":
119
- modified_data.drop_duplicates(subset=[column], inplace=True)
120
- elif action == "Keep First":
121
- modified_data.drop_duplicates(
122
- subset=[column], keep="first", inplace=True)
123
- elif action == "Keep Last":
124
- modified_data.drop_duplicates(
125
- subset=[column], keep="last", inplace=True)
126
-
127
- st.write("### After (Preview):")
128
- st.dataframe(modified_data)
129
-
130
- if st.button("OK"):
131
- st.session_state["data"] = modified_data
132
- st.success("Done! The action has been applied.")
133
- st.write("### After:")
134
- st.dataframe(modified_data)
135
-
136
- else:
137
- st.warning("Please upload a dataset first.")
138
-
139
-
140
- def handle_outliers():
141
- st.title("Handle Outliers")
142
-
143
- if "data" in st.session_state:
144
- data = st.session_state["data"].copy()
145
-
146
- column = st.selectbox("Select Column", data.select_dtypes(
147
- include=[np.number]).columns)
148
-
149
- action = st.selectbox(
150
- "Select Action",
151
- ["Remove Outliers (IQR)", "Set Bounds Manually",
152
- "Replace Outliers"]
153
- )
154
-
155
- st.write("### Before:")
156
- st.dataframe(data)
157
-
158
- after_placeholder = st.empty()
159
-
160
- modified_data = data.copy()
161
-
162
- if action == "Remove Outliers (IQR)":
163
- Q1 = data[column].quantile(0.25)
164
- Q3 = data[column].quantile(0.75)
165
- IQR = Q3 - Q1
166
- lower_bound = Q1 - 1.5 * IQR
167
- upper_bound = Q3 + 1.5 * IQR
168
-
169
- # Remove outliers
170
- modified_data = modified_data[(
171
- modified_data[column] >= lower_bound) & (modified_data[column] <= upper_bound)]
172
-
173
- elif action == "Set Bounds Manually":
174
- # User inputs for bounds
175
- lower_bound = st.number_input(
176
- f"Set lower bound for {column}", value=float(data[column].min()))
177
- upper_bound = st.number_input(
178
- f"Set upper bound for {column}", value=float(data[column].max()))
179
-
180
- modified_data = modified_data[(
181
- modified_data[column] >= lower_bound) & (modified_data[column] <= upper_bound)]
182
-
183
- elif action == "Replace Outliers":
184
-
185
- Q1 = data[column].quantile(0.25)
186
- Q3 = data[column].quantile(0.75)
187
- IQR = Q3 - Q1
188
- lower_bound = Q1 - 1.5 * IQR
189
- upper_bound = Q3 + 1.5 * IQR
190
-
191
- replace_method = st.radio(
192
- "Select Replacement Method",
193
- ["Mean", "Median"]
194
- )
195
-
196
- if replace_method == "Mean":
197
- replacement_value = data[column].mean()
198
- else:
199
- replacement_value = data[column].median()
200
-
201
- # Replace outliers
202
- modified_data[column] = modified_data[column].apply(
203
- lambda x: replacement_value if x < lower_bound or x > upper_bound else x
204
- )
205
-
206
- # After Visualization
207
- st.write("### After (Preview):")
208
- st.dataframe(modified_data)
209
-
210
- if st.button("OK"):
211
- st.session_state["data"] = modified_data
212
- st.success("Done! The action has been applied.")
213
- st.write("### After:")
214
- st.dataframe(modified_data)
215
-
216
- else:
217
- st.warning("Please upload a dataset first.")
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import io
5
+ import matplotlib.pyplot as plt
6
+ from sklearn.preprocessing import LabelEncoder
7
+ import seaborn as sns
8
+ import base64
9
+
10
+
11
+ def handle_categorical_values():
12
+ if "data" in st.session_state:
13
+ data = st.session_state["data"]
14
+
15
+ st.subheader("Handle Categorical Values")
16
+
17
+ categorical_cols_features = list(data.select_dtypes(include="object").columns)
18
+
19
+ one_hot_enc = st.multiselect("Select nominal categorical columns", categorical_cols_features)
20
+
21
+ if one_hot_enc:
22
+ for column in one_hot_enc:
23
+ if data[column].dtype == 'object':
24
+ data = pd.get_dummies(data, columns=[column])
25
+ st.session_state["data"] = data
26
+ st.write("### Data after One-Hot Encoding:")
27
+ st.write(data.head())
28
+
29
+ label_encoder = LabelEncoder()
30
+ label_enc = st.multiselect("Select ordinal categorical columns", categorical_cols_features)
31
+
32
+ if label_enc:
33
+ for column in label_enc:
34
+ if data[column].dtype == 'object':
35
+ data[column] = label_encoder.fit_transform(data[column])
36
+ st.session_state["data"] = data
37
+ st.write("### Data after Label Encoding:")
38
+ st.write(data.head())
39
+
40
+ else:
41
+ st.warning("Please upload a dataset to handle categorical values.")
42
+
43
+
44
+ def missing_values():
45
+ st.title("Handle Missing Values")
46
+
47
+ if "data" in st.session_state:
48
+ data = st.session_state["data"].copy()
49
+
50
+ action = st.selectbox(
51
+ "Select Action", ["Drop", "Dropna", "Fill missing val"])
52
+
53
+ column = st.selectbox("Select Column", data.columns)
54
+
55
+ st.write("### Before:")
56
+ st.dataframe(data)
57
+
58
+ modified_data = data.copy()
59
+
60
+ if action == "Drop":
61
+ modified_data.drop(columns=[column], inplace=True)
62
+ elif action == "Dropna":
63
+ modified_data.dropna(subset=[column], inplace=True)
64
+ elif action == "Fill missing val":
65
+ fill_method = st.selectbox(
66
+ "Select fill method", ["Mean", "Mode", "Median"])
67
+
68
+ if fill_method == "Mean":
69
+ fill_value = data[column].mean()
70
+ elif fill_method == "Mode":
71
+ fill_value = data[column].mode()[0]
72
+ elif fill_method == "Median":
73
+ fill_value = data[column].median()
74
+
75
+ modified_data[column].fillna(fill_value, inplace=True)
76
+
77
+ st.write("### After (Preview):")
78
+ st.dataframe(modified_data)
79
+
80
+ if st.button("OK"):
81
+ st.session_state["data"] = modified_data
82
+ st.success("Done! The action has been applied.")
83
+ st.write("### After:")
84
+ st.dataframe(modified_data)
85
+
86
+ else:
87
+ st.warning("Please upload a dataset first.")
88
+
89
+
90
+ def handle_duplicates():
91
+ st.title("Handle Duplicates")
92
+
93
+ if "data" in st.session_state:
94
+ data = st.session_state["data"].copy()
95
+
96
+ action = st.selectbox(
97
+ "Select Action", ["Drop Duplicates", "Drop Duplicates in Column", "Keep First", "Keep Last"])
98
+
99
+ if action in ["Drop Duplicates in Column", "Keep First", "Keep Last"]:
100
+ column = st.selectbox("Select Column", data.columns)
101
+ else:
102
+ column = None
103
+
104
+ st.write("### Before:")
105
+ st.dataframe(data)
106
+
107
+ after_placeholder = st.empty()
108
+
109
+ modified_data = data.copy()
110
+
111
+ if action == "Drop Duplicates":
112
+ modified_data.drop_duplicates(inplace=True)
113
+ elif action == "Drop Duplicates in Column":
114
+ modified_data.drop_duplicates(subset=[column], inplace=True)
115
+ elif action == "Keep First":
116
+ modified_data.drop_duplicates(
117
+ subset=[column], keep="first", inplace=True)
118
+ elif action == "Keep Last":
119
+ modified_data.drop_duplicates(
120
+ subset=[column], keep="last", inplace=True)
121
+
122
+ st.write("### After (Preview):")
123
+ st.dataframe(modified_data)
124
+
125
+ if st.button("OK"):
126
+ st.session_state["data"] = modified_data
127
+ st.success("Done! The action has been applied.")
128
+ st.write("### After:")
129
+ st.dataframe(modified_data)
130
+
131
+ else:
132
+ st.warning("Please upload a dataset first.")
133
+
134
+
135
+ def handle_outliers():
136
+ st.title("Handle Outliers")
137
+
138
+ if "data" in st.session_state:
139
+ data = st.session_state["data"].copy()
140
+
141
+ column = st.selectbox("Select Column", data.select_dtypes(
142
+ include=[np.number]).columns)
143
+
144
+ action = st.selectbox(
145
+ "Select Action",
146
+ ["Remove Outliers (IQR)", "Set Bounds Manually",
147
+ "Replace Outliers"]
148
+ )
149
+
150
+ st.write("### Before:")
151
+ st.dataframe(data)
152
+
153
+ after_placeholder = st.empty()
154
+
155
+ modified_data = data.copy()
156
+
157
+ if action == "Remove Outliers (IQR)":
158
+ Q1 = data[column].quantile(0.25)
159
+ Q3 = data[column].quantile(0.75)
160
+ IQR = Q3 - Q1
161
+ lower_bound = Q1 - 1.5 * IQR
162
+ upper_bound = Q3 + 1.5 * IQR
163
+
164
+ # Remove outliers
165
+ modified_data = modified_data[(
166
+ modified_data[column] >= lower_bound) & (modified_data[column] <= upper_bound)]
167
+
168
+ elif action == "Set Bounds Manually":
169
+ # User inputs for bounds
170
+ lower_bound = st.number_input(
171
+ f"Set lower bound for {column}", value=float(data[column].min()))
172
+ upper_bound = st.number_input(
173
+ f"Set upper bound for {column}", value=float(data[column].max()))
174
+
175
+ modified_data = modified_data[(
176
+ modified_data[column] >= lower_bound) & (modified_data[column] <= upper_bound)]
177
+
178
+ elif action == "Replace Outliers":
179
+
180
+ Q1 = data[column].quantile(0.25)
181
+ Q3 = data[column].quantile(0.75)
182
+ IQR = Q3 - Q1
183
+ lower_bound = Q1 - 1.5 * IQR
184
+ upper_bound = Q3 + 1.5 * IQR
185
+
186
+ replace_method = st.radio(
187
+ "Select Replacement Method",
188
+ ["Mean", "Median"]
189
+ )
190
+
191
+ if replace_method == "Mean":
192
+ replacement_value = data[column].mean()
193
+ else:
194
+ replacement_value = data[column].median()
195
+
196
+ # Replace outliers
197
+ modified_data[column] = modified_data[column].apply(
198
+ lambda x: replacement_value if x < lower_bound or x > upper_bound else x
199
+ )
200
+
201
+ # After Visualization
202
+ st.write("### After (Preview):")
203
+ st.dataframe(modified_data)
204
+
205
+ if st.button("OK"):
206
+ st.session_state["data"] = modified_data
207
+ st.success("Done! The action has been applied.")
208
+ st.write("### After:")
209
+ st.dataframe(modified_data)
210
+
211
+ else:
212
+ st.warning("Please upload a dataset first.")