saherPervaiz commited on
Commit
4409f0b
·
verified ·
1 Parent(s): 76454b0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +183 -159
app.py CHANGED
@@ -14,31 +14,33 @@ import matplotlib.pyplot as plt
14
  import seaborn as sns
15
  from io import BytesIO
16
 
 
 
 
17
  # File uploader
18
- st.title("Model Training with Metrics and Correlation Heatmap")
19
  uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
20
 
21
  if uploaded_file is not None:
 
22
  df = pd.read_csv(uploaded_file)
23
-
24
- # Show the dataset
25
  st.write("Dataset:")
26
  st.dataframe(df)
27
 
28
  # Convert categorical (str) data to numerical
29
  st.write("Converting Categorical Columns to Numerical Values:")
30
  label_encoder = LabelEncoder()
31
-
32
  for col in df.columns:
33
  if df[col].dtype == 'object' or len(df[col].unique()) <= 10:
34
  st.write(f"Encoding Column: **{col}**")
35
  df[col] = label_encoder.fit_transform(df[col])
36
-
37
  # Display the dataset after conversion
38
  st.write("Dataset After Conversion:")
39
  st.dataframe(df)
40
-
41
- # Handle Null Values (Missing Data)
42
  st.write("Handling Missing (Null) Values:")
43
  fill_method = st.selectbox("Choose how to handle missing values", ["Drop rows", "Fill with mean/median"])
44
  if fill_method == "Drop rows":
@@ -49,18 +51,24 @@ if uploaded_file is not None:
49
  df[col].fillna(df[col].mean(), inplace=True)
50
  else:
51
  df[col].fillna(df[col].mode()[0], inplace=True)
52
-
53
- # Handle Outliers using IQR method
54
- st.write("Handling Outliers:")
55
- def remove_outliers_iqr(dataframe):
56
- Q1 = dataframe.quantile(0.25)
57
- Q3 = dataframe.quantile(0.75)
58
  IQR = Q3 - Q1
59
- return dataframe[~((dataframe < (Q1 - 1.5 * IQR)) | (dataframe > (Q3 + 1.5 * IQR))).any(axis=1)]
60
-
61
- df = remove_outliers_iqr(df)
62
-
63
- # Cap Extreme Values
 
 
 
 
 
 
64
  st.write("Handling Extreme Values (Capping):")
65
  def cap_extreme_values(dataframe):
66
  for col in dataframe.select_dtypes(include=[np.number]).columns:
@@ -70,9 +78,9 @@ if uploaded_file is not None:
70
  return dataframe
71
 
72
  df = cap_extreme_values(df)
73
-
74
- # Show cleaned dataset
75
- st.write("Cleaned Dataset:")
76
  st.dataframe(df)
77
 
78
  # Add clean data download option
@@ -90,7 +98,7 @@ if uploaded_file is not None:
90
  plt.figure(figsize=(10, 8))
91
  sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", cbar=True)
92
  st.pyplot(plt)
93
-
94
  # Save heatmap as PNG
95
  buf = BytesIO()
96
  plt.savefig(buf, format="png")
@@ -101,149 +109,165 @@ if uploaded_file is not None:
101
  file_name="correlation_heatmap.png",
102
  mime="image/png"
103
  )
104
-
105
  # Highlight highly correlated pairs
106
  st.subheader("Highly Correlated Features")
107
  high_corr = corr.abs().unstack().sort_values(ascending=False).drop_duplicates()
108
  high_corr = high_corr[high_corr.index.get_level_values(0) != high_corr.index.get_level_values(1)]
109
- high_corr_df = pd.DataFrame(high_corr)
110
- st.write(high_corr_df)
111
 
 
 
 
 
 
 
 
 
 
112
  target = st.selectbox("Select Target Variable", df.columns)
113
  features = [col for col in df.columns if col != target]
114
  X = df[features]
115
  y = df[target]
116
-
117
- if y.dtype == 'object' or len(y.unique()) <= 10: # Categorical target (classification)
118
- st.subheader("Classification Model Training")
119
- classifiers = {
120
- 'Logistic Regression': LogisticRegression(max_iter=5000, solver='saga', penalty='l1'),
121
- 'Decision Tree': DecisionTreeClassifier(),
122
- 'Random Forest': RandomForestClassifier(),
123
- 'Support Vector Machine (SVM)': SVC(),
124
- 'K-Nearest Neighbors (k-NN)': KNeighborsClassifier(),
125
- 'Naive Bayes': GaussianNB()
126
- }
127
-
128
- metrics = []
129
- train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
130
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, random_state=42)
131
-
132
- for name, classifier in classifiers.items():
133
- classifier.fit(X_train, y_train)
134
- y_pred = classifier.predict(X_test)
135
- metrics.append({
136
- 'Model': name,
137
- 'Accuracy': round(accuracy_score(y_test, y_pred), 2),
138
- 'Precision': round(precision_score(y_test, y_pred, zero_division=1, average='macro'), 2),
139
- 'Recall': round(recall_score(y_test, y_pred, zero_division=1, average='macro'), 2),
140
- 'F1-Score': round(f1_score(y_test, y_pred, zero_division=1, average='macro'), 2)
141
- })
142
-
143
- metrics_df = pd.DataFrame(metrics)
144
- st.subheader("Classification Model Performance Metrics")
145
- st.dataframe(metrics_df)
146
-
147
- # Save metrics as PNG (table form)
148
- fig, ax = plt.subplots(figsize=(8, 4))
149
- ax.axis('tight')
150
- ax.axis('off')
151
- table = plt.table(cellText=metrics_df.values, colLabels=metrics_df.columns, cellLoc='center', loc='center')
152
- table.auto_set_font_size(False)
153
- table.set_fontsize(10)
154
- table.auto_set_column_width(col=list(range(len(metrics_df.columns))))
155
- buf = BytesIO()
156
- fig.savefig(buf, format="png")
157
- buf.seek(0)
158
- st.download_button(
159
- label="Download Classification Metrics Table as PNG",
160
- data=buf,
161
- file_name="classification_metrics_table.png",
162
- mime="image/png"
163
- )
164
-
165
- # Visualization (Bar Graphs for Classification)
166
- st.subheader("Classification Model Performance Metrics Graph")
167
- metrics_df.set_index('Model', inplace=True)
168
- ax = metrics_df.plot(kind='bar', figsize=(10, 6), colormap='coolwarm', rot=45)
169
- plt.title("Classification Models - Performance Metrics")
170
- plt.ylabel("Scores")
171
- plt.xlabel("Models")
172
- st.pyplot(plt)
173
-
174
- # Download button for the bar graph
175
- buf = BytesIO()
176
- ax.figure.savefig(buf, format="png")
177
- buf.seek(0)
178
- st.download_button(
179
- label="Download Classification Performance Graph as PNG",
180
- data=buf,
181
- file_name="classification_performance_graph.png",
182
- mime="image/png"
183
- )
184
-
185
- else: # Continuous target (regression)
186
- st.subheader("Regression Model Training")
187
- regressors = {
188
- 'Linear Regression': LinearRegression(),
189
- 'Decision Tree Regressor': DecisionTreeRegressor(),
190
- 'Random Forest Regressor': RandomForestRegressor(),
191
- 'Support Vector Regressor (SVR)': SVR(),
192
- 'K-Nearest Neighbors Regressor (k-NN)': KNeighborsRegressor()
193
- }
194
-
195
- regression_metrics = []
196
- train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
197
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, random_state=42)
198
-
199
- for name, regressor in regressors.items():
200
- regressor.fit(X_train, y_train)
201
- y_pred = regressor.predict(X_test)
202
- regression_metrics.append({
203
- 'Model': name,
204
- 'Mean Squared Error (MSE)': round(mean_squared_error(y_test, y_pred), 2),
205
- 'Mean Absolute Error (MAE)': round(mean_absolute_error(y_test, y_pred), 2),
206
- 'R² Score': round(r2_score(y_test, y_pred), 2)
207
- })
208
-
209
- regression_metrics_df = pd.DataFrame(regression_metrics)
210
- st.subheader("Regression Model Performance Metrics")
211
- st.dataframe(regression_metrics_df)
212
-
213
- # Save metrics as PNG (table form)
214
- fig, ax = plt.subplots(figsize=(8, 4))
215
- ax.axis('tight')
216
- ax.axis('off')
217
- table = plt.table(cellText=regression_metrics_df.values, colLabels=regression_metrics_df.columns, cellLoc='center', loc='center')
218
- table.auto_set_font_size(False)
219
- table.set_fontsize(10)
220
- table.auto_set_column_width(col=list(range(len(regression_metrics_df.columns))))
221
- buf = BytesIO()
222
- fig.savefig(buf, format="png")
223
- buf.seek(0)
224
- st.download_button(
225
- label="Download Regression Metrics Table as PNG",
226
- data=buf,
227
- file_name="regression_metrics_table.png",
228
- mime="image/png"
229
- )
230
-
231
- # Visualization (Bar Graphs for Regression)
232
- st.subheader("Regression Model Performance Metrics Graph")
233
- regression_metrics_df.set_index('Model', inplace=True)
234
- regression_metrics_df.plot(kind='bar', figsize=(10, 6), colormap='coolwarm', rot=45)
235
- plt.title("Regression Models - Performance Metrics")
236
- plt.ylabel("Scores")
237
- plt.xlabel("Models")
238
- st.pyplot(plt)
239
-
240
- # Download button for the bar graph
241
- buf = BytesIO()
242
- plt.savefig(buf, format="png")
243
- buf.seek(0)
244
- st.download_button(
245
- label="Download Regression Performance Graph as PNG",
246
- data=buf,
247
- file_name="regression_performance_graph.png",
248
- mime="image/png"
249
- )
 
 
 
 
 
 
 
 
14
  import seaborn as sns
15
  from io import BytesIO
16
 
17
+ # Streamlit app title
18
+ st.title("Model Training with Outlier Removal, Metrics, and Correlation Heatmap")
19
+
20
  # File uploader
 
21
  uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
22
 
23
  if uploaded_file is not None:
24
+ # Read the uploaded CSV file
25
  df = pd.read_csv(uploaded_file)
26
+
27
+ # Display the dataset
28
  st.write("Dataset:")
29
  st.dataframe(df)
30
 
31
  # Convert categorical (str) data to numerical
32
  st.write("Converting Categorical Columns to Numerical Values:")
33
  label_encoder = LabelEncoder()
 
34
  for col in df.columns:
35
  if df[col].dtype == 'object' or len(df[col].unique()) <= 10:
36
  st.write(f"Encoding Column: **{col}**")
37
  df[col] = label_encoder.fit_transform(df[col])
38
+
39
  # Display the dataset after conversion
40
  st.write("Dataset After Conversion:")
41
  st.dataframe(df)
42
+
43
+ # Handle missing values
44
  st.write("Handling Missing (Null) Values:")
45
  fill_method = st.selectbox("Choose how to handle missing values", ["Drop rows", "Fill with mean/median"])
46
  if fill_method == "Drop rows":
 
51
  df[col].fillna(df[col].mean(), inplace=True)
52
  else:
53
  df[col].fillna(df[col].mode()[0], inplace=True)
54
+
55
+ # Remove outliers using the IQR method
56
+ st.write("Removing Outliers Using IQR:")
57
+ def remove_outliers_iqr(data, column):
58
+ Q1 = data[column].quantile(0.25)
59
+ Q3 = data[column].quantile(0.75)
60
  IQR = Q3 - Q1
61
+ lower_bound = Q1 - 1.5 * IQR
62
+ upper_bound = Q3 + 1.5 * IQR
63
+ return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
64
+
65
+ numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
66
+ for col in numeric_cols:
67
+ original_count = len(df)
68
+ df = remove_outliers_iqr(df, col)
69
+ st.write(f"Removed outliers from **{col}**: {original_count - len(df)} rows removed.")
70
+
71
+ # Capping Extreme Values (based on 5% and 95% percentiles)
72
  st.write("Handling Extreme Values (Capping):")
73
  def cap_extreme_values(dataframe):
74
  for col in dataframe.select_dtypes(include=[np.number]).columns:
 
78
  return dataframe
79
 
80
  df = cap_extreme_values(df)
81
+
82
+ # Display dataset after cleaning
83
+ st.write("Dataset After Outlier Removal and Capping Extreme Values:")
84
  st.dataframe(df)
85
 
86
  # Add clean data download option
 
98
  plt.figure(figsize=(10, 8))
99
  sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", cbar=True)
100
  st.pyplot(plt)
101
+
102
  # Save heatmap as PNG
103
  buf = BytesIO()
104
  plt.savefig(buf, format="png")
 
109
  file_name="correlation_heatmap.png",
110
  mime="image/png"
111
  )
112
+
113
  # Highlight highly correlated pairs
114
  st.subheader("Highly Correlated Features")
115
  high_corr = corr.abs().unstack().sort_values(ascending=False).drop_duplicates()
116
  high_corr = high_corr[high_corr.index.get_level_values(0) != high_corr.index.get_level_values(1)]
117
+ high_corr_df = pd.DataFrame(high_corr, columns=["Correlation"])
118
+ st.dataframe(high_corr_df)
119
 
120
+ # Download correlation table as CSV
121
+ st.download_button(
122
+ label="Download Correlation Table (CSV)",
123
+ data=high_corr_df.to_csv(index=True),
124
+ file_name="correlation_table.csv",
125
+ mime="text/csv"
126
+ )
127
+
128
+ # Select target variable
129
  target = st.selectbox("Select Target Variable", df.columns)
130
  features = [col for col in df.columns if col != target]
131
  X = df[features]
132
  y = df[target]
133
+
134
+ if len(y.unique()) > 1: # Ensure the target variable has at least two unique classes/values
135
+ if y.dtype == 'object' or len(y.unique()) <= 10: # Classification
136
+ st.subheader("Classification Model Training")
137
+ classifiers = {
138
+ 'Logistic Regression': LogisticRegression(max_iter=5000),
139
+ 'Decision Tree': DecisionTreeClassifier(),
140
+ 'Random Forest': RandomForestClassifier(),
141
+ 'Support Vector Machine (SVM)': SVC(),
142
+ 'K-Nearest Neighbors (k-NN)': KNeighborsClassifier(),
143
+ 'Naive Bayes': GaussianNB()
144
+ }
145
+
146
+ metrics = []
147
+ train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
148
+ X_train, X_test, y_train, y_test = train_test_split(
149
+ X, y, test_size=1-train_size, stratify=y, random_state=42
150
+ )
151
+
152
+ for name, classifier in classifiers.items():
153
+ classifier.fit(X_train, y_train)
154
+ y_pred = classifier.predict(X_test)
155
+ metrics.append({
156
+ 'Model': name,
157
+ 'Accuracy': round(accuracy_score(y_test, y_pred), 2),
158
+ 'Precision': round(precision_score(y_test, y_pred, zero_division=1, average='macro'), 2),
159
+ 'Recall': round(recall_score(y_test, y_pred, zero_division=1, average='macro'), 2),
160
+ 'F1-Score': round(f1_score(y_test, y_pred, zero_division=1, average='macro'), 2)
161
+ })
162
+
163
+ metrics_df = pd.DataFrame(metrics)
164
+ st.subheader("Classification Model Performance Metrics")
165
+ st.dataframe(metrics_df)
166
+
167
+ # Save metrics as PNG (table form)
168
+ fig, ax = plt.subplots(figsize=(8, 4))
169
+ ax.axis('tight')
170
+ ax.axis('off')
171
+ table = plt.table(cellText=metrics_df.values, colLabels=metrics_df.columns, cellLoc='center', loc='center')
172
+ table.auto_set_font_size(False)
173
+ table.set_fontsize(10)
174
+ table.auto_set_column_width(col=list(range(len(metrics_df.columns))))
175
+ buf = BytesIO()
176
+ fig.savefig(buf, format="png")
177
+ buf.seek(0)
178
+ st.download_button(
179
+ label="Download Classification Metrics Table as PNG",
180
+ data=buf,
181
+ file_name="classification_metrics_table.png",
182
+ mime="image/png"
183
+ )
184
+
185
+ # Visualization (Bar Graphs for Classification)
186
+ st.subheader("Classification Model Performance Metrics Graph")
187
+ metrics_df.set_index('Model', inplace=True)
188
+ ax = metrics_df.plot(kind='bar', figsize=(10, 6), colormap='coolwarm', rot=45)
189
+ plt.title("Classification Models - Performance Metrics")
190
+ plt.ylabel("Scores")
191
+ plt.xlabel("Models")
192
+ st.pyplot(plt)
193
+
194
+ # Download button for the bar graph
195
+ buf = BytesIO()
196
+ ax.figure.savefig(buf, format="png")
197
+ buf.seek(0)
198
+ st.download_button(
199
+ label="Download Classification Performance Graph as PNG",
200
+ data=buf,
201
+ file_name="classification_performance_graph.png",
202
+ mime="image/png"
203
+ )
204
+
205
+ else: # Regression
206
+ st.subheader("Regression Model Training")
207
+ regressors = {
208
+ 'Linear Regression': LinearRegression(),
209
+ 'Decision Tree Regressor': DecisionTreeRegressor(),
210
+ 'Random Forest Regressor': RandomForestRegressor(),
211
+ 'Support Vector Regressor (SVR)': SVR(),
212
+ 'K-Nearest Neighbors Regressor (k-NN)': KNeighborsRegressor()
213
+ }
214
+
215
+ regression_metrics = []
216
+ train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
217
+ X_train, X_test, y_train, y_test = train_test_split(
218
+ X, y, test_size=1-train_size, random_state=42
219
+ )
220
+
221
+ for name, regressor in regressors.items():
222
+ regressor.fit(X_train, y_train)
223
+ y_pred = regressor.predict(X_test)
224
+ regression_metrics.append({
225
+ 'Model': name,
226
+ 'Mean Squared Error (MSE)': round(mean_squared_error(y_test, y_pred), 2),
227
+ 'Mean Absolute Error (MAE)': round(mean_absolute_error(y_test, y_pred), 2),
228
+ 'R² Score': round(r2_score(y_test, y_pred), 2)
229
+ })
230
+
231
+ regression_metrics_df = pd.DataFrame(regression_metrics)
232
+ st.subheader("Regression Model Performance Metrics")
233
+ st.dataframe(regression_metrics_df)
234
+
235
+ # Save metrics as PNG (table form)
236
+ fig, ax = plt.subplots(figsize=(8, 4))
237
+ ax.axis('tight')
238
+ ax.axis('off')
239
+ table = plt.table(cellText=regression_metrics_df.values, colLabels=regression_metrics_df.columns, cellLoc='center', loc='center')
240
+ table.auto_set_font_size(False)
241
+ table.set_fontsize(10)
242
+ table.auto_set_column_width(col=list(range(len(regression_metrics_df.columns))))
243
+ buf = BytesIO()
244
+ fig.savefig(buf, format="png")
245
+ buf.seek(0)
246
+ st.download_button(
247
+ label="Download Regression Metrics Table as PNG",
248
+ data=buf,
249
+ file_name="regression_metrics_table.png",
250
+ mime="image/png"
251
+ )
252
+
253
+ # Visualization (Bar Graphs for Regression)
254
+ st.subheader("Regression Model Performance Metrics Graph")
255
+ regression_metrics_df.set_index('Model', inplace=True)
256
+ ax = regression_metrics_df.plot(kind='bar', figsize=(10, 6), colormap='coolwarm', rot=45)
257
+ plt.title("Regression Models - Performance Metrics")
258
+ plt.ylabel("Scores")
259
+ plt.xlabel("Models")
260
+ st.pyplot(plt)
261
+
262
+ # Download button for the bar graph
263
+ buf = BytesIO()
264
+ ax.figure.savefig(buf, format="png")
265
+ buf.seek(0)
266
+ st.download_button(
267
+ label="Download Regression Performance Graph as PNG",
268
+ data=buf,
269
+ file_name="regression_performance_graph.png",
270
+ mime="image/png"
271
+ )
272
+ else:
273
+ st.error("The target variable must contain at least two unique values for classification or regression. Please check your dataset.")