saherPervaiz commited on
Commit
fc84ade
·
verified ·
1 Parent(s): 6e605a0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +226 -0
app.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.preprocessing import LabelEncoder
5
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
6
+ from sklearn.linear_model import LogisticRegression, LinearRegression
7
+ from sklearn.svm import SVC, SVR
8
+ from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
9
+ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
10
+ from sklearn.naive_bayes import GaussianNB
11
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, mean_absolute_error, r2_score
12
+ import numpy as np
13
+ import matplotlib.pyplot as plt
14
+ import seaborn as sns
15
+ from io import BytesIO
16
+
17
+ # File uploader
18
+ st.title("Model Training with Metrics and Correlation Heatmap")
19
+ uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
20
+
21
+ if uploaded_file is not None:
22
+ df = pd.read_csv(uploaded_file)
23
+
24
+ # Show the dataset
25
+ st.write("Dataset:")
26
+ st.dataframe(df)
27
+
28
+ # Convert categorical (str) data to numerical
29
+ st.write("Converting Categorical Columns to Numerical Values:")
30
+ label_encoder = LabelEncoder()
31
+
32
+ for col in df.columns:
33
+ if df[col].dtype == 'object' or len(df[col].unique()) <= 10:
34
+ st.write(f"Encoding Column: **{col}**")
35
+ df[col] = label_encoder.fit_transform(df[col])
36
+
37
+ # Display the dataset after conversion
38
+ st.write("Dataset After Conversion:")
39
+ st.dataframe(df)
40
+
41
+ # Handle Null Values (Missing Data)
42
+ st.write("Handling Missing (Null) Values:")
43
+ fill_method = st.selectbox("Choose how to handle missing values", ["Drop rows", "Fill with mean/median"])
44
+ if fill_method == "Drop rows":
45
+ df = df.dropna()
46
+ elif fill_method == "Fill with mean/median":
47
+ for col in df.columns:
48
+ if df[col].dtype in ['float64', 'int64']:
49
+ df[col].fillna(df[col].mean(), inplace=True)
50
+ else:
51
+ df[col].fillna(df[col].mode()[0], inplace=True)
52
+
53
+ # Handle Outliers using IQR method
54
+ st.write("Handling Outliers:")
55
+ def remove_outliers_iqr(dataframe):
56
+ Q1 = dataframe.quantile(0.25)
57
+ Q3 = dataframe.quantile(0.75)
58
+ IQR = Q3 - Q1
59
+ return dataframe[~((dataframe < (Q1 - 1.5 * IQR)) | (dataframe > (Q3 + 1.5 * IQR))).any(axis=1)]
60
+
61
+ df = remove_outliers_iqr(df)
62
+
63
+ # Cap Extreme Values
64
+ st.write("Handling Extreme Values (Capping):")
65
+ def cap_extreme_values(dataframe):
66
+ for col in dataframe.select_dtypes(include=[np.number]).columns:
67
+ lower_limit = dataframe[col].quantile(0.05)
68
+ upper_limit = dataframe[col].quantile(0.95)
69
+ dataframe[col] = np.clip(dataframe[col], lower_limit, upper_limit)
70
+ return dataframe
71
+
72
+ df = cap_extreme_values(df)
73
+
74
+ # Show cleaned dataset
75
+ st.write("Cleaned Dataset:")
76
+ st.dataframe(df)
77
+
78
+ # Add clean data download option
79
+ st.subheader("Download Cleaned Dataset")
80
+ st.download_button(
81
+ label="Download Cleaned Dataset (CSV)",
82
+ data=df.to_csv(index=False),
83
+ file_name="cleaned_dataset.csv",
84
+ mime="text/csv"
85
+ )
86
+
87
+ # Correlation Heatmap
88
+ st.subheader("Correlation Heatmap")
89
+ corr = df.corr()
90
+ plt.figure(figsize=(10, 8))
91
+ sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", cbar=True)
92
+ st.pyplot(plt)
93
+
94
+ # Save heatmap as PNG
95
+ buf = BytesIO()
96
+ plt.savefig(buf, format="png")
97
+ buf.seek(0)
98
+ st.download_button(
99
+ label="Download Correlation Heatmap as PNG",
100
+ data=buf,
101
+ file_name="correlation_heatmap.png",
102
+ mime="image/png"
103
+ )
104
+
105
+ # Highlight highly correlated pairs
106
+ st.subheader("Highly Correlated Features")
107
+ high_corr = corr.abs().unstack().sort_values(ascending=False).drop_duplicates()
108
+ high_corr = high_corr[high_corr.index.get_level_values(0) != high_corr.index.get_level_values(1)]
109
+ st.write(high_corr_df)
110
+
111
+ target = st.selectbox("Select Target Variable", df.columns)
112
+ features = [col for col in df.columns if col != target]
113
+ X = df[features]
114
+ y = df[target]
115
+
116
+ if y.dtype == 'object' or len(y.unique()) <= 10: # Categorical target (classification)
117
+ st.subheader("Classification Model Training")
118
+ classifiers = {
119
+ 'Logistic Regression': LogisticRegression(max_iter=5000, solver='saga', penalty='l1'),
120
+ 'Decision Tree': DecisionTreeClassifier(),
121
+ 'Random Forest': RandomForestClassifier(),
122
+ 'Support Vector Machine (SVM)': SVC(),
123
+ 'K-Nearest Neighbors (k-NN)': KNeighborsClassifier(),
124
+ 'Naive Bayes': GaussianNB()
125
+ }
126
+
127
+ metrics = []
128
+ train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
129
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, random_state=42)
130
+
131
+ for name, classifier in classifiers.items():
132
+ classifier.fit(X_train, y_train)
133
+ y_pred = classifier.predict(X_test)
134
+ metrics.append({
135
+ 'Model': name,
136
+ 'Accuracy': round(accuracy_score(y_test, y_pred), 2),
137
+ 'Precision': round(precision_score(y_test, y_pred, zero_division=1, average='macro'), 2),
138
+ 'Recall': round(recall_score(y_test, y_pred, zero_division=1, average='macro'), 2),
139
+ 'F1-Score': round(f1_score(y_test, y_pred, zero_division=1, average='macro'), 2)
140
+ })
141
+
142
+ metrics_df = pd.DataFrame(metrics)
143
+ st.subheader("Classification Model Performance Metrics")
144
+ st.dataframe(metrics_df)
145
+
146
+ # Save metrics as PNG (table form)
147
+ fig, ax = plt.subplots(figsize=(8, 4))
148
+ ax.axis('tight')
149
+ ax.axis('off')
150
+ table = plt.table(cellText=metrics_df.values, colLabels=metrics_df.columns, cellLoc='center', loc='center')
151
+ table.auto_set_font_size(False)
152
+ table.set_fontsize(10)
153
+ table.auto_set_column_width(col=list(range(len(metrics_df.columns))))
154
+ buf = BytesIO()
155
+ fig.savefig(buf, format="png")
156
+ buf.seek(0)
157
+ st.download_button(
158
+ label="Download Classification Metrics Table as PNG",
159
+ data=buf,
160
+ file_name="classification_metrics_table.png",
161
+ mime="image/png"
162
+ )
163
+
164
+ # Visualization (Bar Graphs for Classification)
165
+ st.subheader("Classification Model Performance Metrics Graph")
166
+ metrics_df.set_index('Model', inplace=True)
167
+ metrics_df.plot(kind='bar', figsize=(10, 6), colormap='viridis', rot=45)
168
+ plt.title("Classification Models - Performance Metrics")
169
+ plt.ylabel("Scores")
170
+ plt.xlabel("Models")
171
+ st.pyplot(plt)
172
+
173
+ else: # Continuous target (regression)
174
+ st.subheader("Regression Model Training")
175
+ regressors = {
176
+ 'Linear Regression': LinearRegression(),
177
+ 'Decision Tree Regressor': DecisionTreeRegressor(),
178
+ 'Random Forest Regressor': RandomForestRegressor(),
179
+ 'Support Vector Regressor (SVR)': SVR(),
180
+ 'K-Nearest Neighbors Regressor (k-NN)': KNeighborsRegressor()
181
+ }
182
+
183
+ regression_metrics = []
184
+ train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
185
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, random_state=42)
186
+
187
+ for name, regressor in regressors.items():
188
+ regressor.fit(X_train, y_train)
189
+ y_pred = regressor.predict(X_test)
190
+ regression_metrics.append({
191
+ 'Model': name,
192
+ 'Mean Squared Error (MSE)': round(mean_squared_error(y_test, y_pred), 2),
193
+ 'Mean Absolute Error (MAE)': round(mean_absolute_error(y_test, y_pred), 2),
194
+ 'R² Score': round(r2_score(y_test, y_pred), 2)
195
+ })
196
+
197
+ regression_metrics_df = pd.DataFrame(regression_metrics)
198
+ st.subheader("Regression Model Performance Metrics")
199
+ st.dataframe(regression_metrics_df)
200
+
201
+ # Save metrics as PNG (table form)
202
+ fig, ax = plt.subplots(figsize=(8, 4))
203
+ ax.axis('tight')
204
+ ax.axis('off')
205
+ table = plt.table(cellText=regression_metrics_df.values, colLabels=regression_metrics_df.columns, cellLoc='center', loc='center')
206
+ table.auto_set_font_size(False)
207
+ table.set_fontsize(10)
208
+ table.auto_set_column_width(col=list(range(len(regression_metrics_df.columns))))
209
+ buf = BytesIO()
210
+ fig.savefig(buf, format="png")
211
+ buf.seek(0)
212
+ st.download_button(
213
+ label="Download Regression Metrics Table as PNG",
214
+ data=buf,
215
+ file_name="regression_metrics_table.png",
216
+ mime="image/png"
217
+ )
218
+
219
+ # Visualization (Bar Graphs for Regression)
220
+ st.subheader("Regression Model Performance Metrics Graph")
221
+ regression_metrics_df.set_index('Model', inplace=True)
222
+ regression_metrics_df.plot(kind='bar', figsize=(10, 6), colormap='coolwarm', rot=45)
223
+ plt.title("Regression Models - Performance Metrics")
224
+ plt.ylabel("Scores")
225
+ plt.xlabel("Models")
226
+ st.pyplot(plt)