Gourav18 commited on
Commit
38939c4
·
verified ·
1 Parent(s): 9b1bea9

Upload ML.py

Browse files
Files changed (1) hide show
  1. ML.py +644 -0
ML.py ADDED
@@ -0,0 +1,644 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
5
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
6
+ from sklearn.ensemble import RandomForestClassifier
7
+ from sklearn.linear_model import LogisticRegression
8
+ from sklearn.svm import SVC
9
+ from xgboost import XGBClassifier
10
+ from sklearn.pipeline import Pipeline
11
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc,classification_report
12
+ from sklearn.impute import SimpleImputer
13
+ import openpyxl
14
+ import optuna
15
+ import joblib
16
+ import plotly.express as px
17
+ import seaborn as sns
18
+ import matplotlib.pyplot as plt
19
+
20
+ st.set_page_config(page_title="ML Model Deployment", layout="wide")
21
+
22
+ def load_data(file):
23
+ try:
24
+ if file.name.endswith('.csv'):
25
+ data = pd.read_csv(file)
26
+ elif file.name.endswith(('.xls', '.xlsx')):
27
+ data = pd.read_excel(file)
28
+ return data
29
+ except Exception as e:
30
+ st.error(f"Error loading file: {e}")
31
+ return None
32
+
33
+ def auto_process_data(data):
34
+ processed_data = data.copy()
35
+ label_encoders = {}
36
+
37
+ if processed_data.isnull().sum().sum() > 0:
38
+ st.info("Automatically handling missing values...")
39
+
40
+ num_cols = processed_data.select_dtypes(include=['int64', 'float64']).columns
41
+ if len(num_cols) > 0:
42
+ num_imputer = SimpleImputer(strategy='median')
43
+ processed_data[num_cols] = num_imputer.fit_transform(processed_data[num_cols])
44
+
45
+ cat_cols = processed_data.select_dtypes(include=['object']).columns
46
+ if len(cat_cols) > 0:
47
+ for col in cat_cols:
48
+ if processed_data[col].isnull().any():
49
+ most_frequent = processed_data[col].mode()[0]
50
+ processed_data[col].fillna(most_frequent, inplace=True)
51
+
52
+ for column in processed_data.select_dtypes(include=['object']):
53
+ label_encoders[column] = LabelEncoder()
54
+ processed_data[column] = label_encoders[column].fit_transform(processed_data[column].astype(str))
55
+
56
+ return processed_data, label_encoders
57
+
58
+ def get_model_configs():
59
+ models = {
60
+ 'Logistic Regression': {
61
+ 'pipeline': Pipeline([
62
+ ('scaler', StandardScaler()),
63
+ ('classifier', LogisticRegression())
64
+ ]),
65
+ 'params': {
66
+ 'classifier__penalty':['l1','l2'],
67
+ 'classifier__C':[0.01,0.1,1],
68
+ 'classifier__max_iter': [100, 200],
69
+ 'classifier__solver':['liblinear','saga']
70
+ }
71
+ },
72
+ 'Support Vector Machine': {
73
+ 'pipeline': Pipeline([
74
+ ('scaler', StandardScaler()),
75
+ ('classifier', SVC(probability=True))
76
+ ]),
77
+ 'params': {
78
+ 'classifier__C': [0.001, 0.1, 1],
79
+ 'classifier__kernel': ['linear', 'rbf', 'sigmoid'],
80
+ 'classifier__gamma': ['scale', 'auto', 0.01, 0.1, 1],
81
+ 'classifier__max_iter':[100,200]
82
+ }
83
+ },
84
+ 'Random Forest': {
85
+ 'pipeline': Pipeline([
86
+ ('scaler', StandardScaler()),
87
+ ('classifier', RandomForestClassifier())
88
+ ]),
89
+ 'params': {
90
+ 'classifier__n_estimators':[100,200],
91
+ 'classifier__max_depth': [None, 10, 20],
92
+ 'classifier__min_samples_split': [2,5,10],
93
+ 'classifier__min_samples_leaf':[1,2,4],
94
+ }
95
+ },
96
+ 'XgBoost':{
97
+ 'pipeline':Pipeline([
98
+ ('scaled',StandardScaler()),
99
+ ('classifier',XGBClassifier(use_label_encoder=False,eval_metric='logloss'))
100
+ ]),
101
+ 'params':{
102
+ 'classifier__n_estimators': [100, 200],
103
+ 'classifier__learning_rate': [0.01, 0.05, 0.1],
104
+ 'classifier__max_depth': [3, 5, 7],
105
+ 'classifier__min_child_weight': [1, 3, 5],
106
+ 'classifier__subsample': [0.8, 1.0]
107
+ }
108
+ }
109
+ }
110
+ return models
111
+
112
+ def train_model(X_train, y_train, selected_model, progress_bar=None):
113
+ models = get_model_configs()
114
+ model_config = models[selected_model]
115
+
116
+ with st.spinner(f"Training {selected_model}..."):
117
+ grid_search = GridSearchCV(
118
+ estimator=model_config['pipeline'],
119
+ param_grid=model_config['params'],
120
+ cv=5,
121
+ n_jobs=-1,
122
+ verbose=0,
123
+ scoring="accuracy"
124
+ )
125
+ grid_search.fit(X_train, y_train)
126
+
127
+ if progress_bar:
128
+ progress_bar.progress(1.0)
129
+
130
+ return grid_search.best_estimator_, grid_search.best_score_
131
+ def objective(trial, X_train, y_train, model_name):
132
+ models = get_model_configs()
133
+ model_config = models[model_name]
134
+ dataset_size = len(X_train)
135
+ cv_folds = 5 if dataset_size > 1000 else (3 if dataset_size > 500 else min(2, dataset_size))
136
+ params = {}
137
+
138
+ if model_name == 'Logistic Regression':
139
+ params = {
140
+ 'classifier__penalty': trial.suggest_categorical('classifier__penalty', ['l1', 'l2']),
141
+ 'classifier__C': trial.suggest_float('classifier__C', 0.01, 1.0, log=True),
142
+ 'classifier__solver': trial.suggest_categorical('classifier__solver', ['liblinear', 'saga']),
143
+ 'classifier__max_iter': trial.suggest_int('classifier__max_iter', 100, 200)
144
+ }
145
+
146
+ elif model_name == 'Support Vector Machine':
147
+ params = {
148
+ 'classifier__C': trial.suggest_float('classifier__C', 0.001, 1.0, log=True),
149
+ 'classifier__kernel': trial.suggest_categorical('classifier__kernel', ['linear', 'rbf', 'sigmoid']),
150
+ 'classifier__gamma': trial.suggest_categorical('classifier__gamma', ['scale', 'auto', 0.01, 0.1, 1]),
151
+ 'classifier__max_iter': trial.suggest_int('classifier__max_iter', 100, 200)
152
+ }
153
+
154
+ elif model_name == 'Random Forest':
155
+ params = {
156
+ 'classifier__n_estimators': trial.suggest_int('classifier__n_estimators', 100, 200),
157
+ 'classifier__max_depth': trial.suggest_categorical('classifier__max_depth', [None, 10, 20]),
158
+ 'classifier__min_samples_split': trial.suggest_int('classifier__min_samples_split', 2, 10),
159
+ 'classifier__min_samples_leaf': trial.suggest_int('classifier__min_samples_leaf', 1, 4)
160
+ }
161
+ elif model_name == 'XGBoost':
162
+ params = {
163
+ 'classifier__n_estimators': trial.suggest_int('classifier__n_estimators', 100, 300),
164
+ 'classifier__learning_rate': trial.suggest_float('classifier__learning_rate', 0.01, 0.2, log=True),
165
+ 'classifier__max_depth': trial.suggest_int('classifier__max_depth', 3, 10),
166
+ 'classifier__min_child_weight': trial.suggest_int('classifier__min_child_weight', 1, 6)
167
+ }
168
+
169
+ pipeline = model_config['pipeline'].set_params(**params)
170
+ pipeline.fit(X_train, y_train)
171
+
172
+ score = cross_val_score(pipeline, X_train, y_train, cv=cv_folds, scoring="accuracy").mean()
173
+ return score
174
+ def auto_train(X_train, y_train, X_test, y_test):
175
+ models = get_model_configs()
176
+ results = {}
177
+ best_score = 0
178
+ best_model = None
179
+ best_model_name = None
180
+
181
+ st.write("🔄 Training models with Optuna hyperparameter tuning...")
182
+
183
+ progress_cols = st.columns(len(models))
184
+ progress_bars = {model_name: progress_cols[i].progress(0.0) for i, model_name in enumerate(models)}
185
+
186
+ for model_name in models.keys():
187
+ st.write(f"🛠 Training {model_name}...")
188
+
189
+ # Run Optuna optimization
190
+ study = optuna.create_study(direction='maximize')
191
+ study.optimize(lambda trial: objective(trial, X_train, y_train, model_name), n_trials=20)
192
+
193
+ # Retrieve best parameters and train model
194
+ best_params = study.best_params
195
+ pipeline = models[model_name]['pipeline'].set_params(**best_params)
196
+ pipeline.fit(X_train, y_train)
197
+
198
+ # Evaluate model
199
+ y_pred = pipeline.predict(X_test)
200
+ test_accuracy = accuracy_score(y_test, y_pred)
201
+
202
+ results[model_name] = {
203
+ 'model': pipeline,
204
+ 'cv_score': study.best_value,
205
+ 'test_accuracy': test_accuracy
206
+ }
207
+
208
+ progress_bars[model_name].progress(1.0)
209
+
210
+ # Track best model
211
+ if test_accuracy > best_score:
212
+ best_score = test_accuracy
213
+ best_model = pipeline
214
+ best_model_name = model_name
215
+
216
+ # Display results
217
+ results_df = pd.DataFrame({
218
+ 'Model': list(results.keys()),
219
+ 'Cross-Validation Score': [results[model]['cv_score'] for model in results],
220
+ 'Test Accuracy': [results[model]['test_accuracy'] for model in results]
221
+ }).sort_values('Test Accuracy', ascending=False)
222
+
223
+ st.subheader("📊 Model Performance Comparison")
224
+ st.dataframe(results_df)
225
+
226
+ st.success(f"🏆 Best model: **{best_model_name}** with accuracy: **{best_score:.2%}**")
227
+
228
+ return best_model, best_model_name
229
+
230
+ def get_classification_report(y_true, y_pred):
231
+ report_dict = classification_report(y_true, y_pred, output_dict=True)
232
+ df = pd.DataFrame(report_dict).transpose()
233
+ return df
234
+ def evaluate_models(X_train, X_test, y_train, y_test):
235
+ models =get_model_configs()
236
+
237
+ results = {}
238
+
239
+ plt.figure(figsize=(10, 6))
240
+
241
+ for name, model in models.items():
242
+ model.fit(X_train, y_train)
243
+ y_pred = model.predict(X_test)
244
+ y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
245
+
246
+ accuracy = accuracy_score(y_test, y_pred)
247
+ precision = precision_score(y_test, y_pred, average='binary')
248
+ recall = recall_score(y_test, y_pred, average='binary')
249
+ f1 = f1_score(y_test, y_pred, average='binary')
250
+ roc_auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None
251
+
252
+ results[name] = {
253
+ "Accuracy": accuracy,
254
+ "Precision": precision,
255
+ "Recall": recall,
256
+ "F1-score": f1,
257
+ "ROC-AUC": roc_auc
258
+ }
259
+
260
+ if y_prob is not None:
261
+ fpr, tpr, _ = roc_curve(y_test, y_prob)
262
+ plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc:.2f})")
263
+
264
+ plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
265
+ plt.xlabel("False Positive Rate")
266
+ plt.ylabel("True Positive Rate")
267
+ plt.title("ROC Curves")
268
+ plt.legend()
269
+ plt.show()
270
+
271
+ fig, axes = plt.subplots(2, 2, figsize=(12, 10))
272
+ for ax, (name, model) in zip(axes.ravel(), models.items()):
273
+ y_pred = model.predict(X_test)
274
+ cm = confusion_matrix(y_test, y_pred)
275
+ sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax)
276
+ ax.set_title(f"{name} - Confusion Matrix")
277
+ ax.set_xlabel("Predicted Label")
278
+ ax.set_ylabel("True Label")
279
+
280
+ plt.tight_layout()
281
+ plt.show()
282
+
283
+ results_df = pd.DataFrame(results).T
284
+ results_df.plot(kind="bar", figsize=(10, 6))
285
+ plt.title("Model Comparison")
286
+ plt.ylabel("Score")
287
+ plt.xticks(rotation=45)
288
+ plt.legend(title="Metrics")
289
+ plt.show()
290
+
291
+ return results_df
292
+
293
+ def main():
294
+ st.title("🤖 Machine Learning Model Deployment")
295
+
296
+ st.sidebar.header("Navigation")
297
+ page = st.sidebar.radio("Go to", ["Home","Data Upload & Analysis", "Model Training","Visualisation", "Prediction"])
298
+
299
+ if 'data' not in st.session_state:
300
+ st.session_state.data = None
301
+ if 'processed_data' not in st.session_state:
302
+ st.session_state.processed_data = None
303
+ if 'label_encoders' not in st.session_state:
304
+ st.session_state.label_encoders = None
305
+ if 'model' not in st.session_state:
306
+ st.session_state.model = None
307
+ if 'features' not in st.session_state:
308
+ st.session_state.features = None
309
+ if 'target' not in st.session_state:
310
+ st.session_state.target = None
311
+ if 'model_name' not in st.session_state:
312
+ st.session_state.model_name = None
313
+
314
+ if page=="Home":
315
+ st.title("🚀 AutoML: Effortless Machine Learning")
316
+ st.markdown(
317
+ """
318
+ Welcome to **AutoML**, a powerful yet easy-to-use tool that automates the process of building and evaluating
319
+ machine learning models. Whether you're a beginner exploring data or an expert looking for quick model deployment,
320
+ AutoML simplifies the entire workflow.
321
+ """
322
+ )
323
+
324
+ st.header("🔹 Features")
325
+ st.markdown(
326
+ """
327
+ - **Automated Model Selection** – Let AutoML pick the best algorithm for your data.
328
+ - **Hyperparameter Tuning** – Optimize model performance without manual tweaking.
329
+ - **Data Preprocessing** – Handle missing values, scaling, encoding, and feature engineering.
330
+ - **Performance Evaluation** – Compare models with key metrics and visualizations.
331
+ - **Model Export** – Save trained models for deployment.
332
+ """
333
+ )
334
+
335
+ st.header("🚀 Get Started")
336
+ st.markdown(
337
+ """
338
+ 1. **Upload your dataset** – Provide a CSV or Excel file with your data.
339
+ 2. **Select your target variable** – Choose the column to predict.
340
+ 3. **Let AutoML do the magic!** – Sit back and watch the automation work.
341
+ """
342
+ )
343
+
344
+ st.header("📊 Visual Insights")
345
+ st.markdown(
346
+ """
347
+ Explore interactive charts and performance metrics to make informed decisions.
348
+ Use visualizations to compare model accuracy, precision, recall, and other key statistics.
349
+ """
350
+ )
351
+
352
+ st.success("Start automating your ML workflows now! 🎯")
353
+ st.write('''Developed By Gourav Singh,Ankit Yadav,Pushpansh''')
354
+
355
+ if page == "Data Upload & Analysis":
356
+ st.header("📊 Data Upload & Analysis")
357
+
358
+ uploaded_file = st.file_uploader("Upload your dataset (CSV or Excel)", type=['csv', 'xlsx', 'xls'])
359
+
360
+ if uploaded_file is not None:
361
+ st.session_state.data = load_data(uploaded_file)
362
+
363
+ if st.session_state.data is not None:
364
+ st.session_state.processed_data, st.session_state.label_encoders = auto_process_data(st.session_state.data)
365
+
366
+ st.success("Data loaded and automatically processed!")
367
+
368
+ st.subheader("Dataset Overview")
369
+ col1, col2, col3 = st.columns(3)
370
+ with col1:
371
+ st.info(f"Number of rows: {st.session_state.data.shape[0]}")
372
+ with col2:
373
+ st.info(f"Number of columns: {st.session_state.data.shape[1]}")
374
+ with col3:
375
+ missing_values = st.session_state.data.isnull().sum().sum()
376
+ st.info(f"Missing values: {missing_values} (Automatically handled)")
377
+
378
+ st.subheader("Original Data Preview")
379
+ st.dataframe(st.session_state.data.head())
380
+
381
+ st.subheader("Processed Data Preview")
382
+ st.dataframe(st.session_state.processed_data.head())
383
+
384
+ st.subheader("Statistical Description")
385
+ st.dataframe(st.session_state.processed_data.describe())
386
+
387
+ st.subheader("Correlation Heatmap")
388
+ fig, ax = plt.subplots(figsize=(10, 6))
389
+ sns.heatmap(st.session_state.processed_data.corr(), annot=True, cmap='coolwarm', ax=ax)
390
+ st.pyplot(fig)
391
+
392
+ elif page == "Model Training":
393
+ st.header("🎯 Auto Model Training")
394
+
395
+ if st.session_state.processed_data is None:
396
+ st.warning("Please upload and process your data first!")
397
+ return
398
+
399
+ st.subheader("Select Features and Target")
400
+ columns = st.session_state.processed_data.columns.tolist()
401
+
402
+ st.session_state.features = st.multiselect("Select features", columns, default=columns[:-1])
403
+ st.session_state.target = st.selectbox("Select target variable", columns)
404
+
405
+ if st.button("Auto Train Models"):
406
+ if len(st.session_state.features) > 0 and st.session_state.target:
407
+ X = st.session_state.processed_data[st.session_state.features]
408
+ y = st.session_state.processed_data[st.session_state.target]
409
+
410
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
411
+
412
+ st.session_state.model, st.session_state.model_name = auto_train(X_train, y_train, X_test, y_test)
413
+
414
+ y_pred = st.session_state.model.predict(X_test)
415
+
416
+ st.subheader("Best Model Performance")
417
+
418
+ accuracy = accuracy_score(y_test, y_pred)
419
+ st.metric("Accuracy", f"{accuracy:.2%}")
420
+
421
+ st.text("Classification Report:")
422
+
423
+ df_report = get_classification_report(y_test, y_pred)
424
+ st.dataframe(df_report)
425
+
426
+ if st.session_state.model_name == "Random Forest":
427
+ st.subheader("Feature Importance")
428
+
429
+ importance_df = pd.DataFrame({
430
+ 'Feature': st.session_state.features,
431
+ 'Importance': st.session_state.model.named_steps['classifier'].feature_importances_
432
+ }).sort_values('Importance', ascending=False)
433
+
434
+ fig = px.bar(importance_df, x='Feature', y='Importance',
435
+ title='Feature Importance Plot')
436
+ st.plotly_chart(fig)
437
+
438
+ model_data = {
439
+ 'model': st.session_state.model,
440
+ 'model_name': st.session_state.model_name,
441
+ 'label_encoders': st.session_state.label_encoders,
442
+ 'features': st.session_state.features,
443
+ 'target': st.session_state.target
444
+ }
445
+ joblib.dump(model_data, 'model_data.joblib')
446
+ st.download_button(
447
+ label="Download trained model",
448
+ data=open('model_data.joblib', 'rb'),
449
+ file_name='model_data.joblib',
450
+ mime='application/octet-stream'
451
+ )
452
+ elif page=="Visualisation":
453
+ st.header("Model Visualisation")
454
+ if st.session_state.model is None:
455
+ st.warning("Please train a model first!")
456
+ return
457
+
458
+ if st.session_state.processed_data is not None and st.session_state.features and st.session_state.target:
459
+ X = st.session_state.processed_data[st.session_state.features]
460
+ y = st.session_state.processed_data[st.session_state.target]
461
+
462
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
463
+
464
+ # Create visualization options
465
+ viz_option = st.selectbox(
466
+ "Select visualization type",
467
+ ["Model Comparison", "ROC Curves", "Confusion Matrix"]
468
+ )
469
+
470
+ if viz_option == "Model Comparison":
471
+ st.subheader("Model Performance Metrics")
472
+
473
+ # Train all models to compare
474
+ models = get_model_configs()
475
+ results = {}
476
+
477
+ progress_bar = st.progress(0)
478
+ progress_text = st.empty()
479
+
480
+ for i, (name, model_config) in enumerate(models.items()):
481
+ progress_text.text(f"Training {name}...")
482
+ pipeline = model_config['pipeline']
483
+ pipeline.fit(X_train, y_train)
484
+
485
+ y_pred = pipeline.predict(X_test)
486
+ y_prob = pipeline.predict_proba(X_test)[:, 1] if hasattr(pipeline, "predict_proba") else None
487
+
488
+ accuracy = accuracy_score(y_test, y_pred)
489
+ precision = precision_score(y_test, y_pred, average='binary')
490
+ recall = recall_score(y_test, y_pred, average='binary')
491
+ f1 = f1_score(y_test, y_pred, average='binary')
492
+ roc_auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None
493
+
494
+ results[name] = {
495
+ "Accuracy": accuracy,
496
+ "Precision": precision,
497
+ "Recall": recall,
498
+ "F1-score": f1,
499
+ "ROC-AUC": roc_auc
500
+ }
501
+
502
+ progress_bar.progress((i + 1) / len(models))
503
+
504
+ progress_text.empty()
505
+
506
+ results_df = pd.DataFrame(results).T
507
+ st.dataframe(results_df)
508
+
509
+ fig = px.bar(
510
+ results_df.reset_index().melt(id_vars='index', var_name='Metric', value_name='Score'),
511
+ x='index', y='Score', color='Metric',
512
+ barmode='group',
513
+ title='Model Comparison',
514
+ labels={'index': 'Model'}
515
+ )
516
+ st.plotly_chart(fig)
517
+
518
+ elif viz_option == "ROC Curves":
519
+ st.subheader("ROC Curves")
520
+
521
+ models = get_model_configs()
522
+
523
+ fig = plt.figure(figsize=(10, 6))
524
+
525
+ for name, model_config in models.items():
526
+ pipeline = model_config['pipeline']
527
+ pipeline.fit(X_train, y_train)
528
+
529
+ if hasattr(pipeline, "predict_proba"):
530
+ y_prob = pipeline.predict_proba(X_test)[:, 1]
531
+ fpr, tpr, _ = roc_curve(y_test, y_prob)
532
+ roc_auc = auc(fpr, tpr)
533
+ plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.2f})')
534
+
535
+ plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
536
+ plt.xlim([0.0, 1.0])
537
+ plt.ylim([0.0, 1.05])
538
+ plt.xlabel('False Positive Rate')
539
+ plt.ylabel('True Positive Rate')
540
+ plt.title('Receiver Operating Characteristic (ROC) Curves')
541
+ plt.legend(loc="lower right")
542
+
543
+ st.pyplot(fig)
544
+
545
+ elif viz_option == "Confusion Matrix":
546
+ st.subheader("Confusion Matrices")
547
+
548
+ models = get_model_configs()
549
+
550
+ if len(models) > 4:
551
+ st.warning("Showing confusion matrices for the first 4 models")
552
+ model_items = list(models.items())[:4]
553
+ else:
554
+ model_items = list(models.items())
555
+
556
+ num_models = len(model_items)
557
+ cols = 2
558
+ rows = (num_models + 1) // 2
559
+
560
+ fig, axes = plt.subplots(rows, cols, figsize=(12, 10))
561
+ axes = axes.flatten() if num_models > 1 else [axes]
562
+
563
+ for i, (name, model_config) in enumerate(model_items):
564
+ pipeline = model_config['pipeline']
565
+ pipeline.fit(X_train, y_train)
566
+
567
+ y_pred = pipeline.predict(X_test)
568
+ cm = confusion_matrix(y_test, y_pred)
569
+
570
+ sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=axes[i])
571
+ axes[i].set_title(f"{name} - Confusion Matrix")
572
+ axes[i].set_xlabel("Predicted")
573
+ axes[i].set_ylabel("Actual")
574
+
575
+ for j in range(num_models, len(axes)):
576
+ fig.delaxes(axes[j])
577
+
578
+ plt.tight_layout()
579
+ st.pyplot(fig)
580
+
581
+ st.subheader("Current Model Performance")
582
+ best_model_pred = st.session_state.model.predict(X_test)
583
+
584
+ st.metric("Accuracy", f"{accuracy_score(y_test, best_model_pred):.2%}")
585
+
586
+ col1, col2 = st.columns(2)
587
+ with col1:
588
+ st.metric("Precision", f"{precision_score(y_test, best_model_pred):.2%}")
589
+ st.metric("F1 Score", f"{f1_score(y_test, best_model_pred):.2%}")
590
+ with col2:
591
+ st.metric("Recall", f"{recall_score(y_test, best_model_pred):.2%}")
592
+ if hasattr(st.session_state.model, "predict_proba"):
593
+ best_proba = st.session_state.model.predict_proba(X_test)[:, 1]
594
+ st.metric("AUC", f"{roc_auc_score(y_test, best_proba):.2%}")
595
+
596
+ else:
597
+ st.warning("Please load and preprocess your dataset before running evaluation.")
598
+
599
+
600
+ elif page == "Prediction":
601
+ st.header("🎲 Make Predictions")
602
+
603
+ if st.session_state.model is None:
604
+ st.warning("Please train a model first!")
605
+ return
606
+
607
+ st.subheader("Enter Feature Values")
608
+ st.info(f"Using best model: {st.session_state.model_name}")
609
+
610
+ input_data = {}
611
+ for feature in st.session_state.features:
612
+ if feature in st.session_state.label_encoders:
613
+ options = st.session_state.label_encoders[feature].classes_
614
+ value = st.selectbox(f"Select {feature}", options)
615
+ input_data[feature] = st.session_state.label_encoders[feature].transform([value])[0]
616
+ else:
617
+ input_data[feature] = st.number_input(f"Enter value for {feature}", value=0.0)
618
+ if st.button("Predict"):
619
+ input_df = pd.DataFrame([input_data])
620
+
621
+ prediction = st.session_state.model.predict(input_df)
622
+
623
+ if st.session_state.target in st.session_state.label_encoders:
624
+ original_prediction = st.session_state.label_encoders[st.session_state.target].inverse_transform(prediction)
625
+ st.success(f"Predicted {st.session_state.target}: {original_prediction[0]}")
626
+ else:
627
+ st.success(f"Predicted {st.session_state.target}: {prediction[0]}")
628
+
629
+ proba = st.session_state.model.predict_proba(input_df)
630
+ st.subheader("Prediction Probability")
631
+
632
+ if st.session_state.target in st.session_state.label_encoders:
633
+ classes = st.session_state.label_encoders[st.session_state.target].classes_
634
+ else:
635
+ classes = st.session_state.model.classes_
636
+
637
+ proba_df = pd.DataFrame(
638
+ proba,
639
+ columns=classes
640
+ )
641
+ st.dataframe(proba_df)
642
+
643
+ if __name__ == "__main__":
644
+ main()