MENG21 commited on
Commit
d6212ac
·
1 Parent(s): 6883c01

Enhance README.md with detailed application overview, features, installation instructions, and usage guidelines for synthetic data generation and ML model training.

Browse files
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ models/
2
+ temp_uploads/
3
+ __pycache__/
4
+
App.py ADDED
@@ -0,0 +1,1316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ import pandas as pd
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.ensemble import RandomForestClassifier
6
+ from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
7
+ import plotly.express as px
8
+ from sklearn.linear_model import LogisticRegression, RidgeClassifier
9
+ from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier
10
+ from sklearn.svm import SVC, LinearSVC
11
+ from sklearn.naive_bayes import GaussianNB, MultinomialNB
12
+ from sklearn.neighbors import KNeighborsClassifier
13
+ from sklearn.neural_network import MLPClassifier
14
+ from sklearn.preprocessing import MaxAbsScaler, StandardScaler, MinMaxScaler
15
+ import time
16
+ import warnings
17
+ import joblib
18
+ import os
19
+ from datetime import datetime
20
+ import seaborn as sns
21
+ import matplotlib.pyplot as plt
22
+ from matplotlib.colors import LinearSegmentedColormap
23
+ from sklearn.model_selection import learning_curve
24
+ import pickle
25
+ warnings.filterwarnings('ignore')
26
+
27
+ class DataGenerator:
28
+ def __init__(self):
29
+ self.features = None
30
+ self.feature_configs = None
31
+ self.classes = None
32
+ self.class_configs = None
33
+
34
+ def generate_synthetic_data(self, n_samples, feature_configs, classes, class_configs=None):
35
+ """Generate synthetic data based on configurations"""
36
+ n_features = len(feature_configs)
37
+ n_classes = len(classes)
38
+
39
+ X = []
40
+ y = []
41
+ samples_per_class = n_samples // n_classes
42
+
43
+ for i in range(n_classes):
44
+ class_samples = []
45
+ class_name = classes[i]
46
+
47
+ for j, (feature_name, config) in enumerate(feature_configs.items()):
48
+ if class_configs and class_name in class_configs:
49
+ center = class_configs[class_name]['mean'][j]
50
+ std = class_configs[class_name]['std'][j]
51
+ else:
52
+ if config['type'] == 'random':
53
+ center = np.random.randn() * 5
54
+ std = config['std']
55
+ else:
56
+ center = config['center']
57
+ std = config['std']
58
+
59
+ feature_samples = np.round(np.random.normal(
60
+ loc=center,
61
+ scale=std,
62
+ size=samples_per_class
63
+ ), decimals=2)
64
+ class_samples.append(feature_samples)
65
+
66
+ X.append(np.column_stack(class_samples))
67
+ y.extend([classes[i]] * samples_per_class)
68
+
69
+ X = np.vstack(X)
70
+ return X, np.array(y)
71
+
72
+ class ModelManager:
73
+ @staticmethod
74
+ def get_classifiers():
75
+ """Return dictionary of classifiers with appropriate preprocessing"""
76
+ return {
77
+ 'LogisticRegression': {
78
+ 'model': LogisticRegression(max_iter=1000),
79
+ 'scaler': StandardScaler()
80
+ },
81
+ 'RidgeClassifier': {
82
+ 'model': RidgeClassifier(),
83
+ 'scaler': StandardScaler()
84
+ },
85
+ 'RandomForestClassifier': {
86
+ 'model': RandomForestClassifier(random_state=42),
87
+ 'scaler': StandardScaler()
88
+ },
89
+ 'AdaBoostClassifier': {
90
+ 'model': AdaBoostClassifier(),
91
+ 'scaler': StandardScaler()
92
+ },
93
+ 'ExtraTreesClassifier': {
94
+ 'model': ExtraTreesClassifier(),
95
+ 'scaler': StandardScaler()
96
+ },
97
+ 'SVC': {
98
+ 'model': SVC(),
99
+ 'scaler': StandardScaler()
100
+ },
101
+ 'LinearSVC': {
102
+ 'model': LinearSVC(max_iter=2000),
103
+ 'scaler': StandardScaler()
104
+ },
105
+ 'GaussianNB': {
106
+ 'model': GaussianNB(),
107
+ 'scaler': StandardScaler()
108
+ },
109
+ 'KNeighborsClassifier': {
110
+ 'model': KNeighborsClassifier(),
111
+ 'scaler': StandardScaler()
112
+ },
113
+ 'MLPClassifier': {
114
+ 'model': MLPClassifier(max_iter=1000),
115
+ 'scaler': StandardScaler()
116
+ },
117
+ 'MultinomialNB': {
118
+ 'model': MultinomialNB(),
119
+ 'scaler': MaxAbsScaler()
120
+ }
121
+ }
122
+
123
+ @staticmethod
124
+ def ensure_non_negative(X):
125
+ """Ensure data is non-negative by shifting"""
126
+ if isinstance(X, pd.DataFrame):
127
+ min_val = X.values.min()
128
+ if min_val < 0:
129
+ return X + abs(min_val)
130
+ return X
131
+ else:
132
+ min_val = X.min()
133
+ if min_val < 0:
134
+ return X - min_val
135
+ return X
136
+
137
+ def save_model(self, model_dict, model_name):
138
+ """Save model and its scaler to files"""
139
+ if not os.path.exists('models'):
140
+ os.makedirs('models')
141
+
142
+ base_filename = f"{model_name}"
143
+
144
+ if hasattr(model_dict['model'], 'feature_names_in_'):
145
+ model_dict['scaler'].feature_names_in_ = model_dict['model'].feature_names_in_
146
+ elif hasattr(st.session_state, 'features'):
147
+ model_dict['scaler'].feature_names_in_ = np.array(st.session_state.features)
148
+
149
+ model_path = os.path.join('models', f"{base_filename}_model.joblib")
150
+ scaler_path = os.path.join('models', f"{base_filename}_scaler.joblib")
151
+
152
+ joblib.dump(model_dict['model'], model_path)
153
+ joblib.dump(model_dict['scaler'], scaler_path)
154
+
155
+ return model_path, scaler_path
156
+
157
+ def train_and_evaluate_model(self, clf_dict, X_train, X_test, y_train, y_test, model_name):
158
+ """Train and evaluate a single model"""
159
+ start_time = time.time()
160
+
161
+ try:
162
+ scaler = clf_dict['scaler']
163
+ feature_names = st.session_state.features if hasattr(st.session_state, 'features') else None
164
+
165
+ if model_name == 'MultinomialNB':
166
+ X_train_positive = self.ensure_non_negative(X_train)
167
+ X_test_positive = self.ensure_non_negative(X_test)
168
+ X_train_scaled = scaler.fit_transform(X_train_positive)
169
+ X_test_scaled = scaler.transform(X_test_positive)
170
+
171
+ if np.any(X_train_scaled < 0) or np.any(X_test_scaled < 0):
172
+ raise ValueError("Negative values in scaled data")
173
+ else:
174
+ X_train_scaled = scaler.fit_transform(X_train)
175
+ X_test_scaled = scaler.transform(X_test)
176
+
177
+ if feature_names is not None:
178
+ if hasattr(clf_dict['model'], 'feature_names_in_'):
179
+ clf_dict['model'].feature_names_in_ = np.array(feature_names)
180
+ scaler.feature_names_in_ = np.array(feature_names)
181
+
182
+ clf_dict['model'].fit(X_train_scaled, y_train)
183
+ y_pred = clf_dict['model'].predict(X_test_scaled)
184
+
185
+ accuracy = accuracy_score(y_test, y_pred)
186
+ training_time = time.time() - start_time
187
+
188
+ model_path, scaler_path = self.save_model(clf_dict, model_name)
189
+ conf_matrix = confusion_matrix(y_test, y_pred)
190
+
191
+ return {
192
+ 'model_name': model_name,
193
+ 'accuracy': accuracy,
194
+ 'training_time': training_time,
195
+ 'model': clf_dict['model'],
196
+ 'predictions': y_pred,
197
+ 'status': 'success',
198
+ 'scaler': scaler_path,
199
+ 'model_path': model_path,
200
+ 'confusion_matrix': conf_matrix
201
+ }
202
+ except Exception as e:
203
+ return {
204
+ 'model_name': model_name,
205
+ 'accuracy': 0,
206
+ 'training_time': 0,
207
+ 'model': None,
208
+ 'predictions': None,
209
+ 'status': f'failed: {str(e)}',
210
+ 'scaler': None,
211
+ 'model_path': None,
212
+ 'confusion_matrix': None
213
+ }
214
+
215
+ class Visualizer:
216
+ @staticmethod
217
+ def plot_learning_curve(estimator, X, y, title, ax):
218
+ """Plot learning curves for a model"""
219
+ train_sizes, train_scores, test_scores = learning_curve(
220
+ estimator, X, y,
221
+ train_sizes=np.linspace(0.1, 1.0, 10),
222
+ cv=5,
223
+ n_jobs=-1,
224
+ scoring='accuracy'
225
+ )
226
+
227
+ train_mean = np.mean(train_scores, axis=1)
228
+ train_std = np.std(train_scores, axis=1)
229
+ test_mean = np.mean(test_scores, axis=1)
230
+ test_std = np.std(test_scores, axis=1)
231
+
232
+ ax.plot(train_sizes, train_mean, label='Training score')
233
+ ax.plot(train_sizes, test_mean, label='Cross-validation score')
234
+
235
+ ax.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1)
236
+ ax.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1)
237
+
238
+ ax.set_xlabel('Training Examples')
239
+ ax.set_ylabel('Score')
240
+ ax.set_title(title)
241
+ ax.legend(loc='lower right')
242
+ ax.grid(True)
243
+
244
+ def create_confusion_matrices_plot(self, successful_results, y_test):
245
+ """Create and display confusion matrices for successful models"""
246
+ n_models = len(successful_results)
247
+ n_cols = 2
248
+ n_rows = (n_models + n_cols - 1) // n_cols
249
+
250
+ fig = plt.figure(figsize=(15, 5 * n_rows))
251
+ colors = ['white', '#4a90e2']
252
+ n_bins = 100
253
+ # cmap = LinearSegmentedColormap.from_list("custom_blues", colors, N=n_bins)
254
+
255
+ for idx, result in enumerate(successful_results):
256
+ ax = plt.subplot(n_rows, n_cols, idx + 1)
257
+
258
+ sns.heatmap(
259
+ result['confusion_matrix'],
260
+ annot=True,
261
+ fmt='d',
262
+ # cmap=cmap,
263
+ cmap='viridis',
264
+ ax=ax,
265
+ xticklabels=sorted(set(y_test)),
266
+ yticklabels=sorted(set(y_test))
267
+ )
268
+
269
+ ax.set_xlabel('Predicted')
270
+ ax.set_ylabel('Actual')
271
+ ax.set_title(f"{result['model_name']}\nAccuracy: {result['accuracy']:.4f}")
272
+
273
+ plt.tight_layout()
274
+ return fig
275
+
276
+ def create_performance_summary_plot(self, successful_df, selected_models):
277
+ """Create performance metrics summary plot"""
278
+ metrics_to_compare = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
279
+ summary_df = successful_df[successful_df['Model'].isin(selected_models)].melt(
280
+ id_vars=['Model'],
281
+ value_vars=metrics_to_compare,
282
+ var_name='Metric',
283
+ value_name='Score'
284
+ )
285
+
286
+ fig_summary = px.bar(
287
+ summary_df,
288
+ x='Model',
289
+ y='Score',
290
+ color='Metric',
291
+ barmode='group',
292
+ title="Model Performance Metrics Comparison",
293
+ text='Score'
294
+ )
295
+
296
+ fig_summary.update_layout(
297
+ xaxis_tickangle=-45,
298
+ showlegend=True,
299
+ height=600,
300
+ yaxis=dict(
301
+ range=[0, 1],
302
+ title='Score'
303
+ ),
304
+ legend=dict(
305
+ title='Metric',
306
+ orientation='h',
307
+ yanchor='bottom',
308
+ y=1.02,
309
+ xanchor='right',
310
+ x=1
311
+ )
312
+ )
313
+
314
+ fig_summary.update_traces(
315
+ texttemplate='%{text:.4f}',
316
+ textposition='outside',
317
+ textangle=0
318
+ )
319
+
320
+ summary_df['Avg_Score'] = summary_df.groupby('Model')['Score'].transform('mean')
321
+ models_order = summary_df.drop_duplicates('Model').sort_values('Avg_Score', ascending=False)['Model']
322
+ fig_summary.update_layout(xaxis={'categoryorder': 'array', 'categoryarray': models_order})
323
+
324
+ return fig_summary
325
+
326
+ class StreamlitUI:
327
+ def __init__(self):
328
+ self.data_generator = DataGenerator()
329
+ self.model_manager = ModelManager()
330
+ self.visualizer = Visualizer()
331
+
332
+ # Add default configurations as class attribute
333
+ self.default_configs = {
334
+ # Features: [length (mm), width (mm), density (g/cm³), pH]
335
+
336
+ # AMPALAYA: Medium length (150-180mm), thin width (40-50mm)
337
+ # Medium density (95 g/cm³) due to hollow interior, slightly basic pH (6.8-7.0)
338
+ "Ampalaya": {'mean': [165, 45, 95, 6.9], 'std': [15, 5, 10, 0.1]},
339
+
340
+ # BANANA: Long length (180-220mm), medium width (30-40mm)
341
+ # Low density (85 g/cm³), acidic pH (4.5-5.2)
342
+ "Banana": {'mean': [200, 35, 85, 4.8], 'std': [20, 5, 8, 0.3]},
343
+
344
+ # CABBAGE: Round shape - similar length/width (150-200mm x 150-200mm)
345
+ # Very low density (65 g/cm³) due to layered leaves, neutral pH (6.5-7.0)
346
+ "Cabbage": {'mean': [175, 175, 65, 6.8], 'std': [25, 25, 5, 0.2]},
347
+
348
+ # CARROT: Medium length (140-180mm), narrow width (25-35mm)
349
+ # High density (115 g/cm³) due to dense flesh, slightly acidic pH (6.0-6.5)
350
+ "Carrot": {'mean': [160, 30, 115, 6.3], 'std': [20, 5, 10, 0.2]},
351
+
352
+ # CASSAVA: Long length (200-300mm), thick width (50-80mm)
353
+ # High density (125 g/cm³) due to starchy flesh, slightly acidic pH (6.0-6.5)
354
+ "Cassava": {'mean': [250, 65, 125, 6.2], 'std': [50, 15, 12, 0.2]}
355
+ }
356
+
357
+ # Default feature names that match the measurements in default_configs
358
+ self.default_features = [
359
+ 'length (mm)',
360
+ 'width (mm)',
361
+ 'density (g/cm³)',
362
+ 'pH'
363
+ ]
364
+
365
+ # Add new session state variables for static visualizations
366
+ self.initialize_static_visualizations()
367
+
368
+ # Add new session state variable for data source
369
+ if 'data_source' not in st.session_state:
370
+ st.session_state.data_source = 'synthetic'
371
+
372
+ def initialize_static_visualizations(self):
373
+ """Initialize session state variables for static visualizations"""
374
+ if 'confusion_matrices_fig' not in st.session_state:
375
+ st.session_state.confusion_matrices_fig = None
376
+ if 'learning_curves_fig' not in st.session_state:
377
+ st.session_state.learning_curves_fig = None
378
+
379
+ def initialize_session_state(self):
380
+ """Initialize all session state variables"""
381
+ session_vars = {
382
+ 'data_generated': False,
383
+ 'df': None,
384
+ 'features': None,
385
+ 'feature_configs': None,
386
+ 'X_train': None,
387
+ 'X_test': None,
388
+ 'y_train': None,
389
+ 'y_test': None,
390
+ 'y_pred': None,
391
+ 'model_results': None,
392
+ 'best_model': None,
393
+ 'accuracy': None,
394
+ 'feature_importance': None,
395
+ 'split_info': None
396
+ }
397
+
398
+ for var, value in session_vars.items():
399
+ if var not in st.session_state:
400
+ st.session_state[var] = value
401
+
402
+ def setup_page_config(self):
403
+ """Configure the Streamlit page"""
404
+ st.set_page_config(
405
+ page_title="ML Model Generator & Implementation",
406
+ page_icon="🤖",
407
+ layout="wide",
408
+ menu_items={
409
+ 'About': """
410
+ ## Final project in Modeling and Simulation \n
411
+ ### Juan Dela Cruz - BSCS 4A"""
412
+ }
413
+ )
414
+
415
+ def get_sidebar_inputs(self):
416
+ """Get all inputs from the sidebar"""
417
+ st.sidebar.header("Data Generation Parameters")
418
+
419
+ # Feature configuration
420
+ st.sidebar.subheader("Feature Configuration")
421
+
422
+ # Initialize default features if not in session state
423
+ if 'features_input' not in st.session_state:
424
+ st.session_state.features_input = ", ".join(self.default_features)
425
+
426
+ features_input = st.sidebar.text_input(
427
+ "Enter feature names (comma-separated)",
428
+ key='features_input'
429
+ )
430
+ features = [f.strip() for f in features_input.split(",")]
431
+
432
+ # Initialize default classes if not in session state
433
+ if 'classes_input' not in st.session_state:
434
+ st.session_state.classes_input = ", ".join(self.default_configs.keys())
435
+
436
+ classes_input = st.sidebar.text_input(
437
+ "Enter class names (comma-separated)",
438
+ key='classes_input'
439
+ )
440
+ classes = [c.strip() for c in classes_input.split(",")]
441
+
442
+ # Generate feature configs
443
+ feature_configs = {}
444
+ for feature in features:
445
+ feature_configs[feature] = {
446
+ 'type': 'random',
447
+ 'std': 20.0,
448
+ 'center': None
449
+ }
450
+
451
+ return features, feature_configs, classes
452
+
453
+ def get_class_configs(self, classes, features):
454
+ """Get class-specific configurations from the sidebar"""
455
+ class_configs = {}
456
+ st.sidebar.subheader("Class-Specific Settings")
457
+
458
+ for class_name in classes:
459
+ with st.sidebar.expander(f"{class_name} Settings", expanded=False):
460
+ checkbox_key = f"use_specific_{class_name}"
461
+
462
+ # Initialize checkbox state if not in session state
463
+ if checkbox_key not in st.session_state:
464
+ st.session_state[checkbox_key] = True
465
+
466
+ use_specific = st.checkbox(
467
+ f"Set specific values for {class_name}",
468
+ key=checkbox_key
469
+ )
470
+
471
+ means = []
472
+ stds = []
473
+
474
+ # Generate unique means for each class if not in default configs
475
+ if class_name not in self.default_configs:
476
+ # Generate random means between 0-100 that are different from other classes
477
+ random_means = []
478
+ for _ in range(len(features)):
479
+ mean = np.random.uniform(0, 100)
480
+ # Ensure means are unique across classes
481
+ while any(abs(mean - c['mean'][_]) < 10 for c in class_configs.values() if 'mean' in c):
482
+ mean = np.random.uniform(0, 100)
483
+ random_means.append(mean)
484
+ default_values = {'mean': random_means, 'std': [20.0] * len(features)}
485
+ else:
486
+ # Ensure default values match the number of features
487
+ default_means = self.default_configs[class_name]['mean']
488
+ default_stds = self.default_configs[class_name]['std']
489
+
490
+ # If we have more features than default values, extend with random values
491
+ if len(features) > len(default_means):
492
+ additional_means = [np.random.uniform(0, 100) for _ in range(len(features) - len(default_means))]
493
+ additional_stds = [20.0 for _ in range(len(features) - len(default_stds))]
494
+ default_means.extend(additional_means)
495
+ default_stds.extend(additional_stds)
496
+ # If we have fewer features than default values, truncate
497
+ elif len(features) < len(default_means):
498
+ default_means = default_means[:len(features)]
499
+ default_stds = default_stds[:len(features)]
500
+
501
+ default_values = {'mean': default_means, 'std': default_stds}
502
+
503
+ if use_specific:
504
+ for idx, feature in enumerate(features):
505
+ mean_key = f"mean_{class_name}_{feature}"
506
+ std_key = f"std_{class_name}_{feature}"
507
+
508
+ if mean_key not in st.session_state:
509
+ st.session_state[mean_key] = float(default_values['mean'][idx])
510
+ if std_key not in st.session_state:
511
+ st.session_state[std_key] = float(default_values['std'][idx])
512
+
513
+ col1, col2 = st.columns(2)
514
+ with col1:
515
+ mean = st.number_input(
516
+ f"Mean for {feature}",
517
+ key=mean_key
518
+ )
519
+ means.append(mean)
520
+ with col2:
521
+ std = st.number_input(
522
+ f"Std Dev for {feature}",
523
+ min_value=0.1,
524
+ key=std_key
525
+ )
526
+ stds.append(std)
527
+ else:
528
+ # Use default values if specific values not requested
529
+ means = default_values['mean']
530
+ stds = default_values['std']
531
+
532
+ class_configs[class_name] = {
533
+ 'mean': means,
534
+ 'std': stds
535
+ }
536
+
537
+ return class_configs
538
+
539
+ def get_training_params(self):
540
+ """Get training parameters from the sidebar"""
541
+ st.sidebar.subheader("Sample Size & Train/Test Split Configuration")
542
+
543
+ # Initialize default values if not in session state
544
+ if 'n_samples' not in st.session_state:
545
+ st.session_state.n_samples = 10000
546
+
547
+ col1, col2 = st.sidebar.columns(2)
548
+
549
+ with col1:
550
+ n_samples = st.slider(
551
+ "Number of samples",
552
+ 500,
553
+ 50000,
554
+ step=500,
555
+ key='n_samples'
556
+ )
557
+
558
+ with col2:
559
+ test_size = st.slider(
560
+ "Test Size",
561
+ min_value=10,
562
+ max_value=50,
563
+ value=30, # Default value directly in the widget
564
+ step=5,
565
+ key='test_size',
566
+ format="%d%%",
567
+ help="Percentage of data to use for testing"
568
+ )
569
+ st.write(f"Test: {test_size}% / Train: {100 - test_size}%")
570
+
571
+ return n_samples, test_size
572
+
573
+ def generate_and_train(self, n_samples, feature_configs, classes, class_configs, test_size):
574
+ """Generate data and train models"""
575
+ X, y = self.data_generator.generate_synthetic_data(
576
+ n_samples,
577
+ feature_configs,
578
+ classes,
579
+ class_configs
580
+ )
581
+
582
+ st.session_state.df = pd.DataFrame(X, columns=st.session_state.features)
583
+ st.session_state.df['target'] = y
584
+
585
+ # Train test split
586
+ X_train, X_test, y_train, y_test = train_test_split(
587
+ X, y,
588
+ test_size=test_size/100,
589
+ random_state=42
590
+ )
591
+
592
+ # Store split data
593
+ st.session_state.X_train = X_train
594
+ st.session_state.X_test = X_test
595
+ st.session_state.y_train = y_train
596
+ st.session_state.y_test = y_test
597
+
598
+ # Get classifiers and train models
599
+ classifiers = self.model_manager.get_classifiers()
600
+ results = []
601
+
602
+ with st.spinner('Training models... Please wait.'):
603
+ progress_bar = st.progress(0)
604
+ for idx, (name, clf_dict) in enumerate(classifiers.items()):
605
+ result = self.model_manager.train_and_evaluate_model(
606
+ clf_dict,
607
+ X_train,
608
+ X_test,
609
+ y_train,
610
+ y_test,
611
+ name
612
+ )
613
+ results.append(result)
614
+ progress_bar.progress((idx + 1) / len(classifiers))
615
+
616
+ st.session_state.model_results = results
617
+ st.session_state.data_generated = True
618
+
619
+ # Find best model
620
+ successful_results = [r for r in results if r['status'] == 'success']
621
+ if successful_results:
622
+ best_model = max(successful_results, key=lambda x: x['accuracy'])
623
+ st.session_state.best_model = best_model
624
+
625
+ # Store split information
626
+ st.session_state.split_info = {
627
+ 'total_samples': len(X),
628
+ 'train_samples': len(X_train),
629
+ 'test_samples': len(X_test),
630
+ 'test_percentage': test_size
631
+ }
632
+ st.session_state.feature_configs = feature_configs
633
+
634
+ # Generate static visualizations after training
635
+ successful_results = [r for r in st.session_state.model_results if r['status'] == 'success']
636
+ if successful_results:
637
+ # Generate and store confusion matrices
638
+ st.session_state.confusion_matrices_fig = self.visualizer.create_confusion_matrices_plot(
639
+ successful_results,
640
+ st.session_state.y_test
641
+ )
642
+
643
+ # Generate and store learning curves
644
+ st.session_state.learning_curves_fig = self.generate_learning_curves_figure(successful_results)
645
+
646
+ def generate_learning_curves_figure(self, successful_results):
647
+ """Generate learning curves figure"""
648
+ successful_results.sort(key=lambda x: x['accuracy'], reverse=True)
649
+ n_models = len(successful_results)
650
+ n_cols = 2
651
+ n_rows = (n_models + n_cols - 1) // n_cols
652
+
653
+ fig_learning = plt.figure(figsize=(15, 5 * n_rows))
654
+
655
+ for idx, result in enumerate(successful_results):
656
+ ax = plt.subplot(n_rows, n_cols, idx + 1)
657
+
658
+ model_name = result['model_name']
659
+ model = result['model']
660
+ scaler = joblib.load(result['scaler'])
661
+
662
+ if model_name == 'MultinomialNB':
663
+ X_scaled = self.model_manager.ensure_non_negative(
664
+ st.session_state.df.drop('target', axis=1)
665
+ )
666
+ X_scaled = scaler.transform(X_scaled)
667
+ else:
668
+ X_scaled = scaler.transform(st.session_state.df.drop('target', axis=1))
669
+
670
+ y = st.session_state.df['target']
671
+
672
+ self.visualizer.plot_learning_curve(
673
+ model,
674
+ X_scaled,
675
+ y,
676
+ f'Learning Curve - {model_name}\nFinal Accuracy: {result["accuracy"]:.4f}',
677
+ ax
678
+ )
679
+
680
+ plt.tight_layout()
681
+ return fig_learning
682
+
683
+ def display_model_comparison(self):
684
+ """Display model comparison section"""
685
+ st.subheader("Model Comparison")
686
+
687
+ comparison_data = []
688
+ for result in st.session_state.model_results:
689
+ if result['status'] == 'success':
690
+ report_dict = classification_report(
691
+ st.session_state.y_test,
692
+ result['predictions'],
693
+ output_dict=True
694
+ )
695
+
696
+ macro_avg = report_dict['macro avg']
697
+
698
+ comparison_data.append({
699
+ 'Model': result['model_name'],
700
+ 'Accuracy': float(f"{result['accuracy']:.4f}"),
701
+ 'Precision': float(f"{macro_avg['precision']:.4f}"),
702
+ 'Recall': float(f"{macro_avg['recall']:.4f}"),
703
+ 'F1-Score': float(f"{macro_avg['f1-score']:.4f}"),
704
+ 'Training Time (s)': float(f"{result['training_time']:.3f}"),
705
+ 'Status': 'Success'
706
+ })
707
+ else:
708
+ comparison_data.append({
709
+ 'Model': result['model_name'],
710
+ 'Accuracy': 0,
711
+ 'Precision': 0,
712
+ 'Recall': 0,
713
+ 'F1-Score': 0,
714
+ 'Training Time (s)': 0,
715
+ 'Status': result['status']
716
+ })
717
+
718
+ comparison_df = pd.DataFrame(comparison_data)
719
+ comparison_df = comparison_df.sort_values('Accuracy', ascending=False)
720
+
721
+ st.dataframe(comparison_df.style.format({
722
+ 'Accuracy': '{:.4f}',
723
+ 'Precision': '{:.4f}',
724
+ 'Recall': '{:.4f}',
725
+ 'F1-Score': '{:.4f}',
726
+ 'Training Time (s)': '{:.3f}'
727
+ }))
728
+
729
+ return comparison_df
730
+
731
+ def display_metric_visualization(self, comparison_df):
732
+ """Display metric visualization section"""
733
+ metric_to_plot = st.selectbox(
734
+ "Select metric to visualize",
735
+ ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'Training Time (s)']
736
+ )
737
+
738
+ successful_df = comparison_df[comparison_df['Status'] == 'Success']
739
+
740
+ if metric_to_plot == 'Training Time (s)':
741
+ successful_df = successful_df.sort_values(metric_to_plot)
742
+ else:
743
+ successful_df = successful_df.sort_values(metric_to_plot, ascending=False)
744
+
745
+ fig_comparison = px.bar(
746
+ successful_df,
747
+ x='Model',
748
+ y=metric_to_plot,
749
+ title=f"Model {metric_to_plot} Comparison",
750
+ color=metric_to_plot,
751
+ text=metric_to_plot
752
+ )
753
+
754
+ fig_comparison.update_layout(
755
+ xaxis_tickangle=-45,
756
+ showlegend=True,
757
+ height=500,
758
+ yaxis=dict(
759
+ range=[0, 1] if metric_to_plot != 'Training Time (s)' else None
760
+ )
761
+ )
762
+
763
+ fig_comparison.update_traces(
764
+ texttemplate='%{text:.4f}',
765
+ textposition='outside',
766
+ textangle=0
767
+ )
768
+
769
+ st.plotly_chart(fig_comparison)
770
+ return successful_df
771
+
772
+ def display_best_model_performance(self):
773
+ """Display best model performance section"""
774
+ if hasattr(st.session_state, 'best_model'):
775
+ st.subheader("Best Model Performance")
776
+ best_model = st.session_state.best_model
777
+ st.write(f"Best Model: **{best_model['model_name']}**")
778
+ st.write(f"Accuracy: {best_model['accuracy']:.4f}")
779
+
780
+ st.write("Classification Report (Best Model):")
781
+ report_dict = classification_report(
782
+ st.session_state.y_test,
783
+ best_model['predictions'],
784
+ output_dict=True
785
+ )
786
+ report_df = pd.DataFrame(report_dict).transpose()
787
+ st.dataframe(report_df.style.format('{:.4f}'))
788
+
789
+ def display_dataset_info(self):
790
+ """Display dataset split information"""
791
+ if st.session_state.split_info:
792
+ st.subheader("Dataset Split Information")
793
+ col1, col2, col3 = st.columns(3)
794
+
795
+ with col1:
796
+ st.metric(
797
+ "Total Samples",
798
+ st.session_state.split_info['total_samples']
799
+ )
800
+
801
+ with col2:
802
+ st.metric(
803
+ "Training Samples",
804
+ f"{st.session_state.split_info['train_samples']} "
805
+ f"({100 - st.session_state.split_info['test_percentage']}%)"
806
+ )
807
+
808
+ with col3:
809
+ st.metric(
810
+ "Testing Samples",
811
+ f"{st.session_state.split_info['test_samples']} "
812
+ f"({st.session_state.split_info['test_percentage']}%)"
813
+ )
814
+
815
+ def display_feature_configs(self):
816
+ """Display feature configurations"""
817
+ st.subheader("Feature Configurations")
818
+ config_data = []
819
+ for feature, config in st.session_state.feature_configs.items():
820
+ config_data.append({
821
+ 'Feature': feature,
822
+ 'Type': config['type'],
823
+ 'Std Dev': config['std'],
824
+ 'Center': config['center'] if config['type'] == 'user-defined' else 'Random'
825
+ })
826
+ st.table(pd.DataFrame(config_data))
827
+
828
+ def display_data_samples(self):
829
+ """Display original and scaled data samples"""
830
+ st.subheader("Generated Data Sample")
831
+
832
+ # Get random samples from each class
833
+ unique_classes = st.session_state.df['target'].unique()
834
+ samples_per_class = 2 # Number of samples to show per class
835
+
836
+ sampled_data = []
837
+ for class_name in unique_classes:
838
+ class_data = st.session_state.df[st.session_state.df['target'] == class_name]
839
+ sampled_data.append(class_data.sample(n=min(samples_per_class, len(class_data))))
840
+
841
+ sampled_df = pd.concat(sampled_data).sample(frac=1).reset_index(drop=True)
842
+
843
+ col1, col2 = st.columns(2)
844
+
845
+ with col1:
846
+ st.write("Original Data (Random samples from each class):")
847
+ st.write(sampled_df)
848
+
849
+ with col2:
850
+ st.write("Scaled Data (using best model's scaler):")
851
+ if st.session_state.best_model and st.session_state.best_model['status'] == 'success':
852
+ best_model_name = st.session_state.best_model['model_name']
853
+ scaler = joblib.load(st.session_state.best_model['scaler'])
854
+
855
+ features_df = sampled_df.drop('target', axis=1)
856
+
857
+ if best_model_name == 'MultinomialNB':
858
+ features_scaled = self.model_manager.ensure_non_negative(features_df)
859
+ features_scaled = scaler.transform(features_scaled)
860
+ else:
861
+ features_scaled = scaler.transform(features_df)
862
+
863
+ scaled_df = pd.DataFrame(
864
+ features_scaled,
865
+ columns=features_df.columns,
866
+ index=features_df.index
867
+ )
868
+ scaled_df['target'] = sampled_df['target']
869
+
870
+ st.write(scaled_df)
871
+ else:
872
+ st.write("No scaled data available (best model not found)")
873
+
874
+ def display_confusion_matrices(self):
875
+ """Display confusion matrices section"""
876
+ st.subheader("Confusion Matrices")
877
+ st.write("""
878
+ Confusion matrices show the model's prediction performance across different classes.
879
+ - Each row represents the actual class
880
+ - Each column represents the predicted class
881
+ - Diagonal elements represent correct predictions (True Positives for each class)
882
+ - Off-diagonal elements represent incorrect predictions
883
+ - Numbers show how many samples were classified for each combination
884
+ - Colors range from yellow (high values) to green-blue (low values) using the viridis colormap
885
+ """)
886
+ if st.session_state.confusion_matrices_fig is not None:
887
+ st.pyplot(st.session_state.confusion_matrices_fig)
888
+ plt.close()
889
+
890
+
891
+
892
+ def display_performance_summary(self, successful_df):
893
+ """Display performance metrics summary"""
894
+ st.subheader("Performance Metrics Summary")
895
+
896
+ all_models = successful_df['Model'].unique().tolist()
897
+ default_selection = all_models
898
+
899
+ col1, col2 = st.columns([3, 1])
900
+ with col1:
901
+ selected_models = st.multiselect(
902
+ "Select models to compare",
903
+ all_models,
904
+ default=default_selection
905
+ )
906
+
907
+ if not selected_models:
908
+ st.warning("Please select at least one model to display the comparison.")
909
+ return
910
+
911
+ fig_summary = self.visualizer.create_performance_summary_plot(
912
+ successful_df,
913
+ selected_models
914
+ )
915
+ st.plotly_chart(fig_summary, use_container_width=True)
916
+
917
+ def display_saved_models(self):
918
+ """Display saved models information and download buttons"""
919
+ st.subheader("Saved Models")
920
+ saved_models = []
921
+
922
+ for result in st.session_state.model_results:
923
+ if result['status'] == 'success' and result['model_path']:
924
+ # Load model and scaler
925
+ model = joblib.load(result['model_path'])
926
+ scaler = joblib.load(result['scaler'])
927
+
928
+ # Create binary data for download using pickle
929
+ model_bytes = pickle.dumps(model)
930
+ scaler_bytes = pickle.dumps(scaler)
931
+
932
+ saved_models.append({
933
+ 'Model': result['model_name'],
934
+ 'Accuracy': result['accuracy'],
935
+ 'Model_Binary': model_bytes,
936
+ 'Scaler_Binary': scaler_bytes
937
+ })
938
+
939
+ if saved_models:
940
+ # Display models table
941
+ display_df = pd.DataFrame([{
942
+ 'Model': m['Model'],
943
+ 'Accuracy': m['Accuracy']
944
+ } for m in saved_models])
945
+
946
+ st.dataframe(display_df.style.format({
947
+ 'Accuracy': '{:.4f}'
948
+ }))
949
+
950
+ # Add download buttons for each model
951
+ st.write("Download Models:")
952
+ for model_data in saved_models:
953
+ col1, col2 = st.columns(2)
954
+
955
+ with col1:
956
+ st.download_button(
957
+ label=f"Download {model_data['Model']} Model",
958
+ data=model_data['Model_Binary'],
959
+ file_name=f"{model_data['Model']}_model.pkl",
960
+ mime="application/octet-stream"
961
+ )
962
+
963
+ with col2:
964
+ st.download_button(
965
+ label=f"Download {model_data['Model']} Scaler",
966
+ data=model_data['Scaler_Binary'],
967
+ file_name=f"{model_data['Model']}_scaler.pkl",
968
+ mime="application/octet-stream"
969
+ )
970
+ else:
971
+ st.info("No models were saved. Models are saved automatically when accuracy exceeds 0.5")
972
+
973
+ def display_download_section(self):
974
+ """Display dataset download section"""
975
+ st.subheader("Download Dataset")
976
+ col1, col2 = st.columns(2)
977
+
978
+ with col1:
979
+ if st.session_state.df is not None:
980
+ csv = st.session_state.df.to_csv(index=False)
981
+ st.download_button(
982
+ label="Download Original Dataset (CSV)",
983
+ data=csv,
984
+ file_name=f"synthetic_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
985
+ mime='text/csv',
986
+ help="Download the original unscaled dataset"
987
+ )
988
+
989
+ with col2:
990
+ if st.session_state.best_model and st.session_state.best_model['status'] == 'success':
991
+ best_model_name = st.session_state.best_model['model_name']
992
+ scaler = joblib.load(st.session_state.best_model['scaler'])
993
+
994
+ features_df = st.session_state.df.drop('target', axis=1)
995
+ if best_model_name == 'MultinomialNB':
996
+ features_scaled = self.model_manager.ensure_non_negative(features_df)
997
+ features_scaled = scaler.transform(features_scaled)
998
+ else:
999
+ features_scaled = scaler.transform(features_df)
1000
+
1001
+ scaled_df = pd.DataFrame(
1002
+ features_scaled,
1003
+ columns=features_df.columns,
1004
+ index=features_df.index
1005
+ )
1006
+ scaled_df['target'] = st.session_state.df['target']
1007
+
1008
+ csv_scaled = scaled_df.to_csv(index=False)
1009
+ st.download_button(
1010
+ label="Download Scaled Dataset (CSV)",
1011
+ data=csv_scaled,
1012
+ file_name=f"synthetic_data_scaled_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
1013
+ mime='text/csv',
1014
+ help="Download the scaled dataset (using best model's scaler)"
1015
+ )
1016
+
1017
+ def display_dataset_statistics(self):
1018
+ """Display dataset statistics"""
1019
+ with st.expander("Dataset Statistics"):
1020
+ col1, col2 = st.columns(2)
1021
+
1022
+ with col1:
1023
+ st.write("Original Dataset Statistics:")
1024
+ st.write(st.session_state.df.describe())
1025
+
1026
+ with col2:
1027
+ if st.session_state.best_model and st.session_state.best_model['status'] == 'success':
1028
+ st.write("Scaled Dataset Statistics:")
1029
+ best_model_name = st.session_state.best_model['model_name']
1030
+ scaler = joblib.load(st.session_state.best_model['scaler'])
1031
+
1032
+ features_df = st.session_state.df.drop('target', axis=1)
1033
+ if best_model_name == 'MultinomialNB':
1034
+ features_scaled = self.model_manager.ensure_non_negative(features_df)
1035
+ features_scaled = scaler.transform(features_scaled)
1036
+ else:
1037
+ features_scaled = scaler.transform(features_df)
1038
+
1039
+ scaled_df = pd.DataFrame(
1040
+ features_scaled,
1041
+ columns=features_df.columns,
1042
+ index=features_df.index
1043
+ )
1044
+ scaled_df['target'] = st.session_state.df['target']
1045
+ st.write(scaled_df.describe())
1046
+
1047
+ def display_learning_curves(self):
1048
+ """Display learning curves section"""
1049
+ st.subheader("Learning Curves")
1050
+ st.write("""
1051
+ Learning curves show how model performance changes with increasing training data.
1052
+ - Blue line: Training score
1053
+ - Orange line: Cross-validation score
1054
+ - Shaded areas represent standard deviation
1055
+ """)
1056
+
1057
+ if st.session_state.learning_curves_fig is not None:
1058
+ st.pyplot(st.session_state.learning_curves_fig)
1059
+ plt.close()
1060
+
1061
+ def display_feature_visualization(self):
1062
+ """Display 2D and 3D feature visualizations"""
1063
+ st.subheader("Feature Visualization")
1064
+ plot_type = st.radio("Select plot type", ["2D Plot", "3D Plot"], index=1)
1065
+
1066
+ if plot_type == "2D Plot":
1067
+ col1, col2 = st.columns(2)
1068
+
1069
+ with col1:
1070
+ x_feature = st.selectbox(
1071
+ "Select X-axis feature",
1072
+ st.session_state.features,
1073
+ index=0,
1074
+ key='x_2d'
1075
+ )
1076
+
1077
+ with col2:
1078
+ y_features = [f for f in st.session_state.features if f != x_feature]
1079
+ y_feature = st.selectbox(
1080
+ "Select Y-axis feature",
1081
+ y_features,
1082
+ index=0,
1083
+ key='y_2d'
1084
+ )
1085
+
1086
+ fig = px.scatter(
1087
+ st.session_state.df,
1088
+ x=x_feature,
1089
+ y=y_feature,
1090
+ color='target',
1091
+ title=f"2D Visualization of {x_feature} vs {y_feature}",
1092
+ labels={'target': 'Class'}
1093
+ )
1094
+
1095
+ st.plotly_chart(fig, use_container_width=True)
1096
+
1097
+ else: # 3D Plot
1098
+ col1, col2, col3 = st.columns(3)
1099
+
1100
+ with col1:
1101
+ x_feature = st.selectbox(
1102
+ "Select X-axis feature",
1103
+ st.session_state.features,
1104
+ index=0,
1105
+ key='x_3d'
1106
+ )
1107
+
1108
+ with col2:
1109
+ y_features = [f for f in st.session_state.features if f != x_feature]
1110
+ y_feature = st.selectbox(
1111
+ "Select Y-axis feature",
1112
+ y_features,
1113
+ index=0,
1114
+ key='y_3d'
1115
+ )
1116
+
1117
+ with col3:
1118
+ z_features = [f for f in st.session_state.features if f not in [x_feature, y_feature]]
1119
+ z_feature = st.selectbox(
1120
+ "Select Z-axis feature",
1121
+ z_features,
1122
+ index=0,
1123
+ key='z_3d'
1124
+ )
1125
+
1126
+ fig = px.scatter_3d(
1127
+ st.session_state.df,
1128
+ x=x_feature,
1129
+ y=y_feature,
1130
+ z=z_feature,
1131
+ color='target',
1132
+ title=f"3D Visualization of {x_feature} vs {y_feature} vs {z_feature}",
1133
+ labels={'target': 'Class'}
1134
+ )
1135
+
1136
+ fig.update_layout(
1137
+ scene = dict(
1138
+ xaxis_title=x_feature,
1139
+ yaxis_title=y_feature,
1140
+ zaxis_title=z_feature
1141
+ ),
1142
+ scene_camera=dict(
1143
+ up=dict(x=0, y=0, z=1),
1144
+ center=dict(x=0, y=0, z=0),
1145
+ eye=dict(x=1.5, y=1.5, z=1.5)
1146
+ )
1147
+ )
1148
+
1149
+ st.plotly_chart(fig, use_container_width=True)
1150
+
1151
+ def get_data_source(self):
1152
+ """Get user's choice of data source"""
1153
+ st.sidebar.header("Data Source")
1154
+ data_source = st.sidebar.radio(
1155
+ "Choose data source",
1156
+ ['Generate Synthetic Data', 'Upload Dataset'],
1157
+ key='data_source_radio'
1158
+ )
1159
+ st.session_state.data_source = 'synthetic' if data_source == 'Generate Synthetic Data' else 'upload'
1160
+ return st.session_state.data_source
1161
+
1162
+ def upload_dataset(self):
1163
+ """Handle dataset upload"""
1164
+ st.sidebar.header("Upload Dataset")
1165
+ uploaded_file = st.sidebar.file_uploader(
1166
+ "Choose a CSV file",
1167
+ type="csv",
1168
+ help="Upload a CSV file with features and target column"
1169
+ )
1170
+
1171
+ if uploaded_file is not None:
1172
+ try:
1173
+ df = pd.read_csv(uploaded_file)
1174
+
1175
+ # Let user select target column
1176
+ target_col = st.sidebar.selectbox(
1177
+ "Select target column",
1178
+ df.columns.tolist()
1179
+ )
1180
+
1181
+ # Store features and target
1182
+ features = [col for col in df.columns if col != target_col]
1183
+ X = df[features]
1184
+ y = df[target_col]
1185
+
1186
+ # Store in session state
1187
+ st.session_state.df = df
1188
+ st.session_state.features = features
1189
+
1190
+ # Train test split
1191
+ test_size = st.sidebar.slider(
1192
+ "Test Size",
1193
+ min_value=10,
1194
+ max_value=50,
1195
+ value=30,
1196
+ step=5,
1197
+ format="%d%%",
1198
+ help="Percentage of data to use for testing"
1199
+ )
1200
+
1201
+ X_train, X_test, y_train, y_test = train_test_split(
1202
+ X, y,
1203
+ test_size=test_size/100,
1204
+ random_state=42
1205
+ )
1206
+
1207
+ # Store split data
1208
+ st.session_state.X_train = X_train
1209
+ st.session_state.X_test = X_test
1210
+ st.session_state.y_train = y_train
1211
+ st.session_state.y_test = y_test
1212
+
1213
+ # Store split information
1214
+ st.session_state.split_info = {
1215
+ 'total_samples': len(X),
1216
+ 'train_samples': len(X_train),
1217
+ 'test_samples': len(X_test),
1218
+ 'test_percentage': test_size
1219
+ }
1220
+
1221
+ return True
1222
+ except Exception as e:
1223
+ st.sidebar.error(f"Error loading dataset: {str(e)}")
1224
+ return False
1225
+ return False
1226
+
1227
+ def run(self):
1228
+ """Main application logic"""
1229
+ self.setup_page_config()
1230
+ self.initialize_session_state()
1231
+
1232
+ st.title("ML Model Generator")
1233
+
1234
+ # Get data source choice
1235
+ data_source = self.get_data_source()
1236
+
1237
+ if data_source == 'synthetic':
1238
+ st.sidebar.header("Synthetic Data Generation")
1239
+ # Get inputs from sidebar for synthetic data
1240
+ features, feature_configs, classes = self.get_sidebar_inputs()
1241
+ class_configs = self.get_class_configs(classes, features)
1242
+ n_samples, test_size = self.get_training_params()
1243
+
1244
+ # Store features in session state
1245
+ st.session_state.features = features
1246
+
1247
+ # Generate Data button
1248
+ if st.sidebar.button("Generate Data and Train Models"):
1249
+ self.generate_and_train(n_samples, feature_configs, classes, class_configs, test_size)
1250
+
1251
+ else: # upload
1252
+ # Handle dataset upload
1253
+ if self.upload_dataset():
1254
+ if st.sidebar.button("Train Models"):
1255
+ # Get classifiers and train models
1256
+ classifiers = self.model_manager.get_classifiers()
1257
+ results = []
1258
+
1259
+ with st.spinner('Training models... Please wait.'):
1260
+ progress_bar = st.progress(0)
1261
+ for idx, (name, clf_dict) in enumerate(classifiers.items()):
1262
+ result = self.model_manager.train_and_evaluate_model(
1263
+ clf_dict,
1264
+ st.session_state.X_train,
1265
+ st.session_state.X_test,
1266
+ st.session_state.y_train,
1267
+ st.session_state.y_test,
1268
+ name
1269
+ )
1270
+ results.append(result)
1271
+ progress_bar.progress((idx + 1) / len(classifiers))
1272
+
1273
+ st.session_state.model_results = results
1274
+ st.session_state.data_generated = True
1275
+
1276
+ # Find best model
1277
+ successful_results = [r for r in results if r['status'] == 'success']
1278
+ if successful_results:
1279
+ best_model = max(successful_results, key=lambda x: x['accuracy'])
1280
+ st.session_state.best_model = best_model
1281
+
1282
+ # Generate static visualizations
1283
+ st.session_state.confusion_matrices_fig = self.visualizer.create_confusion_matrices_plot(
1284
+ successful_results,
1285
+ st.session_state.y_test
1286
+ )
1287
+ st.session_state.learning_curves_fig = self.generate_learning_curves_figure(successful_results)
1288
+
1289
+ # Display results if data has been generated/uploaded and trained
1290
+ if st.session_state.data_generated:
1291
+ self.display_dataset_info()
1292
+ self.display_data_samples()
1293
+ self.display_feature_visualization()
1294
+ self.display_download_section()
1295
+ self.display_dataset_statistics()
1296
+ self.display_best_model_performance()
1297
+ successful_df = self.display_model_comparison()
1298
+
1299
+ if successful_df is not None and not successful_df.empty:
1300
+ self.display_performance_summary(successful_df)
1301
+ self.display_saved_models()
1302
+ self.display_learning_curves()
1303
+ self.display_confusion_matrices()
1304
+ else:
1305
+ if data_source == 'synthetic':
1306
+ st.info("Please generate data using the sidebar button to view visualizations and results.")
1307
+ else:
1308
+ st.info("Please upload a dataset and click 'Train Models' to view visualizations and results.")
1309
+
1310
+ def main():
1311
+ app = StreamlitUI()
1312
+ app.run()
1313
+
1314
+ if __name__ == "__main__":
1315
+ main()
1316
+
README.md CHANGED
@@ -1,13 +1,102 @@
1
- ---
2
- title: Synthetic Data Generation
3
- emoji: 📚
4
- colorFrom: yellow
5
- colorTo: purple
6
- sdk: streamlit
7
- sdk_version: 1.44.1
8
- app_file: app.py
9
- pinned: false
10
- short_description: synthetic data generation, modeling and integration
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Synthetic Data Generation and ML Model Training
2
+
3
+ A comprehensive Streamlit application for generating synthetic data, training machine learning models, and educational visualization of algorithm performance.
4
+
5
+ ## Live Demo
6
+
7
+ **[Try the application online!](https://projectsyntheticdatageneration.streamlit.app/)**
8
+
9
+ ## Overview
10
+
11
+ This application provides an end-to-end platform for:
12
+ 1. Generating customizable synthetic datasets
13
+ 2. Training and evaluating multiple machine learning classifiers
14
+ 3. Visualizing model performance and data characteristics
15
+ 4. Learning about different ML algorithms through interactive education
16
+ 5. Implementing and testing trained models
17
+
18
+ ## Features
19
+
20
+ ### Main App (`App.py`)
21
+ - Synthetic data generation with customizable feature distributions
22
+ - Support for multiple classifier algorithms with automatic preprocessing
23
+ - Real-time visualization of model performance metrics
24
+ - Model comparison and selection
25
+ - Dataset exploration and visualization tools
26
+ - Model saving and exporting functionality
27
+
28
+ ### Algorithm Education (`pages/02_Algorithm_Education.py`)
29
+ - Detailed explanations of various ML classification algorithms
30
+ - Interactive demonstrations with customizable parameters
31
+ - Mathematical foundations and implementation details
32
+ - Algorithm strengths, limitations, and use cases
33
+ - Performance visualization across different data distributions
34
+
35
+ ### Model Implementation (`pages/03_Model_Implementation.py`)
36
+ - Upload and use previously trained models
37
+ - Real-time prediction with custom input values
38
+ - Model and scaler integration
39
+
40
+ ## Installation
41
+
42
+ ```bash
43
+ # Clone the repository
44
+ git clone https://github.com/yourusername/synthetic_data_generation.git
45
+ cd synthetic_data_generation
46
+
47
+ # Install dependencies
48
+ pip install -r requirements.txt
49
+
50
+ # Run the application
51
+ streamlit run App.py
52
+ ```
53
+
54
+ ## Requirements
55
+
56
+ - Python 3.7+
57
+ - streamlit>=1.28.0
58
+ - numpy>=1.24.0
59
+ - pandas>=2.0.0
60
+ - scikit-learn>=1.2.0
61
+ - plotly>=5.13.0
62
+ - seaborn>=0.12.0
63
+ - matplotlib>=3.7.0
64
+ - joblib>=1.2.0
65
+
66
+ ## Usage
67
+
68
+ ### Generating Synthetic Data
69
+ 1. Define features and their distributions
70
+ 2. Configure class characteristics
71
+ 3. Set sample size and other generation parameters
72
+ 4. Generate and explore your synthetic dataset
73
+
74
+ ### Training Models
75
+ 1. Select classifier algorithms to evaluate
76
+ 2. Configure training parameters (test split, etc.)
77
+ 3. Train models and view performance metrics
78
+ 4. Compare model results through interactive visualizations
79
+
80
+ ### Educational Resources
81
+ 1. Navigate to the Algorithm Education page
82
+ 2. Select an algorithm to learn about
83
+ 3. Interact with the demo to see how parameters affect performance
84
+ 4. Examine mathematical foundations and implementation details
85
+
86
+ ### Model Implementation
87
+ 1. Upload previously saved model and scaler files
88
+ 2. Input feature values or generate random test values
89
+ 3. Make predictions and view results
90
+
91
+ ## Project Structure
92
+
93
+ ```
94
+ synthetic_data_generation/
95
+ ├── App.py # Main application
96
+ ├── models/ # Directory for saved models
97
+ ├── pages/ # Additional application pages
98
+ │ ├── 02_Algorithm_Education.py # Educational content about ML algorithms
99
+ │ └── 03_Model_implementation.py # Model deployment and usage interface
100
+ ├── temp_uploads/ # Temporary directory for file uploads
101
+ └── requirements.txt # Project dependencies
102
+ ```
pages/02_Algorithm_Education.py ADDED
@@ -0,0 +1,1250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ from sklearn.naive_bayes import GaussianNB
4
+ from sklearn.svm import LinearSVC, SVC
5
+ from sklearn.neural_network import MLPClassifier
6
+ from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
7
+ from sklearn.neighbors import KNeighborsClassifier
8
+ from sklearn.linear_model import RidgeClassifier
9
+ from sklearn.naive_bayes import MultinomialNB
10
+ from sklearn.ensemble import AdaBoostClassifier
11
+ from sklearn.metrics import accuracy_score, confusion_matrix
12
+ from sklearn.model_selection import train_test_split, learning_curve
13
+ import matplotlib.pyplot as plt
14
+ import seaborn as sns
15
+
16
+ def setup_page_config():
17
+ """Configure the Streamlit page"""
18
+ st.set_page_config(
19
+ page_title="Algorithm Education",
20
+ page_icon="🤖",
21
+ layout="wide"
22
+ )
23
+
24
+ def page_introduction():
25
+ """Display the introduction section of the page"""
26
+ st.title("Machine Learning Algorithm Education 🎓")
27
+
28
+ st.markdown("""
29
+ Welcome to the Algorithm Education page! This interactive guide helps you understand various machine learning
30
+ algorithms used in classification tasks. Each algorithm is explained in detail with:
31
+
32
+ - 📝 Clear descriptions and explanations
33
+ - ✅ Advantages and limitations
34
+ - 🎯 Practical use cases
35
+ - 📊 Mathematical foundations
36
+ - 💻 Implementation examples
37
+ - 🔬 Interactive demonstrations
38
+ - 📚 Academic references
39
+
40
+ ### How to Use This Guide
41
+ 1. Select an algorithm from the dropdown menu below
42
+ 2. Explore its characteristics and implementation details
43
+ 3. Try the interactive demo with different datasets
44
+ 4. Compare performance metrics and visualizations
45
+
46
+ ### Available Algorithms
47
+ This guide covers popular classification algorithms including:
48
+ - Naive Bayes variants
49
+ - Support Vector Machines
50
+ - Neural Networks
51
+ - Tree-based methods
52
+ - Nearest Neighbors
53
+ - Linear Classifiers
54
+ - Ensemble Methods
55
+
56
+ ### Why Understanding Algorithms Matters
57
+ Choosing the right algorithm for your machine learning task is crucial for:
58
+ - Achieving optimal performance
59
+ - Efficient resource utilization
60
+ - Meeting specific problem constraints
61
+ - Understanding model behavior and limitations
62
+ """)
63
+
64
+ def algorithm_info():
65
+ """Display detailed algorithm information"""
66
+ # First show the introduction
67
+ page_introduction()
68
+
69
+ algorithms = {
70
+ "Gaussian Naive Bayes (GaussianNB)": {
71
+ "description": """
72
+ A probabilistic classifier based on Bayes' theorem with strong independence assumptions between features.
73
+ Assumes features follow a Gaussian (normal) distribution.
74
+ """,
75
+ "pros": [
76
+ "Simple and fast",
77
+ "Works well with small datasets",
78
+ "Good for high-dimensional data",
79
+ "Performs well when features are normally distributed"
80
+ ],
81
+ "cons": [
82
+ "Assumes feature independence (often unrealistic)",
83
+ "Limited by Gaussian distribution assumption",
84
+ "May underperform when features are highly correlated"
85
+ ],
86
+ "use_cases": [
87
+ "Text classification",
88
+ "Spam detection",
89
+ "Medical diagnosis",
90
+ "Real-time prediction scenarios"
91
+ ],
92
+ "math_details": {
93
+ "main_formula": r"""
94
+ P(y|x_1,...,x_n) = \frac{P(y)\prod_{i=1}^{n}P(x_i|y)}{P(x_1,...,x_n)}
95
+ """,
96
+ "component_formulas": [
97
+ {
98
+ "name": "Gaussian Probability Density",
99
+ "formula": r"""
100
+ P(x_i|y) = \frac{1}{\sqrt{2\pi\sigma^2_y}} \exp\left(-\frac{(x_i-\mu_y)^2}{2\sigma^2_y}\right)
101
+ """
102
+ },
103
+ {
104
+ "name": "Class Prior Probability",
105
+ "formula": r"""
106
+ P(y) = \frac{\text{number of samples in class y}}{\text{total number of samples}}
107
+ """
108
+ }
109
+ ],
110
+ "explanation": """
111
+ - P(y|x₁,...,xₙ) is the posterior probability of class y given features
112
+ - P(y) is the prior probability of class y
113
+ - P(xᵢ|y) is the likelihood of feature xᵢ given class y
114
+ - μy and σ²y are the mean and variance of features in class y
115
+ """
116
+ },
117
+ "references": [
118
+ {
119
+ "title": "Naive Bayes and Text Classification",
120
+ "authors": "Sebastian Raschka",
121
+ "publication": "arXiv preprint",
122
+ "year": "2014",
123
+ "url": "https://arxiv.org/abs/1410.5329"
124
+ },
125
+ {
126
+ "title": "scikit-learn: Machine Learning in Python",
127
+ "authors": "Pedregosa et al.",
128
+ "publication": "Journal of Machine Learning Research",
129
+ "year": "2011",
130
+ "url": "https://jmlr.csail.mit.edu/papers/v12/pedregosa11a.html"
131
+ },
132
+ {
133
+ "title": "Fundamental Mathematical Formulas Used in Machine Learning",
134
+ "authors": "Showmik Setta",
135
+ "publication": "Medium",
136
+ "year": "2023",
137
+ "url": "https://medium.com/@showmiklovesport/fundamental-mathematical-formulas-used-in-machine-learning-beginner-21c0843e61e0"
138
+ }
139
+ ]
140
+ },
141
+ "Linear Support Vector Classification (LinearSVC)": {
142
+ "description": """
143
+ A linear classifier that finds the hyperplane that best separates classes by maximizing the margin between them.
144
+ Optimized implementation of Support Vector Classification for linear classification.
145
+ """,
146
+ "pros": [
147
+ "Effective for high-dimensional spaces",
148
+ "Memory efficient",
149
+ "Faster than standard SVC with linear kernel",
150
+ "Works well when classes are linearly separable"
151
+ ],
152
+ "cons": [
153
+ "Only suitable for linear classification",
154
+ "Sensitive to feature scaling",
155
+ "May struggle with overlapping classes",
156
+ "No probability estimates by default"
157
+ ],
158
+ "use_cases": [
159
+ "Text classification",
160
+ "Image classification",
161
+ "Bioinformatics",
162
+ "High-dimensional data analysis"
163
+ ],
164
+ "math_details": {
165
+ "main_formula": r"""
166
+ \min_{w,b} \frac{1}{2}||w||^2 + C\sum_{i=1}^{n} \max(0, 1-y_i(w^Tx_i+b))
167
+ """,
168
+ "component_formulas": [
169
+ {
170
+ "name": "Decision Function",
171
+ "formula": r"""
172
+ f(x) = w^Tx + b
173
+ """
174
+ },
175
+ {
176
+ "name": "Margin Width",
177
+ "formula": r"""
178
+ \text{margin} = \frac{2}{||w||}
179
+ """
180
+ }
181
+ ],
182
+ "explanation": """
183
+ - w is the weight vector
184
+ - b is the bias term
185
+ - C is the regularization parameter
186
+ - yᵢ are the true labels (±1)
187
+ - xᵢ are the input features
188
+ """
189
+ },
190
+ "references": [
191
+ {
192
+ "title": "A Tutorial on Support Vector Machines for Pattern Recognition",
193
+ "authors": "Christopher J.C. Burges",
194
+ "publication": "Data Mining and Knowledge Discovery",
195
+ "year": "1998",
196
+ "url": "https://link.springer.com/article/10.1023/A:1009715923555"
197
+ },
198
+ {
199
+ "title": "Support Vector Machines",
200
+ "authors": "Andrew Ng",
201
+ "publication": "CS229 Lecture Notes, Stanford University",
202
+ "year": "2018",
203
+ "url": "http://cs229.stanford.edu/notes/cs229-notes3.pdf"
204
+ },
205
+ {
206
+ "title": "Machine Learning Algorithms: Mathematical Deep Dive",
207
+ "authors": "Vidushi Meel",
208
+ "publication": "viso.ai",
209
+ "year": "2021",
210
+ "url": "https://viso.ai/deep-learning/machine-learning-algorithms-mathematical-guide/"
211
+ }
212
+ ]
213
+ },
214
+ "Support Vector Classification (SVC)": {
215
+ "description": """
216
+ A powerful classifier that can perform non-linear classification using different kernel functions to transform
217
+ the feature space. Creates an optimal hyperplane in a transformed feature space.
218
+ """,
219
+ "pros": [
220
+ "Effective for non-linear classification",
221
+ "Works well with high-dimensional data",
222
+ "Robust against overfitting",
223
+ "Versatile through different kernel functions"
224
+ ],
225
+ "cons": [
226
+ "Computationally intensive for large datasets",
227
+ "Sensitive to feature scaling",
228
+ "Kernel selection can be challenging",
229
+ "Memory intensive for large datasets"
230
+ ],
231
+ "use_cases": [
232
+ "Image classification",
233
+ "Handwriting recognition",
234
+ "Bioinformatics",
235
+ "Pattern recognition"
236
+ ],
237
+ "math_details": {
238
+ "main_formula": r"""
239
+ \min_{w,b} \frac{1}{2}||w||^2 + C\sum_{i=1}^{n} \xi_i
240
+ """,
241
+ "component_formulas": [
242
+ {
243
+ "name": "Kernel Function (RBF)",
244
+ "formula": r"""
245
+ K(x,x') = \exp\left(-\gamma ||x-x'||^2\right)
246
+ """
247
+ },
248
+ {
249
+ "name": "Decision Function",
250
+ "formula": r"""
251
+ f(x) = \sum_{i=1}^{n} \alpha_i y_i K(x_i,x) + b
252
+ """
253
+ }
254
+ ],
255
+ "explanation": """
256
+ - K(x,x') is the kernel function
257
+ - γ is the kernel coefficient
258
+ - αᵢ are the dual coefficients
259
+ - ξᵢ are the slack variables
260
+ """
261
+ },
262
+ "references": [
263
+ {
264
+ "title": "Support Vector Networks",
265
+ "authors": "Cortes C., Vapnik V.",
266
+ "publication": "Machine Learning",
267
+ "year": "1995",
268
+ "url": "https://link.springer.com/article/10.1007/BF00994018"
269
+ },
270
+ {
271
+ "title": "A Practical Guide to Support Vector Classification",
272
+ "authors": "Hsu, Chang, and Lin",
273
+ "publication": "BJU International",
274
+ "year": "2003",
275
+ "url": "https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf"
276
+ },
277
+ {
278
+ "title": "Machine Learning Algorithms: Mathematical Deep Dive",
279
+ "authors": "Vidushi Meel",
280
+ "publication": "viso.ai",
281
+ "year": "2021",
282
+ "url": "https://viso.ai/deep-learning/machine-learning-algorithms-mathematical-guide/"
283
+ }
284
+ ]
285
+ },
286
+ "Multi-layer Perceptron (MLPClassifier)": {
287
+ "description": """
288
+ A neural network classifier that learns non-linear models by training multiple layers of nodes.
289
+ Each node uses a non-linear activation function to transform inputs.
290
+ """,
291
+ "pros": [
292
+ "Can learn highly non-linear patterns",
293
+ "Capable of learning complex relationships",
294
+ "Good generalization with proper regularization",
295
+ "Can handle multiple classes naturally"
296
+ ],
297
+ "cons": [
298
+ "Requires careful hyperparameter tuning",
299
+ "Computationally intensive",
300
+ "Sensitive to feature scaling",
301
+ "May get stuck in local minima"
302
+ ],
303
+ "use_cases": [
304
+ "Image recognition",
305
+ "Speech recognition",
306
+ "Complex pattern recognition",
307
+ "Financial prediction"
308
+ ],
309
+ "math_details": {
310
+ "main_formula": r"""
311
+ h_l = \sigma(W_l h_{l-1} + b_l)
312
+ """,
313
+ "component_formulas": [
314
+ {
315
+ "name": "ReLU Activation",
316
+ "formula": r"""
317
+ \sigma(x) = \max(0,x)
318
+ """
319
+ },
320
+ {
321
+ "name": "Softmax Output",
322
+ "formula": r"""
323
+ P(y=j|x) = \frac{e^{z_j}}{\sum_{k=1}^K e^{z_k}}
324
+ """
325
+ }
326
+ ],
327
+ "explanation": """
328
+ - hₗ is the output of layer l
329
+ - Wₗ is the weight matrix for layer l
330
+ - bₗ is the bias vector for layer l
331
+ - σ is the activation function
332
+ """
333
+ },
334
+ "references": [
335
+ {
336
+ "title": "Learning representations by back-propagating errors",
337
+ "authors": "Rumelhart, D. E., Hinton, G. E., & Williams, R. J.",
338
+ "publication": "Nature",
339
+ "year": "1986",
340
+ "url": "https://www.nature.com/articles/323533a0"
341
+ },
342
+ {
343
+ "title": "Gradient-based learning applied to document recognition",
344
+ "authors": "LeCun Y., Bottou L., Bengio Y., & Haffner P.",
345
+ "publication": "Proceedings of the IEEE",
346
+ "year": "1998",
347
+ "url": "https://ieeexplore.ieee.org/document/726791"
348
+ },
349
+ {
350
+ "title": "Fundamental Mathematical Formulas Used in Machine Learning",
351
+ "authors": "Showmik Setta",
352
+ "publication": "Medium",
353
+ "year": "2023",
354
+ "url": "https://medium.com/@showmiklovesport/fundamental-mathematical-formulas-used-in-machine-learning-beginner-21c0843e61e0"
355
+ }
356
+ ]
357
+ },
358
+ "Extra Trees Classifier": {
359
+ "description": """
360
+ An ensemble method that builds multiple randomized decision trees and averages their predictions.
361
+ Similar to Random Forest but with additional randomization in the tree-building process.
362
+ """,
363
+ "pros": [
364
+ "Lower variance than Random Forest",
365
+ "Faster training than Random Forest",
366
+ "Good at handling high-dimensional data",
367
+ "Less prone to overfitting"
368
+ ],
369
+ "cons": [
370
+ "May have slightly lower accuracy than Random Forest",
371
+ "Can be memory intensive",
372
+ "Less interpretable than single decision trees",
373
+ "May require more trees than Random Forest"
374
+ ],
375
+ "use_cases": [
376
+ "Feature selection",
377
+ "Large dataset classification",
378
+ "Remote sensing",
379
+ "Biomedical classification"
380
+ ],
381
+ "math_details": {
382
+ "main_formula": r"""
383
+ \hat{f}_{et}(x) = \frac{1}{B}\sum_{b=1}^B \hat{f}_b(x)
384
+ """,
385
+ "component_formulas": [
386
+ {
387
+ "name": "Random Split Selection",
388
+ "formula": r"""
389
+ \text{gain}(s,D) = \frac{|D_l|}{|D|}H(D_l) + \frac{|D_r|}{|D|}H(D_r)
390
+ """
391
+ },
392
+ {
393
+ "name": "Entropy",
394
+ "formula": r"""
395
+ H(D) = -\sum_{k=1}^K p_k\log(p_k)
396
+ """
397
+ }
398
+ ],
399
+ "explanation": """
400
+ - B is the number of trees
401
+ - fᵦ is the prediction of the b-th tree
402
+ - Dₗ and Dᵣ are left and right splits
403
+ - pₖ is the proportion of class k in the node
404
+ """
405
+ },
406
+ "references": [
407
+ {
408
+ "title": "Extremely randomized trees",
409
+ "authors": "Geurts P., Ernst D., & Wehenkel L.",
410
+ "publication": "Machine Learning",
411
+ "year": "2006",
412
+ "url": "https://link.springer.com/article/10.1007/s10994-006-6226-1"
413
+ },
414
+ {
415
+ "title": "scikit-learn: Machine Learning in Python",
416
+ "authors": "Pedregosa et al.",
417
+ "publication": "Journal of Machine Learning Research",
418
+ "year": "2011",
419
+ "url": "https://jmlr.csail.mit.edu/papers/v12/pedregosa11a.html"
420
+ },
421
+ {
422
+ "title": "Fundamental Mathematical Formulas Used in Machine Learning",
423
+ "authors": "Showmik Setta",
424
+ "publication": "Medium",
425
+ "year": "2023",
426
+ "url": "https://medium.com/@showmiklovesport/fundamental-mathematical-formulas-used-in-machine-learning-beginner-21c0843e61e0"
427
+ }
428
+ ]
429
+ },
430
+ "Random Forest Classifier": {
431
+ "description": """
432
+ An ensemble learning method that constructs multiple decision trees and combines their predictions.
433
+ Each tree is built using a random subset of features and bootstrap samples of the data.
434
+ """,
435
+ "pros": [
436
+ "Robust against overfitting",
437
+ "Handles non-linear relationships well",
438
+ "Provides feature importance",
439
+ "Works well with high-dimensional data"
440
+ ],
441
+ "cons": [
442
+ "Can be computationally intensive",
443
+ "Less interpretable than single decision trees",
444
+ "Memory intensive for large datasets",
445
+ "May overfit on noisy datasets"
446
+ ],
447
+ "use_cases": [
448
+ "Credit risk assessment",
449
+ "Medical diagnosis",
450
+ "Market prediction",
451
+ "Image classification"
452
+ ],
453
+ "math_details": {
454
+ "main_formula": r"""
455
+ \hat{f}_{rf}(x) = \frac{1}{B}\sum_{b=1}^B \hat{f}_b(x)
456
+ """,
457
+ "component_formulas": [
458
+ {
459
+ "name": "Random Split Selection",
460
+ "formula": r"""
461
+ \text{gain}(s,D) = \frac{|D_l|}{|D|}H(D_l) + \frac{|D_r|}{|D|}H(D_r)
462
+ """
463
+ },
464
+ {
465
+ "name": "Entropy",
466
+ "formula": r"""
467
+ H(D) = -\sum_{k=1}^K p_k\log(p_k)
468
+ """
469
+ }
470
+ ],
471
+ "explanation": """
472
+ - B is the number of trees
473
+ - fᵦ is the prediction of the b-th tree
474
+ - Dₗ and Dᵣ are left and right splits
475
+ - pₖ is the proportion of class k in the node
476
+ """
477
+ },
478
+ "references": [
479
+ {
480
+ "title": "Random Forests",
481
+ "authors": "Breiman L.",
482
+ "publication": "Machine Learning",
483
+ "year": "2001",
484
+ "url": "https://link.springer.com/article/10.1023/A:1010933404324"
485
+ },
486
+ {
487
+ "title": "An Introduction to Statistical Learning",
488
+ "authors": "James G., Witten D., Hastie T., & Tibshirani R.",
489
+ "publication": "Springer",
490
+ "year": "2013",
491
+ "url": "https://www.statlearning.com/"
492
+ },
493
+ {
494
+ "title": "Machine Learning Algorithms: Mathematical Deep Dive",
495
+ "authors": "Vidushi Meel",
496
+ "publication": "viso.ai",
497
+ "year": "2021",
498
+ "url": "https://viso.ai/deep-learning/machine-learning-algorithms-mathematical-guide/"
499
+ }
500
+ ]
501
+ },
502
+ "K-Nearest Neighbors (KNeighborsClassifier)": {
503
+ "description": """
504
+ A non-parametric method that classifies a data point based on the majority class of its k nearest neighbors
505
+ in the feature space. Simple but effective algorithm.
506
+ """,
507
+ "pros": [
508
+ "Simple to understand and implement",
509
+ "No training phase",
510
+ "Naturally handles multi-class cases",
511
+ "Non-parametric (no assumptions about data)"
512
+ ],
513
+ "cons": [
514
+ "Computationally intensive for large datasets",
515
+ "Sensitive to irrelevant features",
516
+ "Requires feature scaling",
517
+ "Memory intensive (stores all training data)"
518
+ ],
519
+ "use_cases": [
520
+ "Recommendation systems",
521
+ "Pattern recognition",
522
+ "Data imputation",
523
+ "Anomaly detection"
524
+ ],
525
+ "math_details": {
526
+ "main_formula": r"""
527
+ \hat{f}_{knn}(x) = \frac{1}{k}\sum_{i=1}^k y_i
528
+ """,
529
+ "component_formulas": [
530
+ {
531
+ "name": "Distance Function",
532
+ "formula": r"""
533
+ d(x,x') = \sum_{i=1}^p |x_i - x'_i|^2
534
+ """
535
+ },
536
+ {
537
+ "name": "Decision Function",
538
+ "formula": r"""
539
+ f(x) = \text{sign}\left(\sum_{i=1}^k y_i \cdot \text{weight}(d(x,x_i))\right)
540
+ """
541
+ }
542
+ ],
543
+ "explanation": """
544
+ - d(x,x') is the distance function
545
+ - xᵢ are the k nearest neighbors
546
+ - yᵢ are the labels of the k nearest neighbors
547
+ - weight(d(x,x')) is the weight function based on distance
548
+ """
549
+ },
550
+ "references": [
551
+ {
552
+ "title": "Nearest Neighbor Pattern Classification",
553
+ "authors": "Cover T. & Hart P.",
554
+ "publication": "IEEE Transactions on Information Theory",
555
+ "year": "1967",
556
+ "url": "https://ieeexplore.ieee.org/document/1053964"
557
+ },
558
+ {
559
+ "title": "A Survey of Nearest Neighbor Techniques",
560
+ "authors": "Bhatia N. & Vandana",
561
+ "publication": "International Journal of Computer Science and Information Security",
562
+ "year": "2010",
563
+ "url": "https://arxiv.org/abs/1007.0085"
564
+ },
565
+ {
566
+ "title": "Machine Learning Algorithms: Mathematical Deep Dive",
567
+ "authors": "Vidushi Meel",
568
+ "publication": "viso.ai",
569
+ "year": "2021",
570
+ "url": "https://viso.ai/deep-learning/machine-learning-algorithms-mathematical-guide/"
571
+ }
572
+ ]
573
+ },
574
+ "Ridge Classifier": {
575
+ "description": """
576
+ A linear classifier that uses L2 regularization to prevent overfitting. Similar to logistic regression
577
+ but with different loss function and regularization.
578
+ """,
579
+ "pros": [
580
+ "Good for multicollinear data",
581
+ "Less prone to overfitting",
582
+ "Computationally efficient",
583
+ "Works well with many features"
584
+ ],
585
+ "cons": [
586
+ "Only for linear classification",
587
+ "May underfit complex patterns",
588
+ "Sensitive to feature scaling",
589
+ "No probability estimates"
590
+ ],
591
+ "use_cases": [
592
+ "High-dimensional data classification",
593
+ "Text classification",
594
+ "Gene expression analysis",
595
+ "Simple binary classification"
596
+ ],
597
+ "math_details": {
598
+ "main_formula": r"""
599
+ \min_{w} ||Xw - y||^2_2 + \alpha ||w||^2_2
600
+ """,
601
+ "component_formulas": [
602
+ {
603
+ "name": "Decision Function",
604
+ "formula": r"""
605
+ f(x) = w^Tx
606
+ """
607
+ },
608
+ {
609
+ "name": "L2 Penalty",
610
+ "formula": r"""
611
+ \text{penalty} = \alpha ||w||^2_2 = \alpha \sum_{j=1}^p w_j^2
612
+ """
613
+ }
614
+ ],
615
+ "explanation": """
616
+ - w is the weight vector
617
+ - α is the regularization strength
618
+ - X is the feature matrix
619
+ - y is the target vector
620
+ - p is the number of features
621
+ """
622
+ },
623
+ "references": [
624
+ {
625
+ "title": "Ridge Regression: Biased Estimation for Nonorthogonal Problems",
626
+ "authors": "Hoerl A.E. & Kennard R.W.",
627
+ "publication": "Technometrics",
628
+ "year": "1970",
629
+ "url": "https://www.tandfonline.com/doi/abs/10.1080/00401706.1970.10488634"
630
+ },
631
+ {
632
+ "title": "The Elements of Statistical Learning",
633
+ "authors": "Hastie T., Tibshirani R., & Friedman J.",
634
+ "publication": "Springer",
635
+ "year": "2009",
636
+ "url": "https://web.stanford.edu/~hastie/ElemStatLearn/"
637
+ },
638
+ {
639
+ "title": "Fundamental Mathematical Formulas Used in Machine Learning",
640
+ "authors": "Showmik Setta",
641
+ "publication": "Medium",
642
+ "year": "2023",
643
+ "url": "https://medium.com/@showmiklovesport/fundamental-mathematical-formulas-used-in-machine-learning-beginner-21c0843e61e0"
644
+ }
645
+ ]
646
+ },
647
+ "Multinomial Naive Bayes": {
648
+ "description": """
649
+ A specialized version of Naive Bayes for multinomially distributed data. Commonly used for text
650
+ classification with word counts.
651
+ """,
652
+ "pros": [
653
+ "Fast training and prediction",
654
+ "Works well with high-dimensional data",
655
+ "Good for text classification",
656
+ "Handles multiple classes well"
657
+ ],
658
+ "cons": [
659
+ "Assumes feature independence",
660
+ "Requires non-negative features",
661
+ "Sensitive to feature distribution",
662
+ "May underperform with continuous data"
663
+ ],
664
+ "use_cases": [
665
+ "Document classification",
666
+ "Spam detection",
667
+ "Language detection",
668
+ "Topic modeling"
669
+ ],
670
+ "math_details": {
671
+ "main_formula": r"""
672
+ P(y|x) = \frac{P(y)\prod_{i=1}^n P(x_i|y)}{\sum_{k} P(y_k)\prod_{i=1}^n P(x_i|y_k)}
673
+ """,
674
+ "component_formulas": [
675
+ {
676
+ "name": "Feature Probability",
677
+ "formula": r"""
678
+ P(x_i|y) = \frac{N_{yi} + \alpha}{N_y + \alpha n}
679
+ """
680
+ },
681
+ {
682
+ "name": "Log Probability",
683
+ "formula": r"""
684
+ \log P(y|x) = \log P(y) + \sum_{i=1}^n \log P(x_i|y)
685
+ """
686
+ }
687
+ ],
688
+ "explanation": """
689
+ - Nyᵢ is the count of feature i in class y
690
+ - Ny is the total count of all features in class y
691
+ - α is the smoothing parameter
692
+ - n is the number of features
693
+ """
694
+ },
695
+ "references": [
696
+ {
697
+ "title": "A comparison of event models for naive Bayes text classification",
698
+ "authors": "McCallum A. & Nigam K.",
699
+ "publication": "AAAI-98 Workshop on Learning for Text Categorization",
700
+ "year": "1998",
701
+ "url": "https://www.cs.cmu.edu/~knigam/papers/multinomial-aaaiws98.pdf"
702
+ },
703
+ {
704
+ "title": "An empirical study of the naive Bayes classifier",
705
+ "authors": "Rish I.",
706
+ "publication": "IJCAI 2001 Workshop on Empirical Methods in Artificial Intelligence",
707
+ "year": "2001",
708
+ "url": "https://www.researchgate.net/publication/228845263_An_Empirical_Study_of_the_Naive_Bayes_Classifier"
709
+ },
710
+ {
711
+ "title": "Fundamental Mathematical Formulas Used in Machine Learning",
712
+ "authors": "Showmik Setta",
713
+ "publication": "Medium",
714
+ "year": "2023",
715
+ "url": "https://medium.com/@showmiklovesport/fundamental-mathematical-formulas-used-in-machine-learning-beginner-21c0843e61e0"
716
+ }
717
+ ]
718
+ },
719
+ "AdaBoost Classifier": {
720
+ "description": """
721
+ An ensemble method that builds a strong classifier by iteratively adding weak learners, focusing on
722
+ previously misclassified examples.
723
+ """,
724
+ "pros": [
725
+ "Good generalization",
726
+ "Less prone to overfitting",
727
+ "Can identify hard-to-classify instances",
728
+ "Works well with weak learners"
729
+ ],
730
+ "cons": [
731
+ "Sensitive to noisy data and outliers",
732
+ "Sequential nature (can't parallelize)",
733
+ "Can be computationally intensive",
734
+ "May require careful tuning"
735
+ ],
736
+ "use_cases": [
737
+ "Face detection",
738
+ "Object recognition",
739
+ "Medical diagnosis",
740
+ "Fraud detection"
741
+ ],
742
+ "math_details": {
743
+ "main_formula": r"""
744
+ F(x) = \text{sign}\left(\sum_{t=1}^T \alpha_t h_t(x)\right)
745
+ """,
746
+ "component_formulas": [
747
+ {
748
+ "name": "Weak Learner Weight",
749
+ "formula": r"""
750
+ \alpha_t = \frac{1}{2}\ln\left(\frac{1-\epsilon_t}{\epsilon_t}\right)
751
+ """
752
+ },
753
+ {
754
+ "name": "Sample Weight Update",
755
+ "formula": r"""
756
+ w_{i,t+1} = w_{i,t}\exp(-y_i\alpha_th_t(x_i))
757
+ """
758
+ }
759
+ ],
760
+ "explanation": """
761
+ - hₜ(x) is the weak learner prediction
762
+ - αₜ is the weight of weak learner t
763
+ - εₜ is the weighted error rate
764
+ - wᵢ,ₜ is the weight of sample i at iteration t
765
+ """
766
+ },
767
+ "references": [
768
+ {
769
+ "title": "A Decision-Theoretic Generalization of On-Line Learning and an Application to Boosting",
770
+ "authors": "Freund Y. & Schapire R.E.",
771
+ "publication": "Journal of Computer and System Sciences",
772
+ "year": "1997",
773
+ "url": "https://www.sciencedirect.com/science/article/pii/S002200009791504X"
774
+ },
775
+ {
776
+ "title": "Experiments with a New Boosting Algorithm",
777
+ "authors": "Freund Y. & Schapire R.E.",
778
+ "publication": "International Conference on Machine Learning",
779
+ "year": "1996",
780
+ "url": "https://icml.cc/Conferences/1996/papers/boosting.pdf"
781
+ },
782
+ {
783
+ "title": "Machine Learning Algorithms: Mathematical Deep Dive",
784
+ "authors": "Vidushi Meel",
785
+ "publication": "viso.ai",
786
+ "year": "2021",
787
+ "url": "https://viso.ai/deep-learning/machine-learning-algorithms-mathematical-guide/"
788
+ }
789
+ ]
790
+ }
791
+ }
792
+
793
+ # Add implementation details to each algorithm
794
+ for algo_name in algorithms:
795
+ algorithms[algo_name]["implementation"] = {
796
+ "Gaussian Naive Bayes (GaussianNB)": {
797
+ "code": """
798
+ from sklearn.naive_bayes import GaussianNB
799
+ from sklearn.datasets import make_classification
800
+
801
+ # Create sample dataset
802
+ X, y = make_classification(n_samples=1000, n_features=20, n_classes=2)
803
+
804
+ # Initialize and train the model
805
+ gnb = GaussianNB()
806
+ gnb.fit(X, y)
807
+
808
+ # Make predictions
809
+ y_pred = gnb.predict(X)
810
+ """,
811
+ "key_parameters": {
812
+ "var_smoothing": "Portion of the largest variance of all features that is added to variances for calculation stability",
813
+ "priors": "Prior probabilities of the classes"
814
+ },
815
+ "tips": [
816
+ "Normalize features if they have very different scales",
817
+ "Good as a baseline model for comparison",
818
+ "Check feature distributions - should be roughly Gaussian"
819
+ ]
820
+ },
821
+ "Linear Support Vector Classification (LinearSVC)": {
822
+ "code": """
823
+ from sklearn.svm import LinearSVC
824
+ from sklearn.preprocessing import StandardScaler
825
+
826
+ # Scale the features
827
+ scaler = StandardScaler()
828
+ X_scaled = scaler.fit_transform(X)
829
+
830
+ # Initialize and train the model
831
+ svc = LinearSVC(random_state=42, max_iter=1000)
832
+ svc.fit(X_scaled, y)
833
+ """,
834
+ "key_parameters": {
835
+ "C": "Regularization parameter (default=1.0)",
836
+ "max_iter": "Maximum iterations for convergence",
837
+ "dual": "Dual or primal formulation"
838
+ },
839
+ "tips": [
840
+ "Always scale your features",
841
+ "Increase max_iter if model doesn't converge",
842
+ "Try different C values using cross-validation"
843
+ ]
844
+ },
845
+ "Support Vector Classification (SVC)": {
846
+ "code": """
847
+ from sklearn.svm import SVC
848
+ from sklearn.preprocessing import StandardScaler
849
+
850
+ # Scale the features
851
+ scaler = StandardScaler()
852
+ X_scaled = scaler.fit_transform(X)
853
+
854
+ # Initialize and train the model
855
+ svc = SVC(random_state=42)
856
+ svc.fit(X_scaled, y)
857
+ """,
858
+ "key_parameters": {
859
+ "C": "Regularization parameter (default=1.0)",
860
+ "kernel": "Kernel function used to transform the data",
861
+ "gamma": "Kernel coefficient for 'rbf', 'poly', and 'sigmoid' kernels"
862
+ },
863
+ "tips": [
864
+ "Always scale your features",
865
+ "Try different kernels and gamma values",
866
+ "Increase C if model underfits",
867
+ "Decrease C if model overfits"
868
+ ]
869
+ },
870
+ "Multi-layer Perceptron (MLPClassifier)": {
871
+ "code": """
872
+ from sklearn.neural_network import MLPClassifier
873
+ from sklearn.preprocessing import StandardScaler
874
+
875
+ # Scale the features
876
+ scaler = StandardScaler()
877
+ X_scaled = scaler.fit_transform(X)
878
+
879
+ # Initialize and train the model
880
+ mlp = MLPClassifier(random_state=42)
881
+ mlp.fit(X_scaled, y)
882
+ """,
883
+ "key_parameters": {
884
+ "hidden_layer_sizes": "Number of neurons in each layer",
885
+ "activation": "Activation function used in the hidden layers",
886
+ "solver": "Optimization algorithm used to train the model",
887
+ "alpha": "L2 regularization parameter"
888
+ },
889
+ "tips": [
890
+ "Always scale your features",
891
+ "Try different activation functions",
892
+ "Increase hidden_layer_sizes if model underfits",
893
+ "Decrease hidden_layer_sizes if model overfits"
894
+ ]
895
+ },
896
+ "Extra Trees Classifier": {
897
+ "code": """
898
+ from sklearn.ensemble import ExtraTreesClassifier
899
+ from sklearn.preprocessing import StandardScaler
900
+
901
+ # Scale the features
902
+ scaler = StandardScaler()
903
+ X_scaled = scaler.fit_transform(X)
904
+
905
+ # Initialize and train the model
906
+ et = ExtraTreesClassifier(random_state=42)
907
+ et.fit(X_scaled, y)
908
+ """,
909
+ "key_parameters": {
910
+ "n_estimators": "Number of trees in the forest",
911
+ "max_depth": "Maximum depth of the trees",
912
+ "min_samples_split": "Minimum number of samples required to split an internal node",
913
+ "min_samples_leaf": "Minimum number of samples required to be at a leaf node"
914
+ },
915
+ "tips": [
916
+ "Always scale your features",
917
+ "Try different max_depth values",
918
+ "Increase n_estimators if model underfits",
919
+ "Decrease n_estimators if model overfits"
920
+ ]
921
+ },
922
+ "Random Forest Classifier": {
923
+ "code": """
924
+ from sklearn.ensemble import RandomForestClassifier
925
+ from sklearn.preprocessing import StandardScaler
926
+
927
+ # Scale the features
928
+ scaler = StandardScaler()
929
+ X_scaled = scaler.fit_transform(X)
930
+
931
+ # Initialize and train the model
932
+ rf = RandomForestClassifier(random_state=42)
933
+ rf.fit(X_scaled, y)
934
+ """,
935
+ "key_parameters": {
936
+ "n_estimators": "Number of trees in the forest",
937
+ "max_depth": "Maximum depth of the trees",
938
+ "min_samples_split": "Minimum number of samples required to split an internal node",
939
+ "min_samples_leaf": "Minimum number of samples required to be at a leaf node"
940
+ },
941
+ "tips": [
942
+ "Always scale your features",
943
+ "Try different max_depth values",
944
+ "Increase n_estimators if model underfits",
945
+ "Decrease n_estimators if model overfits"
946
+ ]
947
+ },
948
+ "K-Nearest Neighbors (KNeighborsClassifier)": {
949
+ "code": """
950
+ from sklearn.neighbors import KNeighborsClassifier
951
+ from sklearn.preprocessing import StandardScaler
952
+
953
+ # Scale the features
954
+ scaler = StandardScaler()
955
+ X_scaled = scaler.fit_transform(X)
956
+
957
+ # Initialize and train the model
958
+ knn = KNeighborsClassifier()
959
+ knn.fit(X_scaled, y)
960
+ """,
961
+ "key_parameters": {
962
+ "n_neighbors": "Number of neighbors to use",
963
+ "weights": "Weight function used in prediction",
964
+ "algorithm": "Algorithm used to compute the nearest neighbors",
965
+ "leaf_size": "Maximum number of samples in each leaf"
966
+ },
967
+ "tips": [
968
+ "Always scale your features",
969
+ "Try different n_neighbors values",
970
+ "Increase leaf_size if model underfits",
971
+ "Decrease leaf_size if model overfits"
972
+ ]
973
+ },
974
+ "Ridge Classifier": {
975
+ "code": """
976
+ from sklearn.linear_model import RidgeClassifier
977
+ from sklearn.preprocessing import StandardScaler
978
+
979
+ # Scale the features
980
+ scaler = StandardScaler()
981
+ X_scaled = scaler.fit_transform(X)
982
+
983
+ # Initialize and train the model
984
+ ridge = RidgeClassifier(random_state=42)
985
+ ridge.fit(X_scaled, y)
986
+ """,
987
+ "key_parameters": {
988
+ "alpha": "Regularization parameter (default=1.0)",
989
+ "solver": "Optimization algorithm used to train the model",
990
+ "max_iter": "Maximum number of iterations for the solver to converge"
991
+ },
992
+ "tips": [
993
+ "Always scale your features",
994
+ "Try different alpha values",
995
+ "Increase max_iter if model doesn't converge",
996
+ "Decrease max_iter if model overfits"
997
+ ]
998
+ },
999
+ "Multinomial Naive Bayes": {
1000
+ "code": """
1001
+ from sklearn.naive_bayes import MultinomialNB
1002
+ from sklearn.preprocessing import StandardScaler
1003
+
1004
+ # Scale the features
1005
+ scaler = StandardScaler()
1006
+ X_scaled = scaler.fit_transform(X)
1007
+
1008
+ # Initialize and train the model
1009
+ nb = MultinomialNB()
1010
+ nb.fit(X_scaled, y)
1011
+ """,
1012
+ "key_parameters": {
1013
+ "alpha": "Regularization parameter (default=1.0)",
1014
+ "fit_prior": "Whether to learn class prior probabilities or not",
1015
+ "class_prior": "Prior probabilities of the classes"
1016
+ },
1017
+ "tips": [
1018
+ "Always scale your features",
1019
+ "Try different alpha values",
1020
+ "Increase alpha if model underfits",
1021
+ "Decrease alpha if model overfits"
1022
+ ]
1023
+ },
1024
+ "AdaBoost Classifier": {
1025
+ "code": """
1026
+ from sklearn.ensemble import AdaBoostClassifier
1027
+ from sklearn.preprocessing import StandardScaler
1028
+
1029
+ # Scale the features
1030
+ scaler = StandardScaler()
1031
+ X_scaled = scaler.fit_transform(X)
1032
+
1033
+ # Initialize and train the model
1034
+ ada = AdaBoostClassifier(random_state=42)
1035
+ ada.fit(X_scaled, y)
1036
+ """,
1037
+ "key_parameters": {
1038
+ "n_estimators": "Number of trees in the forest",
1039
+ "learning_rate": "Learning rate used to update the weights of the weak classifiers",
1040
+ "algorithm": "Optimization algorithm used to train the model"
1041
+ },
1042
+ "tips": [
1043
+ "Always scale your features",
1044
+ "Try different learning_rate values",
1045
+ "Increase n_estimators if model underfits",
1046
+ "Decrease n_estimators if model overfits"
1047
+ ]
1048
+ }
1049
+ }.get(algo_name, {})
1050
+
1051
+ # Algorithm selector
1052
+ selected_algo = st.selectbox(
1053
+ "Select an algorithm to learn more:",
1054
+ list(algorithms.keys())
1055
+ )
1056
+
1057
+ # Display algorithm information
1058
+ if selected_algo:
1059
+ st.header(selected_algo)
1060
+
1061
+ # Description
1062
+ st.subheader("Description")
1063
+ st.write(algorithms[selected_algo]["description"])
1064
+
1065
+ # Two-column layout for pros and cons
1066
+ col1, col2 = st.columns(2)
1067
+
1068
+ with col1:
1069
+ st.subheader("Advantages")
1070
+ for pro in algorithms[selected_algo]["pros"]:
1071
+ st.markdown(f"✅ {pro}")
1072
+
1073
+ with col2:
1074
+ st.subheader("Disadvantages")
1075
+ for con in algorithms[selected_algo]["cons"]:
1076
+ st.markdown(f"⚠️ {con}")
1077
+
1078
+ # Use cases
1079
+ st.subheader("Common Use Cases")
1080
+ for use_case in algorithms[selected_algo]["use_cases"]:
1081
+ st.markdown(f"🎯 {use_case}")
1082
+
1083
+ # Add mathematical details section
1084
+ st.markdown("---")
1085
+ display_math_details(algorithms[selected_algo])
1086
+
1087
+ # Add visual separator
1088
+ st.markdown("---")
1089
+
1090
+ # Implementation section
1091
+ if "implementation" in algorithms[selected_algo]:
1092
+ st.subheader("Implementation Example")
1093
+
1094
+ # Code example
1095
+ st.code(algorithms[selected_algo]["implementation"]["code"], language="python")
1096
+
1097
+ # Key Parameters
1098
+ st.subheader("Key Parameters")
1099
+ for param, desc in algorithms[selected_algo]["implementation"]["key_parameters"].items():
1100
+ st.markdown(f"**`{param}`**: {desc}")
1101
+
1102
+ # Implementation Tips
1103
+ st.subheader("Implementation Tips")
1104
+ for tip in algorithms[selected_algo]["implementation"]["tips"]:
1105
+ st.markdown(f"💡 {tip}")
1106
+
1107
+ # Add interactive demo section
1108
+ st.subheader("Interactive Demo")
1109
+ if st.checkbox("Show Interactive Demo"):
1110
+ st.write("Select dataset:")
1111
+ dataset_choice = st.selectbox(
1112
+ "Choose a sample dataset",
1113
+ ["Iris", "Breast Cancer", "Wine", "Digits"]
1114
+ )
1115
+
1116
+ if st.button("Run Demo"):
1117
+ try:
1118
+ with st.spinner("Running demo..."):
1119
+ demo_results = run_algorithm_demo(selected_algo, dataset_choice)
1120
+
1121
+ # Display results
1122
+ st.write("Model Performance:")
1123
+ st.write(f"Accuracy: {demo_results['accuracy']:.4f}")
1124
+
1125
+ # Show confusion matrix
1126
+ st.write("Confusion Matrix:")
1127
+ st.pyplot(demo_results['confusion_matrix_plot'])
1128
+
1129
+ # Show learning curve
1130
+ st.write("Learning Curve:")
1131
+ st.pyplot(demo_results['learning_curve_plot'])
1132
+ except Exception as e:
1133
+ st.error(f"Error running demo: {str(e)}")
1134
+
1135
+ # Add a references section to display in the UI
1136
+ if st.checkbox("Show References"):
1137
+ st.subheader("Academic References")
1138
+ if "references" in algorithms[selected_algo]:
1139
+ for ref in algorithms[selected_algo]["references"]:
1140
+ st.markdown(f"**{ref['title']}**")
1141
+ st.markdown(f"*{ref['authors']}* ({ref['year']})")
1142
+ st.markdown(f"Published in: {ref['publication']}")
1143
+ st.markdown(f"[Link to Publication]({ref['url']})")
1144
+ st.markdown("---")
1145
+ else:
1146
+ st.write("No references available for this algorithm.")
1147
+
1148
+ def run_algorithm_demo(algorithm_name, dataset_name):
1149
+ """Run a demo of the selected algorithm on the chosen dataset."""
1150
+ from sklearn.datasets import load_iris, load_breast_cancer, load_wine, load_digits
1151
+ from sklearn.model_selection import train_test_split, learning_curve
1152
+ from sklearn.preprocessing import StandardScaler
1153
+ import matplotlib.pyplot as plt
1154
+ import seaborn as sns
1155
+
1156
+ # Load dataset
1157
+ dataset_loaders = {
1158
+ "Iris": load_iris,
1159
+ "Breast Cancer": load_breast_cancer,
1160
+ "Wine": load_wine,
1161
+ "Digits": load_digits
1162
+ }
1163
+
1164
+ data = dataset_loaders[dataset_name]()
1165
+ X, y = data.data, data.target
1166
+
1167
+ # Split and scale data
1168
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
1169
+ scaler = StandardScaler()
1170
+ X_train_scaled = scaler.fit_transform(X_train)
1171
+ X_test_scaled = scaler.transform(X_test)
1172
+
1173
+ # Initialize and train model
1174
+ model = get_model_instance(algorithm_name)
1175
+ model.fit(X_train_scaled, y_train)
1176
+
1177
+ # Get predictions and accuracy
1178
+ y_pred = model.predict(X_test_scaled)
1179
+ accuracy = accuracy_score(y_test, y_pred)
1180
+
1181
+ # Create confusion matrix plot
1182
+ plt.figure(figsize=(8, 6))
1183
+ cm = confusion_matrix(y_test, y_pred)
1184
+ sns.heatmap(cm, annot=True, fmt='d', cmap='viridis')
1185
+ plt.title('Confusion Matrix')
1186
+ plt.ylabel('True Label')
1187
+ plt.xlabel('Predicted Label')
1188
+ cm_plot = plt.gcf()
1189
+ plt.close()
1190
+
1191
+ # Create learning curve plot
1192
+ train_sizes, train_scores, test_scores = learning_curve(
1193
+ model, X_train_scaled, y_train, cv=5,
1194
+ train_sizes=np.linspace(0.1, 1.0, 5)
1195
+ )
1196
+
1197
+ plt.figure(figsize=(8, 6))
1198
+ plt.plot(train_sizes, np.mean(train_scores, axis=1), label='Training score')
1199
+ plt.plot(train_sizes, np.mean(test_scores, axis=1), label='Cross-validation score')
1200
+ plt.xlabel('Training Examples')
1201
+ plt.ylabel('Score')
1202
+ plt.title('Learning Curve')
1203
+ plt.legend(loc='best')
1204
+ lc_plot = plt.gcf()
1205
+ plt.close()
1206
+
1207
+ return {
1208
+ 'accuracy': accuracy,
1209
+ 'confusion_matrix_plot': cm_plot,
1210
+ 'learning_curve_plot': lc_plot
1211
+ }
1212
+
1213
+ def get_model_instance(algorithm_name):
1214
+ """Return an instance of the specified algorithm."""
1215
+ models = {
1216
+ "Gaussian Naive Bayes (GaussianNB)": GaussianNB(),
1217
+ "Linear Support Vector Classification (LinearSVC)": LinearSVC(random_state=42),
1218
+ "Support Vector Classification (SVC)": SVC(random_state=42),
1219
+ "Multi-layer Perceptron (MLPClassifier)": MLPClassifier(random_state=42),
1220
+ "Extra Trees Classifier": ExtraTreesClassifier(random_state=42),
1221
+ "Random Forest Classifier": RandomForestClassifier(random_state=42),
1222
+ "K-Nearest Neighbors (KNeighborsClassifier)": KNeighborsClassifier(),
1223
+ "Ridge Classifier": RidgeClassifier(random_state=42),
1224
+ "Multinomial Naive Bayes": MultinomialNB(),
1225
+ "AdaBoost Classifier": AdaBoostClassifier(random_state=42)
1226
+ }
1227
+ return models[algorithm_name]
1228
+
1229
+ def display_math_details(algorithm):
1230
+ """Display mathematical details for the algorithm."""
1231
+ if "math_details" in algorithm:
1232
+ st.subheader("Mathematical Details")
1233
+
1234
+ # Main formula
1235
+ st.write("Main Formula:")
1236
+ st.latex(algorithm["math_details"]["main_formula"])
1237
+
1238
+ # Component formulas
1239
+ st.write("Component Formulas:")
1240
+ for component in algorithm["math_details"]["component_formulas"]:
1241
+ st.write(f"**{component['name']}:**")
1242
+ st.latex(component["formula"])
1243
+
1244
+ # Explanation
1245
+ st.write("**Variable Explanations:**")
1246
+ st.markdown(algorithm["math_details"]["explanation"])
1247
+
1248
+ if __name__ == "__main__":
1249
+ setup_page_config()
1250
+ algorithm_info()
pages/03_Model_implementation.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import pickle
5
+ import os
6
+ from datetime import datetime
7
+
8
+ from App import StreamlitUI
9
+
10
+ def setup_page_config():
11
+ """Configure the Streamlit page"""
12
+ st.set_page_config(
13
+ page_title="Model Implementation",
14
+ page_icon="🤖",
15
+ layout="wide"
16
+ )
17
+
18
+ def load_model_and_scaler(model_file, scaler_file):
19
+ try:
20
+ # Create a temporary directory if it doesn't exist
21
+ temp_dir = 'temp_uploads'
22
+ if not os.path.exists(temp_dir):
23
+ os.makedirs(temp_dir)
24
+
25
+ # Generate unique filenames
26
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
27
+ temp_model_path = os.path.join(temp_dir, f'model_{timestamp}.pkl')
28
+ temp_scaler_path = os.path.join(temp_dir, f'scaler_{timestamp}.pkl')
29
+
30
+ # Save uploaded files
31
+ with open(temp_model_path, 'wb') as f:
32
+ f.write(model_file.getbuffer())
33
+ with open(temp_scaler_path, 'wb') as f:
34
+ f.write(scaler_file.getbuffer())
35
+
36
+ # Load the files using pickle
37
+ with open(temp_model_path, 'rb') as f:
38
+ model = pickle.load(f)
39
+ with open(temp_scaler_path, 'rb') as f:
40
+ scaler = pickle.load(f)
41
+
42
+ # Clean up
43
+ os.remove(temp_model_path)
44
+ os.remove(temp_scaler_path)
45
+
46
+ return model, scaler
47
+ except Exception as e:
48
+ st.error(f"Error loading model or scaler: {str(e)}")
49
+ return None, None
50
+
51
+ def predict(model, scaler, features):
52
+ try:
53
+ # Convert features to numpy array and reshape
54
+ features_array = np.array(features).reshape(1, -1)
55
+
56
+ # Scale features
57
+ features_scaled = scaler.transform(features_array)
58
+
59
+ # Make prediction
60
+ prediction = model.predict(features_scaled)
61
+
62
+ # Get prediction probabilities if available
63
+ try:
64
+ probabilities = model.predict_proba(features_scaled)
65
+ return prediction[0], probabilities[0]
66
+ except:
67
+ return prediction[0], None
68
+
69
+ except Exception as e:
70
+ st.error(f"Error making prediction: {str(e)}")
71
+ return None, None
72
+
73
+ def generate_random_features(feature_names):
74
+ """Generate random but realistic values for features"""
75
+ random_values = {}
76
+
77
+ # Get ranges from default configs in App.py
78
+ feature_ranges = {}
79
+ for feature_name in feature_names:
80
+ min_val = float('inf')
81
+ max_val = float('-inf')
82
+
83
+ # Calculate min/max across all classes in default configs
84
+ for class_config in StreamlitUI().default_configs.values():
85
+ mean = class_config['mean']
86
+ std = class_config['std']
87
+
88
+ # Get index of matching feature
89
+ try:
90
+ idx = StreamlitUI().default_features.index(feature_name)
91
+ feature_min = mean[idx] - 3*std[idx] # 3 std deviations for 99.7% coverage
92
+ feature_max = mean[idx] + 3*std[idx]
93
+
94
+ min_val = min(min_val, feature_min)
95
+ max_val = max(max_val, feature_max)
96
+ except ValueError:
97
+ continue
98
+
99
+ # If feature not found in defaults, use reasonable fallback range
100
+ if min_val == float('inf'):
101
+ min_val, max_val = 0, 100
102
+
103
+ feature_ranges[feature_name] = (min_val, max_val)
104
+
105
+ for feature in feature_names:
106
+ # Default range if feature not in predefined ranges
107
+ min_val, max_val = 0, 100
108
+
109
+ # Check if any of the known features are in the feature name
110
+ for key, (min_range, max_range) in feature_ranges.items():
111
+ if key.lower() in feature.lower():
112
+ min_val, max_val = min_range, max_range
113
+ break
114
+
115
+ random_values[feature] = round(np.random.uniform(min_val, max_val), 2)
116
+
117
+ return random_values
118
+
119
+ def show():
120
+ st.title("Model Implementation")
121
+
122
+ # Initialize session state for random values if not exists
123
+ if 'random_values' not in st.session_state:
124
+ st.session_state.random_values = {}
125
+
126
+ # Keep file uploaders in sidebar
127
+ st.sidebar.subheader("Upload Model Files")
128
+ model_file = st.sidebar.file_uploader("Upload Model (.pkl)", type=['pkl'])
129
+ scaler_file = st.sidebar.file_uploader("Upload Scaler (.pkl)", type=['pkl'])
130
+
131
+ # Only proceed if both files are uploaded
132
+ if model_file and scaler_file:
133
+ model, scaler = load_model_and_scaler(model_file, scaler_file)
134
+
135
+ if model and scaler:
136
+ st.sidebar.success("Model and scaler loaded successfully!")
137
+
138
+ # Get feature names from scaler
139
+ feature_names = None
140
+ if hasattr(scaler, 'feature_names_in_'):
141
+ feature_names = scaler.feature_names_in_
142
+ elif hasattr(model, 'feature_names_in_'):
143
+ feature_names = model.feature_names_in_
144
+
145
+ if feature_names is None:
146
+ feature_names_input = st.sidebar.text_input(
147
+ "Enter feature names (comma-separated)",
148
+ "feature1, feature2, feature3"
149
+ )
150
+ feature_names = [f.strip() for f in feature_names_input.split(",")]
151
+ st.sidebar.info("Feature names were not found in the model/scaler. Using manually entered names.")
152
+
153
+ # Create two main columns for the page layout
154
+ input_col, result_col = st.columns(2)
155
+
156
+ # Left column for feature inputs
157
+ with input_col:
158
+ st.subheader("Enter Feature Values")
159
+
160
+ # Add randomization button
161
+ col1, col2 = st.columns([1, 2])
162
+ with col1:
163
+ if st.button("🎲 Randomize"):
164
+ # Generate new random values
165
+ st.session_state.random_values = generate_random_features(feature_names)
166
+ # Update session state for each feature
167
+ for feature in feature_names:
168
+ st.session_state[f"input_{feature}"] = st.session_state.random_values[feature]
169
+ with col2:
170
+ st.markdown("<div style='margin-top: 8px;'>Generate realistic random values</div>",
171
+ unsafe_allow_html=True)
172
+
173
+ # Create feature inputs in a grid layout
174
+ feature_values = {}
175
+ input_cols = st.columns(2) # 2 columns for feature inputs
176
+ for idx, feature in enumerate(feature_names):
177
+ with input_cols[idx % 2]:
178
+ # Initialize session state for this input if not exists
179
+ if f"input_{feature}" not in st.session_state:
180
+ st.session_state[f"input_{feature}"] = 0.0
181
+
182
+ feature_values[feature] = st.number_input(
183
+ f"{feature}",
184
+ key=f"input_{feature}",
185
+ step=1.0,
186
+ format="%.2f"
187
+ )
188
+
189
+ # Make prediction button
190
+ predict_clicked = st.button("Make Prediction")
191
+
192
+ # Right column for prediction results
193
+ with result_col:
194
+ st.subheader("Prediction Results")
195
+
196
+ # Make prediction when values are available or button is clicked
197
+ if predict_clicked or st.session_state.random_values:
198
+ # Prepare features in correct order
199
+ features = [feature_values[feature] for feature in feature_names]
200
+
201
+ # Get prediction
202
+ prediction, probabilities = predict(model, scaler, features)
203
+
204
+ if prediction is not None:
205
+ st.write(f"Predicted Class: **{prediction}**")
206
+
207
+ # Display probabilities if available
208
+ if probabilities is not None:
209
+ st.write("Class Probabilities:")
210
+ prob_df = pd.DataFrame({
211
+ 'Class': model.classes_,
212
+ 'Probability': probabilities
213
+ })
214
+
215
+ # Display as bar chart
216
+ st.bar_chart(
217
+ prob_df.set_index('Class')
218
+ )
219
+ else:
220
+ st.info("Enter feature values and click 'Make Prediction' to see results.")
221
+ else:
222
+ st.sidebar.info("Please upload both model and scaler files to proceed.")
223
+
224
+
225
+ if __name__ == "__main__":
226
+ setup_page_config()
227
+ show()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ streamlit>=1.28.0
2
+ numpy>=1.24.0
3
+ pandas>=2.0.0
4
+ scikit-learn>=1.2.0
5
+ plotly>=5.13.0
6
+ seaborn>=0.12.0
7
+ matplotlib>=3.7.0
8
+ joblib>=1.2.0