shvy commited on
Commit
994aeff
·
verified ·
1 Parent(s): e0fd09a
Files changed (1) hide show
  1. app.py +403 -0
app.py ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
8
+ from sklearn.compose import ColumnTransformer
9
+ from sklearn.pipeline import Pipeline
10
+ from sklearn.impute import SimpleImputer
11
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
12
+ from sklearn.linear_model import LogisticRegression, LinearRegression
13
+ from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
14
+ import xgboost as xgb
15
+ from catboost import CatBoostClassifier, CatBoostRegressor
16
+ import lightgbm as lgb
17
+ import io
18
+ import base64
19
+ from PIL import Image
20
+ import os
21
+ import pickle
22
+ import warnings
23
+ warnings.filterwarnings('ignore')
24
+
25
+ def infer_problem_type(df, target_col):
26
+ """Determine if it's a classification or regression problem"""
27
+ unique_values = df[target_col].nunique()
28
+ # If the target column has less than 10 unique values and is an integer type,
29
+ # it's likely a classification problem
30
+ if unique_values < 10 or df[target_col].dtype in ['object', 'category', 'bool']:
31
+ return "Classification"
32
+ else:
33
+ return "Regression"
34
+
35
+ def generate_eda_report(df):
36
+ """Generate EDA report for the dataset"""
37
+ buffer = io.BytesIO()
38
+
39
+ report = {}
40
+
41
+ # Basic info
42
+ report['shape'] = df.shape
43
+ report['dtypes'] = df.dtypes.astype(str).to_dict()
44
+ report['null_counts'] = df.isnull().sum().to_dict()
45
+ report['desc_stats'] = df.describe().to_html()
46
+
47
+ # Correlation heatmap
48
+ plt.figure(figsize=(10, 8))
49
+ numeric_df = df.select_dtypes(include=['number'])
50
+ if not numeric_df.empty:
51
+ sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
52
+ plt.title('Correlation Matrix')
53
+ plt.tight_layout()
54
+ plt.savefig(buffer, format='png')
55
+ plt.close()
56
+ buffer.seek(0)
57
+ report['corr_heatmap'] = base64.b64encode(buffer.getvalue()).decode('utf-8')
58
+ buffer.close()
59
+
60
+ # Summary of categorical columns
61
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
62
+ report['categorical_cols'] = categorical_cols
63
+
64
+ # Summary of numerical columns
65
+ numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
66
+ report['numerical_cols'] = numerical_cols
67
+
68
+ return report
69
+
70
+ def clean_and_preprocess(df, problem_type, target_col):
71
+ """Clean and preprocess the dataset"""
72
+ # Make a copy of the dataframe
73
+ processed_df = df.copy()
74
+
75
+ # Handle missing values
76
+ for col in processed_df.columns:
77
+ if processed_df[col].dtype in ['int64', 'float64']:
78
+ processed_df[col].fillna(processed_df[col].median(), inplace=True)
79
+ else:
80
+ processed_df[col].fillna(processed_df[col].mode()[0], inplace=True)
81
+
82
+ # Split features and target
83
+ X = processed_df.drop(columns=[target_col])
84
+ y = processed_df[target_col]
85
+
86
+ # Identify categorical and numerical columns
87
+ categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
88
+ numerical_cols = X.select_dtypes(include=['number']).columns.tolist()
89
+
90
+ # Create preprocessor
91
+ preprocessor = ColumnTransformer(
92
+ transformers=[
93
+ ('num', Pipeline(steps=[
94
+ ('imputer', SimpleImputer(strategy='median')),
95
+ ('scaler', StandardScaler())
96
+ ]), numerical_cols),
97
+ ('cat', Pipeline(steps=[
98
+ ('imputer', SimpleImputer(strategy='most_frequent')),
99
+ ('onehot', OneHotEncoder(handle_unknown='ignore'))
100
+ ]), categorical_cols)
101
+ ]
102
+ )
103
+
104
+ # Create and fit preprocessor
105
+ X_processed = preprocessor.fit_transform(X)
106
+
107
+ # Handle target for classification
108
+ if problem_type == "Classification":
109
+ le = LabelEncoder()
110
+ y = le.fit_transform(y)
111
+
112
+ # Split data
113
+ X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)
114
+
115
+ preprocessing_info = {
116
+ 'preprocessor': preprocessor,
117
+ 'X_train': X_train,
118
+ 'X_test': X_test,
119
+ 'y_train': y_train,
120
+ 'y_test': y_test,
121
+ 'categorical_cols': categorical_cols,
122
+ 'numerical_cols': numerical_cols,
123
+ 'target_encoder': le if problem_type == "Classification" else None
124
+ }
125
+
126
+ return preprocessing_info
127
+
128
+ def train_and_evaluate_models(preprocessing_info, problem_type):
129
+ """Train and evaluate models based on problem type"""
130
+ X_train = preprocessing_info['X_train']
131
+ X_test = preprocessing_info['X_test']
132
+ y_train = preprocessing_info['y_train']
133
+ y_test = preprocessing_info['y_test']
134
+
135
+ results = {}
136
+ models = {}
137
+
138
+ if problem_type == "Classification":
139
+ # Classification models
140
+ models_to_train = {
141
+ 'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
142
+ 'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
143
+ 'XGBoost': xgb.XGBClassifier(random_state=42),
144
+ 'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
145
+ 'LightGBM': lgb.LGBMClassifier(random_state=42)
146
+ }
147
+
148
+ for name, model in models_to_train.items():
149
+ model.fit(X_train, y_train)
150
+ y_pred = model.predict(X_test)
151
+
152
+ accuracy = accuracy_score(y_test, y_pred)
153
+ report = classification_report(y_test, y_pred, output_dict=True)
154
+
155
+ results[name] = {
156
+ 'accuracy': accuracy,
157
+ 'report': report
158
+ }
159
+ models[name] = model
160
+
161
+ else:
162
+ # Regression models
163
+ models_to_train = {
164
+ 'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
165
+ 'LinearRegression': LinearRegression(),
166
+ 'XGBoost': xgb.XGBRegressor(random_state=42),
167
+ 'CatBoost': CatBoostRegressor(verbose=0, random_state=42),
168
+ 'LightGBM': lgb.LGBMRegressor(random_state=42)
169
+ }
170
+
171
+ for name, model in models_to_train.items():
172
+ model.fit(X_train, y_train)
173
+ y_pred = model.predict(X_test)
174
+
175
+ mse = mean_squared_error(y_test, y_pred)
176
+ r2 = r2_score(y_test, y_pred)
177
+
178
+ results[name] = {
179
+ 'mse': mse,
180
+ 'r2': r2
181
+ }
182
+ models[name] = model
183
+
184
+ # Find best model
185
+ if problem_type == "Classification":
186
+ best_model_name = max(results, key=lambda x: results[x]['accuracy'])
187
+ best_score = results[best_model_name]['accuracy']
188
+ metric_name = 'accuracy'
189
+ else:
190
+ best_model_name = max(results, key=lambda x: results[x]['r2'])
191
+ best_score = results[best_model_name]['r2']
192
+ metric_name = 'R²'
193
+
194
+ return {
195
+ 'results': results,
196
+ 'best_model_name': best_model_name,
197
+ 'best_score': best_score,
198
+ 'metric_name': metric_name,
199
+ 'models': models,
200
+ 'best_model': models[best_model_name]
201
+ }
202
+
203
+ def save_model(model, preprocessor, target_encoder=None):
204
+ """Save model and preprocessor to files"""
205
+ os.makedirs('models', exist_ok=True)
206
+
207
+ # Save model
208
+ with open('models/model.pkl', 'wb') as f:
209
+ pickle.dump(model, f)
210
+
211
+ # Save preprocessor
212
+ with open('models/preprocessor.pkl', 'wb') as f:
213
+ pickle.dump(preprocessor, f)
214
+
215
+ # Save target encoder if it exists
216
+ if target_encoder is not None:
217
+ with open('models/target_encoder.pkl', 'wb') as f:
218
+ pickle.dump(target_encoder, f)
219
+
220
+ return 'models/model.pkl'
221
+
222
+ def process_dataset(df, target_col):
223
+ """Process the entire dataset pipeline"""
224
+ # Determine problem type
225
+ problem_type = infer_problem_type(df, target_col)
226
+
227
+ # Generate EDA report
228
+ eda_report = generate_eda_report(df)
229
+
230
+ # Preprocess data
231
+ preprocessing_info = clean_and_preprocess(df, problem_type, target_col)
232
+
233
+ # Train and evaluate models
234
+ model_results = train_and_evaluate_models(preprocessing_info, problem_type)
235
+
236
+ # Save best model
237
+ model_path = save_model(
238
+ model_results['best_model'],
239
+ preprocessing_info['preprocessor'],
240
+ preprocessing_info.get('target_encoder')
241
+ )
242
+
243
+ return {
244
+ 'problem_type': problem_type,
245
+ 'eda_report': eda_report,
246
+ 'preprocessing_info': preprocessing_info,
247
+ 'model_results': model_results,
248
+ 'model_path': model_path
249
+ }
250
+
251
+ def format_results_html(results_data):
252
+ """Format results as HTML for display"""
253
+ problem_type = results_data['problem_type']
254
+ eda_report = results_data['eda_report']
255
+ model_results = results_data['model_results']
256
+
257
+ html = f"""
258
+ <h2>AutoML Analysis Results</h2>
259
+ <h3>Problem Type: {problem_type}</h3>
260
+
261
+ <h3>Dataset Information</h3>
262
+ <p><strong>Shape:</strong> {eda_report['shape'][0]} rows, {eda_report['shape'][1]} columns</p>
263
+ <p><strong>Numerical Columns:</strong> {', '.join(eda_report['numerical_cols'])}</p>
264
+ <p><strong>Categorical Columns:</strong> {', '.join(eda_report['categorical_cols'])}</p>
265
+
266
+ <h3>Missing Values</h3>
267
+ <ul>
268
+ """
269
+
270
+ for col, count in eda_report['null_counts'].items():
271
+ if count > 0:
272
+ html += f"<li>{col}: {count} missing values</li>"
273
+
274
+ html += "</ul>"
275
+
276
+ if 'corr_heatmap' in eda_report:
277
+ html += f"""
278
+ <h3>Correlation Heatmap</h3>
279
+ <img src="data:image/png;base64,{eda_report['corr_heatmap']}" alt="Correlation Heatmap" width="600">
280
+ """
281
+
282
+ html += f"""
283
+ <h3>Model Results</h3>
284
+ <p><strong>Best Model:</strong> {model_results['best_model_name']}</p>
285
+ <p><strong>Best {model_results['metric_name']}:</strong> {model_results['best_score']:.4f}</p>
286
+
287
+ <h4>All Models Performance</h4>
288
+ <table border="1" cellpadding="5">
289
+ <tr>
290
+ <th>Model</th>
291
+ """
292
+
293
+ if problem_type == "Classification":
294
+ html += "<th>Accuracy</th></tr>"
295
+
296
+ for model, result in model_results['results'].items():
297
+ html += f"""
298
+ <tr>
299
+ <td>{model}</td>
300
+ <td>{result['accuracy']:.4f}</td>
301
+ </tr>
302
+ """
303
+ else:
304
+ html += "<th>MSE</th><th>R²</th></tr>"
305
+
306
+ for model, result in model_results['results'].items():
307
+ html += f"""
308
+ <tr>
309
+ <td>{model}</td>
310
+ <td>{result['mse']:.4f}</td>
311
+ <td>{result['r2']:.4f}</td>
312
+ </tr>
313
+ """
314
+
315
+ html += "</table>"
316
+
317
+ # Add detailed performance metrics for classification
318
+ if problem_type == "Classification":
319
+ best_model = model_results['best_model_name']
320
+ report = model_results['results'][best_model]['report']
321
+
322
+ html += f"""
323
+ <h4>Classification Report for {best_model}</h4>
324
+ <table border="1" cellpadding="5">
325
+ <tr>
326
+ <th>Class</th>
327
+ <th>Precision</th>
328
+ <th>Recall</th>
329
+ <th>F1-Score</th>
330
+ <th>Support</th>
331
+ </tr>
332
+ """
333
+
334
+ for class_name, metrics in report.items():
335
+ if class_name in ['accuracy', 'macro avg', 'weighted avg']:
336
+ continue
337
+
338
+ html += f"""
339
+ <tr>
340
+ <td>{class_name}</td>
341
+ <td>{metrics['precision']:.4f}</td>
342
+ <td>{metrics['recall']:.4f}</td>
343
+ <td>{metrics['f1-score']:.4f}</td>
344
+ <td>{metrics['support']}</td>
345
+ </tr>
346
+ """
347
+
348
+ html += "</table>"
349
+
350
+ html += f"""
351
+ <h3>Model Download</h3>
352
+ <p>Your model has been saved and is ready for download.</p>
353
+ """
354
+
355
+ return html
356
+
357
+ def process_file(file, target_col):
358
+ """Process uploaded CSV file"""
359
+ if file is None:
360
+ return "Please upload a CSV file."
361
+
362
+ # Read the CSV file
363
+ try:
364
+ df = pd.read_csv(file.name)
365
+ except Exception as e:
366
+ return f"Error reading the CSV file: {str(e)}"
367
+
368
+ # Validate target column
369
+ if target_col not in df.columns:
370
+ return f"Target column '{target_col}' not found in the dataset. Available columns: {', '.join(df.columns)}"
371
+
372
+ # Process the dataset
373
+ try:
374
+ results = process_dataset(df, target_col)
375
+ return format_results_html(results)
376
+ except Exception as e:
377
+ return f"Error processing the dataset: {str(e)}"
378
+
379
+ # Define Gradio interface
380
+ with gr.Blocks(title="AutoML for Structured Data") as demo:
381
+ gr.Markdown("# AutoML for Structured Data")
382
+ gr.Markdown("""
383
+ Upload a CSV file, specify the target column, and let AutoML do the rest! This app will:
384
+ 1. Perform exploratory data analysis (EDA)
385
+ 2. Determine if it's a regression or classification problem
386
+ 3. Handle preprocessing (cleaning, encoding, etc.)
387
+ 4. Train multiple models and select the best one
388
+ 5. Display the results and allow you to download the model
389
+ """)
390
+
391
+ with gr.Row():
392
+ with gr.Column():
393
+ file_input = gr.File(label="Upload CSV File")
394
+ target_col = gr.Textbox(label="Target Column Name")
395
+ submit_btn = gr.Button("Process Dataset")
396
+
397
+ with gr.Column():
398
+ output = gr.HTML(label="Results")
399
+
400
+ submit_btn.click(fn=process_file, inputs=[file_input, target_col], outputs=output)
401
+
402
+ # Launch the app
403
+ demo.launch()