shimaa22 commited on
Commit
edf6c00
·
verified ·
1 Parent(s): 2967f48

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -110
app.py CHANGED
@@ -10,20 +10,15 @@ from sklearn.tree import DecisionTreeClassifier
10
  from sklearn.ensemble import RandomForestClassifier
11
  from xgboost import XGBClassifier
12
 
13
- from sklearn.linear_model import LinearRegression
14
- from sklearn.ensemble import RandomForestRegressor
15
- from xgboost import XGBRegressor
16
-
17
  from sklearn.metrics import (
18
  accuracy_score,
19
  precision_score,
20
  recall_score,
21
- confusion_matrix,
22
- mean_absolute_error,
23
- mean_squared_error,
24
- r2_score
25
  )
26
 
 
 
27
  from reportlab.pdfgen import canvas
28
 
29
  # =========================
@@ -63,7 +58,7 @@ def upload_and_clean(file):
63
 
64
 
65
  # =========================
66
- # VISUALIZATION (BAR + PIE)
67
  # =========================
68
  def analyze_data(target):
69
 
@@ -74,11 +69,10 @@ def analyze_data(target):
74
 
75
  cols = [c for c in df.columns if c != target]
76
 
77
- for col in cols[:8]:
78
 
79
  fig, axes = plt.subplots(1, 2, figsize=(12, 4))
80
 
81
- # BAR
82
  df[col].astype(str).value_counts().head(10).plot(
83
  kind="bar",
84
  ax=axes[0]
@@ -86,7 +80,6 @@ def analyze_data(target):
86
  axes[0].set_title(f"Bar - {col}")
87
  axes[0].tick_params(axis='x', rotation=45)
88
 
89
- # PIE
90
  df[col].astype(str).value_counts().head(6).plot(
91
  kind="pie",
92
  ax=axes[1],
@@ -107,7 +100,7 @@ def analyze_data(target):
107
 
108
 
109
  # =========================
110
- # ML TRAINING
111
  # =========================
112
  def run_ml(target):
113
 
@@ -115,7 +108,7 @@ def run_ml(target):
115
 
116
  df = df_global.copy()
117
 
118
- # encode all categorical
119
  for col in df.columns:
120
  if not pd.api.types.is_numeric_dtype(df[col]):
121
  df[col] = LabelEncoder().fit_transform(df[col].astype(str))
@@ -126,88 +119,94 @@ def run_ml(target):
126
  X_global = X
127
  y_global = y
128
 
129
- is_classification = len(np.unique(y)) <= 20
 
 
 
 
 
130
 
 
131
  X_train, X_test, y_train, y_test = train_test_split(
132
  X, y, test_size=0.2, random_state=42
133
  )
134
 
135
- results = []
136
- best_score = -999
137
-
138
- # ================= CLASSIFICATION =================
139
- if is_classification:
140
-
141
- models = {
142
- "Decision Tree": DecisionTreeClassifier(),
143
- "Random Forest": RandomForestClassifier(),
144
- "XGBoost": XGBClassifier(eval_metric="logloss")
145
- }
146
-
147
- for name, model in models.items():
148
-
149
- model.fit(X_train, y_train)
150
- pred = model.predict(X_test)
151
-
152
- acc = accuracy_score(y_test, pred)
153
-
154
- results.append({
155
- "Model": name,
156
- "Accuracy": acc,
157
- "Precision": precision_score(y_test, pred, average="weighted", zero_division=0),
158
- "Recall": recall_score(y_test, pred, average="weighted", zero_division=0)
159
- })
160
-
161
- if acc > best_score:
162
- best_score = acc
163
- best_model_obj = model
164
- best_model_name = name
165
-
166
- leaderboard = pd.DataFrame(results).sort_values("Accuracy", ascending=False)
167
-
168
- # confusion matrix
169
- cm = confusion_matrix(y_test, best_model_obj.predict(X_test))
170
-
171
- fig = plt.figure()
172
- plt.imshow(cm, cmap="Blues")
173
- plt.title(f"Best Model: {best_model_name}")
174
-
175
- for i in range(cm.shape[0]):
176
- for j in range(cm.shape[1]):
177
- plt.text(j, i, cm[i, j], ha="center", va="center")
178
-
179
- cm_path = "/tmp/cm.png"
180
- plt.savefig(cm_path)
181
- plt.close()
182
-
183
- return "Classification", leaderboard, cm_path
184
-
185
- # ================= REGRESSION =================
 
 
 
 
 
 
186
  else:
 
187
 
188
- models = {
189
- "Linear Regression": LinearRegression(),
190
- "Random Forest": RandomForestRegressor(),
191
- "XGBoost": XGBRegressor()
192
- }
193
-
194
- for name, model in models.items():
195
 
196
- model.fit(X_train, y_train)
197
- pred = model.predict(X_test)
198
 
199
- results.append({
200
- "Model": name,
201
- "MAE": mean_absolute_error(y_test, pred),
202
- "MSE": mean_squared_error(y_test, pred),
203
- "R2": r2_score(y_test, pred)
204
- })
205
 
206
- leaderboard = pd.DataFrame(results).sort_values("R2", ascending=False)
207
-
208
- best_model_name = leaderboard.iloc[0]["Model"]
209
-
210
- return "Regression", leaderboard, None
 
211
 
212
 
213
  # =========================
@@ -220,10 +219,9 @@ def feature_importance():
220
  if hasattr(best_model_obj, "feature_importances_"):
221
 
222
  plt.figure(figsize=(6,4))
223
-
224
  plt.barh(X_global.columns, best_model_obj.feature_importances_)
225
 
226
- path = "/tmp/feature.png"
227
  plt.savefig(path)
228
  plt.close()
229
 
@@ -235,33 +233,33 @@ def feature_importance():
235
  # =========================
236
  # PDF REPORT
237
  # =========================
238
- def download_pdf():
239
 
240
  global best_model_name
241
 
242
- file_path = "/tmp/report.pdf"
243
 
244
- c = canvas.Canvas(file_path)
245
 
246
  c.drawString(100, 750, "Auto ML Report")
247
  c.drawString(100, 730, f"Best Model: {best_model_name}")
248
 
249
- c.drawString(100, 700, "Generated Successfully")
250
 
251
  c.save()
252
 
253
- return file_path
254
 
255
 
256
  # =========================
257
- # COMBINED RUN
258
  # =========================
259
- def full_run(target):
260
 
261
- status, leaderboard, cm = run_ml(target)
262
  images = analyze_data(target)
263
 
264
- return status, leaderboard, cm, images
265
 
266
 
267
  # =========================
@@ -269,28 +267,26 @@ def full_run(target):
269
  # =========================
270
  with gr.Blocks() as demo:
271
 
272
- gr.Markdown("# 🚀 Auto ML Dashboard")
273
 
274
  file = gr.File()
275
 
276
- upload_btn = gr.Button("Upload")
277
 
278
  status = gr.Textbox()
279
  preview = gr.Dataframe()
280
 
281
- target = gr.Dropdown(label="Target")
282
 
283
- run_btn = gr.Button("RUN FULL ANALYSIS")
284
 
285
  ml_status = gr.Textbox()
286
- leaderboard = gr.Dataframe()
287
 
288
- cm_img = gr.Image()
 
 
289
 
290
- gallery = gr.Gallery(
291
- label="Analysis Charts (Click to Enlarge)",
292
- columns=2
293
- )
294
 
295
  feat_btn = gr.Button("Feature Importance")
296
  feat_img = gr.Image()
@@ -307,9 +303,9 @@ with gr.Blocks() as demo:
307
 
308
  # full analysis
309
  run_btn.click(
310
- full_run,
311
  target,
312
- [ml_status, leaderboard, cm_img, gallery]
313
  )
314
 
315
  # feature importance
@@ -321,7 +317,7 @@ with gr.Blocks() as demo:
321
 
322
  # pdf
323
  pdf_btn.click(
324
- download_pdf,
325
  None,
326
  pdf_file
327
  )
 
10
  from sklearn.ensemble import RandomForestClassifier
11
  from xgboost import XGBClassifier
12
 
 
 
 
 
13
  from sklearn.metrics import (
14
  accuracy_score,
15
  precision_score,
16
  recall_score,
17
+ confusion_matrix
 
 
 
18
  )
19
 
20
+ from imblearn.over_sampling import SMOTE
21
+
22
  from reportlab.pdfgen import canvas
23
 
24
  # =========================
 
58
 
59
 
60
  # =========================
61
+ # VISUALIZATION
62
  # =========================
63
  def analyze_data(target):
64
 
 
69
 
70
  cols = [c for c in df.columns if c != target]
71
 
72
+ for col in cols[:6]:
73
 
74
  fig, axes = plt.subplots(1, 2, figsize=(12, 4))
75
 
 
76
  df[col].astype(str).value_counts().head(10).plot(
77
  kind="bar",
78
  ax=axes[0]
 
80
  axes[0].set_title(f"Bar - {col}")
81
  axes[0].tick_params(axis='x', rotation=45)
82
 
 
83
  df[col].astype(str).value_counts().head(6).plot(
84
  kind="pie",
85
  ax=axes[1],
 
100
 
101
 
102
  # =========================
103
+ # ML WITH SMOTE + CLASS WEIGHT
104
  # =========================
105
  def run_ml(target):
106
 
 
108
 
109
  df = df_global.copy()
110
 
111
+ # encode
112
  for col in df.columns:
113
  if not pd.api.types.is_numeric_dtype(df[col]):
114
  df[col] = LabelEncoder().fit_transform(df[col].astype(str))
 
119
  X_global = X
120
  y_global = y
121
 
122
+ # =========================
123
+ # imbalance detection
124
+ # =========================
125
+ counts = np.bincount(y)
126
+ imbalance_ratio = min(counts) / max(counts)
127
+ is_imbalanced = imbalance_ratio < 0.5
128
 
129
+ # split
130
  X_train, X_test, y_train, y_test = train_test_split(
131
  X, y, test_size=0.2, random_state=42
132
  )
133
 
134
+ models = {
135
+ "Decision Tree": DecisionTreeClassifier(),
136
+ "Random Forest": RandomForestClassifier(),
137
+ "XGBoost": XGBClassifier(eval_metric="logloss")
138
+ }
139
+
140
+ # =========================
141
+ # RESULT TABLES
142
+ # =========================
143
+ no_results = []
144
+ cw_results = []
145
+ smote_results = []
146
+
147
+ best_score = 0
148
+
149
+ # =====================================================
150
+ # 1️⃣ NO SAMPLING
151
+ # =====================================================
152
+ for name, model in models.items():
153
+
154
+ model.fit(X_train, y_train)
155
+ pred = model.predict(X_test)
156
+
157
+ acc = accuracy_score(y_test, pred)
158
+
159
+ no_results.append({
160
+ "Model": name,
161
+ "Accuracy": acc
162
+ })
163
+
164
+ if acc > best_score:
165
+ best_score = acc
166
+ best_model_obj = model
167
+ best_model_name = name + " (No Sampling)"
168
+
169
+ # =====================================================
170
+ # 2️⃣ CLASS WEIGHT
171
+ # =====================================================
172
+ for name, model in models.items():
173
+
174
+ if name != "XGBoost":
175
+ model = DecisionTreeClassifier(class_weight="balanced") if name=="Decision Tree" else RandomForestClassifier(class_weight="balanced")
176
+
177
+ model.fit(X_train, y_train)
178
+ pred = model.predict(X_test)
179
+
180
+ cw_results.append({
181
+ "Model": name,
182
+ "Accuracy": accuracy_score(y_test, pred)
183
+ })
184
+
185
+ # =====================================================
186
+ # 3️⃣ SMOTE
187
+ # =====================================================
188
+ if is_imbalanced:
189
+ sm = SMOTE(random_state=42)
190
+ X_res, y_res = sm.fit_resample(X_train, y_train)
191
  else:
192
+ X_res, y_res = X_train, y_train
193
 
194
+ for name, model in models.items():
 
 
 
 
 
 
195
 
196
+ model.fit(X_res, y_res)
197
+ pred = model.predict(X_test)
198
 
199
+ smote_results.append({
200
+ "Model": name,
201
+ "Accuracy": accuracy_score(y_test, pred)
202
+ })
 
 
203
 
204
+ return (
205
+ f"Imbalanced Dataset: {is_imbalanced}",
206
+ pd.DataFrame(no_results),
207
+ pd.DataFrame(cw_results),
208
+ pd.DataFrame(smote_results)
209
+ )
210
 
211
 
212
  # =========================
 
219
  if hasattr(best_model_obj, "feature_importances_"):
220
 
221
  plt.figure(figsize=(6,4))
 
222
  plt.barh(X_global.columns, best_model_obj.feature_importances_)
223
 
224
+ path = "/tmp/feat.png"
225
  plt.savefig(path)
226
  plt.close()
227
 
 
233
  # =========================
234
  # PDF REPORT
235
  # =========================
236
+ def download_report():
237
 
238
  global best_model_name
239
 
240
+ path = "/tmp/report.pdf"
241
 
242
+ c = canvas.Canvas(path)
243
 
244
  c.drawString(100, 750, "Auto ML Report")
245
  c.drawString(100, 730, f"Best Model: {best_model_name}")
246
 
247
+ c.drawString(100, 700, "Includes SMOTE + Class Weight Comparison")
248
 
249
  c.save()
250
 
251
+ return path
252
 
253
 
254
  # =========================
255
+ # FULL ANALYSIS
256
  # =========================
257
+ def full_analysis(target):
258
 
259
+ ml_status, no_df, cw_df, smote_df = run_ml(target)
260
  images = analyze_data(target)
261
 
262
+ return ml_status, no_df, cw_df, smote_df, images
263
 
264
 
265
  # =========================
 
267
  # =========================
268
  with gr.Blocks() as demo:
269
 
270
+ gr.Markdown("# 🚀 Advanced AutoML System (SMOTE + Class Weight)")
271
 
272
  file = gr.File()
273
 
274
+ upload_btn = gr.Button("Upload Data")
275
 
276
  status = gr.Textbox()
277
  preview = gr.Dataframe()
278
 
279
+ target = gr.Dropdown(label="Select Target")
280
 
281
+ run_btn = gr.Button("Run Full Analysis")
282
 
283
  ml_status = gr.Textbox()
 
284
 
285
+ no_table = gr.Dataframe(label="No Sampling")
286
+ cw_table = gr.Dataframe(label="Class Weight")
287
+ smote_table = gr.Dataframe(label="SMOTE")
288
 
289
+ gallery = gr.Gallery(label="Visualizations", columns=2)
 
 
 
290
 
291
  feat_btn = gr.Button("Feature Importance")
292
  feat_img = gr.Image()
 
303
 
304
  # full analysis
305
  run_btn.click(
306
+ full_analysis,
307
  target,
308
+ [ml_status, no_table, cw_table, smote_table, gallery]
309
  )
310
 
311
  # feature importance
 
317
 
318
  # pdf
319
  pdf_btn.click(
320
+ download_report,
321
  None,
322
  pdf_file
323
  )