pavanmutha commited on
Commit
c1b291b
·
verified ·
1 Parent(s): f904d02

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -21
app.py CHANGED
@@ -189,40 +189,49 @@ def train_model(_):
189
  wandb_run = wandb.init(project="huggingface-data-analysis", name=f"Optuna_Run_{run_counter}", reinit=True)
190
  run_counter += 1
191
 
 
 
 
192
  target = df_global.columns[-1]
193
  X = df_global.drop(target, axis=1)
194
  y = df_global[target]
195
 
196
- if y.dtype == "object":
197
  y = LabelEncoder().fit_transform(y)
198
 
199
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
 
200
 
201
- # Error analysis
202
- error_df = X_test.copy()
203
- error_df["actual"] = y_test
204
- error_df["predicted"] = y_pred
205
- error_df["error"] = error_df["actual"] != error_df["predicted"]
206
- common_errors = error_df[error_df["error"]].groupby(["actual", "predicted"]).size().reset_index(name='count')
207
 
208
- def generate_report(metrics_df, trials_df, common_errors_df):
209
- report = f"""
210
- # Model Training Report
 
 
 
 
 
 
211
 
212
- ## Metrics
213
- {metrics_df.to_markdown(index=False)}
214
 
215
- ## Top Trials
216
- {trials_df.to_markdown(index=False)}
 
 
217
 
218
- ## Common Errors
219
- {common_errors_df.to_markdown(index=False)}
 
 
 
 
 
 
220
 
221
- _Generated on {time.strftime('%Y-%m-%d %H:%M:%S')}_
222
- """
223
- with open("model_report.md", "w") as f:
224
- f.write(report)
225
- return "Report saved to model_report.md"
226
 
227
 
228
 
 
189
  wandb_run = wandb.init(project="huggingface-data-analysis", name=f"Optuna_Run_{run_counter}", reinit=True)
190
  run_counter += 1
191
 
192
+ def prepare_data():
193
+ """Prepares the dataset by splitting into X and y, and returns training and test sets."""
194
+ global X_train, X_test, y_train, y_test
195
  target = df_global.columns[-1]
196
  X = df_global.drop(target, axis=1)
197
  y = df_global[target]
198
 
199
+ if y.dtype == 'object':
200
  y = LabelEncoder().fit_transform(y)
201
 
202
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
203
+ return X_train, X_test, y_train, y_test
204
 
205
+ # Prepare the data before the optimization process
206
+ X_train, X_test, y_train, y_test = prepare_data()
 
 
 
 
207
 
208
+ def objective(trial):
209
+ params = {
210
+ "n_estimators": trial.suggest_int("n_estimators", 50, 200),
211
+ "max_depth": trial.suggest_int("max_depth", 3, 10),
212
+ }
213
+ model = RandomForestClassifier(**params)
214
+ score = cross_val_score(model, X_train, y_train, cv=3).mean() # Now X_train and y_train are defined
215
+ wandb.log(params | {"cv_score": score})
216
+ return score
217
 
218
+ study = optuna.create_study(direction="maximize")
219
+ study.optimize(objective, n_trials=15)
220
 
221
+ best_params = study.best_params
222
+ model = RandomForestClassifier(**best_params)
223
+ model.fit(X_train, y_train)
224
+ y_pred = model.predict(X_test)
225
 
226
+ metrics = {
227
+ "accuracy": accuracy_score(y_test, y_pred),
228
+ "precision": precision_score(y_test, y_pred, average="weighted", zero_division=0),
229
+ "recall": recall_score(y_test, y_pred, average="weighted", zero_division=0),
230
+ "f1_score": f1_score(y_test, y_pred, average="weighted", zero_division=0),
231
+ }
232
+ wandb.log(metrics)
233
+ wandb_run.finish()
234
 
 
 
 
 
 
235
 
236
 
237