AIDataAgentProjectFinal

Paused

App Files Files Community

pavanmutha commited on Apr 13, 2025

Commit

c1b291b

verified ·

1 Parent(s): f904d02

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -21

app.py CHANGED Viewed

@@ -189,40 +189,49 @@ def train_model(_):
     wandb_run = wandb.init(project="huggingface-data-analysis", name=f"Optuna_Run_{run_counter}", reinit=True)
     run_counter += 1
     target = df_global.columns[-1]
     X = df_global.drop(target, axis=1)
     y = df_global[target]
-    if y.dtype == "object":
         y = LabelEncoder().fit_transform(y)
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
-    # Error analysis
-    error_df = X_test.copy()
-    error_df["actual"] = y_test
-    error_df["predicted"] = y_pred
-    error_df["error"] = error_df["actual"] != error_df["predicted"]
-    common_errors = error_df[error_df["error"]].groupby(["actual", "predicted"]).size().reset_index(name='count')
-    def generate_report(metrics_df, trials_df, common_errors_df):
-        report = f"""
-        # Model Training Report
-        ## Metrics
-        {metrics_df.to_markdown(index=False)}
-        ## Top Trials
-        {trials_df.to_markdown(index=False)}
-        ## Common Errors
-        {common_errors_df.to_markdown(index=False)}
-        _Generated on {time.strftime('%Y-%m-%d %H:%M:%S')}_
-        """
-        with open("model_report.md", "w") as f:
-            f.write(report)
-        return "Report saved to model_report.md"

     wandb_run = wandb.init(project="huggingface-data-analysis", name=f"Optuna_Run_{run_counter}", reinit=True)
     run_counter += 1
+def prepare_data():
+    """Prepares the dataset by splitting into X and y, and returns training and test sets."""
+    global X_train, X_test, y_train, y_test
     target = df_global.columns[-1]
     X = df_global.drop(target, axis=1)
     y = df_global[target]
+    if y.dtype == 'object':
         y = LabelEncoder().fit_transform(y)
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
+    return X_train, X_test, y_train, y_test
+# Prepare the data before the optimization process
+X_train, X_test, y_train, y_test = prepare_data()
+def objective(trial):
+    params = {
+        "n_estimators": trial.suggest_int("n_estimators", 50, 200),
+        "max_depth": trial.suggest_int("max_depth", 3, 10),
+    }
+    model = RandomForestClassifier(**params)
+    score = cross_val_score(model, X_train, y_train, cv=3).mean()  # Now X_train and y_train are defined
+    wandb.log(params | {"cv_score": score})
+    return score
+study = optuna.create_study(direction="maximize")
+study.optimize(objective, n_trials=15)
+best_params = study.best_params
+model = RandomForestClassifier(**best_params)
+model.fit(X_train, y_train)
+y_pred = model.predict(X_test)
+metrics = {
+    "accuracy": accuracy_score(y_test, y_pred),
+    "precision": precision_score(y_test, y_pred, average="weighted", zero_division=0),
+    "recall": recall_score(y_test, y_pred, average="weighted", zero_division=0),
+    "f1_score": f1_score(y_test, y_pred, average="weighted", zero_division=0),
+}
+wandb.log(metrics)
+wandb_run.finish()