AIDataAgentProjectFinal

Paused

App Files Files Community

pavanmutha commited on Apr 13, 2025

Commit

c70b4e1

verified ·

1 Parent(s): d1a62b9

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -18

app.py CHANGED Viewed

@@ -14,10 +14,11 @@ import shutil
 import ast
 from smolagents import HfApiModel, CodeAgent
 from huggingface_hub import login
-from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import train_test_split, cross_val_score
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
 from sklearn.metrics import ConfusionMatrixDisplay
 from sklearn.preprocessing import LabelEncoder
 from PIL import Image
@@ -189,30 +190,62 @@ def train_model(_):
     wandb_run = wandb.init(project="huggingface-data-analysis", name=f"Optuna_Run_{run_counter}", reinit=True)
     run_counter += 1
 def prepare_data():
-    """Prepares the dataset by splitting into X and y, and returns training and test sets."""
-    global df_global, X_train, X_test, y_train, y_test
-    # Check if df_global is None, which means no file has been uploaded yet
     if df_global is None:
-        raise ValueError("DataFrame is None. Please upload a dataset first.")
     target = df_global.columns[-1]
-    X = df_global.drop(target, axis=1)
     y = df_global[target]
-    if y.dtype == 'object':
-        y = LabelEncoder().fit_transform(y)
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
-    return X_train, X_test, y_train, y_test
-# Prepare the data before the optimization process, with a check for df_global
-try:
-    X_train, X_test, y_train, y_test = prepare_data()
-except ValueError as e:
-    print(e)  # You can log this or return it as a message in the UI
-    # Handle the error by returning or setting defaults as needed.
 def objective(trial):

 import ast
 from smolagents import HfApiModel, CodeAgent
 from huggingface_hub import login
 from sklearn.model_selection import train_test_split, cross_val_score
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
 from sklearn.metrics import ConfusionMatrixDisplay
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
+from sklearn.linear_model import LogisticRegression
 from sklearn.preprocessing import LabelEncoder
 from PIL import Image
     wandb_run = wandb.init(project="huggingface-data-analysis", name=f"Optuna_Run_{run_counter}", reinit=True)
     run_counter += 1
+import optuna
+from sklearn.model_selection import train_test_split, cross_val_score
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
+from sklearn.linear_model import LogisticRegression
+import pandas as pd
 def prepare_data():
+    global df_global
     if df_global is None:
+        raise ValueError("No dataset uploaded.")
     target = df_global.columns[-1]
+    X = df_global.iloc[:, :-1]
     y = df_global[target]
+    return train_test_split(X, y, test_size=0.2, random_state=42)
+def make_objective(X_train, y_train):
+    def objective(trial):
+        model_type = trial.suggest_categorical("model_type", ["RandomForest", "GradientBoosting", "LogisticRegression"])
+        if model_type == "RandomForest":
+            model = RandomForestClassifier(
+                n_estimators=trial.suggest_int("n_estimators", 50, 300),
+                max_depth=trial.suggest_int("max_depth", 2, 32)
+            )
+        elif model_type == "GradientBoosting":
+            model = GradientBoostingClassifier(
+                n_estimators=trial.suggest_int("n_estimators", 50, 300),
+                learning_rate=trial.suggest_float("learning_rate", 0.01, 0.3),
+                max_depth=trial.suggest_int("max_depth", 2, 32)
+            )
+        else:
+            model = LogisticRegression(
+                C=trial.suggest_float("C", 1e-3, 1e2),
+                solver="liblinear"
+            )
+        score = cross_val_score(model, X_train, y_train, cv=3).mean()
+        return score
+    return objective
+# ✅ Call the functions in order
+X_train, X_test, y_train, y_test = prepare_data()
+objective = make_objective(X_train, y_train)  # 👈 wrap with your train data
+# ✅ Now run optimization
+study = optuna.create_study(direction="maximize")
+study.optimize(objective, n_trials=15)
+# ✅ Print the best params
+print("Best trial:")
+print(study.best_trial)
 def objective(trial):