Spaces:

szili2011
/

TrainAI

Running

App Files Files Community

szili2011 commited on Aug 16, 2025

Commit

0987ee1

verified ·

1 Parent(s): fbb3355

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -54

app.py CHANGED Viewed

@@ -28,12 +28,6 @@ from sklearn.metrics import accuracy_score, classification_report, mean_squared_
 from sklearn.datasets import make_classification, make_regression
 import joblib
-# --- Core Machine Learning (PyTorch) ---
-import torch
-import torch.nn as nn
-import torch.optim as optim
-from torch.utils.data import TensorDataset, DataLoader
 # --- ONNX Support for Model Interoperability ---
 import skl2onnx
 from skl2onnx import convert_sklearn
@@ -60,12 +54,6 @@ TEMP_DIR = "temp_outputs"
 os.makedirs(TEMP_DIR, exist_ok=True)
 MAX_GENERATED_ROWS = 50000
 MAX_GENERATED_COLS = 100
-PARAM_RANGES = collections.OrderedDict([
-    ("Tiny (<10k)", (0, 10000)),
-    ("Small (10k-50k)", (10000, 50000)),
-    ("Medium (50k-250k)", (50000, 250000)),
-    ("Large (250k-1M)", (250000, 1000000)),
-])
 # --- Helper Functions ---
 def get_temp_filepath(filename_base, extension):
@@ -73,33 +61,6 @@ def get_temp_filepath(filename_base, extension):
     clean_extension = extension.lstrip('.')
     return os.path.join(TEMP_DIR, f"{filename_base}_{time.strftime('%Y%m%d-%H%M%S')}.{clean_extension}")
-# --- PyTorch Model Definitions ---
-class SimpleMLP(nn.Module):
-    """A simple Multi-Layer Perceptron."""
-    def __init__(self, input_dim, hidden_layers_str, output_dim, activation_fn_str="relu", task_type="classification"):
-        super().__init__()
-        layers = []
-        hidden_units = [int(x.strip()) for x in hidden_layers_str.split(',') if x.strip()]
-        current_dim = input_dim
-        for h_units in hidden_units:
-            layers.append(nn.Linear(current_dim, h_units))
-            if activation_fn_str.lower() == "relu": layers.append(nn.ReLU())
-            elif activation_fn_str.lower() == "tanh": layers.append(nn.Tanh())
-            elif activation_fn_str.lower() == "sigmoid": layers.append(nn.Sigmoid())
-            current_dim = h_units
-        layers.append(nn.Linear(current_dim, output_dim))
-        if task_type == "classification" and output_dim == 1:
-            layers.append(nn.Sigmoid()) # For BCELoss
-        # For multi-class, CrossEntropyLoss expects raw logits, so no final activation.
-        self.network = nn.Sequential(*layers)
-    def forward(self, x):
-        return self.network(x)
 # --- Dataset and Preprocessing Logic ---
 def generate_dataset_backend(task_type, n_samples, n_features, n_classes_or_informative, dataset_format):
     """Generates synthetic data based on user specifications."""
@@ -145,11 +106,13 @@ def train_model_sklearn(data_input, target_column, task_type, model_name, model_
     logs += f"\n--- Training Scikit-learn Model: {model_name} ---\n"
     try:
-        if isinstance(data_input, str): # Is a filepath
             if data_input.endswith('.csv'): df = pd.read_csv(data_input)
             else: raise ValueError("Unsupported file type for upload.")
-        else: # Is a DataFrame from generation
-            df = data_input
         if target_column not in df.columns:
             raise ValueError(f"Target column '{target_column}' not found.")
@@ -206,6 +169,7 @@ def train_model_sklearn(data_input, target_column, task_type, model_name, model_
         # Model Saving
         model_filename_base = f"sklearn_{model_name.replace(' ', '_').lower()}"
         if model_output_format == ".pkl (Scikit-learn)":
             model_path = get_temp_filepath(model_filename_base, "pkl")
             joblib.dump(pipeline, model_path)
@@ -219,7 +183,8 @@ def train_model_sklearn(data_input, target_column, task_type, model_name, model_
                 else:
                     initial_types.append((col_name, StringTensorType([None, 1])))
-            onnx_model = convert_sklearn(pipeline, initial_types=initial_types, target_opset=12)
             with open(model_path, "wb") as f: f.write(onnx_model.SerializeToString())
             logs += f"Model pipeline saved to {os.path.basename(model_path)} as ONNX.\n"
@@ -248,14 +213,10 @@ def train_model_wrapper(data_input, target_column, task_type, model_family, mode
         logs, metrics, model_path = train_model_sklearn(data_input, target_column, task_type, model_specific, model_output_format, logs)
         return logs, metrics, model_path, None # No plot for sklearn
-    # Placeholder for PyTorch integration if added back
-    elif model_family == "PyTorch (Neural Networks)":
-        logs += "PyTorch training is not fully integrated in this version yet.\n"
-        return logs, "PyTorch not available.", None, None
     else:
-        logs += f"Unknown model family: {model_family}\n"
-        return logs, "Error: Unknown model family.", None, None
 # --- Gradio UI Definition ---
 def update_model_options(task_choice, model_family_choice):
@@ -266,7 +227,6 @@ def update_model_options(task_choice, model_family_choice):
             choices = ["Logistic Regression", "Random Forest Classifier", "Support Vector Machine (SVM) Classifier"]
         elif task_choice == "Tabular Regression":
             choices = ["Linear Regression", "Random Forest Regressor", "Support Vector Machine (SVR) Regressor"]
-    # Add PyTorch options here if needed
     value = choices[0] if choices else None
     return gr.update(choices=choices, value=value, visible=bool(choices))
@@ -276,7 +236,6 @@ def update_model_output_formats(model_family_choice):
     formats = []
     if model_family_choice == "Scikit-learn (Classical ML)":
         formats = [".pkl (Scikit-learn)", ".onnx (ONNX)"]
-    # Add PyTorch formats here
     value = formats[0] if formats else None
     return gr.update(choices=formats, value=value)
@@ -306,7 +265,11 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange"))
             generate_dataset_btn = gr.Button("Generate & Preview Dataset", variant="secondary")
             target_column_name_txt = gr.Textbox(label="Target Column Name", value="target", interactive=True)
-            dataset_preview_df = gr.DataFrame(label="Dataset Preview (First 5 Rows)", interactive=False, height=200)
             generated_dataset_download_file = gr.File(label="Download Generated Dataset", interactive=False)
         with gr.TabItem("3. Train Model & Get Results"):
@@ -317,7 +280,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange"))
             training_log_txt = gr.Textbox(label="Training Log & Status", lines=15, interactive=False, max_lines=50)
             evaluation_metrics_txt = gr.Textbox(label="Evaluation Metrics", lines=7, interactive=False)
             download_trained_model_file = gr.File(label="Download Trained Model", interactive=False)
-            loss_plot_img = gr.Plot(label="Training Loss Curve (PyTorch only)", visible=False) # Hide for now
     # --- Event Handlers ---

 from sklearn.datasets import make_classification, make_regression
 import joblib
 # --- ONNX Support for Model Interoperability ---
 import skl2onnx
 from skl2onnx import convert_sklearn
 os.makedirs(TEMP_DIR, exist_ok=True)
 MAX_GENERATED_ROWS = 50000
 MAX_GENERATED_COLS = 100
 # --- Helper Functions ---
 def get_temp_filepath(filename_base, extension):
     clean_extension = extension.lstrip('.')
     return os.path.join(TEMP_DIR, f"{filename_base}_{time.strftime('%Y%m%d-%H%M%S')}.{clean_extension}")
 # --- Dataset and Preprocessing Logic ---
 def generate_dataset_backend(task_type, n_samples, n_features, n_classes_or_informative, dataset_format):
     """Generates synthetic data based on user specifications."""
     logs += f"\n--- Training Scikit-learn Model: {model_name} ---\n"
     try:
+        # Load data if it's a filepath, otherwise use the DataFrame directly
+        df = data_input
+        if isinstance(data_input, str):
             if data_input.endswith('.csv'): df = pd.read_csv(data_input)
+            elif data_input.endswith('.json'): df = pd.read_json(data_input, lines=True)
+            elif data_input.endswith('.parquet'): df = pd.read_parquet(data_input)
             else: raise ValueError("Unsupported file type for upload.")
         if target_column not in df.columns:
             raise ValueError(f"Target column '{target_column}' not found.")
         # Model Saving
         model_filename_base = f"sklearn_{model_name.replace(' ', '_').lower()}"
+        model_path = None
         if model_output_format == ".pkl (Scikit-learn)":
             model_path = get_temp_filepath(model_filename_base, "pkl")
             joblib.dump(pipeline, model_path)
                 else:
                     initial_types.append((col_name, StringTensorType([None, 1])))
+            options = {'zipmap': False} if task_type == "Tabular Classification" else {}
+            onnx_model = convert_sklearn(pipeline, initial_types=initial_types, target_opset=12, options=options)
             with open(model_path, "wb") as f: f.write(onnx_model.SerializeToString())
             logs += f"Model pipeline saved to {os.path.basename(model_path)} as ONNX.\n"
         logs, metrics, model_path = train_model_sklearn(data_input, target_column, task_type, model_specific, model_output_format, logs)
         return logs, metrics, model_path, None # No plot for sklearn
+    # Placeholder for future PyTorch integration
     else:
+        logs += f"The selected model family '{model_family}' is not supported yet.\n"
+        return logs, "Error: Model family not supported.", None, None
 # --- Gradio UI Definition ---
 def update_model_options(task_choice, model_family_choice):
             choices = ["Logistic Regression", "Random Forest Classifier", "Support Vector Machine (SVM) Classifier"]
         elif task_choice == "Tabular Regression":
             choices = ["Linear Regression", "Random Forest Regressor", "Support Vector Machine (SVR) Regressor"]
     value = choices[0] if choices else None
     return gr.update(choices=choices, value=value, visible=bool(choices))
     formats = []
     if model_family_choice == "Scikit-learn (Classical ML)":
         formats = [".pkl (Scikit-learn)", ".onnx (ONNX)"]
     value = formats[0] if formats else None
     return gr.update(choices=formats, value=value)
             generate_dataset_btn = gr.Button("Generate & Preview Dataset", variant="secondary")
             target_column_name_txt = gr.Textbox(label="Target Column Name", value="target", interactive=True)
+            # --- FIX: Replaced 'height' with 'row_count' ---
+            dataset_preview_df = gr.DataFrame(label="Dataset Preview (First 5 Rows)", interactive=False, row_count=5)
+            # --- END FIX ---
             generated_dataset_download_file = gr.File(label="Download Generated Dataset", interactive=False)
         with gr.TabItem("3. Train Model & Get Results"):
             training_log_txt = gr.Textbox(label="Training Log & Status", lines=15, interactive=False, max_lines=50)
             evaluation_metrics_txt = gr.Textbox(label="Evaluation Metrics", lines=7, interactive=False)
             download_trained_model_file = gr.File(label="Download Trained Model", interactive=False)
+            loss_plot_img = gr.Plot(label="Training Loss Curve (PyTorch only)", visible=False) # Hide as PyTorch is not used
     # --- Event Handlers ---