szili2011 commited on
Commit
0987ee1
·
verified ·
1 Parent(s): fbb3355

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -54
app.py CHANGED
@@ -28,12 +28,6 @@ from sklearn.metrics import accuracy_score, classification_report, mean_squared_
28
  from sklearn.datasets import make_classification, make_regression
29
  import joblib
30
 
31
- # --- Core Machine Learning (PyTorch) ---
32
- import torch
33
- import torch.nn as nn
34
- import torch.optim as optim
35
- from torch.utils.data import TensorDataset, DataLoader
36
-
37
  # --- ONNX Support for Model Interoperability ---
38
  import skl2onnx
39
  from skl2onnx import convert_sklearn
@@ -60,12 +54,6 @@ TEMP_DIR = "temp_outputs"
60
  os.makedirs(TEMP_DIR, exist_ok=True)
61
  MAX_GENERATED_ROWS = 50000
62
  MAX_GENERATED_COLS = 100
63
- PARAM_RANGES = collections.OrderedDict([
64
- ("Tiny (<10k)", (0, 10000)),
65
- ("Small (10k-50k)", (10000, 50000)),
66
- ("Medium (50k-250k)", (50000, 250000)),
67
- ("Large (250k-1M)", (250000, 1000000)),
68
- ])
69
 
70
  # --- Helper Functions ---
71
  def get_temp_filepath(filename_base, extension):
@@ -73,33 +61,6 @@ def get_temp_filepath(filename_base, extension):
73
  clean_extension = extension.lstrip('.')
74
  return os.path.join(TEMP_DIR, f"{filename_base}_{time.strftime('%Y%m%d-%H%M%S')}.{clean_extension}")
75
 
76
- # --- PyTorch Model Definitions ---
77
- class SimpleMLP(nn.Module):
78
- """A simple Multi-Layer Perceptron."""
79
- def __init__(self, input_dim, hidden_layers_str, output_dim, activation_fn_str="relu", task_type="classification"):
80
- super().__init__()
81
- layers = []
82
- hidden_units = [int(x.strip()) for x in hidden_layers_str.split(',') if x.strip()]
83
-
84
- current_dim = input_dim
85
- for h_units in hidden_units:
86
- layers.append(nn.Linear(current_dim, h_units))
87
- if activation_fn_str.lower() == "relu": layers.append(nn.ReLU())
88
- elif activation_fn_str.lower() == "tanh": layers.append(nn.Tanh())
89
- elif activation_fn_str.lower() == "sigmoid": layers.append(nn.Sigmoid())
90
- current_dim = h_units
91
-
92
- layers.append(nn.Linear(current_dim, output_dim))
93
-
94
- if task_type == "classification" and output_dim == 1:
95
- layers.append(nn.Sigmoid()) # For BCELoss
96
- # For multi-class, CrossEntropyLoss expects raw logits, so no final activation.
97
-
98
- self.network = nn.Sequential(*layers)
99
-
100
- def forward(self, x):
101
- return self.network(x)
102
-
103
  # --- Dataset and Preprocessing Logic ---
104
  def generate_dataset_backend(task_type, n_samples, n_features, n_classes_or_informative, dataset_format):
105
  """Generates synthetic data based on user specifications."""
@@ -145,11 +106,13 @@ def train_model_sklearn(data_input, target_column, task_type, model_name, model_
145
  logs += f"\n--- Training Scikit-learn Model: {model_name} ---\n"
146
 
147
  try:
148
- if isinstance(data_input, str): # Is a filepath
 
 
149
  if data_input.endswith('.csv'): df = pd.read_csv(data_input)
 
 
150
  else: raise ValueError("Unsupported file type for upload.")
151
- else: # Is a DataFrame from generation
152
- df = data_input
153
 
154
  if target_column not in df.columns:
155
  raise ValueError(f"Target column '{target_column}' not found.")
@@ -206,6 +169,7 @@ def train_model_sklearn(data_input, target_column, task_type, model_name, model_
206
 
207
  # Model Saving
208
  model_filename_base = f"sklearn_{model_name.replace(' ', '_').lower()}"
 
209
  if model_output_format == ".pkl (Scikit-learn)":
210
  model_path = get_temp_filepath(model_filename_base, "pkl")
211
  joblib.dump(pipeline, model_path)
@@ -219,7 +183,8 @@ def train_model_sklearn(data_input, target_column, task_type, model_name, model_
219
  else:
220
  initial_types.append((col_name, StringTensorType([None, 1])))
221
 
222
- onnx_model = convert_sklearn(pipeline, initial_types=initial_types, target_opset=12)
 
223
  with open(model_path, "wb") as f: f.write(onnx_model.SerializeToString())
224
  logs += f"Model pipeline saved to {os.path.basename(model_path)} as ONNX.\n"
225
 
@@ -248,14 +213,10 @@ def train_model_wrapper(data_input, target_column, task_type, model_family, mode
248
  logs, metrics, model_path = train_model_sklearn(data_input, target_column, task_type, model_specific, model_output_format, logs)
249
  return logs, metrics, model_path, None # No plot for sklearn
250
 
251
- # Placeholder for PyTorch integration if added back
252
- elif model_family == "PyTorch (Neural Networks)":
253
- logs += "PyTorch training is not fully integrated in this version yet.\n"
254
- return logs, "PyTorch not available.", None, None
255
-
256
  else:
257
- logs += f"Unknown model family: {model_family}\n"
258
- return logs, "Error: Unknown model family.", None, None
259
 
260
  # --- Gradio UI Definition ---
261
  def update_model_options(task_choice, model_family_choice):
@@ -266,7 +227,6 @@ def update_model_options(task_choice, model_family_choice):
266
  choices = ["Logistic Regression", "Random Forest Classifier", "Support Vector Machine (SVM) Classifier"]
267
  elif task_choice == "Tabular Regression":
268
  choices = ["Linear Regression", "Random Forest Regressor", "Support Vector Machine (SVR) Regressor"]
269
- # Add PyTorch options here if needed
270
 
271
  value = choices[0] if choices else None
272
  return gr.update(choices=choices, value=value, visible=bool(choices))
@@ -276,7 +236,6 @@ def update_model_output_formats(model_family_choice):
276
  formats = []
277
  if model_family_choice == "Scikit-learn (Classical ML)":
278
  formats = [".pkl (Scikit-learn)", ".onnx (ONNX)"]
279
- # Add PyTorch formats here
280
 
281
  value = formats[0] if formats else None
282
  return gr.update(choices=formats, value=value)
@@ -306,7 +265,11 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange"))
306
  generate_dataset_btn = gr.Button("Generate & Preview Dataset", variant="secondary")
307
 
308
  target_column_name_txt = gr.Textbox(label="Target Column Name", value="target", interactive=True)
309
- dataset_preview_df = gr.DataFrame(label="Dataset Preview (First 5 Rows)", interactive=False, height=200)
 
 
 
 
310
  generated_dataset_download_file = gr.File(label="Download Generated Dataset", interactive=False)
311
 
312
  with gr.TabItem("3. Train Model & Get Results"):
@@ -317,7 +280,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange"))
317
  training_log_txt = gr.Textbox(label="Training Log & Status", lines=15, interactive=False, max_lines=50)
318
  evaluation_metrics_txt = gr.Textbox(label="Evaluation Metrics", lines=7, interactive=False)
319
  download_trained_model_file = gr.File(label="Download Trained Model", interactive=False)
320
- loss_plot_img = gr.Plot(label="Training Loss Curve (PyTorch only)", visible=False) # Hide for now
321
 
322
  # --- Event Handlers ---
323
 
 
28
  from sklearn.datasets import make_classification, make_regression
29
  import joblib
30
 
 
 
 
 
 
 
31
  # --- ONNX Support for Model Interoperability ---
32
  import skl2onnx
33
  from skl2onnx import convert_sklearn
 
54
  os.makedirs(TEMP_DIR, exist_ok=True)
55
  MAX_GENERATED_ROWS = 50000
56
  MAX_GENERATED_COLS = 100
 
 
 
 
 
 
57
 
58
  # --- Helper Functions ---
59
  def get_temp_filepath(filename_base, extension):
 
61
  clean_extension = extension.lstrip('.')
62
  return os.path.join(TEMP_DIR, f"{filename_base}_{time.strftime('%Y%m%d-%H%M%S')}.{clean_extension}")
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  # --- Dataset and Preprocessing Logic ---
65
  def generate_dataset_backend(task_type, n_samples, n_features, n_classes_or_informative, dataset_format):
66
  """Generates synthetic data based on user specifications."""
 
106
  logs += f"\n--- Training Scikit-learn Model: {model_name} ---\n"
107
 
108
  try:
109
+ # Load data if it's a filepath, otherwise use the DataFrame directly
110
+ df = data_input
111
+ if isinstance(data_input, str):
112
  if data_input.endswith('.csv'): df = pd.read_csv(data_input)
113
+ elif data_input.endswith('.json'): df = pd.read_json(data_input, lines=True)
114
+ elif data_input.endswith('.parquet'): df = pd.read_parquet(data_input)
115
  else: raise ValueError("Unsupported file type for upload.")
 
 
116
 
117
  if target_column not in df.columns:
118
  raise ValueError(f"Target column '{target_column}' not found.")
 
169
 
170
  # Model Saving
171
  model_filename_base = f"sklearn_{model_name.replace(' ', '_').lower()}"
172
+ model_path = None
173
  if model_output_format == ".pkl (Scikit-learn)":
174
  model_path = get_temp_filepath(model_filename_base, "pkl")
175
  joblib.dump(pipeline, model_path)
 
183
  else:
184
  initial_types.append((col_name, StringTensorType([None, 1])))
185
 
186
+ options = {'zipmap': False} if task_type == "Tabular Classification" else {}
187
+ onnx_model = convert_sklearn(pipeline, initial_types=initial_types, target_opset=12, options=options)
188
  with open(model_path, "wb") as f: f.write(onnx_model.SerializeToString())
189
  logs += f"Model pipeline saved to {os.path.basename(model_path)} as ONNX.\n"
190
 
 
213
  logs, metrics, model_path = train_model_sklearn(data_input, target_column, task_type, model_specific, model_output_format, logs)
214
  return logs, metrics, model_path, None # No plot for sklearn
215
 
216
+ # Placeholder for future PyTorch integration
 
 
 
 
217
  else:
218
+ logs += f"The selected model family '{model_family}' is not supported yet.\n"
219
+ return logs, "Error: Model family not supported.", None, None
220
 
221
  # --- Gradio UI Definition ---
222
  def update_model_options(task_choice, model_family_choice):
 
227
  choices = ["Logistic Regression", "Random Forest Classifier", "Support Vector Machine (SVM) Classifier"]
228
  elif task_choice == "Tabular Regression":
229
  choices = ["Linear Regression", "Random Forest Regressor", "Support Vector Machine (SVR) Regressor"]
 
230
 
231
  value = choices[0] if choices else None
232
  return gr.update(choices=choices, value=value, visible=bool(choices))
 
236
  formats = []
237
  if model_family_choice == "Scikit-learn (Classical ML)":
238
  formats = [".pkl (Scikit-learn)", ".onnx (ONNX)"]
 
239
 
240
  value = formats[0] if formats else None
241
  return gr.update(choices=formats, value=value)
 
265
  generate_dataset_btn = gr.Button("Generate & Preview Dataset", variant="secondary")
266
 
267
  target_column_name_txt = gr.Textbox(label="Target Column Name", value="target", interactive=True)
268
+
269
+ # --- FIX: Replaced 'height' with 'row_count' ---
270
+ dataset_preview_df = gr.DataFrame(label="Dataset Preview (First 5 Rows)", interactive=False, row_count=5)
271
+ # --- END FIX ---
272
+
273
  generated_dataset_download_file = gr.File(label="Download Generated Dataset", interactive=False)
274
 
275
  with gr.TabItem("3. Train Model & Get Results"):
 
280
  training_log_txt = gr.Textbox(label="Training Log & Status", lines=15, interactive=False, max_lines=50)
281
  evaluation_metrics_txt = gr.Textbox(label="Evaluation Metrics", lines=7, interactive=False)
282
  download_trained_model_file = gr.File(label="Download Trained Model", interactive=False)
283
+ loss_plot_img = gr.Plot(label="Training Loss Curve (PyTorch only)", visible=False) # Hide as PyTorch is not used
284
 
285
  # --- Event Handlers ---
286