# --- Standard Library Imports --- import os import time import traceback import tempfile import json import math import collections import collections.abc # For Gradio compatibility with newer Python versions # --- UI Framework --- import gradio as gr # --- Data Handling & Numerical Ops --- import pandas as pd import numpy as np # --- Core Machine Learning (Scikit-learn) --- from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder from sklearn.impute import SimpleImputer from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.svm import SVC, SVR from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score from sklearn.datasets import make_classification, make_regression import joblib # --- ONNX Support for Model Interoperability --- import skl2onnx from skl2onnx import convert_sklearn from skl2onnx.common.data_types import FloatTensorType, StringTensorType # --- Visualization --- import matplotlib matplotlib.use('Agg') # Use non-interactive backend for server environments import matplotlib.pyplot as plt # --- Graceful ONNX Runtime Handling --- # This addresses the system-level ImportError on platforms like Hugging Face Spaces. try: import onnxruntime as rt ONNX_RUNTIME_AVAILABLE = True except ImportError: ONNX_RUNTIME_AVAILABLE = False print("Warning: onnxruntime could not be imported. ONNX model validation will be skipped.") # --- End of Imports --- # --- Global Variables & Constants --- TEMP_DIR = "temp_outputs" os.makedirs(TEMP_DIR, exist_ok=True) MAX_GENERATED_ROWS = 50000 MAX_GENERATED_COLS = 100 # --- Helper Functions --- def get_temp_filepath(filename_base, extension): """Generates a unique temporary filepath.""" clean_extension = extension.lstrip('.') return os.path.join(TEMP_DIR, f"{filename_base}_{time.strftime('%Y%m%d-%H%M%S')}.{clean_extension}") # --- Dataset and Preprocessing Logic --- def generate_dataset_backend(task_type, n_samples, n_features, n_classes_or_informative, dataset_format): """Generates synthetic data based on user specifications.""" logs = "\n--- Generating Dataset ---\n" n_samples = max(10, min(int(n_samples), MAX_GENERATED_ROWS)) n_features = max(1, min(int(n_features), MAX_GENERATED_COLS)) n_classes_or_informative = int(n_classes_or_informative) df = None try: if task_type == "Tabular Classification": X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=max(1, n_features // 2), n_redundant=0, n_classes=max(2, n_classes_or_informative), random_state=42) df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(n_features)]) df['target'] = y elif task_type == "Tabular Regression": X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=max(1, min(n_features, n_classes_or_informative)), noise=10, random_state=42) df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(n_features)]) df['target'] = y if df is None: raise NotImplementedError(f"Dataset generation for '{task_type}' is not implemented.") logs += f"Generated data with shape: {df.shape}\n" file_path = get_temp_filepath("generated_dataset", dataset_format) if dataset_format == ".csv": df.to_csv(file_path, index=False) elif dataset_format == ".json": df.to_json(file_path, orient='records', lines=True) elif dataset_format == ".parquet": df.to_parquet(file_path, index=False) logs += f"Dataset saved to temporary file: {os.path.basename(file_path)}\n" return df.head(), df, logs, file_path except Exception as e: error_msg = f"Error generating dataset: {traceback.format_exc()}" logs += error_msg + "\n" return None, None, logs, None # --- Core Training Functions --- def train_model_sklearn(data_input, target_column, task_type, model_name, model_output_format, logs=""): """Handles the entire Scikit-learn training and evaluation pipeline.""" logs += f"\n--- Training Scikit-learn Model: {model_name} ---\n" try: # Load data if it's a filepath, otherwise use the DataFrame directly df = data_input if isinstance(data_input, str): if data_input.endswith('.csv'): df = pd.read_csv(data_input) elif data_input.endswith('.json'): df = pd.read_json(data_input, lines=True) elif data_input.endswith('.parquet'): df = pd.read_parquet(data_input) else: raise ValueError("Unsupported file type for upload.") if target_column not in df.columns: raise ValueError(f"Target column '{target_column}' not found.") # Preprocessing X = df.drop(columns=[target_column]) y = df[target_column] numeric_features = X.select_dtypes(include=np.number).columns categorical_features = X.select_dtypes(include='object').columns preprocessor = ColumnTransformer(transformers=[ ('num', Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())]), numeric_features), ('cat', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), categorical_features) ]) # Model Selection if task_type == "Tabular Classification": y = LabelEncoder().fit_transform(y) models = { "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42), "Random Forest Classifier": RandomForestClassifier(random_state=42), "Support Vector Machine (SVM) Classifier": SVC(random_state=42, probability=True) } else: # Regression models = { "Linear Regression": LinearRegression(), "Random Forest Regressor": RandomForestRegressor(random_state=42), "Support Vector Machine (SVR) Regressor": SVR() } model = models[model_name] # Create full pipeline pipeline = Pipeline([('preprocessor', preprocessor), ('model', model)]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) logs += f"Data split into training ({X_train.shape}) and testing ({X_test.shape}) sets.\n" # Training start_time = time.time() pipeline.fit(X_train, y_train) logs += f"Training completed in {time.time() - start_time:.2f}s.\n" # Evaluation y_pred = pipeline.predict(X_test) if task_type == "Tabular Classification": acc = accuracy_score(y_test, y_pred) report = classification_report(y_test, y_pred, zero_division=0) metrics = f"Accuracy: {acc:.4f}\n\nClassification Report:\n{report}" else: mse = mean_squared_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) metrics = f"Mean Squared Error: {mse:.4f}\nR² Score: {r2:.4f}" logs += "\n--- Evaluation Metrics ---\n" + metrics + "\n" # Model Saving model_filename_base = f"sklearn_{model_name.replace(' ', '_').lower()}" model_path = None if model_output_format == ".pkl (Scikit-learn)": model_path = get_temp_filepath(model_filename_base, "pkl") joblib.dump(pipeline, model_path) logs += f"Model pipeline saved to {os.path.basename(model_path)} as PKL.\n" elif model_output_format == ".onnx (ONNX)": model_path = get_temp_filepath(model_filename_base, "onnx") initial_types = [] for col_name in X.columns: if pd.api.types.is_numeric_dtype(X[col_name].dtype): initial_types.append((col_name, FloatTensorType([None, 1]))) else: initial_types.append((col_name, StringTensorType([None, 1]))) options = {'zipmap': False} if task_type == "Tabular Classification" else {} onnx_model = convert_sklearn(pipeline, initial_types=initial_types, target_opset=12, options=options) with open(model_path, "wb") as f: f.write(onnx_model.SerializeToString()) logs += f"Model pipeline saved to {os.path.basename(model_path)} as ONNX.\n" if ONNX_RUNTIME_AVAILABLE: sess = rt.InferenceSession(model_path) logs += "ONNX model successfully loaded and validated with onnxruntime.\n" else: logs += "ONNX model validation skipped because onnxruntime is not available in this environment.\n" return logs, metrics, model_path except Exception as e: error_msg = f"Scikit-learn training failed: {traceback.format_exc()}" logs += error_msg + "\n" return logs, error_msg, None # --- Main Training Dispatcher --- def train_model_wrapper(data_input, target_column, task_type, model_family, model_specific, model_output_format, logs): """A wrapper to call the correct training function based on user choices.""" if data_input is None: logs += "ERROR: No dataset has been generated or uploaded. Please go to Tab 2.\n" return logs, "Error: No dataset available.", None, None if model_family == "Scikit-learn (Classical ML)": logs, metrics, model_path = train_model_sklearn(data_input, target_column, task_type, model_specific, model_output_format, logs) return logs, metrics, model_path, None # No plot for sklearn # Placeholder for future PyTorch integration else: logs += f"The selected model family '{model_family}' is not supported yet.\n" return logs, "Error: Model family not supported.", None, None # --- Gradio UI Definition --- def update_model_options(task_choice, model_family_choice): """Dynamically updates the available models based on task and family.""" choices = [] if model_family_choice == "Scikit-learn (Classical ML)": if task_choice == "Tabular Classification": choices = ["Logistic Regression", "Random Forest Classifier", "Support Vector Machine (SVM) Classifier"] elif task_choice == "Tabular Regression": choices = ["Linear Regression", "Random Forest Regressor", "Support Vector Machine (SVR) Regressor"] value = choices[0] if choices else None return gr.update(choices=choices, value=value, visible=bool(choices)) def update_model_output_formats(model_family_choice): """Updates the output format options based on the model family.""" formats = [] if model_family_choice == "Scikit-learn (Classical ML)": formats = [".pkl (Scikit-learn)", ".onnx (ONNX)"] value = formats[0] if formats else None return gr.update(choices=formats, value=value) # The Gradio App Layout with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange")) as demo: gr.Markdown("# 🧠 TrainAI ⚙️") gr.Markdown("A simple interface to create, train, and download machine learning models.") # State variables to hold data between interactions generated_data_state = gr.State(None) with gr.Tabs(): with gr.TabItem("1. Define Task & Model"): with gr.Row(): task_type_dd = gr.Dropdown(["Tabular Classification", "Tabular Regression"], label="Select Task Type", value="Tabular Classification") model_family_dd = gr.Dropdown(["Scikit-learn (Classical ML)"], label="Select Model Family", value="Scikit-learn (Classical ML)") model_specific_dd = gr.Dropdown(label="Select Specific Model", choices=["Logistic Regression", "Random Forest Classifier", "Support Vector Machine (SVM) Classifier"], value="Logistic Regression", interactive=True) with gr.TabItem("2. Configure Dataset"): with gr.Row(): ds_gen_samples_num = gr.Number(label="# Samples", value=1000, minimum=10, step=100) ds_gen_features_num = gr.Number(label="# Features", value=10, minimum=1, step=1) ds_gen_classes_num = gr.Number(label="Classes (Classif) / Informative (Regr)", value=2, minimum=1, step=1) ds_gen_format_dd = gr.Dropdown([".csv", ".json", ".parquet"], label="Generated Dataset Format", value=".csv") generate_dataset_btn = gr.Button("Generate & Preview Dataset", variant="secondary") target_column_name_txt = gr.Textbox(label="Target Column Name", value="target", interactive=True) # --- FIX: Replaced 'height' with 'row_count' --- dataset_preview_df = gr.DataFrame(label="Dataset Preview (First 5 Rows)", interactive=False, row_count=5) # --- END FIX --- generated_dataset_download_file = gr.File(label="Download Generated Dataset", interactive=False) with gr.TabItem("3. Train Model & Get Results"): model_output_format_dd = gr.Dropdown(label="Select Model Output Format", choices=[".pkl (Scikit-learn)", ".onnx (ONNX)"], value=".pkl (Scikit-learn)") train_model_btn = gr.Button("🚀 Train Model", variant="primary") gr.Markdown("---") gr.Markdown("### Training Progress & Results") training_log_txt = gr.Textbox(label="Training Log & Status", lines=15, interactive=False, max_lines=50) evaluation_metrics_txt = gr.Textbox(label="Evaluation Metrics", lines=7, interactive=False) download_trained_model_file = gr.File(label="Download Trained Model", interactive=False) loss_plot_img = gr.Plot(label="Training Loss Curve (PyTorch only)", visible=False) # Hide as PyTorch is not used # --- Event Handlers --- # Update model choices when task or family changes task_type_dd.change(fn=update_model_options, inputs=[task_type_dd, model_family_dd], outputs=model_specific_dd) model_family_dd.change(fn=update_model_options, inputs=[task_type_dd, model_family_dd], outputs=model_specific_dd) # Update output formats when family changes model_family_dd.change(fn=update_model_output_formats, inputs=model_family_dd, outputs=model_output_format_dd) # Dataset generation button generate_dataset_btn.click( fn=generate_dataset_backend, inputs=[task_type_dd, ds_gen_samples_num, ds_gen_features_num, ds_gen_classes_num, ds_gen_format_dd], outputs=[dataset_preview_df, generated_data_state, training_log_txt, generated_dataset_download_file] ) # Main training button train_model_btn.click( fn=train_model_wrapper, inputs=[generated_data_state, target_column_name_txt, task_type_dd, model_family_dd, model_specific_dd, model_output_format_dd, training_log_txt], outputs=[training_log_txt, evaluation_metrics_txt, download_trained_model_file, loss_plot_img] ) # Launch the application demo.queue().launch(debug=True, show_error=True)