import pandas as pd from sklearn.pipeline import Pipeline from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from core.detection import detect_target_type from models.registry import REGRESSION_MODELS, CLASSIFICATION_MODELS from preprocessing.transformers import build_preprocessor from utils.metrics import regression_metrics, classification_metrics from core.visuals import regression_graphs, classification_graphs from models.registry import MODEL_GROUPS from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.impute import SimpleImputer from sklearn.model_selection import train_test_split import numpy as np def build_preprocessor(df): X = df.iloc[:, :-1] y = df.iloc[:, -1] num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist() cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist() if len(num_cols) + len(cat_cols) == 0: raise ValueError("No usable feature columns found") numeric_pipeline = Pipeline([ ("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()) ]) categorical_pipeline = Pipeline([ ("imputer", SimpleImputer(strategy="most_frequent")), ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)) ]) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_pipeline, num_cols), ("cat", categorical_pipeline, cat_cols), ], remainder="drop" ) return X, y, preprocessor def build_pipeline(model, preprocessor): return Pipeline([ ("preprocessor", preprocessor), ("model", model) ]) def train_model(file, task_type, model_group, model_name, graph_type): try: if file is None: return pd.DataFrame({ "Error": [f"Please upload a csv file first."] }), None df = pd.read_csv(file.name) X, y, preprocessor = build_preprocessor(df) detected_task = detect_target_type(y) if task_type != detected_task: return pd.DataFrame({ "Error": [f"Detected {detected_task} target, but {task_type} selected."] }), None if task_type == "Classification" and y.dtype == "object": y = LabelEncoder().fit_transform(y) model = MODEL_GROUPS[model_group][task_type][model_name] unique_count = len(np.unique(y)) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y if unique_count < 20 else None ) pipeline = build_pipeline(model, preprocessor) pipeline.fit(X_train, y_train) preds = pipeline.predict(X_test) if task_type == "Regression": metrics = regression_metrics(y_test, preds) else: metrics = classification_metrics(pipeline, X_test, y_test, preds) fig = None if task_type == "Regression": fig = regression_graphs(graph_type, X, y, model, pipeline, y_test, preds) else: fig = classification_graphs(graph_type, pipeline, X_test, y_test, preds) metrics_df = pd.DataFrame(metrics.items(), columns=["Metric", "Value"]) return metrics_df, fig except ValueError as e: return ( pd.DataFrame({"Error": [str(e)]}), None, )