| import pandas as pd | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import LabelEncoder | |
| from core.detection import detect_target_type | |
| from models.registry import REGRESSION_MODELS, CLASSIFICATION_MODELS | |
| from preprocessing.transformers import build_preprocessor | |
| from utils.metrics import regression_metrics, classification_metrics | |
| from core.visuals import regression_graphs, classification_graphs | |
| from models.registry import MODEL_GROUPS | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.preprocessing import OneHotEncoder, StandardScaler | |
| from sklearn.impute import SimpleImputer | |
| from sklearn.model_selection import train_test_split | |
| import numpy as np | |
| def build_preprocessor(df): | |
| X = df.iloc[:, :-1] | |
| y = df.iloc[:, -1] | |
| num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist() | |
| cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist() | |
| if len(num_cols) + len(cat_cols) == 0: | |
| raise ValueError("No usable feature columns found") | |
| numeric_pipeline = Pipeline([ | |
| ("imputer", SimpleImputer(strategy="median")), | |
| ("scaler", StandardScaler()) | |
| ]) | |
| categorical_pipeline = Pipeline([ | |
| ("imputer", SimpleImputer(strategy="most_frequent")), | |
| ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)) | |
| ]) | |
| preprocessor = ColumnTransformer( | |
| transformers=[ | |
| ("num", numeric_pipeline, num_cols), | |
| ("cat", categorical_pipeline, cat_cols), | |
| ], | |
| remainder="drop" | |
| ) | |
| return X, y, preprocessor | |
| def build_pipeline(model, preprocessor): | |
| return Pipeline([ | |
| ("preprocessor", preprocessor), | |
| ("model", model) | |
| ]) | |
| def train_model(file, task_type, model_group, model_name, graph_type): | |
| try: | |
| if file is None: | |
| return pd.DataFrame({ | |
| "Error": [f"Please upload a csv file first."] | |
| }), None | |
| df = pd.read_csv(file.name) | |
| X, y, preprocessor = build_preprocessor(df) | |
| detected_task = detect_target_type(y) | |
| if task_type != detected_task: | |
| return pd.DataFrame({ | |
| "Error": [f"Detected {detected_task} target, but {task_type} selected."] | |
| }), None | |
| if task_type == "Classification" and y.dtype == "object": | |
| y = LabelEncoder().fit_transform(y) | |
| model = MODEL_GROUPS[model_group][task_type][model_name] | |
| unique_count = len(np.unique(y)) | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, | |
| test_size=0.2, | |
| random_state=42, | |
| stratify=y if unique_count < 20 else None | |
| ) | |
| pipeline = build_pipeline(model, preprocessor) | |
| pipeline.fit(X_train, y_train) | |
| preds = pipeline.predict(X_test) | |
| if task_type == "Regression": | |
| metrics = regression_metrics(y_test, preds) | |
| else: | |
| metrics = classification_metrics(pipeline, X_test, y_test, preds) | |
| fig = None | |
| if task_type == "Regression": | |
| fig = regression_graphs(graph_type, X, y, model, pipeline, y_test, preds) | |
| else: | |
| fig = classification_graphs(graph_type, pipeline, X_test, y_test, preds) | |
| metrics_df = pd.DataFrame(metrics.items(), columns=["Metric", "Value"]) | |
| return metrics_df, fig | |
| except ValueError as e: | |
| return ( | |
| pd.DataFrame({"Error": [str(e)]}), | |
| None, | |
| ) |