File size: 3,599 Bytes
d7e53e8
 
 
 
 
 
 
 
 
 
4928a1a
d7e53e8
4928a1a
 
 
 
 
 
d7e53e8
 
4928a1a
d7e53e8
 
 
4928a1a
 
d7e53e8
4928a1a
 
d7e53e8
4928a1a
 
 
 
d7e53e8
4928a1a
 
 
 
d7e53e8
4928a1a
 
 
 
 
 
d7e53e8
 
4928a1a
d7e53e8
4928a1a
 
 
 
d7e53e8
 
 
4928a1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7e53e8
4928a1a
d7e53e8
4928a1a
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from core.detection import detect_target_type
from models.registry import REGRESSION_MODELS, CLASSIFICATION_MODELS
from preprocessing.transformers import build_preprocessor
from utils.metrics import regression_metrics, classification_metrics
from core.visuals import regression_graphs, classification_graphs
from models.registry import MODEL_GROUPS

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import numpy as np


def build_preprocessor(df):
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
    cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

    if len(num_cols) + len(cat_cols) == 0:
        raise ValueError("No usable feature columns found")

    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_pipeline, num_cols),
            ("cat", categorical_pipeline, cat_cols),
        ],
        remainder="drop"
    )

    return X, y, preprocessor

def build_pipeline(model, preprocessor):
    return Pipeline([
        ("preprocessor", preprocessor),
        ("model", model)
    ])


def train_model(file, task_type, model_group, model_name, graph_type):
    try:
        if file is None:
            return pd.DataFrame({
                "Error": [f"Please upload a csv file first."]

            }), None
        
        df = pd.read_csv(file.name)

        X, y, preprocessor = build_preprocessor(df)

        detected_task = detect_target_type(y)

        if task_type != detected_task:
            return pd.DataFrame({
                "Error": [f"Detected {detected_task} target, but {task_type} selected."]
            }), None

        if task_type == "Classification" and y.dtype == "object":
            y = LabelEncoder().fit_transform(y)

        model = MODEL_GROUPS[model_group][task_type][model_name]

        unique_count = len(np.unique(y))

        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size=0.2,
            random_state=42,
            stratify=y if unique_count < 20 else None
        )

        pipeline = build_pipeline(model, preprocessor)

        pipeline.fit(X_train, y_train)

        preds = pipeline.predict(X_test)

        if task_type == "Regression":
            metrics = regression_metrics(y_test, preds)
        else:
            metrics = classification_metrics(pipeline, X_test, y_test, preds)
        
        fig = None
        if task_type == "Regression":
            fig = regression_graphs(graph_type, X, y, model, pipeline, y_test, preds)
        else:
            fig = classification_graphs(graph_type, pipeline, X_test, y_test, preds)

        metrics_df = pd.DataFrame(metrics.items(), columns=["Metric", "Value"])

        return metrics_df, fig
    except ValueError as e:
        return (
            pd.DataFrame({"Error": [str(e)]}),
            None,
        )