File size: 4,860 Bytes
cdc04d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler


class TabularMLService:
    def run(self, csv_path, target_column, task_type, test_size, random_state):
        if not csv_path:
            return "", "", "", "Upload a CSV file first."

        if not target_column:
            return "", "", "", "Enter the target column name."

        try:
            df = pd.read_csv(csv_path)
        except Exception as exc:
            return "", "", "", f"Could not read CSV: {type(exc).__name__}: {exc}"

        if target_column not in df.columns:
            return "", "", "", f"Target column `{target_column}` was not found in the dataset."

        if len(df.columns) < 2:
            return "", "", "", "Dataset must have at least one feature column and one target column."

        try:
            inferred_task = self._resolve_task(df[target_column], task_type)
            model_name, metrics, preview = self._train(df, target_column, inferred_task, test_size, random_state)
            status = (
                f"Trained a {inferred_task.lower()} model on {len(df)} rows and {len(df.columns) - 1} feature columns."
            )
            return model_name, metrics, preview, status
        except Exception as exc:
            return "", "", "", f"Training failed: {type(exc).__name__}: {exc}"

    def _resolve_task(self, target_series, task_type):
        if task_type != "Auto":
            return task_type

        unique_count = target_series.nunique(dropna=True)
        if str(target_series.dtype) in {"object", "bool", "category"}:
            return "Classification"
        if unique_count <= 20:
            return "Classification"
        return "Regression"

    def _train(self, df, target_column, task_type, test_size, random_state):
        x = df.drop(columns=[target_column])
        y = df[target_column]

        numeric_cols = x.select_dtypes(include=["number"]).columns.tolist()
        categorical_cols = [col for col in x.columns if col not in numeric_cols]

        numeric_pipeline = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy="median")),
                ("scaler", StandardScaler()),
            ]
        )
        categorical_pipeline = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("onehot", OneHotEncoder(handle_unknown="ignore")),
            ]
        )

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_pipeline, numeric_cols),
                ("cat", categorical_pipeline, categorical_cols),
            ]
        )

        if task_type == "Classification":
            model = RandomForestClassifier(n_estimators=180, random_state=random_state)
            x_train, x_test, y_train, y_test = train_test_split(
                x,
                y,
                test_size=test_size,
                random_state=random_state,
                stratify=y if y.nunique() > 1 else None,
            )
        else:
            model = RandomForestRegressor(n_estimators=220, random_state=random_state)
            x_train, x_test, y_train, y_test = train_test_split(
                x,
                y,
                test_size=test_size,
                random_state=random_state,
            )

        pipeline = Pipeline(
            steps=[
                ("preprocessor", preprocessor),
                ("model", model),
            ]
        )
        pipeline.fit(x_train, y_train)
        preds = pipeline.predict(x_test)

        if task_type == "Classification":
            metrics = "\n".join(
                [
                    f"Accuracy: {accuracy_score(y_test, preds):.4f}",
                    f"Macro F1: {f1_score(y_test, preds, average='macro'):.4f}",
                    f"Train Rows: {len(x_train)}",
                    f"Test Rows: {len(x_test)}",
                    f"Classes: {y.nunique()}",
                ]
            )
            model_name = "RandomForestClassifier"
        else:
            metrics = "\n".join(
                [
                    f"R2: {r2_score(y_test, preds):.4f}",
                    f"MAE: {mean_absolute_error(y_test, preds):.4f}",
                    f"Train Rows: {len(x_train)}",
                    f"Test Rows: {len(x_test)}",
                ]
            )
            model_name = "RandomForestRegressor"

        preview = x.head(8).to_string(index=False)
        return model_name, metrics, preview