Spaces:

abersbail
/

machine-learning-tabular-lab

Sleeping

File size: 4,860 Bytes

cdc04d9

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler


class TabularMLService:
    def run(self, csv_path, target_column, task_type, test_size, random_state):
        if not csv_path:
            return "", "", "", "Upload a CSV file first."

        if not target_column:
            return "", "", "", "Enter the target column name."

        try:
            df = pd.read_csv(csv_path)
        except Exception as exc:
            return "", "", "", f"Could not read CSV: {type(exc).__name__}: {exc}"

        if target_column not in df.columns:
            return "", "", "", f"Target column `{target_column}` was not found in the dataset."

        if len(df.columns) < 2:
            return "", "", "", "Dataset must have at least one feature column and one target column."

        try:
            inferred_task = self._resolve_task(df[target_column], task_type)
            model_name, metrics, preview = self._train(df, target_column, inferred_task, test_size, random_state)
            status = (
                f"Trained a {inferred_task.lower()} model on {len(df)} rows and {len(df.columns) - 1} feature columns."
            )
            return model_name, metrics, preview, status
        except Exception as exc:
            return "", "", "", f"Training failed: {type(exc).__name__}: {exc}"

    def _resolve_task(self, target_series, task_type):
        if task_type != "Auto":
            return task_type

        unique_count = target_series.nunique(dropna=True)
        if str(target_series.dtype) in {"object", "bool", "category"}:
            return "Classification"
        if unique_count <= 20:
            return "Classification"
        return "Regression"

    def _train(self, df, target_column, task_type, test_size, random_state):
        x = df.drop(columns=[target_column])
        y = df[target_column]

        numeric_cols = x.select_dtypes(include=["number"]).columns.tolist()
        categorical_cols = [col for col in x.columns if col not in numeric_cols]

        numeric_pipeline = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy="median")),
                ("scaler", StandardScaler()),
            ]
        )
        categorical_pipeline = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("onehot", OneHotEncoder(handle_unknown="ignore")),
            ]
        )

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_pipeline, numeric_cols),
                ("cat", categorical_pipeline, categorical_cols),
            ]
        )

        if task_type == "Classification":
            model = RandomForestClassifier(n_estimators=180, random_state=random_state)
            x_train, x_test, y_train, y_test = train_test_split(
                x,
                y,
                test_size=test_size,
                random_state=random_state,
                stratify=y if y.nunique() > 1 else None,
            )
        else:
            model = RandomForestRegressor(n_estimators=220, random_state=random_state)
            x_train, x_test, y_train, y_test = train_test_split(
                x,
                y,
                test_size=test_size,
                random_state=random_state,
            )

        pipeline = Pipeline(
            steps=[
                ("preprocessor", preprocessor),
                ("model", model),
            ]
        )
        pipeline.fit(x_train, y_train)
        preds = pipeline.predict(x_test)

        if task_type == "Classification":
            metrics = "\n".join(
                [
                    f"Accuracy: {accuracy_score(y_test, preds):.4f}",
                    f"Macro F1: {f1_score(y_test, preds, average='macro'):.4f}",
                    f"Train Rows: {len(x_train)}",
                    f"Test Rows: {len(x_test)}",
                    f"Classes: {y.nunique()}",
                ]
            )
            model_name = "RandomForestClassifier"
        else:
            metrics = "\n".join(
                [
                    f"R2: {r2_score(y_test, preds):.4f}",
                    f"MAE: {mean_absolute_error(y_test, preds):.4f}",
                    f"Train Rows: {len(x_train)}",
                    f"Test Rows: {len(x_test)}",
                ]
            )
            model_name = "RandomForestRegressor"

        preview = x.head(8).to_string(index=False)
        return model_name, metrics, preview