Spaces:

HF-Pawan
/

Supervised-Learning-Model-Trainer

Running

File size: 6,928 Bytes

d7e53e8

import gradio as gr
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action="ignore")

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    LabelEncoder,
)
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)

# ======================
# Models
# ======================
from sklearn.linear_model import (
    LinearRegression,
    LogisticRegression,
    Perceptron,
)
from sklearn.neighbors import (
    KNeighborsClassifier,
    KNeighborsRegressor,
)
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import (
    DecisionTreeClassifier,
    DecisionTreeRegressor,
)
from sklearn.svm import SVC, SVR
from sklearn.neural_network import (
    MLPClassifier,
    MLPRegressor,
)

from sklearn.utils.multiclass import type_of_target

# ======================
# Model Registry
# ======================
REGRESSION_MODELS = {
    "Linear Regression": LinearRegression(),
    "KNN Regressor": KNeighborsRegressor(),
    "Decision Tree Regressor": DecisionTreeRegressor(),
    "SVR": SVR(),
    "MLP Regressor": MLPRegressor(max_iter=1000),
}

CLASSIFICATION_MODELS = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "KNN Classifier": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Perceptron": Perceptron(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "SVM Classifier": SVC(probability=True),
    "MLP Classifier": MLPClassifier(max_iter=1000),
}

# ======================
# UI Helpers
# ======================
def update_models(task_type):
    if task_type == "Regression":
        return gr.update(choices=list(REGRESSION_MODELS.keys()), value=None)
    else:
        return gr.update(choices=list(CLASSIFICATION_MODELS.keys()), value=None)

def preview_csv(file):
    if file is None:
        return None
    return pd.read_csv(file.name)


def detect_target_type(y):
    # Categorical target
    if y.dtype == "object" or y.dtype.name == "category":
        return "Classification"

    # Numeric but low cardinality → classification
    if y.nunique() <= 20:
        return "Classification"

    return "Regression"


def auto_set_task(file):
    if file is None:
        return "Regression"
    df = pd.read_csv(file.name)
    y = df.iloc[:, -1]
    return detect_target_type(y)


# ======================
# Core Training Logic
# ======================
def train_model(file, task_type, model_name):
    df = pd.read_csv(file.name)

    # Target = last column
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    detected_task = detect_target_type(y)

    # 🚫 Mismatch validation
    if task_type != detected_task:
        return pd.DataFrame(
            {
                "Error": [
                    f"Dataset target detected as {detected_task}, "
                    f"but {task_type} model selected."
                ]
            }
        )

    # ---------- Automatic label encoding ----------
    if task_type == "Classification" and y.dtype == "object":
        y = LabelEncoder().fit_transform(y)

    # ---------- Feature preprocessing ----------
    num_cols = X.select_dtypes(include=["int64", "float64"]).columns
    cat_cols = X.select_dtypes(include=["object", "category"]).columns

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ]
    )

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # ---------- Model selection ----------
    model = (
        REGRESSION_MODELS[model_name]
        if task_type == "Regression"
        else CLASSIFICATION_MODELS[model_name]
    )

    pipeline = Pipeline(
        steps=[
            ("preprocessing", preprocessor),
            ("model", model),
        ]
    )

    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)

    # ---------- Metrics ----------
    if task_type == "Regression":
        metrics = {
            "MAE": mean_absolute_error(y_test, preds),
            "MSE": mean_squared_error(y_test, preds),
            "RMSE": np.sqrt(mean_squared_error(y_test, preds)),
            "R²": r2_score(y_test, preds),
        }

    else:
        metrics = {
            "Accuracy": accuracy_score(y_test, preds),
            "Precision": precision_score(y_test, preds, average="weighted"),
            "Recall": recall_score(y_test, preds, average="weighted"),
            "F1 Score": f1_score(y_test, preds, average="weighted"),
        }

        # ROC-AUC (safe handling)
        if hasattr(pipeline.named_steps["model"], "predict_proba"):
            probs = pipeline.predict_proba(X_test)
            target_type = type_of_target(y_test)

            # Binary classification
            if target_type == "binary":
                roc_auc = roc_auc_score(y_test, probs[:, 1])
                metrics["ROC-AUC"] = roc_auc

            # Multiclass classification
            elif target_type == "multiclass":
                roc_auc = roc_auc_score(
                    y_test,
                    probs,
                    multi_class="ovr",
                    average="weighted",
                )
                metrics["ROC-AUC"] = roc_auc


    # ---------- Metric table ----------
    result_df = pd.DataFrame(
        metrics.items(), columns=["Metric", "Value"]
    )

    return result_df

# ======================
# Gradio UI
# ======================
with gr.Blocks() as app:
    gr.Markdown("## Supervised Learning Model Trainer")
    gr.Markdown(
        "• Upload CSV\n"
        "• Last column is target\n"
        "• Automatic preprocessing & metrics"
    )

    file_input = gr.File(label="Upload CSV", file_types=[".csv"])

    csv_preview = gr.Dataframe(
        label="CSV Preview",
        interactive=False,
    )

    task_type = gr.Dropdown(
        ["Regression", "Classification"], label="Task Type", value="Regression"
    )
    model_name = gr.Dropdown(label="Model")
    output = gr.Dataframe(label="Evaluation Metrics")

    run_btn = gr.Button("Train & Evaluate")


    file_input.change(
        preview_csv,
        inputs=file_input,
        outputs=csv_preview,
    )

    file_input.change(
        auto_set_task,
        inputs=file_input,
        outputs=task_type,
    )

    task_type.change(
        update_models, inputs=task_type, outputs=model_name
    )

    app.load(
        update_models,
        inputs=task_type,
        outputs=model_name,
    )

    run_btn.click(
        train_model,
        inputs=[file_input, task_type, model_name],
        outputs=output,
    )

app.launch()