abersbail's picture
Add machine learning tabular lab Space
cdc04d9 verified
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
class TabularMLService:
def run(self, csv_path, target_column, task_type, test_size, random_state):
if not csv_path:
return "", "", "", "Upload a CSV file first."
if not target_column:
return "", "", "", "Enter the target column name."
try:
df = pd.read_csv(csv_path)
except Exception as exc:
return "", "", "", f"Could not read CSV: {type(exc).__name__}: {exc}"
if target_column not in df.columns:
return "", "", "", f"Target column `{target_column}` was not found in the dataset."
if len(df.columns) < 2:
return "", "", "", "Dataset must have at least one feature column and one target column."
try:
inferred_task = self._resolve_task(df[target_column], task_type)
model_name, metrics, preview = self._train(df, target_column, inferred_task, test_size, random_state)
status = (
f"Trained a {inferred_task.lower()} model on {len(df)} rows and {len(df.columns) - 1} feature columns."
)
return model_name, metrics, preview, status
except Exception as exc:
return "", "", "", f"Training failed: {type(exc).__name__}: {exc}"
def _resolve_task(self, target_series, task_type):
if task_type != "Auto":
return task_type
unique_count = target_series.nunique(dropna=True)
if str(target_series.dtype) in {"object", "bool", "category"}:
return "Classification"
if unique_count <= 20:
return "Classification"
return "Regression"
def _train(self, df, target_column, task_type, test_size, random_state):
x = df.drop(columns=[target_column])
y = df[target_column]
numeric_cols = x.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = [col for col in x.columns if col not in numeric_cols]
numeric_pipeline = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler()),
]
)
categorical_pipeline = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder(handle_unknown="ignore")),
]
)
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_pipeline, numeric_cols),
("cat", categorical_pipeline, categorical_cols),
]
)
if task_type == "Classification":
model = RandomForestClassifier(n_estimators=180, random_state=random_state)
x_train, x_test, y_train, y_test = train_test_split(
x,
y,
test_size=test_size,
random_state=random_state,
stratify=y if y.nunique() > 1 else None,
)
else:
model = RandomForestRegressor(n_estimators=220, random_state=random_state)
x_train, x_test, y_train, y_test = train_test_split(
x,
y,
test_size=test_size,
random_state=random_state,
)
pipeline = Pipeline(
steps=[
("preprocessor", preprocessor),
("model", model),
]
)
pipeline.fit(x_train, y_train)
preds = pipeline.predict(x_test)
if task_type == "Classification":
metrics = "\n".join(
[
f"Accuracy: {accuracy_score(y_test, preds):.4f}",
f"Macro F1: {f1_score(y_test, preds, average='macro'):.4f}",
f"Train Rows: {len(x_train)}",
f"Test Rows: {len(x_test)}",
f"Classes: {y.nunique()}",
]
)
model_name = "RandomForestClassifier"
else:
metrics = "\n".join(
[
f"R2: {r2_score(y_test, preds):.4f}",
f"MAE: {mean_absolute_error(y_test, preds):.4f}",
f"Train Rows: {len(x_train)}",
f"Test Rows: {len(x_test)}",
]
)
model_name = "RandomForestRegressor"
preview = x.head(8).to_string(index=False)
return model_name, metrics, preview