File size: 4,860 Bytes
cdc04d9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 | import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
class TabularMLService:
def run(self, csv_path, target_column, task_type, test_size, random_state):
if not csv_path:
return "", "", "", "Upload a CSV file first."
if not target_column:
return "", "", "", "Enter the target column name."
try:
df = pd.read_csv(csv_path)
except Exception as exc:
return "", "", "", f"Could not read CSV: {type(exc).__name__}: {exc}"
if target_column not in df.columns:
return "", "", "", f"Target column `{target_column}` was not found in the dataset."
if len(df.columns) < 2:
return "", "", "", "Dataset must have at least one feature column and one target column."
try:
inferred_task = self._resolve_task(df[target_column], task_type)
model_name, metrics, preview = self._train(df, target_column, inferred_task, test_size, random_state)
status = (
f"Trained a {inferred_task.lower()} model on {len(df)} rows and {len(df.columns) - 1} feature columns."
)
return model_name, metrics, preview, status
except Exception as exc:
return "", "", "", f"Training failed: {type(exc).__name__}: {exc}"
def _resolve_task(self, target_series, task_type):
if task_type != "Auto":
return task_type
unique_count = target_series.nunique(dropna=True)
if str(target_series.dtype) in {"object", "bool", "category"}:
return "Classification"
if unique_count <= 20:
return "Classification"
return "Regression"
def _train(self, df, target_column, task_type, test_size, random_state):
x = df.drop(columns=[target_column])
y = df[target_column]
numeric_cols = x.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = [col for col in x.columns if col not in numeric_cols]
numeric_pipeline = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler()),
]
)
categorical_pipeline = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder(handle_unknown="ignore")),
]
)
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_pipeline, numeric_cols),
("cat", categorical_pipeline, categorical_cols),
]
)
if task_type == "Classification":
model = RandomForestClassifier(n_estimators=180, random_state=random_state)
x_train, x_test, y_train, y_test = train_test_split(
x,
y,
test_size=test_size,
random_state=random_state,
stratify=y if y.nunique() > 1 else None,
)
else:
model = RandomForestRegressor(n_estimators=220, random_state=random_state)
x_train, x_test, y_train, y_test = train_test_split(
x,
y,
test_size=test_size,
random_state=random_state,
)
pipeline = Pipeline(
steps=[
("preprocessor", preprocessor),
("model", model),
]
)
pipeline.fit(x_train, y_train)
preds = pipeline.predict(x_test)
if task_type == "Classification":
metrics = "\n".join(
[
f"Accuracy: {accuracy_score(y_test, preds):.4f}",
f"Macro F1: {f1_score(y_test, preds, average='macro'):.4f}",
f"Train Rows: {len(x_train)}",
f"Test Rows: {len(x_test)}",
f"Classes: {y.nunique()}",
]
)
model_name = "RandomForestClassifier"
else:
metrics = "\n".join(
[
f"R2: {r2_score(y_test, preds):.4f}",
f"MAE: {mean_absolute_error(y_test, preds):.4f}",
f"Train Rows: {len(x_train)}",
f"Test Rows: {len(x_test)}",
]
)
model_name = "RandomForestRegressor"
preview = x.head(8).to_string(index=False)
return model_name, metrics, preview
|