| import pandas as pd |
| from sklearn.compose import ColumnTransformer |
| from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor |
| from sklearn.impute import SimpleImputer |
| from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, r2_score |
| from sklearn.model_selection import train_test_split |
| from sklearn.pipeline import Pipeline |
| from sklearn.preprocessing import OneHotEncoder, StandardScaler |
|
|
|
|
| class TabularMLService: |
| def run(self, csv_path, target_column, task_type, test_size, random_state): |
| if not csv_path: |
| return "", "", "", "Upload a CSV file first." |
|
|
| if not target_column: |
| return "", "", "", "Enter the target column name." |
|
|
| try: |
| df = pd.read_csv(csv_path) |
| except Exception as exc: |
| return "", "", "", f"Could not read CSV: {type(exc).__name__}: {exc}" |
|
|
| if target_column not in df.columns: |
| return "", "", "", f"Target column `{target_column}` was not found in the dataset." |
|
|
| if len(df.columns) < 2: |
| return "", "", "", "Dataset must have at least one feature column and one target column." |
|
|
| try: |
| inferred_task = self._resolve_task(df[target_column], task_type) |
| model_name, metrics, preview = self._train(df, target_column, inferred_task, test_size, random_state) |
| status = ( |
| f"Trained a {inferred_task.lower()} model on {len(df)} rows and {len(df.columns) - 1} feature columns." |
| ) |
| return model_name, metrics, preview, status |
| except Exception as exc: |
| return "", "", "", f"Training failed: {type(exc).__name__}: {exc}" |
|
|
| def _resolve_task(self, target_series, task_type): |
| if task_type != "Auto": |
| return task_type |
|
|
| unique_count = target_series.nunique(dropna=True) |
| if str(target_series.dtype) in {"object", "bool", "category"}: |
| return "Classification" |
| if unique_count <= 20: |
| return "Classification" |
| return "Regression" |
|
|
| def _train(self, df, target_column, task_type, test_size, random_state): |
| x = df.drop(columns=[target_column]) |
| y = df[target_column] |
|
|
| numeric_cols = x.select_dtypes(include=["number"]).columns.tolist() |
| categorical_cols = [col for col in x.columns if col not in numeric_cols] |
|
|
| numeric_pipeline = Pipeline( |
| steps=[ |
| ("imputer", SimpleImputer(strategy="median")), |
| ("scaler", StandardScaler()), |
| ] |
| ) |
| categorical_pipeline = Pipeline( |
| steps=[ |
| ("imputer", SimpleImputer(strategy="most_frequent")), |
| ("onehot", OneHotEncoder(handle_unknown="ignore")), |
| ] |
| ) |
|
|
| preprocessor = ColumnTransformer( |
| transformers=[ |
| ("num", numeric_pipeline, numeric_cols), |
| ("cat", categorical_pipeline, categorical_cols), |
| ] |
| ) |
|
|
| if task_type == "Classification": |
| model = RandomForestClassifier(n_estimators=180, random_state=random_state) |
| x_train, x_test, y_train, y_test = train_test_split( |
| x, |
| y, |
| test_size=test_size, |
| random_state=random_state, |
| stratify=y if y.nunique() > 1 else None, |
| ) |
| else: |
| model = RandomForestRegressor(n_estimators=220, random_state=random_state) |
| x_train, x_test, y_train, y_test = train_test_split( |
| x, |
| y, |
| test_size=test_size, |
| random_state=random_state, |
| ) |
|
|
| pipeline = Pipeline( |
| steps=[ |
| ("preprocessor", preprocessor), |
| ("model", model), |
| ] |
| ) |
| pipeline.fit(x_train, y_train) |
| preds = pipeline.predict(x_test) |
|
|
| if task_type == "Classification": |
| metrics = "\n".join( |
| [ |
| f"Accuracy: {accuracy_score(y_test, preds):.4f}", |
| f"Macro F1: {f1_score(y_test, preds, average='macro'):.4f}", |
| f"Train Rows: {len(x_train)}", |
| f"Test Rows: {len(x_test)}", |
| f"Classes: {y.nunique()}", |
| ] |
| ) |
| model_name = "RandomForestClassifier" |
| else: |
| metrics = "\n".join( |
| [ |
| f"R2: {r2_score(y_test, preds):.4f}", |
| f"MAE: {mean_absolute_error(y_test, preds):.4f}", |
| f"Train Rows: {len(x_train)}", |
| f"Test Rows: {len(x_test)}", |
| ] |
| ) |
| model_name = "RandomForestRegressor" |
|
|
| preview = x.head(8).to_string(index=False) |
| return model_name, metrics, preview |
|
|