import pandas as pd from sklearn.compose import ColumnTransformer from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.impute import SimpleImputer from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, r2_score from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler class TabularMLService: def run(self, csv_path, target_column, task_type, test_size, random_state): if not csv_path: return "", "", "", "Upload a CSV file first." if not target_column: return "", "", "", "Enter the target column name." try: df = pd.read_csv(csv_path) except Exception as exc: return "", "", "", f"Could not read CSV: {type(exc).__name__}: {exc}" if target_column not in df.columns: return "", "", "", f"Target column `{target_column}` was not found in the dataset." if len(df.columns) < 2: return "", "", "", "Dataset must have at least one feature column and one target column." try: inferred_task = self._resolve_task(df[target_column], task_type) model_name, metrics, preview = self._train(df, target_column, inferred_task, test_size, random_state) status = ( f"Trained a {inferred_task.lower()} model on {len(df)} rows and {len(df.columns) - 1} feature columns." ) return model_name, metrics, preview, status except Exception as exc: return "", "", "", f"Training failed: {type(exc).__name__}: {exc}" def _resolve_task(self, target_series, task_type): if task_type != "Auto": return task_type unique_count = target_series.nunique(dropna=True) if str(target_series.dtype) in {"object", "bool", "category"}: return "Classification" if unique_count <= 20: return "Classification" return "Regression" def _train(self, df, target_column, task_type, test_size, random_state): x = df.drop(columns=[target_column]) y = df[target_column] numeric_cols = x.select_dtypes(include=["number"]).columns.tolist() categorical_cols = [col for col in x.columns if col not in numeric_cols] numeric_pipeline = Pipeline( steps=[ ("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()), ] ) categorical_pipeline = Pipeline( steps=[ ("imputer", SimpleImputer(strategy="most_frequent")), ("onehot", OneHotEncoder(handle_unknown="ignore")), ] ) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_pipeline, numeric_cols), ("cat", categorical_pipeline, categorical_cols), ] ) if task_type == "Classification": model = RandomForestClassifier(n_estimators=180, random_state=random_state) x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=test_size, random_state=random_state, stratify=y if y.nunique() > 1 else None, ) else: model = RandomForestRegressor(n_estimators=220, random_state=random_state) x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=test_size, random_state=random_state, ) pipeline = Pipeline( steps=[ ("preprocessor", preprocessor), ("model", model), ] ) pipeline.fit(x_train, y_train) preds = pipeline.predict(x_test) if task_type == "Classification": metrics = "\n".join( [ f"Accuracy: {accuracy_score(y_test, preds):.4f}", f"Macro F1: {f1_score(y_test, preds, average='macro'):.4f}", f"Train Rows: {len(x_train)}", f"Test Rows: {len(x_test)}", f"Classes: {y.nunique()}", ] ) model_name = "RandomForestClassifier" else: metrics = "\n".join( [ f"R2: {r2_score(y_test, preds):.4f}", f"MAE: {mean_absolute_error(y_test, preds):.4f}", f"Train Rows: {len(x_train)}", f"Test Rows: {len(x_test)}", ] ) model_name = "RandomForestRegressor" preview = x.head(8).to_string(index=False) return model_name, metrics, preview