Spaces:

abersbail
/

machine-learning-tabular-lab

Sleeping

App Files Files Community

machine-learning-tabular-lab / ml_tabular /service.py

abersbail

Add machine learning tabular lab Space

cdc04d9 verified 2 months ago

raw

history blame contribute delete

4.86 kB

	import pandas as pd
	from sklearn.compose import ColumnTransformer
	from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
	from sklearn.impute import SimpleImputer
	from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, r2_score
	from sklearn.model_selection import train_test_split
	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import OneHotEncoder, StandardScaler


	class TabularMLService:
	def run(self, csv_path, target_column, task_type, test_size, random_state):
	if not csv_path:
	return "", "", "", "Upload a CSV file first."

	if not target_column:
	return "", "", "", "Enter the target column name."

	try:
	df = pd.read_csv(csv_path)
	except Exception as exc:
	return "", "", "", f"Could not read CSV: {type(exc).__name__}: {exc}"

	if target_column not in df.columns:
	return "", "", "", f"Target column `{target_column}` was not found in the dataset."

	if len(df.columns) < 2:
	return "", "", "", "Dataset must have at least one feature column and one target column."

	try:
	inferred_task = self._resolve_task(df[target_column], task_type)
	model_name, metrics, preview = self._train(df, target_column, inferred_task, test_size, random_state)
	status = (
	f"Trained a {inferred_task.lower()} model on {len(df)} rows and {len(df.columns) - 1} feature columns."
	)
	return model_name, metrics, preview, status
	except Exception as exc:
	return "", "", "", f"Training failed: {type(exc).__name__}: {exc}"

	def _resolve_task(self, target_series, task_type):
	if task_type != "Auto":
	return task_type

	unique_count = target_series.nunique(dropna=True)
	if str(target_series.dtype) in {"object", "bool", "category"}:
	return "Classification"
	if unique_count <= 20:
	return "Classification"
	return "Regression"

	def _train(self, df, target_column, task_type, test_size, random_state):
	x = df.drop(columns=[target_column])
	y = df[target_column]

	numeric_cols = x.select_dtypes(include=["number"]).columns.tolist()
	categorical_cols = [col for col in x.columns if col not in numeric_cols]

	numeric_pipeline = Pipeline(
	steps=[
	("imputer", SimpleImputer(strategy="median")),
	("scaler", StandardScaler()),
	]
	)
	categorical_pipeline = Pipeline(
	steps=[
	("imputer", SimpleImputer(strategy="most_frequent")),
	("onehot", OneHotEncoder(handle_unknown="ignore")),
	]
	)

	preprocessor = ColumnTransformer(
	transformers=[
	("num", numeric_pipeline, numeric_cols),
	("cat", categorical_pipeline, categorical_cols),
	]
	)

	if task_type == "Classification":
	model = RandomForestClassifier(n_estimators=180, random_state=random_state)
	x_train, x_test, y_train, y_test = train_test_split(
	x,
	y,
	test_size=test_size,
	random_state=random_state,
	stratify=y if y.nunique() > 1 else None,
	)
	else:
	model = RandomForestRegressor(n_estimators=220, random_state=random_state)
	x_train, x_test, y_train, y_test = train_test_split(
	x,
	y,
	test_size=test_size,
	random_state=random_state,
	)

	pipeline = Pipeline(
	steps=[
	("preprocessor", preprocessor),
	("model", model),
	]
	)
	pipeline.fit(x_train, y_train)
	preds = pipeline.predict(x_test)

	if task_type == "Classification":
	metrics = "\n".join(
	[
	f"Accuracy: {accuracy_score(y_test, preds):.4f}",
	f"Macro F1: {f1_score(y_test, preds, average='macro'):.4f}",
	f"Train Rows: {len(x_train)}",
	f"Test Rows: {len(x_test)}",
	f"Classes: {y.nunique()}",
	]
	)
	model_name = "RandomForestClassifier"
	else:
	metrics = "\n".join(
	[
	f"R2: {r2_score(y_test, preds):.4f}",
	f"MAE: {mean_absolute_error(y_test, preds):.4f}",
	f"Train Rows: {len(x_train)}",
	f"Test Rows: {len(x_test)}",
	]
	)
	model_name = "RandomForestRegressor"

	preview = x.head(8).to_string(index=False)
	return model_name, metrics, preview