Spaces:

mnoorchenar
/

AutoMLOps

Sleeping

App Files Files Community

AutoMLOps / mlops /algorithms.py

mnoorchenar

Update 2026-03-26 18:00:31

fb9037e about 2 months ago

raw

history blame contribute delete

21.8 kB

	"""Algorithm registry for AutoMLOps — multiple categories for classification & regression."""
	from sklearn.linear_model import (
	LogisticRegression, RidgeClassifier, SGDClassifier,
	PassiveAggressiveClassifier, LinearRegression, Ridge, Lasso,
	ElasticNet, BayesianRidge, HuberRegressor, SGDRegressor,
	)
	from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
	from sklearn.ensemble import (
	RandomForestClassifier, ExtraTreesClassifier,
	GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier,
	RandomForestRegressor, ExtraTreesRegressor,
	GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor,
	)
	from sklearn.svm import SVC, SVR, LinearSVC
	from sklearn.naive_bayes import GaussianNB, BernoulliNB, ComplementNB
	from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
	from sklearn.neural_network import MLPClassifier, MLPRegressor
	from sklearn.discriminant_analysis import (
	LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis,
	)
	from xgboost import XGBClassifier, XGBRegressor
	from lightgbm import LGBMClassifier, LGBMRegressor


	# ── Shared verbosity helper ────────────────────────────────────────────────────
	_SILENT = {"verbosity": 0} # XGBoost
	_LGBM_SILENT = {"verbose": -1} # LightGBM


	ALGORITHMS = {
	# ══════════════════════════════════════════════════════════════════════
	# CLASSIFICATION
	# ══════════════════════════════════════════════════════════════════════
	"classification": {

	"Linear Models": {
	"Logistic Regression": {
	"class": LogisticRegression,
	"params": {"max_iter": 1000, "random_state": 42},
	"description": "L2-regularised linear classifier, interpretable baseline.",
	"color": "#3b82f6",
	},
	"Logistic Regression (L1)": {
	"class": LogisticRegression,
	"params": {"penalty": "l1", "solver": "saga", "max_iter": 1000, "random_state": 42},
	"description": "Sparse logistic regression via L1 regularisation.",
	"color": "#60a5fa",
	},
	"Ridge Classifier": {
	"class": RidgeClassifier,
	"params": {"alpha": 1.0},
	"description": "Ridge-regression-based classifier; fast on high-dim data.",
	"color": "#93c5fd",
	},
	"SGD Classifier": {
	"class": SGDClassifier,
	"params": {"max_iter": 1000, "random_state": 42},
	"description": "Stochastic Gradient Descent for large-scale linear classification.",
	"color": "#bfdbfe",
	},
	"Passive Aggressive": {
	"class": PassiveAggressiveClassifier,
	"params": {"max_iter": 1000, "random_state": 42},
	"description": "Online learning algorithm suited to text/streaming data.",
	"color": "#dbeafe",
	},
	"Linear Discriminant Analysis": {
	"class": LinearDiscriminantAnalysis,
	"params": {},
	"description": "Finds linear combinations that maximise class separation.",
	"color": "#eff6ff",
	},
	},

	"Tree-Based": {
	"Decision Tree": {
	"class": DecisionTreeClassifier,
	"params": {"max_depth": 10, "random_state": 42},
	"description": "Interpretable tree of if-else rules.",
	"color": "#22c55e",
	},
	"Random Forest": {
	"class": RandomForestClassifier,
	"params": {"n_estimators": 100, "random_state": 42},
	"description": "Bagging of decision trees; robust, low variance.",
	"color": "#4ade80",
	},
	"Extra Trees": {
	"class": ExtraTreesClassifier,
	"params": {"n_estimators": 100, "random_state": 42},
	"description": "Extremely randomised trees; faster than Random Forest.",
	"color": "#86efac",
	},
	"Quadratic Discriminant Analysis": {
	"class": QuadraticDiscriminantAnalysis,
	"params": {},
	"description": "Non-linear discriminant analysis with quadratic boundary.",
	"color": "#bbf7d0",
	},
	},

	"Ensemble / Boosting": {
	"Gradient Boosting": {
	"class": GradientBoostingClassifier,
	"params": {"n_estimators": 100, "learning_rate": 0.1, "random_state": 42},
	"description": "Sequential boosting of shallow trees; high accuracy.",
	"color": "#f59e0b",
	},
	"AdaBoost": {
	"class": AdaBoostClassifier,
	"params": {"n_estimators": 100, "random_state": 42},
	"description": "Adaptive boosting; up-weights misclassified samples.",
	"color": "#fbbf24",
	},
	"Bagging Classifier": {
	"class": BaggingClassifier,
	"params": {"n_estimators": 50, "random_state": 42},
	"description": "Bootstrap aggregating of any base estimator.",
	"color": "#fcd34d",
	},
	"XGBoost": {
	"class": XGBClassifier,
	"params": {"n_estimators": 100, "learning_rate": 0.1, "random_state": 42, **_SILENT},
	"description": "Optimised gradient boosting with regularisation; competition favourite.",
	"color": "#d97706",
	},
	"LightGBM": {
	"class": LGBMClassifier,
	"params": {"n_estimators": 100, "learning_rate": 0.1, "random_state": 42, **_LGBM_SILENT},
	"description": "Leaf-wise boosting; extremely fast on large datasets.",
	"color": "#b45309",
	},
	},

	"Support Vector Machines": {
	"SVC (RBF Kernel)": {
	"class": SVC,
	"params": {"kernel": "rbf", "probability": True, "random_state": 42},
	"description": "Non-linear SVM with radial basis function kernel.",
	"color": "#a855f7",
	},
	"SVC (Polynomial)": {
	"class": SVC,
	"params": {"kernel": "poly", "degree": 3, "probability": True, "random_state": 42},
	"description": "SVM with polynomial kernel; captures feature interactions.",
	"color": "#c084fc",
	},
	"SVC (Linear)": {
	"class": SVC,
	"params": {"kernel": "linear", "probability": True, "random_state": 42},
	"description": "Linear SVM; interpretable weights, good on text features.",
	"color": "#d8b4fe",
	},
	"LinearSVC": {
	"class": LinearSVC,
	"params": {"max_iter": 2000, "random_state": 42},
	"description": "Faster linear SVM implementation via liblinear.",
	"color": "#ede9fe",
	},
	},

	"Probabilistic": {
	"Gaussian Naive Bayes": {
	"class": GaussianNB,
	"params": {},
	"description": "Assumes Gaussian feature distribution; very fast baseline.",
	"color": "#ec4899",
	},
	"Bernoulli Naive Bayes": {
	"class": BernoulliNB,
	"params": {},
	"description": "NB for binary/boolean features; popular in text classification.",
	"color": "#f472b6",
	},
	"Complement Naive Bayes": {
	"class": ComplementNB,
	"params": {},
	"description": "Improved NB variant, particularly strong on imbalanced text data.",
	"color": "#fbcfe8",
	},
	},

	"Instance-Based (KNN)": {
	"KNN (k=3)": {
	"class": KNeighborsClassifier,
	"params": {"n_neighbors": 3},
	"description": "Majority vote from 3 nearest neighbours.",
	"color": "#06b6d4",
	},
	"KNN (k=5)": {
	"class": KNeighborsClassifier,
	"params": {"n_neighbors": 5},
	"description": "Majority vote from 5 nearest neighbours.",
	"color": "#22d3ee",
	},
	"KNN (k=9)": {
	"class": KNeighborsClassifier,
	"params": {"n_neighbors": 9},
	"description": "Majority vote from 9 nearest neighbours; smoother boundary.",
	"color": "#67e8f9",
	},
	},

	"Neural Networks": {
	"MLP (Small)": {
	"class": MLPClassifier,
	"params": {"hidden_layer_sizes": (64,), "max_iter": 500, "random_state": 42},
	"description": "Single hidden-layer neural network.",
	"color": "#f43f5e",
	},
	"MLP (Medium)": {
	"class": MLPClassifier,
	"params": {"hidden_layer_sizes": (128, 64), "max_iter": 500, "random_state": 42},
	"description": "Two hidden-layer neural network.",
	"color": "#fb7185",
	},
	"MLP (Deep)": {
	"class": MLPClassifier,
	"params": {"hidden_layer_sizes": (256, 128, 64), "max_iter": 500, "random_state": 42},
	"description": "Three hidden-layer neural network with ReLU activations.",
	"color": "#fda4af",
	},
	},
	},

	# ══════════════════════════════════════════════════════════════════════
	# REGRESSION
	# ══════════════════════════════════════════════════════════════════════
	"regression": {

	"Linear Models": {
	"Linear Regression": {
	"class": LinearRegression,
	"params": {},
	"description": "Ordinary least-squares; interpretable baseline.",
	"color": "#3b82f6",
	},
	"Ridge Regression": {
	"class": Ridge,
	"params": {"alpha": 1.0},
	"description": "L2-regularised linear regression; handles multicollinearity.",
	"color": "#60a5fa",
	},
	"Lasso": {
	"class": Lasso,
	"params": {"alpha": 0.1, "max_iter": 2000},
	"description": "L1 regularisation produces sparse feature weights.",
	"color": "#93c5fd",
	},
	"ElasticNet": {
	"class": ElasticNet,
	"params": {"alpha": 0.1, "l1_ratio": 0.5, "max_iter": 2000},
	"description": "Combines L1 and L2 regularisation.",
	"color": "#bfdbfe",
	},
	"Bayesian Ridge": {
	"class": BayesianRidge,
	"params": {},
	"description": "Probabilistic Bayesian linear regression with automatic regularisation.",
	"color": "#dbeafe",
	},
	"Huber Regressor": {
	"class": HuberRegressor,
	"params": {"max_iter": 200},
	"description": "Robust to outliers via Huber loss function.",
	"color": "#eff6ff",
	},
	},

	"Tree-Based": {
	"Decision Tree Regressor": {
	"class": DecisionTreeRegressor,
	"params": {"max_depth": 10, "random_state": 42},
	"description": "Recursive partitioning for regression.",
	"color": "#22c55e",
	},
	"Random Forest Regressor": {
	"class": RandomForestRegressor,
	"params": {"n_estimators": 100, "random_state": 42},
	"description": "Averaged predictions of many trees; low variance.",
	"color": "#4ade80",
	},
	"Extra Trees Regressor": {
	"class": ExtraTreesRegressor,
	"params": {"n_estimators": 100, "random_state": 42},
	"description": "Extremely randomised regression trees; fast.",
	"color": "#86efac",
	},
	},

	"Ensemble / Boosting": {
	"Gradient Boosting Regressor": {
	"class": GradientBoostingRegressor,
	"params": {"n_estimators": 100, "learning_rate": 0.1, "random_state": 42},
	"description": "Sequential boosting minimising regression loss.",
	"color": "#f59e0b",
	},
	"AdaBoost Regressor": {
	"class": AdaBoostRegressor,
	"params": {"n_estimators": 100, "random_state": 42},
	"description": "Adaptive boosting for regression.",
	"color": "#fbbf24",
	},
	"Bagging Regressor": {
	"class": BaggingRegressor,
	"params": {"n_estimators": 50, "random_state": 42},
	"description": "Bootstrap aggregating for regression.",
	"color": "#fcd34d",
	},
	"XGBoost Regressor": {
	"class": XGBRegressor,
	"params": {"n_estimators": 100, "learning_rate": 0.1, "random_state": 42, **_SILENT},
	"description": "Regularised gradient boosting; excellent out-of-the-box performance.",
	"color": "#d97706",
	},
	"LightGBM Regressor": {
	"class": LGBMRegressor,
	"params": {"n_estimators": 100, "learning_rate": 0.1, "random_state": 42, **_LGBM_SILENT},
	"description": "Leaf-wise boosting regressor; fast and memory-efficient.",
	"color": "#b45309",
	},
	},

	"Support Vector Machines": {
	"SVR (RBF)": {
	"class": SVR,
	"params": {"kernel": "rbf"},
	"description": "Non-linear support vector regression.",
	"color": "#a855f7",
	},
	"SVR (Linear)": {
	"class": SVR,
	"params": {"kernel": "linear"},
	"description": "Linear support vector regression.",
	"color": "#c084fc",
	},
	},

	"Instance-Based (KNN)": {
	"KNN Regressor (k=3)": {
	"class": KNeighborsRegressor,
	"params": {"n_neighbors": 3},
	"description": "Average of 3 nearest neighbours.",
	"color": "#06b6d4",
	},
	"KNN Regressor (k=5)": {
	"class": KNeighborsRegressor,
	"params": {"n_neighbors": 5},
	"description": "Average of 5 nearest neighbours.",
	"color": "#22d3ee",
	},
	},

	"Neural Networks": {
	"MLP Regressor (Small)": {
	"class": MLPRegressor,
	"params": {"hidden_layer_sizes": (64,), "max_iter": 500, "random_state": 42},
	"description": "Single hidden-layer neural network for regression.",
	"color": "#f43f5e",
	},
	"MLP Regressor (Medium)": {
	"class": MLPRegressor,
	"params": {"hidden_layer_sizes": (128, 64), "max_iter": 500, "random_state": 42},
	"description": "Two hidden-layer neural network for regression.",
	"color": "#fb7185",
	},
	},
	},
	}


	# ── Hyperparameter search grids (keyed by model class name) ───────────────────
	HPO_GRIDS: dict[str, dict] = {
	# Linear Models
	"LogisticRegression": {"C": [0.001, 0.01, 0.1, 1, 10, 100], "solver": ["lbfgs", "saga"], "max_iter": [500, 1000]},
	"RidgeClassifier": {"alpha": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]},
	"SGDClassifier": {"loss": ["hinge", "log_loss", "modified_huber"], "alpha": [0.0001, 0.001, 0.01]},
	"Ridge": {"alpha": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]},
	"Lasso": {"alpha": [0.001, 0.01, 0.1, 1.0, 10.0]},
	"ElasticNet": {"alpha": [0.001, 0.01, 0.1, 1.0], "l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9]},
	"HuberRegressor": {"epsilon": [1.1, 1.35, 1.5, 2.0], "alpha": [0.0001, 0.001, 0.01, 0.1]},
	# Tree-Based
	"DecisionTreeClassifier":{"max_depth": [3, 5, 7, 10, None], "min_samples_split": [2, 5, 10], "min_samples_leaf": [1, 2, 4], "criterion": ["gini", "entropy"]},
	"DecisionTreeRegressor": {"max_depth": [3, 5, 7, 10, None], "min_samples_split": [2, 5, 10], "min_samples_leaf": [1, 2, 4]},
	"RandomForestClassifier":{"n_estimators": [50, 100, 200, 300], "max_depth": [None, 5, 10, 20], "min_samples_split": [2, 5, 10], "max_features": ["sqrt", "log2"]},
	"RandomForestRegressor": {"n_estimators": [50, 100, 200, 300], "max_depth": [None, 5, 10, 20], "min_samples_split": [2, 5, 10], "max_features": ["sqrt", "log2", None]},
	"ExtraTreesClassifier": {"n_estimators": [50, 100, 200], "max_depth": [None, 5, 10, 20], "min_samples_split": [2, 5, 10]},
	"ExtraTreesRegressor": {"n_estimators": [50, 100, 200], "max_depth": [None, 5, 10, 20], "min_samples_split": [2, 5, 10]},
	# Boosting
	"GradientBoostingClassifier": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.05, 0.1, 0.2], "max_depth": [3, 4, 5, 6], "subsample": [0.7, 0.8, 0.9, 1.0]},
	"GradientBoostingRegressor": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.05, 0.1, 0.2], "max_depth": [3, 4, 5, 6], "subsample": [0.7, 0.8, 0.9, 1.0]},
	"AdaBoostClassifier": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.1, 0.5, 1.0]},
	"AdaBoostRegressor": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.1, 0.5, 1.0], "loss": ["linear", "square", "exponential"]},
	"XGBClassifier": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.05, 0.1, 0.2], "max_depth": [3, 4, 5, 6, 7], "subsample": [0.7, 0.8, 0.9], "colsample_bytree": [0.7, 0.8, 0.9]},
	"XGBRegressor": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.05, 0.1, 0.2], "max_depth": [3, 4, 5, 6, 7], "subsample": [0.7, 0.8, 0.9], "colsample_bytree": [0.7, 0.8, 0.9]},
	"LGBMClassifier": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.05, 0.1, 0.2], "max_depth": [-1, 5, 10, 20], "num_leaves": [15, 31, 63, 127], "subsample": [0.7, 0.8, 0.9, 1.0]},
	"LGBMRegressor": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.05, 0.1, 0.2], "max_depth": [-1, 5, 10, 20], "num_leaves": [15, 31, 63, 127], "subsample": [0.7, 0.8, 0.9, 1.0]},
	# SVM
	"SVC": {"C": [0.1, 1, 10, 100], "gamma": ["scale", "auto", 0.001, 0.01, 0.1]},
	"SVR": {"C": [0.1, 1, 10, 100], "gamma": ["scale", "auto"], "epsilon": [0.01, 0.1, 0.5, 1.0]},
	# KNN
	"KNeighborsClassifier": {"n_neighbors": [3, 5, 7, 9, 11, 15], "weights": ["uniform", "distance"], "metric": ["euclidean", "manhattan"]},
	"KNeighborsRegressor": {"n_neighbors": [3, 5, 7, 9, 11, 15], "weights": ["uniform", "distance"], "metric": ["euclidean", "manhattan"]},
	# MLP
	"MLPClassifier": {"hidden_layer_sizes": [(64,), (128,), (64, 32), (128, 64), (256, 128)], "learning_rate_init": [0.001, 0.005, 0.01], "alpha": [0.0001, 0.001, 0.01], "activation": ["relu", "tanh"]},
	"MLPRegressor": {"hidden_layer_sizes": [(64,), (128,), (64, 32), (128, 64), (256, 128)], "learning_rate_init": [0.001, 0.005, 0.01], "alpha": [0.0001, 0.001, 0.01], "activation": ["relu", "tanh"]},
	}


	def get_hpo_grid(cls) -> dict:
	"""Return the hyperparameter search grid for a model class, or {} if none defined."""
	return HPO_GRIDS.get(cls.__name__, {})


	def get_algorithm(task: str, category: str, name: str) -> dict:
	"""Retrieve algorithm config by task / category / name."""
	try:
	return ALGORITHMS[task][category][name]
	except KeyError:
	raise ValueError(f"Algorithm not found: task={task}, category={category}, name={name}")


	def list_algorithms(task: str) -> dict:
	"""Return the algorithm tree for the given task type."""
	if task not in ALGORITHMS:
	raise ValueError(f"Unknown task: {task}")
	return ALGORITHMS[task]


	def all_algorithm_names(task: str) -> list[str]:
	"""Flat list of all algorithm names for a given task."""
	names = []
	for cat in ALGORITHMS[task].values():
	names.extend(cat.keys())
	return names


	def algorithms_for_json(task: str \| None = None) -> dict:
	"""Return ALGORITHMS (or a task subset) as a JSON-serializable dict.

	Removes the non-serializable ``"class"`` key and converts tuples to lists.
	"""
	def _clean(obj):
	if isinstance(obj, dict):
	return {k: _clean(v) for k, v in obj.items() if k != "class"}
	if isinstance(obj, (list, tuple)):
	return [_clean(i) for i in obj]
	return obj

	src = ALGORITHMS if task is None else ALGORITHMS[task]
	return _clean(src)