AutoMLOps / mlops /algorithms.py
mnoorchenar's picture
Update 2026-03-26 18:00:31
fb9037e
"""Algorithm registry for AutoMLOps β€” multiple categories for classification & regression."""
from sklearn.linear_model import (
LogisticRegression, RidgeClassifier, SGDClassifier,
PassiveAggressiveClassifier, LinearRegression, Ridge, Lasso,
ElasticNet, BayesianRidge, HuberRegressor, SGDRegressor,
)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (
RandomForestClassifier, ExtraTreesClassifier,
GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier,
RandomForestRegressor, ExtraTreesRegressor,
GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor,
)
from sklearn.svm import SVC, SVR, LinearSVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB, ComplementNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.discriminant_analysis import (
LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis,
)
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
# ── Shared verbosity helper ────────────────────────────────────────────────────
_SILENT = {"verbosity": 0} # XGBoost
_LGBM_SILENT = {"verbose": -1} # LightGBM
ALGORITHMS = {
# ══════════════════════════════════════════════════════════════════════
# CLASSIFICATION
# ══════════════════════════════════════════════════════════════════════
"classification": {
"Linear Models": {
"Logistic Regression": {
"class": LogisticRegression,
"params": {"max_iter": 1000, "random_state": 42},
"description": "L2-regularised linear classifier, interpretable baseline.",
"color": "#3b82f6",
},
"Logistic Regression (L1)": {
"class": LogisticRegression,
"params": {"penalty": "l1", "solver": "saga", "max_iter": 1000, "random_state": 42},
"description": "Sparse logistic regression via L1 regularisation.",
"color": "#60a5fa",
},
"Ridge Classifier": {
"class": RidgeClassifier,
"params": {"alpha": 1.0},
"description": "Ridge-regression-based classifier; fast on high-dim data.",
"color": "#93c5fd",
},
"SGD Classifier": {
"class": SGDClassifier,
"params": {"max_iter": 1000, "random_state": 42},
"description": "Stochastic Gradient Descent for large-scale linear classification.",
"color": "#bfdbfe",
},
"Passive Aggressive": {
"class": PassiveAggressiveClassifier,
"params": {"max_iter": 1000, "random_state": 42},
"description": "Online learning algorithm suited to text/streaming data.",
"color": "#dbeafe",
},
"Linear Discriminant Analysis": {
"class": LinearDiscriminantAnalysis,
"params": {},
"description": "Finds linear combinations that maximise class separation.",
"color": "#eff6ff",
},
},
"Tree-Based": {
"Decision Tree": {
"class": DecisionTreeClassifier,
"params": {"max_depth": 10, "random_state": 42},
"description": "Interpretable tree of if-else rules.",
"color": "#22c55e",
},
"Random Forest": {
"class": RandomForestClassifier,
"params": {"n_estimators": 100, "random_state": 42},
"description": "Bagging of decision trees; robust, low variance.",
"color": "#4ade80",
},
"Extra Trees": {
"class": ExtraTreesClassifier,
"params": {"n_estimators": 100, "random_state": 42},
"description": "Extremely randomised trees; faster than Random Forest.",
"color": "#86efac",
},
"Quadratic Discriminant Analysis": {
"class": QuadraticDiscriminantAnalysis,
"params": {},
"description": "Non-linear discriminant analysis with quadratic boundary.",
"color": "#bbf7d0",
},
},
"Ensemble / Boosting": {
"Gradient Boosting": {
"class": GradientBoostingClassifier,
"params": {"n_estimators": 100, "learning_rate": 0.1, "random_state": 42},
"description": "Sequential boosting of shallow trees; high accuracy.",
"color": "#f59e0b",
},
"AdaBoost": {
"class": AdaBoostClassifier,
"params": {"n_estimators": 100, "random_state": 42},
"description": "Adaptive boosting; up-weights misclassified samples.",
"color": "#fbbf24",
},
"Bagging Classifier": {
"class": BaggingClassifier,
"params": {"n_estimators": 50, "random_state": 42},
"description": "Bootstrap aggregating of any base estimator.",
"color": "#fcd34d",
},
"XGBoost": {
"class": XGBClassifier,
"params": {"n_estimators": 100, "learning_rate": 0.1, "random_state": 42, **_SILENT},
"description": "Optimised gradient boosting with regularisation; competition favourite.",
"color": "#d97706",
},
"LightGBM": {
"class": LGBMClassifier,
"params": {"n_estimators": 100, "learning_rate": 0.1, "random_state": 42, **_LGBM_SILENT},
"description": "Leaf-wise boosting; extremely fast on large datasets.",
"color": "#b45309",
},
},
"Support Vector Machines": {
"SVC (RBF Kernel)": {
"class": SVC,
"params": {"kernel": "rbf", "probability": True, "random_state": 42},
"description": "Non-linear SVM with radial basis function kernel.",
"color": "#a855f7",
},
"SVC (Polynomial)": {
"class": SVC,
"params": {"kernel": "poly", "degree": 3, "probability": True, "random_state": 42},
"description": "SVM with polynomial kernel; captures feature interactions.",
"color": "#c084fc",
},
"SVC (Linear)": {
"class": SVC,
"params": {"kernel": "linear", "probability": True, "random_state": 42},
"description": "Linear SVM; interpretable weights, good on text features.",
"color": "#d8b4fe",
},
"LinearSVC": {
"class": LinearSVC,
"params": {"max_iter": 2000, "random_state": 42},
"description": "Faster linear SVM implementation via liblinear.",
"color": "#ede9fe",
},
},
"Probabilistic": {
"Gaussian Naive Bayes": {
"class": GaussianNB,
"params": {},
"description": "Assumes Gaussian feature distribution; very fast baseline.",
"color": "#ec4899",
},
"Bernoulli Naive Bayes": {
"class": BernoulliNB,
"params": {},
"description": "NB for binary/boolean features; popular in text classification.",
"color": "#f472b6",
},
"Complement Naive Bayes": {
"class": ComplementNB,
"params": {},
"description": "Improved NB variant, particularly strong on imbalanced text data.",
"color": "#fbcfe8",
},
},
"Instance-Based (KNN)": {
"KNN (k=3)": {
"class": KNeighborsClassifier,
"params": {"n_neighbors": 3},
"description": "Majority vote from 3 nearest neighbours.",
"color": "#06b6d4",
},
"KNN (k=5)": {
"class": KNeighborsClassifier,
"params": {"n_neighbors": 5},
"description": "Majority vote from 5 nearest neighbours.",
"color": "#22d3ee",
},
"KNN (k=9)": {
"class": KNeighborsClassifier,
"params": {"n_neighbors": 9},
"description": "Majority vote from 9 nearest neighbours; smoother boundary.",
"color": "#67e8f9",
},
},
"Neural Networks": {
"MLP (Small)": {
"class": MLPClassifier,
"params": {"hidden_layer_sizes": (64,), "max_iter": 500, "random_state": 42},
"description": "Single hidden-layer neural network.",
"color": "#f43f5e",
},
"MLP (Medium)": {
"class": MLPClassifier,
"params": {"hidden_layer_sizes": (128, 64), "max_iter": 500, "random_state": 42},
"description": "Two hidden-layer neural network.",
"color": "#fb7185",
},
"MLP (Deep)": {
"class": MLPClassifier,
"params": {"hidden_layer_sizes": (256, 128, 64), "max_iter": 500, "random_state": 42},
"description": "Three hidden-layer neural network with ReLU activations.",
"color": "#fda4af",
},
},
},
# ══════════════════════════════════════════════════════════════════════
# REGRESSION
# ══════════════════════════════════════════════════════════════════════
"regression": {
"Linear Models": {
"Linear Regression": {
"class": LinearRegression,
"params": {},
"description": "Ordinary least-squares; interpretable baseline.",
"color": "#3b82f6",
},
"Ridge Regression": {
"class": Ridge,
"params": {"alpha": 1.0},
"description": "L2-regularised linear regression; handles multicollinearity.",
"color": "#60a5fa",
},
"Lasso": {
"class": Lasso,
"params": {"alpha": 0.1, "max_iter": 2000},
"description": "L1 regularisation produces sparse feature weights.",
"color": "#93c5fd",
},
"ElasticNet": {
"class": ElasticNet,
"params": {"alpha": 0.1, "l1_ratio": 0.5, "max_iter": 2000},
"description": "Combines L1 and L2 regularisation.",
"color": "#bfdbfe",
},
"Bayesian Ridge": {
"class": BayesianRidge,
"params": {},
"description": "Probabilistic Bayesian linear regression with automatic regularisation.",
"color": "#dbeafe",
},
"Huber Regressor": {
"class": HuberRegressor,
"params": {"max_iter": 200},
"description": "Robust to outliers via Huber loss function.",
"color": "#eff6ff",
},
},
"Tree-Based": {
"Decision Tree Regressor": {
"class": DecisionTreeRegressor,
"params": {"max_depth": 10, "random_state": 42},
"description": "Recursive partitioning for regression.",
"color": "#22c55e",
},
"Random Forest Regressor": {
"class": RandomForestRegressor,
"params": {"n_estimators": 100, "random_state": 42},
"description": "Averaged predictions of many trees; low variance.",
"color": "#4ade80",
},
"Extra Trees Regressor": {
"class": ExtraTreesRegressor,
"params": {"n_estimators": 100, "random_state": 42},
"description": "Extremely randomised regression trees; fast.",
"color": "#86efac",
},
},
"Ensemble / Boosting": {
"Gradient Boosting Regressor": {
"class": GradientBoostingRegressor,
"params": {"n_estimators": 100, "learning_rate": 0.1, "random_state": 42},
"description": "Sequential boosting minimising regression loss.",
"color": "#f59e0b",
},
"AdaBoost Regressor": {
"class": AdaBoostRegressor,
"params": {"n_estimators": 100, "random_state": 42},
"description": "Adaptive boosting for regression.",
"color": "#fbbf24",
},
"Bagging Regressor": {
"class": BaggingRegressor,
"params": {"n_estimators": 50, "random_state": 42},
"description": "Bootstrap aggregating for regression.",
"color": "#fcd34d",
},
"XGBoost Regressor": {
"class": XGBRegressor,
"params": {"n_estimators": 100, "learning_rate": 0.1, "random_state": 42, **_SILENT},
"description": "Regularised gradient boosting; excellent out-of-the-box performance.",
"color": "#d97706",
},
"LightGBM Regressor": {
"class": LGBMRegressor,
"params": {"n_estimators": 100, "learning_rate": 0.1, "random_state": 42, **_LGBM_SILENT},
"description": "Leaf-wise boosting regressor; fast and memory-efficient.",
"color": "#b45309",
},
},
"Support Vector Machines": {
"SVR (RBF)": {
"class": SVR,
"params": {"kernel": "rbf"},
"description": "Non-linear support vector regression.",
"color": "#a855f7",
},
"SVR (Linear)": {
"class": SVR,
"params": {"kernel": "linear"},
"description": "Linear support vector regression.",
"color": "#c084fc",
},
},
"Instance-Based (KNN)": {
"KNN Regressor (k=3)": {
"class": KNeighborsRegressor,
"params": {"n_neighbors": 3},
"description": "Average of 3 nearest neighbours.",
"color": "#06b6d4",
},
"KNN Regressor (k=5)": {
"class": KNeighborsRegressor,
"params": {"n_neighbors": 5},
"description": "Average of 5 nearest neighbours.",
"color": "#22d3ee",
},
},
"Neural Networks": {
"MLP Regressor (Small)": {
"class": MLPRegressor,
"params": {"hidden_layer_sizes": (64,), "max_iter": 500, "random_state": 42},
"description": "Single hidden-layer neural network for regression.",
"color": "#f43f5e",
},
"MLP Regressor (Medium)": {
"class": MLPRegressor,
"params": {"hidden_layer_sizes": (128, 64), "max_iter": 500, "random_state": 42},
"description": "Two hidden-layer neural network for regression.",
"color": "#fb7185",
},
},
},
}
# ── Hyperparameter search grids (keyed by model class name) ───────────────────
HPO_GRIDS: dict[str, dict] = {
# Linear Models
"LogisticRegression": {"C": [0.001, 0.01, 0.1, 1, 10, 100], "solver": ["lbfgs", "saga"], "max_iter": [500, 1000]},
"RidgeClassifier": {"alpha": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]},
"SGDClassifier": {"loss": ["hinge", "log_loss", "modified_huber"], "alpha": [0.0001, 0.001, 0.01]},
"Ridge": {"alpha": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]},
"Lasso": {"alpha": [0.001, 0.01, 0.1, 1.0, 10.0]},
"ElasticNet": {"alpha": [0.001, 0.01, 0.1, 1.0], "l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9]},
"HuberRegressor": {"epsilon": [1.1, 1.35, 1.5, 2.0], "alpha": [0.0001, 0.001, 0.01, 0.1]},
# Tree-Based
"DecisionTreeClassifier":{"max_depth": [3, 5, 7, 10, None], "min_samples_split": [2, 5, 10], "min_samples_leaf": [1, 2, 4], "criterion": ["gini", "entropy"]},
"DecisionTreeRegressor": {"max_depth": [3, 5, 7, 10, None], "min_samples_split": [2, 5, 10], "min_samples_leaf": [1, 2, 4]},
"RandomForestClassifier":{"n_estimators": [50, 100, 200, 300], "max_depth": [None, 5, 10, 20], "min_samples_split": [2, 5, 10], "max_features": ["sqrt", "log2"]},
"RandomForestRegressor": {"n_estimators": [50, 100, 200, 300], "max_depth": [None, 5, 10, 20], "min_samples_split": [2, 5, 10], "max_features": ["sqrt", "log2", None]},
"ExtraTreesClassifier": {"n_estimators": [50, 100, 200], "max_depth": [None, 5, 10, 20], "min_samples_split": [2, 5, 10]},
"ExtraTreesRegressor": {"n_estimators": [50, 100, 200], "max_depth": [None, 5, 10, 20], "min_samples_split": [2, 5, 10]},
# Boosting
"GradientBoostingClassifier": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.05, 0.1, 0.2], "max_depth": [3, 4, 5, 6], "subsample": [0.7, 0.8, 0.9, 1.0]},
"GradientBoostingRegressor": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.05, 0.1, 0.2], "max_depth": [3, 4, 5, 6], "subsample": [0.7, 0.8, 0.9, 1.0]},
"AdaBoostClassifier": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.1, 0.5, 1.0]},
"AdaBoostRegressor": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.1, 0.5, 1.0], "loss": ["linear", "square", "exponential"]},
"XGBClassifier": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.05, 0.1, 0.2], "max_depth": [3, 4, 5, 6, 7], "subsample": [0.7, 0.8, 0.9], "colsample_bytree": [0.7, 0.8, 0.9]},
"XGBRegressor": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.05, 0.1, 0.2], "max_depth": [3, 4, 5, 6, 7], "subsample": [0.7, 0.8, 0.9], "colsample_bytree": [0.7, 0.8, 0.9]},
"LGBMClassifier": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.05, 0.1, 0.2], "max_depth": [-1, 5, 10, 20], "num_leaves": [15, 31, 63, 127], "subsample": [0.7, 0.8, 0.9, 1.0]},
"LGBMRegressor": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.05, 0.1, 0.2], "max_depth": [-1, 5, 10, 20], "num_leaves": [15, 31, 63, 127], "subsample": [0.7, 0.8, 0.9, 1.0]},
# SVM
"SVC": {"C": [0.1, 1, 10, 100], "gamma": ["scale", "auto", 0.001, 0.01, 0.1]},
"SVR": {"C": [0.1, 1, 10, 100], "gamma": ["scale", "auto"], "epsilon": [0.01, 0.1, 0.5, 1.0]},
# KNN
"KNeighborsClassifier": {"n_neighbors": [3, 5, 7, 9, 11, 15], "weights": ["uniform", "distance"], "metric": ["euclidean", "manhattan"]},
"KNeighborsRegressor": {"n_neighbors": [3, 5, 7, 9, 11, 15], "weights": ["uniform", "distance"], "metric": ["euclidean", "manhattan"]},
# MLP
"MLPClassifier": {"hidden_layer_sizes": [(64,), (128,), (64, 32), (128, 64), (256, 128)], "learning_rate_init": [0.001, 0.005, 0.01], "alpha": [0.0001, 0.001, 0.01], "activation": ["relu", "tanh"]},
"MLPRegressor": {"hidden_layer_sizes": [(64,), (128,), (64, 32), (128, 64), (256, 128)], "learning_rate_init": [0.001, 0.005, 0.01], "alpha": [0.0001, 0.001, 0.01], "activation": ["relu", "tanh"]},
}
def get_hpo_grid(cls) -> dict:
"""Return the hyperparameter search grid for a model class, or {} if none defined."""
return HPO_GRIDS.get(cls.__name__, {})
def get_algorithm(task: str, category: str, name: str) -> dict:
"""Retrieve algorithm config by task / category / name."""
try:
return ALGORITHMS[task][category][name]
except KeyError:
raise ValueError(f"Algorithm not found: task={task}, category={category}, name={name}")
def list_algorithms(task: str) -> dict:
"""Return the algorithm tree for the given task type."""
if task not in ALGORITHMS:
raise ValueError(f"Unknown task: {task}")
return ALGORITHMS[task]
def all_algorithm_names(task: str) -> list[str]:
"""Flat list of all algorithm names for a given task."""
names = []
for cat in ALGORITHMS[task].values():
names.extend(cat.keys())
return names
def algorithms_for_json(task: str | None = None) -> dict:
"""Return ALGORITHMS (or a task subset) as a JSON-serializable dict.
Removes the non-serializable ``"class"`` key and converts tuples to lists.
"""
def _clean(obj):
if isinstance(obj, dict):
return {k: _clean(v) for k, v in obj.items() if k != "class"}
if isinstance(obj, (list, tuple)):
return [_clean(i) for i in obj]
return obj
src = ALGORITHMS if task is None else ALGORITHMS[task]
return _clean(src)