"""Algorithm registry for AutoMLOps — multiple categories for classification & regression.""" from sklearn.linear_model import ( LogisticRegression, RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier, LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge, HuberRegressor, SGDRegressor, ) from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.ensemble import ( RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor, ) from sklearn.svm import SVC, SVR, LinearSVC from sklearn.naive_bayes import GaussianNB, BernoulliNB, ComplementNB from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor from sklearn.neural_network import MLPClassifier, MLPRegressor from sklearn.discriminant_analysis import ( LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis, ) from xgboost import XGBClassifier, XGBRegressor from lightgbm import LGBMClassifier, LGBMRegressor # ── Shared verbosity helper ──────────────────────────────────────────────────── _SILENT = {"verbosity": 0} # XGBoost _LGBM_SILENT = {"verbose": -1} # LightGBM ALGORITHMS = { # ══════════════════════════════════════════════════════════════════════ # CLASSIFICATION # ══════════════════════════════════════════════════════════════════════ "classification": { "Linear Models": { "Logistic Regression": { "class": LogisticRegression, "params": {"max_iter": 1000, "random_state": 42}, "description": "L2-regularised linear classifier, interpretable baseline.", "color": "#3b82f6", }, "Logistic Regression (L1)": { "class": LogisticRegression, "params": {"penalty": "l1", "solver": "saga", "max_iter": 1000, "random_state": 42}, "description": "Sparse logistic regression via L1 regularisation.", "color": "#60a5fa", }, "Ridge Classifier": { "class": RidgeClassifier, "params": {"alpha": 1.0}, "description": "Ridge-regression-based classifier; fast on high-dim data.", "color": "#93c5fd", }, "SGD Classifier": { "class": SGDClassifier, "params": {"max_iter": 1000, "random_state": 42}, "description": "Stochastic Gradient Descent for large-scale linear classification.", "color": "#bfdbfe", }, "Passive Aggressive": { "class": PassiveAggressiveClassifier, "params": {"max_iter": 1000, "random_state": 42}, "description": "Online learning algorithm suited to text/streaming data.", "color": "#dbeafe", }, "Linear Discriminant Analysis": { "class": LinearDiscriminantAnalysis, "params": {}, "description": "Finds linear combinations that maximise class separation.", "color": "#eff6ff", }, }, "Tree-Based": { "Decision Tree": { "class": DecisionTreeClassifier, "params": {"max_depth": 10, "random_state": 42}, "description": "Interpretable tree of if-else rules.", "color": "#22c55e", }, "Random Forest": { "class": RandomForestClassifier, "params": {"n_estimators": 100, "random_state": 42}, "description": "Bagging of decision trees; robust, low variance.", "color": "#4ade80", }, "Extra Trees": { "class": ExtraTreesClassifier, "params": {"n_estimators": 100, "random_state": 42}, "description": "Extremely randomised trees; faster than Random Forest.", "color": "#86efac", }, "Quadratic Discriminant Analysis": { "class": QuadraticDiscriminantAnalysis, "params": {}, "description": "Non-linear discriminant analysis with quadratic boundary.", "color": "#bbf7d0", }, }, "Ensemble / Boosting": { "Gradient Boosting": { "class": GradientBoostingClassifier, "params": {"n_estimators": 100, "learning_rate": 0.1, "random_state": 42}, "description": "Sequential boosting of shallow trees; high accuracy.", "color": "#f59e0b", }, "AdaBoost": { "class": AdaBoostClassifier, "params": {"n_estimators": 100, "random_state": 42}, "description": "Adaptive boosting; up-weights misclassified samples.", "color": "#fbbf24", }, "Bagging Classifier": { "class": BaggingClassifier, "params": {"n_estimators": 50, "random_state": 42}, "description": "Bootstrap aggregating of any base estimator.", "color": "#fcd34d", }, "XGBoost": { "class": XGBClassifier, "params": {"n_estimators": 100, "learning_rate": 0.1, "random_state": 42, **_SILENT}, "description": "Optimised gradient boosting with regularisation; competition favourite.", "color": "#d97706", }, "LightGBM": { "class": LGBMClassifier, "params": {"n_estimators": 100, "learning_rate": 0.1, "random_state": 42, **_LGBM_SILENT}, "description": "Leaf-wise boosting; extremely fast on large datasets.", "color": "#b45309", }, }, "Support Vector Machines": { "SVC (RBF Kernel)": { "class": SVC, "params": {"kernel": "rbf", "probability": True, "random_state": 42}, "description": "Non-linear SVM with radial basis function kernel.", "color": "#a855f7", }, "SVC (Polynomial)": { "class": SVC, "params": {"kernel": "poly", "degree": 3, "probability": True, "random_state": 42}, "description": "SVM with polynomial kernel; captures feature interactions.", "color": "#c084fc", }, "SVC (Linear)": { "class": SVC, "params": {"kernel": "linear", "probability": True, "random_state": 42}, "description": "Linear SVM; interpretable weights, good on text features.", "color": "#d8b4fe", }, "LinearSVC": { "class": LinearSVC, "params": {"max_iter": 2000, "random_state": 42}, "description": "Faster linear SVM implementation via liblinear.", "color": "#ede9fe", }, }, "Probabilistic": { "Gaussian Naive Bayes": { "class": GaussianNB, "params": {}, "description": "Assumes Gaussian feature distribution; very fast baseline.", "color": "#ec4899", }, "Bernoulli Naive Bayes": { "class": BernoulliNB, "params": {}, "description": "NB for binary/boolean features; popular in text classification.", "color": "#f472b6", }, "Complement Naive Bayes": { "class": ComplementNB, "params": {}, "description": "Improved NB variant, particularly strong on imbalanced text data.", "color": "#fbcfe8", }, }, "Instance-Based (KNN)": { "KNN (k=3)": { "class": KNeighborsClassifier, "params": {"n_neighbors": 3}, "description": "Majority vote from 3 nearest neighbours.", "color": "#06b6d4", }, "KNN (k=5)": { "class": KNeighborsClassifier, "params": {"n_neighbors": 5}, "description": "Majority vote from 5 nearest neighbours.", "color": "#22d3ee", }, "KNN (k=9)": { "class": KNeighborsClassifier, "params": {"n_neighbors": 9}, "description": "Majority vote from 9 nearest neighbours; smoother boundary.", "color": "#67e8f9", }, }, "Neural Networks": { "MLP (Small)": { "class": MLPClassifier, "params": {"hidden_layer_sizes": (64,), "max_iter": 500, "random_state": 42}, "description": "Single hidden-layer neural network.", "color": "#f43f5e", }, "MLP (Medium)": { "class": MLPClassifier, "params": {"hidden_layer_sizes": (128, 64), "max_iter": 500, "random_state": 42}, "description": "Two hidden-layer neural network.", "color": "#fb7185", }, "MLP (Deep)": { "class": MLPClassifier, "params": {"hidden_layer_sizes": (256, 128, 64), "max_iter": 500, "random_state": 42}, "description": "Three hidden-layer neural network with ReLU activations.", "color": "#fda4af", }, }, }, # ══════════════════════════════════════════════════════════════════════ # REGRESSION # ══════════════════════════════════════════════════════════════════════ "regression": { "Linear Models": { "Linear Regression": { "class": LinearRegression, "params": {}, "description": "Ordinary least-squares; interpretable baseline.", "color": "#3b82f6", }, "Ridge Regression": { "class": Ridge, "params": {"alpha": 1.0}, "description": "L2-regularised linear regression; handles multicollinearity.", "color": "#60a5fa", }, "Lasso": { "class": Lasso, "params": {"alpha": 0.1, "max_iter": 2000}, "description": "L1 regularisation produces sparse feature weights.", "color": "#93c5fd", }, "ElasticNet": { "class": ElasticNet, "params": {"alpha": 0.1, "l1_ratio": 0.5, "max_iter": 2000}, "description": "Combines L1 and L2 regularisation.", "color": "#bfdbfe", }, "Bayesian Ridge": { "class": BayesianRidge, "params": {}, "description": "Probabilistic Bayesian linear regression with automatic regularisation.", "color": "#dbeafe", }, "Huber Regressor": { "class": HuberRegressor, "params": {"max_iter": 200}, "description": "Robust to outliers via Huber loss function.", "color": "#eff6ff", }, }, "Tree-Based": { "Decision Tree Regressor": { "class": DecisionTreeRegressor, "params": {"max_depth": 10, "random_state": 42}, "description": "Recursive partitioning for regression.", "color": "#22c55e", }, "Random Forest Regressor": { "class": RandomForestRegressor, "params": {"n_estimators": 100, "random_state": 42}, "description": "Averaged predictions of many trees; low variance.", "color": "#4ade80", }, "Extra Trees Regressor": { "class": ExtraTreesRegressor, "params": {"n_estimators": 100, "random_state": 42}, "description": "Extremely randomised regression trees; fast.", "color": "#86efac", }, }, "Ensemble / Boosting": { "Gradient Boosting Regressor": { "class": GradientBoostingRegressor, "params": {"n_estimators": 100, "learning_rate": 0.1, "random_state": 42}, "description": "Sequential boosting minimising regression loss.", "color": "#f59e0b", }, "AdaBoost Regressor": { "class": AdaBoostRegressor, "params": {"n_estimators": 100, "random_state": 42}, "description": "Adaptive boosting for regression.", "color": "#fbbf24", }, "Bagging Regressor": { "class": BaggingRegressor, "params": {"n_estimators": 50, "random_state": 42}, "description": "Bootstrap aggregating for regression.", "color": "#fcd34d", }, "XGBoost Regressor": { "class": XGBRegressor, "params": {"n_estimators": 100, "learning_rate": 0.1, "random_state": 42, **_SILENT}, "description": "Regularised gradient boosting; excellent out-of-the-box performance.", "color": "#d97706", }, "LightGBM Regressor": { "class": LGBMRegressor, "params": {"n_estimators": 100, "learning_rate": 0.1, "random_state": 42, **_LGBM_SILENT}, "description": "Leaf-wise boosting regressor; fast and memory-efficient.", "color": "#b45309", }, }, "Support Vector Machines": { "SVR (RBF)": { "class": SVR, "params": {"kernel": "rbf"}, "description": "Non-linear support vector regression.", "color": "#a855f7", }, "SVR (Linear)": { "class": SVR, "params": {"kernel": "linear"}, "description": "Linear support vector regression.", "color": "#c084fc", }, }, "Instance-Based (KNN)": { "KNN Regressor (k=3)": { "class": KNeighborsRegressor, "params": {"n_neighbors": 3}, "description": "Average of 3 nearest neighbours.", "color": "#06b6d4", }, "KNN Regressor (k=5)": { "class": KNeighborsRegressor, "params": {"n_neighbors": 5}, "description": "Average of 5 nearest neighbours.", "color": "#22d3ee", }, }, "Neural Networks": { "MLP Regressor (Small)": { "class": MLPRegressor, "params": {"hidden_layer_sizes": (64,), "max_iter": 500, "random_state": 42}, "description": "Single hidden-layer neural network for regression.", "color": "#f43f5e", }, "MLP Regressor (Medium)": { "class": MLPRegressor, "params": {"hidden_layer_sizes": (128, 64), "max_iter": 500, "random_state": 42}, "description": "Two hidden-layer neural network for regression.", "color": "#fb7185", }, }, }, } # ── Hyperparameter search grids (keyed by model class name) ─────────────────── HPO_GRIDS: dict[str, dict] = { # Linear Models "LogisticRegression": {"C": [0.001, 0.01, 0.1, 1, 10, 100], "solver": ["lbfgs", "saga"], "max_iter": [500, 1000]}, "RidgeClassifier": {"alpha": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}, "SGDClassifier": {"loss": ["hinge", "log_loss", "modified_huber"], "alpha": [0.0001, 0.001, 0.01]}, "Ridge": {"alpha": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}, "Lasso": {"alpha": [0.001, 0.01, 0.1, 1.0, 10.0]}, "ElasticNet": {"alpha": [0.001, 0.01, 0.1, 1.0], "l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9]}, "HuberRegressor": {"epsilon": [1.1, 1.35, 1.5, 2.0], "alpha": [0.0001, 0.001, 0.01, 0.1]}, # Tree-Based "DecisionTreeClassifier":{"max_depth": [3, 5, 7, 10, None], "min_samples_split": [2, 5, 10], "min_samples_leaf": [1, 2, 4], "criterion": ["gini", "entropy"]}, "DecisionTreeRegressor": {"max_depth": [3, 5, 7, 10, None], "min_samples_split": [2, 5, 10], "min_samples_leaf": [1, 2, 4]}, "RandomForestClassifier":{"n_estimators": [50, 100, 200, 300], "max_depth": [None, 5, 10, 20], "min_samples_split": [2, 5, 10], "max_features": ["sqrt", "log2"]}, "RandomForestRegressor": {"n_estimators": [50, 100, 200, 300], "max_depth": [None, 5, 10, 20], "min_samples_split": [2, 5, 10], "max_features": ["sqrt", "log2", None]}, "ExtraTreesClassifier": {"n_estimators": [50, 100, 200], "max_depth": [None, 5, 10, 20], "min_samples_split": [2, 5, 10]}, "ExtraTreesRegressor": {"n_estimators": [50, 100, 200], "max_depth": [None, 5, 10, 20], "min_samples_split": [2, 5, 10]}, # Boosting "GradientBoostingClassifier": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.05, 0.1, 0.2], "max_depth": [3, 4, 5, 6], "subsample": [0.7, 0.8, 0.9, 1.0]}, "GradientBoostingRegressor": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.05, 0.1, 0.2], "max_depth": [3, 4, 5, 6], "subsample": [0.7, 0.8, 0.9, 1.0]}, "AdaBoostClassifier": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.1, 0.5, 1.0]}, "AdaBoostRegressor": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.1, 0.5, 1.0], "loss": ["linear", "square", "exponential"]}, "XGBClassifier": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.05, 0.1, 0.2], "max_depth": [3, 4, 5, 6, 7], "subsample": [0.7, 0.8, 0.9], "colsample_bytree": [0.7, 0.8, 0.9]}, "XGBRegressor": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.05, 0.1, 0.2], "max_depth": [3, 4, 5, 6, 7], "subsample": [0.7, 0.8, 0.9], "colsample_bytree": [0.7, 0.8, 0.9]}, "LGBMClassifier": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.05, 0.1, 0.2], "max_depth": [-1, 5, 10, 20], "num_leaves": [15, 31, 63, 127], "subsample": [0.7, 0.8, 0.9, 1.0]}, "LGBMRegressor": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.05, 0.1, 0.2], "max_depth": [-1, 5, 10, 20], "num_leaves": [15, 31, 63, 127], "subsample": [0.7, 0.8, 0.9, 1.0]}, # SVM "SVC": {"C": [0.1, 1, 10, 100], "gamma": ["scale", "auto", 0.001, 0.01, 0.1]}, "SVR": {"C": [0.1, 1, 10, 100], "gamma": ["scale", "auto"], "epsilon": [0.01, 0.1, 0.5, 1.0]}, # KNN "KNeighborsClassifier": {"n_neighbors": [3, 5, 7, 9, 11, 15], "weights": ["uniform", "distance"], "metric": ["euclidean", "manhattan"]}, "KNeighborsRegressor": {"n_neighbors": [3, 5, 7, 9, 11, 15], "weights": ["uniform", "distance"], "metric": ["euclidean", "manhattan"]}, # MLP "MLPClassifier": {"hidden_layer_sizes": [(64,), (128,), (64, 32), (128, 64), (256, 128)], "learning_rate_init": [0.001, 0.005, 0.01], "alpha": [0.0001, 0.001, 0.01], "activation": ["relu", "tanh"]}, "MLPRegressor": {"hidden_layer_sizes": [(64,), (128,), (64, 32), (128, 64), (256, 128)], "learning_rate_init": [0.001, 0.005, 0.01], "alpha": [0.0001, 0.001, 0.01], "activation": ["relu", "tanh"]}, } def get_hpo_grid(cls) -> dict: """Return the hyperparameter search grid for a model class, or {} if none defined.""" return HPO_GRIDS.get(cls.__name__, {}) def get_algorithm(task: str, category: str, name: str) -> dict: """Retrieve algorithm config by task / category / name.""" try: return ALGORITHMS[task][category][name] except KeyError: raise ValueError(f"Algorithm not found: task={task}, category={category}, name={name}") def list_algorithms(task: str) -> dict: """Return the algorithm tree for the given task type.""" if task not in ALGORITHMS: raise ValueError(f"Unknown task: {task}") return ALGORITHMS[task] def all_algorithm_names(task: str) -> list[str]: """Flat list of all algorithm names for a given task.""" names = [] for cat in ALGORITHMS[task].values(): names.extend(cat.keys()) return names def algorithms_for_json(task: str | None = None) -> dict: """Return ALGORITHMS (or a task subset) as a JSON-serializable dict. Removes the non-serializable ``"class"`` key and converts tuples to lists. """ def _clean(obj): if isinstance(obj, dict): return {k: _clean(v) for k, v in obj.items() if k != "class"} if isinstance(obj, (list, tuple)): return [_clean(i) for i in obj] return obj src = ALGORITHMS if task is None else ALGORITHMS[task] return _clean(src)