from typing import Dict, Any, Optional, Union, Tuple import numpy as np import pandas as pd from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier, StackingClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, classification_report from sklearn.preprocessing import LabelEncoder XGBClassifier = None CatBoostClassifier = None LGBMClassifier = None try: from xgboost import XGBClassifier except ImportError: pass try: from catboost import CatBoostClassifier except ImportError: pass try: from lightgbm import LGBMClassifier except ImportError: pass def get_logistic_regression( penalty: str = "l2", C: float = 1.0, max_iter: int = 1000, solver: str = "liblinear", # supports l1 random_state: int = 42 ) -> LogisticRegression: if penalty not in ("l1", "l2", "elasticnet", "none"): raise ValueError("penalty must be 'l1', 'l2', 'elasticnet', or 'none'") if penalty == "l1" and solver not in ("liblinear", "saga"): solver = "liblinear" return LogisticRegression( penalty=penalty, C=C, max_iter=max_iter, solver=solver, random_state=random_state ) def get_svm_linear(C: float = 1.0, random_state: int = 42) -> SVC: return SVC(kernel="linear", C=C, probability=True, random_state=random_state) def get_random_forest( n_estimators: int = 100, max_depth: Optional[int] = None, random_state: int = 42 ) -> RandomForestClassifier: return RandomForestClassifier( n_estimators=n_estimators, max_depth=max_depth, random_state=random_state ) def get_gradient_boosting( model_type: str = "xgb", **kwargs ) -> Union[XGBClassifier, "CatBoostClassifier", "LGBMClassifier"]: if model_type == "xgb": if XGBClassifier is None: raise ImportError("XGBoost not installed. Run: pip install xgboost") kwargs.setdefault("random_state", 42) return XGBClassifier(**kwargs) elif model_type == "cat": if CatBoostClassifier is None: raise ImportError("CatBoost not installed. Run: pip install catboost") kwargs.setdefault("verbose", False) kwargs.setdefault("random_seed", 42) return CatBoostClassifier(**kwargs) elif model_type == "lgb": if LGBMClassifier is None: raise ImportError("LightGBM not installed. Run: pip install lightgbm") kwargs.setdefault("random_state", 42) return LGBMClassifier(**kwargs) else: raise ValueError("model_type must be 'xgb', 'cat', or 'lgb'") def get_bagging_classifier( base_estimator: str = "tree", n_estimators: int = 10, random_state: int = 42 ) -> BaggingClassifier: if base_estimator == "tree": from sklearn.tree import DecisionTreeClassifier estimator = DecisionTreeClassifier(random_state=random_state) elif base_estimator == "lr": estimator = get_logistic_regression() else: raise ValueError("base_estimator must be 'tree' or 'lr'") return BaggingClassifier( estimator=estimator, n_estimators=n_estimators, random_state=random_state ) def get_stacking_classifier( final_estimator: Optional[BaseEstimator] = None, cv: int = 5, random_state: int = 42 ) -> StackingClassifier: estimators = [ ("lr", get_logistic_regression()), ("svm", get_svm_linear()), ] if CatBoostClassifier is not None: estimators.append(("cat", get_gradient_boosting("cat", iterations=100))) if final_estimator is None: final_estimator = get_logistic_regression() return StackingClassifier( estimators=estimators, final_estimator=final_estimator, cv=cv, passthrough=False ) def get_voting_classifier( voting: str = "soft", use_catboost: bool = True ) -> VotingClassifier: clfs = [ ("lr", get_logistic_regression()), ("svm", get_svm_linear()), ("rf", get_random_forest(n_estimators=50)) ] if use_catboost and CatBoostClassifier is not None: clfs.append(("cat", get_gradient_boosting("cat", iterations=50, verbose=False))) return VotingClassifier( estimators=clfs, voting=voting ) def tpot_classifier( generations: int = 5, population_size: int = 20, cv: int = 5, random_state: int = 42, verbosity: int = 0 ) -> Any: try: from tpot import TPOTClassifier except ImportError: raise ImportError("TPOT not installed. Run: pip install tpot") return TPOTClassifier( generations=generations, population_size=population_size, cv=cv, random_state=random_state, verbosity=verbosity, n_jobs=-1 ) def h2o_classifier( max_runtime_secs: int = 300, seed: int = 42, exclude_algos: Optional[list] = None ) -> Any: try: import h2o from h2o.automl import H2OAutoML except ImportError: raise ImportError("H2O not installed. Run: pip install h2o") aml = H2OAutoML( max_runtime_secs=max_runtime_secs, seed=seed, exclude_algos=exclude_algos ) return aml def train_and_evaluate( model: Union[BaseEstimator, Any], X_train: Union[np.ndarray, pd.DataFrame], y_train: Union[np.ndarray, pd.Series], X_test: Union[np.ndarray, pd.DataFrame], y_test: Union[np.ndarray, pd.Series], is_h2o: bool = False ) -> Dict[str, Any]: if is_h2o: import h2o train_frame = X_train.cbind(y_train) test_frame = X_test.cbind(y_test) y_col = y_train.columns[0] model.train(x=X_train.columns.tolist(), y=y_col, training_frame=train_frame) perf = model.model_performance(test_frame) return { "accuracy": perf.accuracy()[0], "auc": perf.auc() if perf._has_auc() else None, "best_model": model.leader } else: model.fit(X_train, y_train) y_pred = model.predict(X_test) return { "accuracy": accuracy_score(y_test, y_pred), "report": classification_report(y_test, y_pred, output_dict=True), "model": model }