Spaces:
Build error
Build error
| from typing import Dict, Any, Optional, Union, Tuple | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.base import BaseEstimator, ClassifierMixin | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.svm import SVC | |
| from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier, StackingClassifier | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import accuracy_score, classification_report | |
| from sklearn.preprocessing import LabelEncoder | |
| XGBClassifier = None | |
| CatBoostClassifier = None | |
| LGBMClassifier = None | |
| try: | |
| from xgboost import XGBClassifier | |
| except ImportError: | |
| pass | |
| try: | |
| from catboost import CatBoostClassifier | |
| except ImportError: | |
| pass | |
| try: | |
| from lightgbm import LGBMClassifier | |
| except ImportError: | |
| pass | |
| def get_logistic_regression( | |
| penalty: str = "l2", | |
| C: float = 1.0, | |
| max_iter: int = 1000, | |
| solver: str = "liblinear", # supports l1 | |
| random_state: int = 42 | |
| ) -> LogisticRegression: | |
| if penalty not in ("l1", "l2", "elasticnet", "none"): | |
| raise ValueError("penalty must be 'l1', 'l2', 'elasticnet', or 'none'") | |
| if penalty == "l1" and solver not in ("liblinear", "saga"): | |
| solver = "liblinear" | |
| return LogisticRegression( | |
| penalty=penalty, | |
| C=C, | |
| max_iter=max_iter, | |
| solver=solver, | |
| random_state=random_state | |
| ) | |
| def get_svm_linear(C: float = 1.0, random_state: int = 42) -> SVC: | |
| return SVC(kernel="linear", C=C, probability=True, random_state=random_state) | |
| def get_random_forest( | |
| n_estimators: int = 100, | |
| max_depth: Optional[int] = None, | |
| random_state: int = 42 | |
| ) -> RandomForestClassifier: | |
| return RandomForestClassifier( | |
| n_estimators=n_estimators, | |
| max_depth=max_depth, | |
| random_state=random_state | |
| ) | |
| def get_gradient_boosting( | |
| model_type: str = "xgb", | |
| **kwargs | |
| ) -> Union[XGBClassifier, "CatBoostClassifier", "LGBMClassifier"]: | |
| if model_type == "xgb": | |
| if XGBClassifier is None: | |
| raise ImportError("XGBoost not installed. Run: pip install xgboost") | |
| kwargs.setdefault("random_state", 42) | |
| return XGBClassifier(**kwargs) | |
| elif model_type == "cat": | |
| if CatBoostClassifier is None: | |
| raise ImportError("CatBoost not installed. Run: pip install catboost") | |
| kwargs.setdefault("verbose", False) | |
| kwargs.setdefault("random_seed", 42) | |
| return CatBoostClassifier(**kwargs) | |
| elif model_type == "lgb": | |
| if LGBMClassifier is None: | |
| raise ImportError("LightGBM not installed. Run: pip install lightgbm") | |
| kwargs.setdefault("random_state", 42) | |
| return LGBMClassifier(**kwargs) | |
| else: | |
| raise ValueError("model_type must be 'xgb', 'cat', or 'lgb'") | |
| def get_bagging_classifier( | |
| base_estimator: str = "tree", | |
| n_estimators: int = 10, | |
| random_state: int = 42 | |
| ) -> BaggingClassifier: | |
| if base_estimator == "tree": | |
| from sklearn.tree import DecisionTreeClassifier | |
| estimator = DecisionTreeClassifier(random_state=random_state) | |
| elif base_estimator == "lr": | |
| estimator = get_logistic_regression() | |
| else: | |
| raise ValueError("base_estimator must be 'tree' or 'lr'") | |
| return BaggingClassifier( | |
| estimator=estimator, | |
| n_estimators=n_estimators, | |
| random_state=random_state | |
| ) | |
| def get_stacking_classifier( | |
| final_estimator: Optional[BaseEstimator] = None, | |
| cv: int = 5, | |
| random_state: int = 42 | |
| ) -> StackingClassifier: | |
| estimators = [ | |
| ("lr", get_logistic_regression()), | |
| ("svm", get_svm_linear()), | |
| ] | |
| if CatBoostClassifier is not None: | |
| estimators.append(("cat", get_gradient_boosting("cat", iterations=100))) | |
| if final_estimator is None: | |
| final_estimator = get_logistic_regression() | |
| return StackingClassifier( | |
| estimators=estimators, | |
| final_estimator=final_estimator, | |
| cv=cv, | |
| passthrough=False | |
| ) | |
| def get_voting_classifier( | |
| voting: str = "soft", | |
| use_catboost: bool = True | |
| ) -> VotingClassifier: | |
| clfs = [ | |
| ("lr", get_logistic_regression()), | |
| ("svm", get_svm_linear()), | |
| ("rf", get_random_forest(n_estimators=50)) | |
| ] | |
| if use_catboost and CatBoostClassifier is not None: | |
| clfs.append(("cat", get_gradient_boosting("cat", iterations=50, verbose=False))) | |
| return VotingClassifier( | |
| estimators=clfs, | |
| voting=voting | |
| ) | |
| def tpot_classifier( | |
| generations: int = 5, | |
| population_size: int = 20, | |
| cv: int = 5, | |
| random_state: int = 42, | |
| verbosity: int = 0 | |
| ) -> Any: | |
| try: | |
| from tpot import TPOTClassifier | |
| except ImportError: | |
| raise ImportError("TPOT not installed. Run: pip install tpot") | |
| return TPOTClassifier( | |
| generations=generations, | |
| population_size=population_size, | |
| cv=cv, | |
| random_state=random_state, | |
| verbosity=verbosity, | |
| n_jobs=-1 | |
| ) | |
| def h2o_classifier( | |
| max_runtime_secs: int = 300, | |
| seed: int = 42, | |
| exclude_algos: Optional[list] = None | |
| ) -> Any: | |
| try: | |
| import h2o | |
| from h2o.automl import H2OAutoML | |
| except ImportError: | |
| raise ImportError("H2O not installed. Run: pip install h2o") | |
| aml = H2OAutoML( | |
| max_runtime_secs=max_runtime_secs, | |
| seed=seed, | |
| exclude_algos=exclude_algos | |
| ) | |
| return aml | |
| def train_and_evaluate( | |
| model: Union[BaseEstimator, Any], | |
| X_train: Union[np.ndarray, pd.DataFrame], | |
| y_train: Union[np.ndarray, pd.Series], | |
| X_test: Union[np.ndarray, pd.DataFrame], | |
| y_test: Union[np.ndarray, pd.Series], | |
| is_h2o: bool = False | |
| ) -> Dict[str, Any]: | |
| if is_h2o: | |
| import h2o | |
| train_frame = X_train.cbind(y_train) | |
| test_frame = X_test.cbind(y_test) | |
| y_col = y_train.columns[0] | |
| model.train(x=X_train.columns.tolist(), y=y_col, training_frame=train_frame) | |
| perf = model.model_performance(test_frame) | |
| return { | |
| "accuracy": perf.accuracy()[0], | |
| "auc": perf.auc() if perf._has_auc() else None, | |
| "best_model": model.leader | |
| } | |
| else: | |
| model.fit(X_train, y_train) | |
| y_pred = model.predict(X_test) | |
| return { | |
| "accuracy": accuracy_score(y_test, y_pred), | |
| "report": classification_report(y_test, y_pred, output_dict=True), | |
| "model": model | |
| } |