Spaces:
Running
Running
| """ | |
| Baseline Model Wrappers | |
| ======================== | |
| Sklearn-compatible wrappers for traditional gradient boosting models: | |
| - XGBoost | |
| - CatBoost | |
| - LightGBM | |
| Author: UW MSIM Team | |
| Date: November 2025 | |
| """ | |
| import time | |
| import logging | |
| from typing import Optional, Union, Dict, Any | |
| import numpy as np | |
| import pandas as pd | |
| from .base_wrapper import BaseModelWrapper | |
| logger = logging.getLogger(__name__) | |
| class XGBoostWrapper(BaseModelWrapper): | |
| """ | |
| XGBoost wrapper. | |
| Parameters | |
| ---------- | |
| task_type : str, default='classification' | |
| Task type: 'classification' or 'regression' | |
| n_estimators : int, default=100 | |
| Number of boosting rounds | |
| learning_rate : float, default=0.1 | |
| Step size shrinkage | |
| max_depth : int, default=6 | |
| Maximum tree depth | |
| random_state : int, default=42 | |
| Random seed | |
| **kwargs : dict | |
| Additional XGBoost parameters | |
| """ | |
| def __init__( | |
| self, | |
| task_type: str = 'classification', | |
| n_estimators: int = 100, | |
| learning_rate: float = 0.1, | |
| max_depth: int = 6, | |
| random_state: int = 42, | |
| **kwargs | |
| ): | |
| super().__init__(task_type=task_type, random_state=random_state) | |
| self.n_estimators = n_estimators | |
| self.learning_rate = learning_rate | |
| self.max_depth = max_depth | |
| self.kwargs = kwargs | |
| def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray]) -> 'XGBoostWrapper': | |
| """Fit XGBoost model.""" | |
| from sklearn.preprocessing import LabelEncoder | |
| self._label_encoder = None | |
| self._validate_input(X, y) | |
| logger.info(f"Fitting XGBoost on {X.shape[0]} samples...") | |
| start_time = time.time() | |
| try: | |
| import xgboost as xgb | |
| if self.task_type == 'classification': | |
| self.model = xgb.XGBClassifier( | |
| n_estimators=self.n_estimators, | |
| learning_rate=self.learning_rate, | |
| max_depth=self.max_depth, | |
| random_state=self.random_state, | |
| **self.kwargs | |
| ) | |
| else: | |
| self.model = xgb.XGBRegressor( | |
| n_estimators=self.n_estimators, | |
| learning_rate=self.learning_rate, | |
| max_depth=self.max_depth, | |
| random_state=self.random_state, | |
| **self.kwargs | |
| ) | |
| if self.task_type == 'classification': | |
| self._label_encoder = LabelEncoder() | |
| y_encoded = self._label_encoder.fit_transform(y) | |
| self.model.fit(X, y_encoded) | |
| else: | |
| self.model.fit(X, y) | |
| self.is_fitted = True | |
| self.fit_time = time.time() - start_time | |
| logger.info(f"XGBoost fitted in {self.fit_time:.2f} seconds") | |
| except ImportError: | |
| raise ImportError("Install XGBoost with: pip install xgboost") | |
| except Exception as e: | |
| logger.error(f"Error fitting XGBoost: {e}") | |
| raise | |
| return self | |
| def predict(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray: | |
| """Make predictions with XGBoost.""" | |
| if not self.is_fitted: | |
| raise ValueError("Model not fitted. Call fit() first.") | |
| self._validate_input(X) | |
| start_time = time.time() | |
| predictions = self.model.predict(X) | |
| if self.task_type == 'classification' and self._label_encoder is not None: | |
| predictions = self._label_encoder.inverse_transform(predictions) | |
| self.predict_time = time.time() - start_time | |
| return predictions | |
| def _predict_proba_impl(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray: | |
| """Predict class probabilities.""" | |
| return self.model.predict_proba(X) | |
| def get_params(self, deep: bool = True) -> dict: | |
| """Get parameters.""" | |
| params = super().get_params(deep) | |
| params.update({ | |
| 'n_estimators': self.n_estimators, | |
| 'learning_rate': self.learning_rate, | |
| 'max_depth': self.max_depth, | |
| **self.kwargs | |
| }) | |
| return params | |
| class CatBoostWrapper(BaseModelWrapper): | |
| """ | |
| CatBoost wrapper. | |
| Parameters | |
| ---------- | |
| task_type : str, default='classification' | |
| Task type: 'classification' or 'regression' | |
| iterations : int, default=100 | |
| Number of boosting iterations | |
| learning_rate : float, default=0.1 | |
| Step size shrinkage | |
| depth : int, default=6 | |
| Tree depth | |
| random_state : int, default=42 | |
| Random seed | |
| **kwargs : dict | |
| Additional CatBoost parameters | |
| """ | |
| def __init__( | |
| self, | |
| task_type: str = 'classification', | |
| iterations: int = 100, | |
| learning_rate: float = 0.1, | |
| depth: int = 6, | |
| random_state: int = 42, | |
| **kwargs | |
| ): | |
| super().__init__(task_type=task_type, random_state=random_state) | |
| self.iterations = iterations | |
| self.learning_rate = learning_rate | |
| self.depth = depth | |
| self.kwargs = kwargs | |
| def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray]) -> 'CatBoostWrapper': | |
| """Fit CatBoost model.""" | |
| self._validate_input(X, y) | |
| logger.info(f"Fitting CatBoost on {X.shape[0]} samples...") | |
| start_time = time.time() | |
| try: | |
| from catboost import CatBoostClassifier, CatBoostRegressor | |
| if self.task_type == 'classification': | |
| self.model = CatBoostClassifier( | |
| iterations=self.iterations, | |
| learning_rate=self.learning_rate, | |
| depth=self.depth, | |
| random_state=self.random_state, | |
| verbose=False, | |
| **self.kwargs | |
| ) | |
| else: | |
| self.model = CatBoostRegressor( | |
| iterations=self.iterations, | |
| learning_rate=self.learning_rate, | |
| depth=self.depth, | |
| random_state=self.random_state, | |
| verbose=False, | |
| **self.kwargs | |
| ) | |
| self.model.fit(X, y) | |
| self.is_fitted = True | |
| self.fit_time = time.time() - start_time | |
| logger.info(f"CatBoost fitted in {self.fit_time:.2f} seconds") | |
| except ImportError: | |
| raise ImportError("Install CatBoost with: pip install catboost") | |
| except Exception as e: | |
| logger.error(f"Error fitting CatBoost: {e}") | |
| raise | |
| return self | |
| def predict(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray: | |
| """Make predictions with CatBoost.""" | |
| if not self.is_fitted: | |
| raise ValueError("Model not fitted. Call fit() first.") | |
| self._validate_input(X) | |
| start_time = time.time() | |
| predictions = self.model.predict(X) | |
| self.predict_time = time.time() - start_time | |
| return predictions | |
| def _predict_proba_impl(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray: | |
| """Predict class probabilities.""" | |
| return self.model.predict_proba(X) | |
| def get_params(self, deep: bool = True) -> dict: | |
| """Get parameters.""" | |
| params = super().get_params(deep) | |
| params.update({ | |
| 'iterations': self.iterations, | |
| 'learning_rate': self.learning_rate, | |
| 'depth': self.depth, | |
| **self.kwargs | |
| }) | |
| return params | |
| class LightGBMWrapper(BaseModelWrapper): | |
| """ | |
| LightGBM wrapper. | |
| Parameters | |
| ---------- | |
| task_type : str, default='classification' | |
| Task type: 'classification' or 'regression' | |
| n_estimators : int, default=100 | |
| Number of boosting rounds | |
| learning_rate : float, default=0.1 | |
| Step size shrinkage | |
| max_depth : int, default=-1 | |
| Maximum tree depth (-1 for unlimited) | |
| random_state : int, default=42 | |
| Random seed | |
| **kwargs : dict | |
| Additional LightGBM parameters | |
| """ | |
| def __init__( | |
| self, | |
| task_type: str = 'classification', | |
| n_estimators: int = 100, | |
| learning_rate: float = 0.1, | |
| max_depth: int = -1, | |
| random_state: int = 42, | |
| **kwargs | |
| ): | |
| super().__init__(task_type=task_type, random_state=random_state) | |
| self.n_estimators = n_estimators | |
| self.learning_rate = learning_rate | |
| self.max_depth = max_depth | |
| self.kwargs = kwargs | |
| def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray]) -> 'LightGBMWrapper': | |
| """Fit LightGBM model.""" | |
| self._validate_input(X, y) | |
| logger.info(f"Fitting LightGBM on {X.shape[0]} samples...") | |
| start_time = time.time() | |
| try: | |
| import lightgbm as lgb | |
| if self.task_type == 'classification': | |
| self.model = lgb.LGBMClassifier( | |
| n_estimators=self.n_estimators, | |
| learning_rate=self.learning_rate, | |
| max_depth=self.max_depth, | |
| random_state=self.random_state, | |
| verbose=-1, | |
| **self.kwargs | |
| ) | |
| else: | |
| self.model = lgb.LGBMRegressor( | |
| n_estimators=self.n_estimators, | |
| learning_rate=self.learning_rate, | |
| max_depth=self.max_depth, | |
| random_state=self.random_state, | |
| verbose=-1, | |
| **self.kwargs | |
| ) | |
| self.model.fit(X, y) | |
| self.is_fitted = True | |
| self.fit_time = time.time() - start_time | |
| logger.info(f"LightGBM fitted in {self.fit_time:.2f} seconds") | |
| except ImportError: | |
| raise ImportError("Install LightGBM with: pip install lightgbm") | |
| except Exception as e: | |
| logger.error(f"Error fitting LightGBM: {e}") | |
| raise | |
| return self | |
| def predict(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray: | |
| """Make predictions with LightGBM.""" | |
| if not self.is_fitted: | |
| raise ValueError("Model not fitted. Call fit() first.") | |
| self._validate_input(X) | |
| start_time = time.time() | |
| predictions = self.model.predict(X) | |
| self.predict_time = time.time() - start_time | |
| return predictions | |
| def _predict_proba_impl(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray: | |
| """Predict class probabilities.""" | |
| return self.model.predict_proba(X) | |
| def get_params(self, deep: bool = True) -> dict: | |
| """Get parameters.""" | |
| params = super().get_params(deep) | |
| params.update({ | |
| 'n_estimators': self.n_estimators, | |
| 'learning_rate': self.learning_rate, | |
| 'max_depth': self.max_depth, | |
| **self.kwargs | |
| }) | |
| return params | |