ModelMatrix / matrix /code /models /baseline_wrappers.py
Akshay4506's picture
Fix deployment entry point and merge requirements
c4ff02d
"""
Baseline Model Wrappers
========================
Sklearn-compatible wrappers for traditional gradient boosting models:
- XGBoost
- CatBoost
- LightGBM
Author: UW MSIM Team
Date: November 2025
"""
import time
import logging
from typing import Optional, Union, Dict, Any
import numpy as np
import pandas as pd
from .base_wrapper import BaseModelWrapper
logger = logging.getLogger(__name__)
class XGBoostWrapper(BaseModelWrapper):
"""
XGBoost wrapper.
Parameters
----------
task_type : str, default='classification'
Task type: 'classification' or 'regression'
n_estimators : int, default=100
Number of boosting rounds
learning_rate : float, default=0.1
Step size shrinkage
max_depth : int, default=6
Maximum tree depth
random_state : int, default=42
Random seed
**kwargs : dict
Additional XGBoost parameters
"""
def __init__(
self,
task_type: str = 'classification',
n_estimators: int = 100,
learning_rate: float = 0.1,
max_depth: int = 6,
random_state: int = 42,
**kwargs
):
super().__init__(task_type=task_type, random_state=random_state)
self.n_estimators = n_estimators
self.learning_rate = learning_rate
self.max_depth = max_depth
self.kwargs = kwargs
def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray]) -> 'XGBoostWrapper':
"""Fit XGBoost model."""
from sklearn.preprocessing import LabelEncoder
self._label_encoder = None
self._validate_input(X, y)
logger.info(f"Fitting XGBoost on {X.shape[0]} samples...")
start_time = time.time()
try:
import xgboost as xgb
if self.task_type == 'classification':
self.model = xgb.XGBClassifier(
n_estimators=self.n_estimators,
learning_rate=self.learning_rate,
max_depth=self.max_depth,
random_state=self.random_state,
**self.kwargs
)
else:
self.model = xgb.XGBRegressor(
n_estimators=self.n_estimators,
learning_rate=self.learning_rate,
max_depth=self.max_depth,
random_state=self.random_state,
**self.kwargs
)
if self.task_type == 'classification':
self._label_encoder = LabelEncoder()
y_encoded = self._label_encoder.fit_transform(y)
self.model.fit(X, y_encoded)
else:
self.model.fit(X, y)
self.is_fitted = True
self.fit_time = time.time() - start_time
logger.info(f"XGBoost fitted in {self.fit_time:.2f} seconds")
except ImportError:
raise ImportError("Install XGBoost with: pip install xgboost")
except Exception as e:
logger.error(f"Error fitting XGBoost: {e}")
raise
return self
def predict(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
"""Make predictions with XGBoost."""
if not self.is_fitted:
raise ValueError("Model not fitted. Call fit() first.")
self._validate_input(X)
start_time = time.time()
predictions = self.model.predict(X)
if self.task_type == 'classification' and self._label_encoder is not None:
predictions = self._label_encoder.inverse_transform(predictions)
self.predict_time = time.time() - start_time
return predictions
def _predict_proba_impl(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
"""Predict class probabilities."""
return self.model.predict_proba(X)
def get_params(self, deep: bool = True) -> dict:
"""Get parameters."""
params = super().get_params(deep)
params.update({
'n_estimators': self.n_estimators,
'learning_rate': self.learning_rate,
'max_depth': self.max_depth,
**self.kwargs
})
return params
class CatBoostWrapper(BaseModelWrapper):
"""
CatBoost wrapper.
Parameters
----------
task_type : str, default='classification'
Task type: 'classification' or 'regression'
iterations : int, default=100
Number of boosting iterations
learning_rate : float, default=0.1
Step size shrinkage
depth : int, default=6
Tree depth
random_state : int, default=42
Random seed
**kwargs : dict
Additional CatBoost parameters
"""
def __init__(
self,
task_type: str = 'classification',
iterations: int = 100,
learning_rate: float = 0.1,
depth: int = 6,
random_state: int = 42,
**kwargs
):
super().__init__(task_type=task_type, random_state=random_state)
self.iterations = iterations
self.learning_rate = learning_rate
self.depth = depth
self.kwargs = kwargs
def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray]) -> 'CatBoostWrapper':
"""Fit CatBoost model."""
self._validate_input(X, y)
logger.info(f"Fitting CatBoost on {X.shape[0]} samples...")
start_time = time.time()
try:
from catboost import CatBoostClassifier, CatBoostRegressor
if self.task_type == 'classification':
self.model = CatBoostClassifier(
iterations=self.iterations,
learning_rate=self.learning_rate,
depth=self.depth,
random_state=self.random_state,
verbose=False,
**self.kwargs
)
else:
self.model = CatBoostRegressor(
iterations=self.iterations,
learning_rate=self.learning_rate,
depth=self.depth,
random_state=self.random_state,
verbose=False,
**self.kwargs
)
self.model.fit(X, y)
self.is_fitted = True
self.fit_time = time.time() - start_time
logger.info(f"CatBoost fitted in {self.fit_time:.2f} seconds")
except ImportError:
raise ImportError("Install CatBoost with: pip install catboost")
except Exception as e:
logger.error(f"Error fitting CatBoost: {e}")
raise
return self
def predict(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
"""Make predictions with CatBoost."""
if not self.is_fitted:
raise ValueError("Model not fitted. Call fit() first.")
self._validate_input(X)
start_time = time.time()
predictions = self.model.predict(X)
self.predict_time = time.time() - start_time
return predictions
def _predict_proba_impl(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
"""Predict class probabilities."""
return self.model.predict_proba(X)
def get_params(self, deep: bool = True) -> dict:
"""Get parameters."""
params = super().get_params(deep)
params.update({
'iterations': self.iterations,
'learning_rate': self.learning_rate,
'depth': self.depth,
**self.kwargs
})
return params
class LightGBMWrapper(BaseModelWrapper):
"""
LightGBM wrapper.
Parameters
----------
task_type : str, default='classification'
Task type: 'classification' or 'regression'
n_estimators : int, default=100
Number of boosting rounds
learning_rate : float, default=0.1
Step size shrinkage
max_depth : int, default=-1
Maximum tree depth (-1 for unlimited)
random_state : int, default=42
Random seed
**kwargs : dict
Additional LightGBM parameters
"""
def __init__(
self,
task_type: str = 'classification',
n_estimators: int = 100,
learning_rate: float = 0.1,
max_depth: int = -1,
random_state: int = 42,
**kwargs
):
super().__init__(task_type=task_type, random_state=random_state)
self.n_estimators = n_estimators
self.learning_rate = learning_rate
self.max_depth = max_depth
self.kwargs = kwargs
def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray]) -> 'LightGBMWrapper':
"""Fit LightGBM model."""
self._validate_input(X, y)
logger.info(f"Fitting LightGBM on {X.shape[0]} samples...")
start_time = time.time()
try:
import lightgbm as lgb
if self.task_type == 'classification':
self.model = lgb.LGBMClassifier(
n_estimators=self.n_estimators,
learning_rate=self.learning_rate,
max_depth=self.max_depth,
random_state=self.random_state,
verbose=-1,
**self.kwargs
)
else:
self.model = lgb.LGBMRegressor(
n_estimators=self.n_estimators,
learning_rate=self.learning_rate,
max_depth=self.max_depth,
random_state=self.random_state,
verbose=-1,
**self.kwargs
)
self.model.fit(X, y)
self.is_fitted = True
self.fit_time = time.time() - start_time
logger.info(f"LightGBM fitted in {self.fit_time:.2f} seconds")
except ImportError:
raise ImportError("Install LightGBM with: pip install lightgbm")
except Exception as e:
logger.error(f"Error fitting LightGBM: {e}")
raise
return self
def predict(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
"""Make predictions with LightGBM."""
if not self.is_fitted:
raise ValueError("Model not fitted. Call fit() first.")
self._validate_input(X)
start_time = time.time()
predictions = self.model.predict(X)
self.predict_time = time.time() - start_time
return predictions
def _predict_proba_impl(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
"""Predict class probabilities."""
return self.model.predict_proba(X)
def get_params(self, deep: bool = True) -> dict:
"""Get parameters."""
params = super().get_params(deep)
params.update({
'n_estimators': self.n_estimators,
'learning_rate': self.learning_rate,
'max_depth': self.max_depth,
**self.kwargs
})
return params