""" TabICL Wrapper ============== Sklearn-compatible wrapper for TabICL (Tabular In-Context Learning). TabICL uses language models for tabular prediction via in-context learning. Author: UW MSIM Team Date: November 2025 """ import time import logging from typing import Optional, Union import numpy as np import pandas as pd from .base_wrapper import BaseModelWrapper logger = logging.getLogger(__name__) class TabICLWrapper(BaseModelWrapper): """ TabICL (Tabular In-Context Learning) wrapper. Parameters ---------- task_type : str, default='classification' Task type: 'classification' or 'regression' model_name : str, default='gpt2' Base language model to use max_samples : int, default=100 Maximum number of in-context examples device : str, default='auto' Device: 'cpu', 'cuda', or 'auto' random_state : int, default=42 Random seed """ def __init__( self, task_type: str = 'classification', model_name: str = 'gpt2', max_samples: int = 100, device: str = 'auto', random_state: int = 42 ): super().__init__(task_type=task_type, random_state=random_state) self.model_name = model_name self.max_samples = max_samples self.device = device def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray]) -> 'TabICLWrapper': """ Fit TabICL (stores training data for in-context learning). Parameters ---------- X : pd.DataFrame or np.ndarray, shape (n_samples, n_features) Training features y : pd.Series or np.ndarray, shape (n_samples,) Training target Returns ------- self : TabICLWrapper Fitted model """ self._validate_input(X, y) logger.info(f"Fitting TabICL with {self.model_name} on {X.shape[0]} samples...") start_time = time.time() try: # Note: Actual TabICL implementation may vary # This is a template; adjust imports based on actual TabICL package # Store training data for in-context learning if isinstance(X, pd.DataFrame): self.X_train_ = X.copy() else: self.X_train_ = pd.DataFrame(X) if isinstance(y, pd.Series): self.y_train_ = y.copy() else: self.y_train_ = pd.Series(y) # Limit to max_samples for efficiency if len(self.X_train_) > self.max_samples: logger.info(f"Sampling {self.max_samples} from {len(self.X_train_)} training samples") sample_idx = np.random.RandomState(self.random_state).choice( len(self.X_train_), self.max_samples, replace=False ) self.X_train_ = self.X_train_.iloc[sample_idx] self.y_train_ = self.y_train_.iloc[sample_idx] # Initialize TabICL model (placeholder - adjust for actual implementation) # from tabicl import TabICLModel # self.model = TabICLModel(model_name=self.model_name, device=self.device) self.is_fitted = True self.fit_time = time.time() - start_time logger.info(f"TabICL fitted in {self.fit_time:.2f} seconds") logger.warning("TabICL wrapper is a template. Adjust for actual TabICL implementation.") except Exception as e: logger.error(f"Error fitting TabICL: {e}") raise return self def predict(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray: """ Make predictions with TabICL. Parameters ---------- X : pd.DataFrame or np.ndarray, shape (n_samples, n_features) Test features Returns ------- predictions : np.ndarray, shape (n_samples,) Predicted values or class labels """ if not self.is_fitted: raise ValueError("Model not fitted. Call fit() first.") self._validate_input(X) logger.info(f"Predicting on {X.shape[0]} samples with TabICL...") start_time = time.time() try: # Placeholder implementation # In actual TabICL, this would use the language model with in-context examples logger.warning("Using placeholder predictions. Integrate actual TabICL model.") # Fallback: predict the majority class for classification to ensure valid type if self.task_type == 'classification': majority_class = self.y_train_.mode()[0] predictions = np.full(len(X), majority_class) else: predictions = np.zeros(len(X)) self.predict_time = time.time() - start_time logger.info(f"Predictions complete in {self.predict_time:.2f} seconds") return predictions except Exception as e: logger.error(f"Error during prediction: {e}") raise def _predict_proba_impl(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray: """ Predict class probabilities with TabICL. Parameters ---------- X : pd.DataFrame or np.ndarray, shape (n_samples, n_features) Test features Returns ------- probabilities : np.ndarray, shape (n_samples, n_classes) Class probabilities """ # Placeholder implementation n_samples = len(X) n_classes = len(np.unique(self.y_train_)) proba = np.ones((n_samples, n_classes)) / n_classes logger.warning("Using uniform probability distribution. Integrate actual TabICL model.") return proba def get_params(self, deep: bool = True) -> dict: """Get parameters for this estimator.""" params = super().get_params(deep) params.update({ 'model_name': self.model_name, 'max_samples': self.max_samples, 'device': self.device }) return params