ModelMatrix / matrix /code /models /tabicl_wrapper.py
Akshay4506's picture
Fix deployment entry point and merge requirements
c4ff02d
"""
TabICL Wrapper
==============
Sklearn-compatible wrapper for TabICL (Tabular In-Context Learning).
TabICL uses language models for tabular prediction via in-context learning.
Author: UW MSIM Team
Date: November 2025
"""
import time
import logging
from typing import Optional, Union
import numpy as np
import pandas as pd
from .base_wrapper import BaseModelWrapper
logger = logging.getLogger(__name__)
class TabICLWrapper(BaseModelWrapper):
"""
TabICL (Tabular In-Context Learning) wrapper.
Parameters
----------
task_type : str, default='classification'
Task type: 'classification' or 'regression'
model_name : str, default='gpt2'
Base language model to use
max_samples : int, default=100
Maximum number of in-context examples
device : str, default='auto'
Device: 'cpu', 'cuda', or 'auto'
random_state : int, default=42
Random seed
"""
def __init__(
self,
task_type: str = 'classification',
model_name: str = 'gpt2',
max_samples: int = 100,
device: str = 'auto',
random_state: int = 42
):
super().__init__(task_type=task_type, random_state=random_state)
self.model_name = model_name
self.max_samples = max_samples
self.device = device
def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray]) -> 'TabICLWrapper':
"""
Fit TabICL (stores training data for in-context learning).
Parameters
----------
X : pd.DataFrame or np.ndarray, shape (n_samples, n_features)
Training features
y : pd.Series or np.ndarray, shape (n_samples,)
Training target
Returns
-------
self : TabICLWrapper
Fitted model
"""
self._validate_input(X, y)
logger.info(f"Fitting TabICL with {self.model_name} on {X.shape[0]} samples...")
start_time = time.time()
try:
# Note: Actual TabICL implementation may vary
# This is a template; adjust imports based on actual TabICL package
# Store training data for in-context learning
if isinstance(X, pd.DataFrame):
self.X_train_ = X.copy()
else:
self.X_train_ = pd.DataFrame(X)
if isinstance(y, pd.Series):
self.y_train_ = y.copy()
else:
self.y_train_ = pd.Series(y)
# Limit to max_samples for efficiency
if len(self.X_train_) > self.max_samples:
logger.info(f"Sampling {self.max_samples} from {len(self.X_train_)} training samples")
sample_idx = np.random.RandomState(self.random_state).choice(
len(self.X_train_), self.max_samples, replace=False
)
self.X_train_ = self.X_train_.iloc[sample_idx]
self.y_train_ = self.y_train_.iloc[sample_idx]
# Initialize TabICL model (placeholder - adjust for actual implementation)
# from tabicl import TabICLModel
# self.model = TabICLModel(model_name=self.model_name, device=self.device)
self.is_fitted = True
self.fit_time = time.time() - start_time
logger.info(f"TabICL fitted in {self.fit_time:.2f} seconds")
logger.warning("TabICL wrapper is a template. Adjust for actual TabICL implementation.")
except Exception as e:
logger.error(f"Error fitting TabICL: {e}")
raise
return self
def predict(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
"""
Make predictions with TabICL.
Parameters
----------
X : pd.DataFrame or np.ndarray, shape (n_samples, n_features)
Test features
Returns
-------
predictions : np.ndarray, shape (n_samples,)
Predicted values or class labels
"""
if not self.is_fitted:
raise ValueError("Model not fitted. Call fit() first.")
self._validate_input(X)
logger.info(f"Predicting on {X.shape[0]} samples with TabICL...")
start_time = time.time()
try:
# Placeholder implementation
# In actual TabICL, this would use the language model with in-context examples
logger.warning("Using placeholder predictions. Integrate actual TabICL model.")
# Fallback: predict the majority class for classification to ensure valid type
if self.task_type == 'classification':
majority_class = self.y_train_.mode()[0]
predictions = np.full(len(X), majority_class)
else:
predictions = np.zeros(len(X))
self.predict_time = time.time() - start_time
logger.info(f"Predictions complete in {self.predict_time:.2f} seconds")
return predictions
except Exception as e:
logger.error(f"Error during prediction: {e}")
raise
def _predict_proba_impl(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
"""
Predict class probabilities with TabICL.
Parameters
----------
X : pd.DataFrame or np.ndarray, shape (n_samples, n_features)
Test features
Returns
-------
probabilities : np.ndarray, shape (n_samples, n_classes)
Class probabilities
"""
# Placeholder implementation
n_samples = len(X)
n_classes = len(np.unique(self.y_train_))
proba = np.ones((n_samples, n_classes)) / n_classes
logger.warning("Using uniform probability distribution. Integrate actual TabICL model.")
return proba
def get_params(self, deep: bool = True) -> dict:
"""Get parameters for this estimator."""
params = super().get_params(deep)
params.update({
'model_name': self.model_name,
'max_samples': self.max_samples,
'device': self.device
})
return params