Spaces:
Running
Running
| """ | |
| TabICL Wrapper | |
| ============== | |
| Sklearn-compatible wrapper for TabICL (Tabular In-Context Learning). | |
| TabICL uses language models for tabular prediction via in-context learning. | |
| Author: UW MSIM Team | |
| Date: November 2025 | |
| """ | |
| import time | |
| import logging | |
| from typing import Optional, Union | |
| import numpy as np | |
| import pandas as pd | |
| from .base_wrapper import BaseModelWrapper | |
| logger = logging.getLogger(__name__) | |
| class TabICLWrapper(BaseModelWrapper): | |
| """ | |
| TabICL (Tabular In-Context Learning) wrapper. | |
| Parameters | |
| ---------- | |
| task_type : str, default='classification' | |
| Task type: 'classification' or 'regression' | |
| model_name : str, default='gpt2' | |
| Base language model to use | |
| max_samples : int, default=100 | |
| Maximum number of in-context examples | |
| device : str, default='auto' | |
| Device: 'cpu', 'cuda', or 'auto' | |
| random_state : int, default=42 | |
| Random seed | |
| """ | |
| def __init__( | |
| self, | |
| task_type: str = 'classification', | |
| model_name: str = 'gpt2', | |
| max_samples: int = 100, | |
| device: str = 'auto', | |
| random_state: int = 42 | |
| ): | |
| super().__init__(task_type=task_type, random_state=random_state) | |
| self.model_name = model_name | |
| self.max_samples = max_samples | |
| self.device = device | |
| def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray]) -> 'TabICLWrapper': | |
| """ | |
| Fit TabICL (stores training data for in-context learning). | |
| Parameters | |
| ---------- | |
| X : pd.DataFrame or np.ndarray, shape (n_samples, n_features) | |
| Training features | |
| y : pd.Series or np.ndarray, shape (n_samples,) | |
| Training target | |
| Returns | |
| ------- | |
| self : TabICLWrapper | |
| Fitted model | |
| """ | |
| self._validate_input(X, y) | |
| logger.info(f"Fitting TabICL with {self.model_name} on {X.shape[0]} samples...") | |
| start_time = time.time() | |
| try: | |
| # Note: Actual TabICL implementation may vary | |
| # This is a template; adjust imports based on actual TabICL package | |
| # Store training data for in-context learning | |
| if isinstance(X, pd.DataFrame): | |
| self.X_train_ = X.copy() | |
| else: | |
| self.X_train_ = pd.DataFrame(X) | |
| if isinstance(y, pd.Series): | |
| self.y_train_ = y.copy() | |
| else: | |
| self.y_train_ = pd.Series(y) | |
| # Limit to max_samples for efficiency | |
| if len(self.X_train_) > self.max_samples: | |
| logger.info(f"Sampling {self.max_samples} from {len(self.X_train_)} training samples") | |
| sample_idx = np.random.RandomState(self.random_state).choice( | |
| len(self.X_train_), self.max_samples, replace=False | |
| ) | |
| self.X_train_ = self.X_train_.iloc[sample_idx] | |
| self.y_train_ = self.y_train_.iloc[sample_idx] | |
| # Initialize TabICL model (placeholder - adjust for actual implementation) | |
| # from tabicl import TabICLModel | |
| # self.model = TabICLModel(model_name=self.model_name, device=self.device) | |
| self.is_fitted = True | |
| self.fit_time = time.time() - start_time | |
| logger.info(f"TabICL fitted in {self.fit_time:.2f} seconds") | |
| logger.warning("TabICL wrapper is a template. Adjust for actual TabICL implementation.") | |
| except Exception as e: | |
| logger.error(f"Error fitting TabICL: {e}") | |
| raise | |
| return self | |
| def predict(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray: | |
| """ | |
| Make predictions with TabICL. | |
| Parameters | |
| ---------- | |
| X : pd.DataFrame or np.ndarray, shape (n_samples, n_features) | |
| Test features | |
| Returns | |
| ------- | |
| predictions : np.ndarray, shape (n_samples,) | |
| Predicted values or class labels | |
| """ | |
| if not self.is_fitted: | |
| raise ValueError("Model not fitted. Call fit() first.") | |
| self._validate_input(X) | |
| logger.info(f"Predicting on {X.shape[0]} samples with TabICL...") | |
| start_time = time.time() | |
| try: | |
| # Placeholder implementation | |
| # In actual TabICL, this would use the language model with in-context examples | |
| logger.warning("Using placeholder predictions. Integrate actual TabICL model.") | |
| # Fallback: predict the majority class for classification to ensure valid type | |
| if self.task_type == 'classification': | |
| majority_class = self.y_train_.mode()[0] | |
| predictions = np.full(len(X), majority_class) | |
| else: | |
| predictions = np.zeros(len(X)) | |
| self.predict_time = time.time() - start_time | |
| logger.info(f"Predictions complete in {self.predict_time:.2f} seconds") | |
| return predictions | |
| except Exception as e: | |
| logger.error(f"Error during prediction: {e}") | |
| raise | |
| def _predict_proba_impl(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray: | |
| """ | |
| Predict class probabilities with TabICL. | |
| Parameters | |
| ---------- | |
| X : pd.DataFrame or np.ndarray, shape (n_samples, n_features) | |
| Test features | |
| Returns | |
| ------- | |
| probabilities : np.ndarray, shape (n_samples, n_classes) | |
| Class probabilities | |
| """ | |
| # Placeholder implementation | |
| n_samples = len(X) | |
| n_classes = len(np.unique(self.y_train_)) | |
| proba = np.ones((n_samples, n_classes)) / n_classes | |
| logger.warning("Using uniform probability distribution. Integrate actual TabICL model.") | |
| return proba | |
| def get_params(self, deep: bool = True) -> dict: | |
| """Get parameters for this estimator.""" | |
| params = super().get_params(deep) | |
| params.update({ | |
| 'model_name': self.model_name, | |
| 'max_samples': self.max_samples, | |
| 'device': self.device | |
| }) | |
| return params | |