Spaces:

Akshay4506
/

ModelMatrix

Running

File size: 10,027 Bytes

e17f3ba

"""
SAP RPT-1 OSS Wrapper (Hugging Face Authenticated)
====================================================

Sklearn-compatible wrapper for SAP RPT-1-OSS via Hugging Face.

This wrapper uses the official `sap_rpt_oss` package with HF token
authentication for downloading gated model weights.

SAP RPT-1 OSS is a tabular in-context learning model — it does NOT
use text generation. It accepts DataFrames/arrays and produces
predictions directly on structured tabular data.

Requirements:
    - Python >= 3.11
    - pip install git+https://github.com/SAP-samples/sap-rpt-1-oss.git
    - Hugging Face token with access to SAP/sap-rpt-1-oss

Author: UW MSIM Team
Date: April 2026
"""

import os
import time
import logging
from typing import Optional, Union

import numpy as np
import pandas as pd

from .base_wrapper import BaseModelWrapper

logger = logging.getLogger(__name__)


def _authenticate_huggingface(token: Optional[str] = None) -> str:
    """
    Authenticate with Hugging Face Hub using token.

    Token resolution order:
        1. Explicit `token` parameter
        2. HUGGING_FACE_HUB_TOKEN environment variable
        3. HF_TOKEN environment variable
        4. Previously saved token via `huggingface-cli login`

    Parameters
    ----------
    token : str, optional
        Explicit HF token to use

    Returns
    -------
    str
        The resolved token

    Raises
    ------
    RuntimeError
        If no valid token is found
    """
    from huggingface_hub import login, HfApi

    # Resolve token from multiple sources
    resolved_token = (
        token
        or os.getenv("HUGGING_FACE_HUB_TOKEN")
        or os.getenv("HF_TOKEN")
    )

    if resolved_token:
        try:
            login(token=resolved_token, add_to_git_credential=False)
            logger.info("✅ Hugging Face authentication successful (via token)")
            return resolved_token
        except Exception as e:
            raise RuntimeError(
                f"Hugging Face authentication failed: {e}\n"
                "Ensure your token is valid and you have accepted the license at:\n"
                "  https://huggingface.co/SAP/sap-rpt-1-oss"
            )

    # Check if already logged in via CLI
    try:
        api = HfApi()
        user_info = api.whoami()
        logger.info(f"✅ Hugging Face authenticated as: {user_info.get('name', 'unknown')}")
        return ""  # Already authenticated
    except Exception:
        pass

    raise RuntimeError(
        "No Hugging Face token found. Please set one of:\n"
        "  1. Environment variable: set HUGGING_FACE_HUB_TOKEN=hf_xxx\n"
        "  2. Environment variable: set HF_TOKEN=hf_xxx\n"
        "  3. Run: huggingface-cli login\n\n"
        "You must also accept the model license at:\n"
        "  https://huggingface.co/SAP/sap-rpt-1-oss"
    )


class SAPRPT1HFWrapper(BaseModelWrapper):
    """
    SAP RPT-1 OSS (Hugging Face) wrapper for tabular prediction.

    Uses the official `sap_rpt_oss` package with in-context learning.
    The model automatically handles:
        - Column/cell embeddings via built-in LLM
        - Missing values
        - CPU/GPU auto-detection (GPU not required)

    Parameters
    ----------
    task_type : str, default='classification'
        Task type: 'classification' or 'regression'
    max_context_size : int, default=4096
        Maximum number of context rows for in-context learning.
        Higher = better accuracy but more memory/time.
        Recommended: 2048 (light), 4096 (balanced), 8192 (best)
    bagging : int or 'auto', default=4
        Number of bagging iterations for prediction stability.
        Use 1 for fast inference, 4-8 for best accuracy.
        'auto' = automatically determined based on dataset size.
    hf_token : str, optional
        Explicit Hugging Face token. If not provided, reads from
        HUGGING_FACE_HUB_TOKEN or HF_TOKEN environment variable.
    random_state : int, default=42
        Random seed for reproducibility
    """

    def __init__(
        self,
        task_type: str = 'classification',
        max_context_size: int = 4096,
        bagging: Union[int, str] = 4,
        hf_token: Optional[str] = None,
        random_state: int = 42
    ):
        super().__init__(task_type=task_type, random_state=random_state)
        self.max_context_size = max_context_size
        self.bagging = bagging
        self.hf_token = hf_token

    def fit(
        self,
        X: Union[pd.DataFrame, np.ndarray],
        y: Union[pd.Series, np.ndarray]
    ) -> 'SAPRPT1HFWrapper':
        """
        Fit SAP RPT-1 OSS model.

        Note: SAP RPT-1 uses in-context learning, so "fitting" stores
        the training data for retrieval during inference. The model
        weights are pretrained and NOT updated.

        Parameters
        ----------
        X : pd.DataFrame or np.ndarray, shape (n_samples, n_features)
            Training features
        y : pd.Series or np.ndarray, shape (n_samples,)
            Training target

        Returns
        -------
        self : SAPRPT1HFWrapper
            Fitted model
        """
        self._validate_input(X, y)

        logger.info(
            f"Fitting SAP RPT-1 OSS on {X.shape[0]} samples, "
            f"{X.shape[1]} features (max_context={self.max_context_size}, "
            f"bagging={self.bagging})..."
        )
        start_time = time.time()

        try:
            # Authenticate with Hugging Face (downloads gated model weights)
            _authenticate_huggingface(self.hf_token)

            # Import here to avoid import errors in environments without sap_rpt_oss
            from sap_rpt_oss import SAP_RPT_OSS_Classifier, SAP_RPT_OSS_Regressor

            # Initialize appropriate model based on task type
            if self.task_type == 'classification':
                self.model = SAP_RPT_OSS_Classifier(
                    max_context_size=self.max_context_size,
                    bagging=self.bagging
                )
            else:
                self.model = SAP_RPT_OSS_Regressor(
                    max_context_size=self.max_context_size,
                    bagging=self.bagging
                )

            # Fit model (stores training data for in-context learning)
            self.model.fit(X, y)

            self.is_fitted = True
            self.fit_time = time.time() - start_time

            logger.info(f"✅ SAP RPT-1 OSS fitted in {self.fit_time:.2f} seconds")

        except ImportError as e:
            logger.error(f"SAP RPT-1 OSS package not installed: {e}")
            raise ImportError(
                "sap-rpt-1-oss not found. Install with:\n"
                "  pip install git+https://github.com/SAP-samples/sap-rpt-1-oss.git\n\n"
                "Requires Python >= 3.11"
            )
        except Exception as e:
            logger.error(f"Error fitting SAP RPT-1 OSS: {e}")
            raise

        return self

    def predict(
        self,
        X: Union[pd.DataFrame, np.ndarray]
    ) -> np.ndarray:
        """
        Make predictions with SAP RPT-1 OSS.

        Parameters
        ----------
        X : pd.DataFrame or np.ndarray, shape (n_samples, n_features)
            Test features

        Returns
        -------
        predictions : np.ndarray, shape (n_samples,)
            Predicted values or class labels
        """
        if not self.is_fitted:
            raise ValueError("Model not fitted. Call fit() first.")

        self._validate_input(X)

        logger.info(f"Predicting on {X.shape[0]} samples with SAP RPT-1 OSS...")
        start_time = time.time()

        try:
            predictions = self.model.predict(X)

            # Convert list to numpy array if needed
            if isinstance(predictions, list):
                predictions = np.array(predictions)

            self.predict_time = time.time() - start_time
            logger.info(f"✅ Predictions complete in {self.predict_time:.2f} seconds")

            return predictions

        except Exception as e:
            logger.error(f"Error during prediction: {e}")
            raise

    def _predict_proba_impl(
        self,
        X: Union[pd.DataFrame, np.ndarray]
    ) -> np.ndarray:
        """
        Predict class probabilities with SAP RPT-1 OSS.

        Parameters
        ----------
        X : pd.DataFrame or np.ndarray, shape (n_samples, n_features)
            Test features

        Returns
        -------
        probabilities : np.ndarray, shape (n_samples, n_classes)
            Class probabilities
        """
        if self.task_type != 'classification':
            raise ValueError("predict_proba only available for classification")

        try:
            proba = self.model.predict_proba(X)

            # Convert to numpy if needed
            if not isinstance(proba, np.ndarray):
                proba = np.array(proba)

            return proba

        except AttributeError:
            # Fallback: one-hot encode predictions if predict_proba unavailable
            logger.warning(
                "predict_proba not available, using one-hot encoding of predictions"
            )
            predictions = self.model.predict(X)
            if isinstance(predictions, list):
                predictions = np.array(predictions)

            classes = np.unique(predictions)
            n_samples = len(predictions)
            n_classes = len(classes)
            proba = np.zeros((n_samples, n_classes))

            class_to_idx = {c: i for i, c in enumerate(classes)}
            for i, pred in enumerate(predictions):
                proba[i, class_to_idx[pred]] = 1.0

            return proba

    def get_params(self, deep: bool = True) -> dict:
        """Get parameters for this estimator (sklearn compatibility)."""
        params = super().get_params(deep)
        params.update({
            'max_context_size': self.max_context_size,
            'bagging': self.bagging,
            'hf_token': self.hf_token
        })
        return params