File size: 7,180 Bytes

"""
Borealis Processor for HuggingFace/vLLM compatibility.

Handles audio feature extraction and tokenization.
"""

import numpy as np
from typing import List, Optional, Union

import torch
from transformers import ProcessorMixin, BatchFeature
from transformers.models.whisper import WhisperFeatureExtractor
from transformers import AutoTokenizer


class BorealisProcessor(ProcessorMixin):
    """
    Processor for Borealis audio-language model.

    Combines WhisperFeatureExtractor for audio and Qwen3 tokenizer for text.
    """

    attributes = ["feature_extractor", "tokenizer"]
    feature_extractor_class = "WhisperFeatureExtractor"
    tokenizer_class = "AutoTokenizer"

    # Audio tokens (checkpoint has only 2 special tokens: 151669 and 151670)
    audio_token = "<|AUDIO|>"
    audio_bos_token = "<|start_of_audio|>"
    audio_eos_token = "<|start_of_audio|>"  # Reuse bos token since only 2 audio tokens in vocab

    # Borealis architecture parameters
    downsample_factor = 4  # Audio embedding downsampling factor

    def __init__(
        self,
        feature_extractor: Optional[WhisperFeatureExtractor] = None,
        tokenizer: Optional[AutoTokenizer] = None,
        **kwargs
    ):
        if feature_extractor is None:
            feature_extractor = WhisperFeatureExtractor.from_pretrained(
                "openai/whisper-large-v3"
            )
        if tokenizer is None:
            tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B")

        super().__init__(feature_extractor, tokenizer)

    @property
    def sampling_rate(self) -> int:
        return self.feature_extractor.sampling_rate

    def __call__(
        self,
        text: Optional[Union[str, List[str]]] = None,
        audio: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
        audios: Optional[List] = None,  # vLLM uses plural
        sampling_rate: Optional[int] = None,
        return_tensors: Optional[str] = "pt",
        **kwargs,
    ) -> BatchFeature:
        """
        Process text and/or audio inputs.

        Expands <|AUDIO|> tokens in text to match the number of audio embeddings.

        Args:
            text: Text prompt(s) containing <|AUDIO|> placeholders
            audio: Audio waveform(s) at 16kHz
            audios: Audio waveform(s) at 16kHz (vLLM style)
            sampling_rate: Audio sampling rate (default: 16000)
            return_tensors: Return tensor type

        Returns:
            BatchFeature with input_ids and optionally input_features
        """
        # vLLM uses 'audios' (plural)
        if audios is not None and audio is None:
            audio = audios

        if sampling_rate is None:
            sampling_rate = self.sampling_rate

        data = {}

        # Process audio if provided
        if audio is not None:
            if not isinstance(audio, list):
                audio = [audio]

            # Convert to numpy for feature extractor
            audio_arrays = []
            for a in audio:
                if isinstance(a, torch.Tensor):
                    a = a.numpy()
                if isinstance(a, np.ndarray):
                    a = a.astype(np.float32)
                audio_arrays.append(a)

            audio_features = self.feature_extractor(
                audio_arrays,
                sampling_rate=sampling_rate,
                return_tensors=return_tensors,
                padding="max_length",
                return_attention_mask=True,
            )
            data["input_features"] = audio_features.input_features

            # Calculate audio lengths for token expansion
            # Whisper uses 30s chunks with 3000 mel frames -> 1500 encoder frames
            # Borealis downsamples by 4x -> 375 tokens
            attention_mask = audio_features.get("attention_mask")
            if attention_mask is not None:
                # Sum attention mask to get actual audio length in frames
                audio_lengths = attention_mask.sum(dim=-1).tolist()
            else:
                # Default: assume full 30s audio
                audio_lengths = [3000] * len(audio_arrays)

        # Process text if provided - expand audio tokens
        if text is not None:
            if isinstance(text, str):
                text = [text]

            # Expand <|AUDIO|> tokens based on audio lengths
            if audio is not None:
                expanded_text = []
                audio_idx = 0

                for sample in text:
                    while self.audio_token in sample:
                        if audio_idx < len(audio_lengths):
                            audio_len = audio_lengths[audio_idx]
                            # Whisper: 3000 mel frames -> 1500 encoder frames
                            # Then downsample by 4 -> 375 tokens
                            whisper_frames = (audio_len - 1) // 2 + 1  # ~1500
                            num_audio_tokens = whisper_frames // self.downsample_factor  # ~375

                            # Expand single <|AUDIO|> to multiple tokens with markers
                            expanded = (
                                self.audio_bos_token +
                                self.audio_token * num_audio_tokens +
                                self.audio_eos_token
                            )
                            sample = sample.replace(self.audio_token, expanded, 1)
                            audio_idx += 1
                        else:
                            break
                    expanded_text.append(sample)
                text = expanded_text

            # Filter out kwargs that tokenizer doesn't accept
            tok_kwargs = {k: v for k, v in kwargs.items()
                         if k in ['padding', 'truncation', 'max_length', 'add_special_tokens']}

            text_inputs = self.tokenizer(
                text,
                return_tensors=return_tensors,
                padding=True,
                **tok_kwargs,
            )
            data["input_ids"] = text_inputs.input_ids
            if "attention_mask" in text_inputs:
                data["attention_mask"] = text_inputs.attention_mask

        return BatchFeature(data=data, tensor_type=return_tensors)

    def batch_decode(self, *args, **kwargs):
        """Decode token IDs to text."""
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """Decode token IDs to text."""
        return self.tokenizer.decode(*args, **kwargs)

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        """Load processor from pretrained."""
        feature_extractor = WhisperFeatureExtractor.from_pretrained(
            "openai/whisper-large-v3"
        )
        tokenizer = AutoTokenizer.from_pretrained(
            pretrained_model_name_or_path,
            **kwargs
        )
        return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)

    def save_pretrained(self, save_directory, **kwargs):
        """Save processor."""
        self.feature_extractor.save_pretrained(save_directory, **kwargs)
        self.tokenizer.save_pretrained(save_directory, **kwargs)