""" Borealis Processor for HuggingFace/vLLM compatibility. Handles audio feature extraction and tokenization. """ from typing import List, Optional, Union import torch from transformers import ProcessorMixin, BatchFeature from transformers.models.whisper import WhisperFeatureExtractor from transformers import AutoTokenizer class BorealisProcessor(ProcessorMixin): """ Processor for Borealis audio-language model. Combines WhisperFeatureExtractor for audio and Qwen3 tokenizer for text. """ attributes = ["feature_extractor", "tokenizer"] feature_extractor_class = "WhisperFeatureExtractor" tokenizer_class = "AutoTokenizer" # Audio tokens audio_token = "<|AUDIO|>" audio_bos_token = "<|start_of_audio|>" audio_eos_token = "<|end_of_audio|>" def __init__( self, feature_extractor: Optional[WhisperFeatureExtractor] = None, tokenizer: Optional[AutoTokenizer] = None, **kwargs ): if feature_extractor is None: feature_extractor = WhisperFeatureExtractor.from_pretrained( "openai/whisper-large-v3" ) if tokenizer is None: tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B") super().__init__(feature_extractor, tokenizer) @property def sampling_rate(self) -> int: return self.feature_extractor.sampling_rate def __call__( self, text: Optional[Union[str, List[str]]] = None, audio: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None, sampling_rate: Optional[int] = None, return_tensors: Optional[str] = "pt", **kwargs, ) -> BatchFeature: """ Process text and/or audio inputs. Args: text: Text prompt(s) audio: Audio waveform(s) at 16kHz sampling_rate: Audio sampling rate (default: 16000) return_tensors: Return tensor type Returns: BatchFeature with input_ids and optionally input_features """ if sampling_rate is None: sampling_rate = self.sampling_rate data = {} # Process audio if provided if audio is not None: if isinstance(audio, torch.Tensor): audio = [audio] # Convert to numpy for feature extractor audio_arrays = [] for a in audio: if isinstance(a, torch.Tensor): a = a.numpy() audio_arrays.append(a) audio_features = self.feature_extractor( audio_arrays, sampling_rate=sampling_rate, return_tensors=return_tensors, ) data["input_features"] = audio_features.input_features # Process text if provided if text is not None: if isinstance(text, str): text = [text] text_inputs = self.tokenizer( text, return_tensors=return_tensors, padding=True, **kwargs, ) data["input_ids"] = text_inputs.input_ids if "attention_mask" in text_inputs: data["attention_mask"] = text_inputs.attention_mask return BatchFeature(data=data, tensor_type=return_tensors) def batch_decode(self, *args, **kwargs): """Decode token IDs to text.""" return self.tokenizer.batch_decode(*args, **kwargs) def decode(self, *args, **kwargs): """Decode token IDs to text.""" return self.tokenizer.decode(*args, **kwargs) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): """Load processor from pretrained.""" feature_extractor = WhisperFeatureExtractor.from_pretrained( "openai/whisper-large-v3" ) tokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs ) return cls(feature_extractor=feature_extractor, tokenizer=tokenizer) def save_pretrained(self, save_directory, **kwargs): """Save processor.""" self.feature_extractor.save_pretrained(save_directory, **kwargs) self.tokenizer.save_pretrained(save_directory, **kwargs)