|
|
""" |
|
|
Borealis Processor for HuggingFace/vLLM compatibility. |
|
|
|
|
|
Handles audio feature extraction and tokenization. |
|
|
""" |
|
|
|
|
|
from typing import List, Optional, Union |
|
|
|
|
|
import torch |
|
|
from transformers import ProcessorMixin, BatchFeature |
|
|
from transformers.models.whisper import WhisperFeatureExtractor |
|
|
from transformers import AutoTokenizer |
|
|
|
|
|
|
|
|
class BorealisProcessor(ProcessorMixin): |
|
|
""" |
|
|
Processor for Borealis audio-language model. |
|
|
|
|
|
Combines WhisperFeatureExtractor for audio and Qwen3 tokenizer for text. |
|
|
""" |
|
|
|
|
|
attributes = ["feature_extractor", "tokenizer"] |
|
|
feature_extractor_class = "WhisperFeatureExtractor" |
|
|
tokenizer_class = "AutoTokenizer" |
|
|
|
|
|
|
|
|
audio_token = "<|AUDIO|>" |
|
|
audio_bos_token = "<|start_of_audio|>" |
|
|
audio_eos_token = "<|end_of_audio|>" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
feature_extractor: Optional[WhisperFeatureExtractor] = None, |
|
|
tokenizer: Optional[AutoTokenizer] = None, |
|
|
**kwargs |
|
|
): |
|
|
if feature_extractor is None: |
|
|
feature_extractor = WhisperFeatureExtractor.from_pretrained( |
|
|
"openai/whisper-large-v3" |
|
|
) |
|
|
if tokenizer is None: |
|
|
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B") |
|
|
|
|
|
super().__init__(feature_extractor, tokenizer) |
|
|
|
|
|
@property |
|
|
def sampling_rate(self) -> int: |
|
|
return self.feature_extractor.sampling_rate |
|
|
|
|
|
def __call__( |
|
|
self, |
|
|
text: Optional[Union[str, List[str]]] = None, |
|
|
audio: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None, |
|
|
sampling_rate: Optional[int] = None, |
|
|
return_tensors: Optional[str] = "pt", |
|
|
**kwargs, |
|
|
) -> BatchFeature: |
|
|
""" |
|
|
Process text and/or audio inputs. |
|
|
|
|
|
Args: |
|
|
text: Text prompt(s) |
|
|
audio: Audio waveform(s) at 16kHz |
|
|
sampling_rate: Audio sampling rate (default: 16000) |
|
|
return_tensors: Return tensor type |
|
|
|
|
|
Returns: |
|
|
BatchFeature with input_ids and optionally input_features |
|
|
""" |
|
|
if sampling_rate is None: |
|
|
sampling_rate = self.sampling_rate |
|
|
|
|
|
data = {} |
|
|
|
|
|
|
|
|
if audio is not None: |
|
|
if isinstance(audio, torch.Tensor): |
|
|
audio = [audio] |
|
|
|
|
|
|
|
|
audio_arrays = [] |
|
|
for a in audio: |
|
|
if isinstance(a, torch.Tensor): |
|
|
a = a.numpy() |
|
|
audio_arrays.append(a) |
|
|
|
|
|
audio_features = self.feature_extractor( |
|
|
audio_arrays, |
|
|
sampling_rate=sampling_rate, |
|
|
return_tensors=return_tensors, |
|
|
) |
|
|
data["input_features"] = audio_features.input_features |
|
|
|
|
|
|
|
|
if text is not None: |
|
|
if isinstance(text, str): |
|
|
text = [text] |
|
|
|
|
|
text_inputs = self.tokenizer( |
|
|
text, |
|
|
return_tensors=return_tensors, |
|
|
padding=True, |
|
|
**kwargs, |
|
|
) |
|
|
data["input_ids"] = text_inputs.input_ids |
|
|
if "attention_mask" in text_inputs: |
|
|
data["attention_mask"] = text_inputs.attention_mask |
|
|
|
|
|
return BatchFeature(data=data, tensor_type=return_tensors) |
|
|
|
|
|
def batch_decode(self, *args, **kwargs): |
|
|
"""Decode token IDs to text.""" |
|
|
return self.tokenizer.batch_decode(*args, **kwargs) |
|
|
|
|
|
def decode(self, *args, **kwargs): |
|
|
"""Decode token IDs to text.""" |
|
|
return self.tokenizer.decode(*args, **kwargs) |
|
|
|
|
|
@classmethod |
|
|
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): |
|
|
"""Load processor from pretrained.""" |
|
|
feature_extractor = WhisperFeatureExtractor.from_pretrained( |
|
|
"openai/whisper-large-v3" |
|
|
) |
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
|
pretrained_model_name_or_path, |
|
|
**kwargs |
|
|
) |
|
|
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer) |
|
|
|
|
|
def save_pretrained(self, save_directory, **kwargs): |
|
|
"""Save processor.""" |
|
|
self.feature_extractor.save_pretrained(save_directory, **kwargs) |
|
|
self.tokenizer.save_pretrained(save_directory, **kwargs) |
|
|
|