Borealis-5b-it / processing_borealis.py
AlexWortega's picture
Upload processing_borealis.py with huggingface_hub
704d932 verified
raw
history blame
4.3 kB
"""
Borealis Processor for HuggingFace/vLLM compatibility.
Handles audio feature extraction and tokenization.
"""
from typing import List, Optional, Union
import torch
from transformers import ProcessorMixin, BatchFeature
from transformers.models.whisper import WhisperFeatureExtractor
from transformers import AutoTokenizer
class BorealisProcessor(ProcessorMixin):
"""
Processor for Borealis audio-language model.
Combines WhisperFeatureExtractor for audio and Qwen3 tokenizer for text.
"""
attributes = ["feature_extractor", "tokenizer"]
feature_extractor_class = "WhisperFeatureExtractor"
tokenizer_class = "AutoTokenizer"
# Audio tokens
audio_token = "<|AUDIO|>"
audio_bos_token = "<|start_of_audio|>"
audio_eos_token = "<|end_of_audio|>"
def __init__(
self,
feature_extractor: Optional[WhisperFeatureExtractor] = None,
tokenizer: Optional[AutoTokenizer] = None,
**kwargs
):
if feature_extractor is None:
feature_extractor = WhisperFeatureExtractor.from_pretrained(
"openai/whisper-large-v3"
)
if tokenizer is None:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B")
super().__init__(feature_extractor, tokenizer)
@property
def sampling_rate(self) -> int:
return self.feature_extractor.sampling_rate
def __call__(
self,
text: Optional[Union[str, List[str]]] = None,
audio: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
sampling_rate: Optional[int] = None,
return_tensors: Optional[str] = "pt",
**kwargs,
) -> BatchFeature:
"""
Process text and/or audio inputs.
Args:
text: Text prompt(s)
audio: Audio waveform(s) at 16kHz
sampling_rate: Audio sampling rate (default: 16000)
return_tensors: Return tensor type
Returns:
BatchFeature with input_ids and optionally input_features
"""
if sampling_rate is None:
sampling_rate = self.sampling_rate
data = {}
# Process audio if provided
if audio is not None:
if isinstance(audio, torch.Tensor):
audio = [audio]
# Convert to numpy for feature extractor
audio_arrays = []
for a in audio:
if isinstance(a, torch.Tensor):
a = a.numpy()
audio_arrays.append(a)
audio_features = self.feature_extractor(
audio_arrays,
sampling_rate=sampling_rate,
return_tensors=return_tensors,
)
data["input_features"] = audio_features.input_features
# Process text if provided
if text is not None:
if isinstance(text, str):
text = [text]
text_inputs = self.tokenizer(
text,
return_tensors=return_tensors,
padding=True,
**kwargs,
)
data["input_ids"] = text_inputs.input_ids
if "attention_mask" in text_inputs:
data["attention_mask"] = text_inputs.attention_mask
return BatchFeature(data=data, tensor_type=return_tensors)
def batch_decode(self, *args, **kwargs):
"""Decode token IDs to text."""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""Decode token IDs to text."""
return self.tokenizer.decode(*args, **kwargs)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
"""Load processor from pretrained."""
feature_extractor = WhisperFeatureExtractor.from_pretrained(
"openai/whisper-large-v3"
)
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path,
**kwargs
)
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
def save_pretrained(self, save_directory, **kwargs):
"""Save processor."""
self.feature_extractor.save_pretrained(save_directory, **kwargs)
self.tokenizer.save_pretrained(save_directory, **kwargs)