Create audio_only_processor.py
Browse files- audio_only_processor.py +81 -0
audio_only_processor.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# audio_only_processor.py
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
from typing import List, Optional, Union
|
| 5 |
+
from transformers import WhisperFeatureExtractor, Qwen2TokenizerFast
|
| 6 |
+
from transformers.processing_utils import ProcessorMixin
|
| 7 |
+
from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput
|
| 8 |
+
from transformers.feature_extraction_utils import BatchFeature
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class AudioOnlyProcessor(ProcessorMixin):
|
| 12 |
+
"""
|
| 13 |
+
A processor class for AudioOnlyThinker. Handles only text + audio input (no image/video support).
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
feature_extractor_class = "WhisperFeatureExtractor"
|
| 17 |
+
tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
|
| 18 |
+
model_input_names = ["input_features", "attention_mask", "input_ids", "feature_attention_mask"]
|
| 19 |
+
|
| 20 |
+
def __init__(self, feature_extractor=None, tokenizer=None, chat_template=None):
|
| 21 |
+
self.audio_token = "<|AUDIO|>"
|
| 22 |
+
self.audio_bos_token = "<|audio_bos|>"
|
| 23 |
+
self.audio_eos_token = "<|audio_eos|>"
|
| 24 |
+
self.tokenizer = tokenizer
|
| 25 |
+
self.feature_extractor = feature_extractor
|
| 26 |
+
self.current_processor = self.tokenizer
|
| 27 |
+
self.chat_template = chat_template
|
| 28 |
+
|
| 29 |
+
def __call__(
|
| 30 |
+
self,
|
| 31 |
+
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
|
| 32 |
+
audios: Union[np.ndarray, List[np.ndarray]],
|
| 33 |
+
sampling_rate: Optional[int] = 16000,
|
| 34 |
+
padding: Union[bool, str, PaddingStrategy] = False,
|
| 35 |
+
**kwargs,
|
| 36 |
+
) -> BatchFeature:
|
| 37 |
+
if not isinstance(text, list):
|
| 38 |
+
text = [text]
|
| 39 |
+
|
| 40 |
+
audios_inputs = self.feature_extractor(
|
| 41 |
+
audios, sampling_rate=sampling_rate, return_attention_mask=True, padding="max_length", **kwargs
|
| 42 |
+
)
|
| 43 |
+
audios_inputs["feature_attention_mask"] = audios_inputs.pop("attention_mask")
|
| 44 |
+
audios_inputs["input_features"] = audios_inputs.pop("input_features")
|
| 45 |
+
|
| 46 |
+
input_lengths = (audios_inputs["feature_attention_mask"].sum(-1).numpy() - 1) // 2 + 1
|
| 47 |
+
audio_lengths = (input_lengths - 2) // 2 + 1
|
| 48 |
+
|
| 49 |
+
# Replace <|AUDIO|> token with audio_placeholder repeated by length
|
| 50 |
+
for i in range(len(text)):
|
| 51 |
+
text[i] = text[i].replace(
|
| 52 |
+
self.audio_token,
|
| 53 |
+
"<|audio_placeholder|>" * audio_lengths[0], # assumes 1 audio per input
|
| 54 |
+
1,
|
| 55 |
+
)
|
| 56 |
+
text[i] = text[i].replace("<|audio_placeholder|>", self.audio_token)
|
| 57 |
+
|
| 58 |
+
text_inputs = self.tokenizer(text, padding=padding, return_tensors=kwargs.get("return_tensors", None))
|
| 59 |
+
|
| 60 |
+
return BatchFeature(data={**text_inputs, **audios_inputs}, tensor_type=kwargs.get("return_tensors"))
|
| 61 |
+
|
| 62 |
+
def apply_chat_template(self, conversations, chat_template=None, **kwargs):
|
| 63 |
+
if isinstance(conversations[0], dict):
|
| 64 |
+
conversations = [conversations]
|
| 65 |
+
return self.tokenizer.apply_chat_template(conversations, chat_template=chat_template, **kwargs)
|
| 66 |
+
|
| 67 |
+
def batch_decode(self, *args, **kwargs):
|
| 68 |
+
return self.tokenizer.batch_decode(*args, **kwargs)
|
| 69 |
+
|
| 70 |
+
def decode(self, *args, **kwargs):
|
| 71 |
+
return self.tokenizer.decode(*args, **kwargs)
|
| 72 |
+
|
| 73 |
+
@classmethod
|
| 74 |
+
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
| 75 |
+
tokenizer = Qwen2TokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
| 76 |
+
feature_extractor = WhisperFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
| 77 |
+
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
| 78 |
+
|
| 79 |
+
def save_pretrained(self, save_directory):
|
| 80 |
+
self.tokenizer.save_pretrained(save_directory)
|
| 81 |
+
self.feature_extractor.save_pretrained(save_directory)
|