| import numpy as np |
| from transformers import ProcessorMixin |
|
|
| class SagaProcessor(ProcessorMixin): |
| attributes = ["feature_extractor", "tokenizer"] |
|
|
| def __init__(self, feature_extractor, tokenizer, **kwargs): |
| super().__init__(feature_extractor, tokenizer, **kwargs) |
| self.target_sr = 16000 |
|
|
| def process_audio(self, audio, sampling_rate): |
| if int(sampling_rate) == self.target_sr: |
| return audio |
|
|
| src_len = audio.shape[0] |
| dst_len = int(round(src_len * (float(self.target_sr) / float(sampling_rate)))) |
| |
| if dst_len <= 1: |
| return np.zeros((0,), dtype=np.float32) |
|
|
| src_x = np.linspace(0.0, 1.0, num=src_len, endpoint=False) |
| dst_x = np.linspace(0.0, 1.0, num=dst_len, endpoint=False) |
| return np.interp(dst_x, src_x, audio).astype(np.float32) |
|
|
| def get_prompt(self): |
| messages = [ |
| {"role": "system", "content": ""}, |
| {"role": "user", "content": [{"type": "audio", "audio": ""}]}, |
| ] |
| prompt = self.tokenizer.apply_chat_template( |
| messages, |
| add_generation_prompt=True, |
| tokenize=False, |
| ) |
| return prompt + "language Danish<asr_text>" |