| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import shutil |
| import tempfile |
| import unittest |
|
|
| from parameterized import parameterized |
|
|
| from transformers import ( |
| AudioFlamingo3Processor, |
| AutoProcessor, |
| AutoTokenizer, |
| WhisperFeatureExtractor, |
| ) |
| from transformers.testing_utils import require_librosa, require_torch, require_torchaudio |
|
|
| from ...test_processing_common import MODALITY_INPUT_DATA, ProcessorTesterMixin |
|
|
|
|
| class AudioFlamingo3ProcessorTest(ProcessorTesterMixin, unittest.TestCase): |
| processor_class = AudioFlamingo3Processor |
|
|
| @classmethod |
| @require_torch |
| @require_torchaudio |
| def setUpClass(cls): |
| cls.checkpoint = "nvidia/audio-flamingo-3-hf" |
| cls.tmpdirname = tempfile.mkdtemp() |
|
|
| processor = AudioFlamingo3Processor.from_pretrained(cls.checkpoint) |
| processor.save_pretrained(cls.tmpdirname) |
|
|
| @require_torch |
| @require_torchaudio |
| def get_tokenizer(self, **kwargs): |
| return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer |
|
|
| @require_torch |
| @require_torchaudio |
| def get_audio_processor(self, **kwargs): |
| return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).audio_processor |
|
|
| @require_torch |
| @require_torchaudio |
| def get_processor(self, **kwargs): |
| return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs) |
|
|
| @classmethod |
| def tearDownClass(cls): |
| shutil.rmtree(cls.tmpdirname, ignore_errors=True) |
|
|
| @require_torch |
| @require_torchaudio |
| def test_can_load_various_tokenizers(self): |
| processor = AudioFlamingo3Processor.from_pretrained(self.checkpoint) |
| tokenizer = AutoTokenizer.from_pretrained(self.checkpoint) |
| self.assertEqual(processor.tokenizer.__class__, tokenizer.__class__) |
|
|
| @require_torch |
| @require_torchaudio |
| def test_save_load_pretrained_default(self): |
| tokenizer = AutoTokenizer.from_pretrained(self.checkpoint) |
| processor = AudioFlamingo3Processor.from_pretrained(self.checkpoint) |
| feature_extractor = processor.feature_extractor |
|
|
| processor = AudioFlamingo3Processor(tokenizer=tokenizer, feature_extractor=feature_extractor) |
|
|
| with tempfile.TemporaryDirectory() as tmpdir: |
| processor.save_pretrained(tmpdir) |
| reloaded = AudioFlamingo3Processor.from_pretrained(tmpdir) |
|
|
| self.assertEqual(reloaded.tokenizer.get_vocab(), tokenizer.get_vocab()) |
| self.assertEqual(reloaded.feature_extractor.to_json_string(), feature_extractor.to_json_string()) |
| self.assertIsInstance(reloaded.feature_extractor, WhisperFeatureExtractor) |
|
|
| @require_torch |
| @require_torchaudio |
| def test_tokenizer_integration(self): |
| slow_tokenizer = AutoTokenizer.from_pretrained(self.checkpoint, use_fast=False) |
| fast_tokenizer = AutoTokenizer.from_pretrained(self.checkpoint, from_slow=True, legacy=False) |
|
|
| prompt = ( |
| "<|im_start|>system\nAnswer the questions.<|im_end|>" |
| "<|im_start|>user\n<sound>What is it?<|im_end|>" |
| "<|im_start|>assistant\n" |
| ) |
| EXPECTED_OUTPUT = [ |
| "<|im_start|>", |
| "system", |
| "Ċ", |
| "Answer", |
| "Ġthe", |
| "Ġquestions", |
| ".", |
| "<|im_end|>", |
| "<|im_start|>", |
| "user", |
| "Ċ", |
| "<sound>", |
| "What", |
| "Ġis", |
| "Ġit", |
| "?", |
| "<|im_end|>", |
| "<|im_start|>", |
| "assistant", |
| "Ċ", |
| ] |
|
|
| self.assertEqual(slow_tokenizer.tokenize(prompt), EXPECTED_OUTPUT) |
| self.assertEqual(fast_tokenizer.tokenize(prompt), EXPECTED_OUTPUT) |
|
|
| @require_torch |
| @require_torchaudio |
| def test_chat_template(self): |
| processor = AutoProcessor.from_pretrained(self.checkpoint) |
| expected_prompt = ( |
| "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" |
| "<|im_start|>user\n<sound>What is surprising about the relationship between the barking and the music?<|im_end|>\n" |
| "<|im_start|>assistant\n" |
| ) |
|
|
| conversations = [ |
| { |
| "role": "user", |
| "content": [ |
| { |
| "type": "text", |
| "text": "What is surprising about the relationship between the barking and the music?", |
| }, |
| { |
| "type": "audio", |
| "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/dogs_barking_in_sync_with_the_music.wav", |
| }, |
| ], |
| } |
| ] |
|
|
| formatted = processor.tokenizer.apply_chat_template(conversations, tokenize=False, add_generation_prompt=True) |
| self.assertEqual(expected_prompt, formatted) |
|
|
| @require_torch |
| @require_torchaudio |
| def test_apply_transcription_request_single(self): |
| processor = AutoProcessor.from_pretrained(self.checkpoint) |
|
|
| audio_url = "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav" |
| helper_outputs = processor.apply_transcription_request(audio=audio_url) |
|
|
| conversation = [ |
| { |
| "role": "user", |
| "content": [ |
| {"type": "text", "text": "Transcribe the input speech."}, |
| {"type": "audio", "audio": audio_url}, |
| ], |
| } |
| ] |
| manual_outputs = processor.apply_chat_template( |
| conversation, |
| tokenize=True, |
| add_generation_prompt=True, |
| return_dict=True, |
| ) |
|
|
| for key in ("input_ids", "attention_mask", "input_features", "input_features_mask"): |
| self.assertIn(key, helper_outputs) |
| self.assertTrue(helper_outputs[key].equal(manual_outputs[key])) |
|
|
| |
| @require_librosa |
| @parameterized.expand([(1, "np"), (1, "pt"), (2, "np"), (2, "pt")]) |
| def test_apply_chat_template_audio(self, batch_size: int, return_tensors: str): |
| if return_tensors == "np": |
| self.skipTest("AudioFlamingo3 only supports PyTorch tensors") |
| self._test_apply_chat_template( |
| "audio", batch_size, return_tensors, "audio_input_name", "feature_extractor", MODALITY_INPUT_DATA["audio"] |
| ) |
|
|