| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | import unittest |
| |
|
| | from transformers import Cohere2VisionProcessor |
| | from transformers.testing_utils import require_vision |
| | from transformers.utils import is_torch_available, is_torchvision_available |
| |
|
| | from ...test_processing_common import ProcessorTesterMixin, url_to_local_path |
| |
|
| |
|
| | if is_torch_available(): |
| | import torch |
| |
|
| | if is_torchvision_available(): |
| | pass |
| |
|
| |
|
| | @require_vision |
| | @unittest.skip("Model not released yet!") |
| | class Cohere2VisionProcessorTest(ProcessorTesterMixin, unittest.TestCase): |
| | processor_class = Cohere2VisionProcessor |
| |
|
| | @classmethod |
| | def _setup_tokenizer(cls): |
| | tokenizer_class = cls._get_component_class_from_processor("tokenizer") |
| | return tokenizer_class.from_pretrained("CohereLabs/command-a-vision-07-2025") |
| |
|
| | @classmethod |
| | def _setup_image_processor(cls): |
| | image_processor_class = cls._get_component_class_from_processor("image_processor") |
| | return image_processor_class( |
| | size={"height": 20, "width": 20}, |
| | max_patches=3, |
| | ) |
| |
|
| | def test_process_interleaved_images_videos(self): |
| | processor = self.get_processor() |
| |
|
| | messages = [ |
| | [ |
| | { |
| | "role": "user", |
| | "content": [ |
| | { |
| | "type": "image", |
| | "url": url_to_local_path( |
| | "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" |
| | ), |
| | }, |
| | { |
| | "type": "image", |
| | "url": url_to_local_path( |
| | "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg" |
| | ), |
| | }, |
| | {"type": "text", "text": "What are the differences between these two images?"}, |
| | ], |
| | }, |
| | ], |
| | [ |
| | { |
| | "role": "user", |
| | "content": [ |
| | { |
| | "type": "image", |
| | "url": url_to_local_path("https://llava-vl.github.io/static/images/view.jpg"), |
| | }, |
| | {"type": "text", "text": "Write a haiku for this image"}, |
| | ], |
| | } |
| | ], |
| | ] |
| |
|
| | inputs_batched = processor.apply_chat_template( |
| | messages, |
| | add_generation_prompt=True, |
| | tokenize=True, |
| | return_dict=True, |
| | return_tensors="pt", |
| | padding=True, |
| | ) |
| |
|
| | |
| | images_patches_index = 0 |
| | for i, message in enumerate(messages): |
| | inputs = processor.apply_chat_template( |
| | message, |
| | add_generation_prompt=True, |
| | tokenize=True, |
| | return_dict=True, |
| | return_tensors="pt", |
| | padding=True, |
| | ) |
| | |
| | torch.testing.assert_close( |
| | inputs["input_ids"][0], inputs_batched["input_ids"][i][-inputs["input_ids"].shape[1] :] |
| | ) |
| | torch.testing.assert_close( |
| | inputs["pixel_values"], |
| | inputs_batched["pixel_values"][ |
| | images_patches_index : images_patches_index + inputs["pixel_values"].shape[0] |
| | ], |
| | ) |
| | images_patches_index += inputs["pixel_values"].shape[0] |
| |
|