transformers / tests /models /cohere2_vision /test_processing_cohere2_vision.py
AbdulElahGwaith's picture
Upload folder using huggingface_hub
a9bd396 verified
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import Cohere2VisionProcessor
from transformers.testing_utils import require_vision
from transformers.utils import is_torch_available, is_torchvision_available
from ...test_processing_common import ProcessorTesterMixin, url_to_local_path
if is_torch_available():
import torch
if is_torchvision_available():
pass
@require_vision
@unittest.skip("Model not released yet!")
class Cohere2VisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = Cohere2VisionProcessor
@classmethod
def _setup_tokenizer(cls):
tokenizer_class = cls._get_component_class_from_processor("tokenizer")
return tokenizer_class.from_pretrained("CohereLabs/command-a-vision-07-2025")
@classmethod
def _setup_image_processor(cls):
image_processor_class = cls._get_component_class_from_processor("image_processor")
return image_processor_class(
size={"height": 20, "width": 20},
max_patches=3,
)
def test_process_interleaved_images_videos(self):
processor = self.get_processor()
messages = [
[
{
"role": "user",
"content": [
{
"type": "image",
"url": url_to_local_path(
"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
),
},
{
"type": "image",
"url": url_to_local_path(
"https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"
),
},
{"type": "text", "text": "What are the differences between these two images?"},
],
},
],
[
{
"role": "user",
"content": [
{
"type": "image",
"url": url_to_local_path("https://llava-vl.github.io/static/images/view.jpg"),
},
{"type": "text", "text": "Write a haiku for this image"},
],
}
],
]
inputs_batched = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
padding=True,
)
# Process non batched inputs to check if the pixel_values and input_ids are reconstructed in the correct order when batched together
images_patches_index = 0
for i, message in enumerate(messages):
inputs = processor.apply_chat_template(
message,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
padding=True,
)
# We slice with [-inputs["input_ids"].shape[1] :] as the input_ids are left padded
torch.testing.assert_close(
inputs["input_ids"][0], inputs_batched["input_ids"][i][-inputs["input_ids"].shape[1] :]
)
torch.testing.assert_close(
inputs["pixel_values"],
inputs_batched["pixel_values"][
images_patches_index : images_patches_index + inputs["pixel_values"].shape[0]
],
)
images_patches_index += inputs["pixel_values"].shape[0]