transformers / tests /models /cohere2_vision /test_processing_cohere2_vision.py

Upload folder using huggingface_hub

a9bd396 verified about 1 month ago

4.42 kB

	# Copyright 2025 The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import unittest

	from transformers import Cohere2VisionProcessor
	from transformers.testing_utils import require_vision
	from transformers.utils import is_torch_available, is_torchvision_available

	from ...test_processing_common import ProcessorTesterMixin, url_to_local_path


	if is_torch_available():
	import torch

	if is_torchvision_available():
	pass


	@require_vision
	@unittest.skip("Model not released yet!")
	class Cohere2VisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
	processor_class = Cohere2VisionProcessor

	@classmethod
	def _setup_tokenizer(cls):
	tokenizer_class = cls._get_component_class_from_processor("tokenizer")
	return tokenizer_class.from_pretrained("CohereLabs/command-a-vision-07-2025")

	@classmethod
	def _setup_image_processor(cls):
	image_processor_class = cls._get_component_class_from_processor("image_processor")
	return image_processor_class(
	size={"height": 20, "width": 20},
	max_patches=3,
	)

	def test_process_interleaved_images_videos(self):
	processor = self.get_processor()

	messages = [
	[
	{
	"role": "user",
	"content": [
	{
	"type": "image",
	"url": url_to_local_path(
	"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
	),
	},
	{
	"type": "image",
	"url": url_to_local_path(
	"https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"
	),
	},
	{"type": "text", "text": "What are the differences between these two images?"},
	],
	},
	],
	[
	{
	"role": "user",
	"content": [
	{
	"type": "image",
	"url": url_to_local_path("https://llava-vl.github.io/static/images/view.jpg"),
	},
	{"type": "text", "text": "Write a haiku for this image"},
	],
	}
	],
	]

	inputs_batched = processor.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
	padding=True,
	)

	# Process non batched inputs to check if the pixel_values and input_ids are reconstructed in the correct order when batched together
	images_patches_index = 0
	for i, message in enumerate(messages):
	inputs = processor.apply_chat_template(
	message,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
	padding=True,
	)
	# We slice with [-inputs["input_ids"].shape[1] :] as the input_ids are left padded
	torch.testing.assert_close(
	inputs["input_ids"][0], inputs_batched["input_ids"][i][-inputs["input_ids"].shape[1] :]
	)
	torch.testing.assert_close(
	inputs["pixel_values"],
	inputs_batched["pixel_values"][
	images_patches_index : images_patches_index + inputs["pixel_values"].shape[0]
	],
	)
	images_patches_index += inputs["pixel_values"].shape[0]