4c65bff about 2 years ago

25.1 kB

	# coding=utf-8
	# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""Image processor class for BridgeTower."""

	from typing import Any, Dict, Iterable, List, Optional, Tuple, Union

	import numpy as np

	from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
	from ...image_transforms import PaddingMode, center_crop, pad, resize, to_channel_dimension_format
	from ...image_utils import (
	OPENAI_CLIP_MEAN,
	OPENAI_CLIP_STD,
	ChannelDimension,
	ImageInput,
	PILImageResampling,
	get_image_size,
	infer_channel_dimension_format,
	is_batched,
	is_scaled_image,
	to_numpy_array,
	valid_images,
	)
	from ...utils import TensorType, is_vision_available, logging


	if is_vision_available():
	import PIL

	logger = logging.get_logger(__name__)


	# Copied from transformers.models.vilt.image_processing_vilt.max_across_indices
	def max_across_indices(values: Iterable[Any]) -> List[Any]:
	"""
	Return the maximum value across all indices of an iterable of values.
	"""
	return [max(values_i) for values_i in zip(*values)]


	# Copied from transformers.models.vilt.image_processing_vilt.make_pixel_mask
	def make_pixel_mask(
	image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
	) -> np.ndarray:
	"""
	Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.

	Args:
	image (`np.ndarray`):
	Image to make the pixel mask for.
	output_size (`Tuple[int, int]`):
	Output size of the mask.
	"""
	input_height, input_width = get_image_size(image, channel_dim=input_data_format)
	mask = np.zeros(output_size, dtype=np.int64)
	mask[:input_height, :input_width] = 1
	return mask


	# Copied from transformers.models.vilt.image_processing_vilt.get_max_height_width
	def get_max_height_width(
	images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
	) -> List[int]:
	"""
	Get the maximum height and width across all images in a batch.
	"""
	if input_data_format is None:
	input_data_format = infer_channel_dimension_format(images[0])

	if input_data_format == ChannelDimension.FIRST:
	_, max_height, max_width = max_across_indices([img.shape for img in images])
	elif input_data_format == ChannelDimension.LAST:
	max_height, max_width, _ = max_across_indices([img.shape for img in images])
	else:
	raise ValueError(f"Invalid channel dimension format: {input_data_format}")
	return (max_height, max_width)


	# Copied from transformers.models.vilt.image_processing_vilt.get_resize_output_image_size
	def get_resize_output_image_size(
	input_image: np.ndarray,
	shorter: int = 800,
	longer: int = 1333,
	size_divisor: int = 32,
	input_data_format: Optional[Union[str, ChannelDimension]] = None,
	) -> Tuple[int, int]:
	input_height, input_width = get_image_size(input_image, input_data_format)
	min_size, max_size = shorter, longer

	scale = min_size / min(input_height, input_width)

	if input_height < input_width:
	new_height = min_size
	new_width = scale * input_width
	else:
	new_height = scale * input_height
	new_width = min_size

	if max(new_height, new_width) > max_size:
	scale = max_size / max(new_height, new_width)
	new_height = scale * new_height
	new_width = scale * new_width

	new_height, new_width = int(new_height + 0.5), int(new_width + 0.5)
	new_height = new_height // size_divisor * size_divisor
	new_width = new_width // size_divisor * size_divisor

	return new_height, new_width


	class BridgeTowerImageProcessor(BaseImageProcessor):
	r"""
	Constructs a BridgeTower image processor.

	Args:
	do_resize (`bool`, optional, defaults to `True`):
	Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
	`do_resize` parameter in the `preprocess` method.
	size (`Dict[str, int]` optional, defaults to 288):
	Resize the shorter side of the input to `size["shortest_edge"]`. The longer side will be limited to under
	`int((1333 / 800) * size["shortest_edge"])` while preserving the aspect ratio. Only has an effect if
	`do_resize` is set to `True`. Can be overridden by the `size` parameter in the `preprocess` method.
	size_divisor (`int`, optional, defaults to 32):
	The size by which to make sure both the height and width can be divided. Only has an effect if `do_resize`
	is set to `True`. Can be overridden by the `size_divisor` parameter in the `preprocess` method.
	resample (`PILImageResampling`, optional, defaults to `Resampling.BICUBIC`):
	Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
	overridden by the `resample` parameter in the `preprocess` method.
	do_rescale (`bool`, optional, defaults to `True`):
	Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
	parameter in the `preprocess` method.
	rescale_factor (`int` or `float`, optional, defaults to `1/255`):
	Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
	overridden by the `rescale_factor` parameter in the `preprocess` method.
	do_normalize (`bool`, optional, defaults to `True`):
	Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
	method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
	image_mean (`float` or `List[float]`, optional, defaults to `IMAGENET_STANDARD_MEAN`):
	Mean to use if normalizing the image. This is a float or list of floats the length of the number of
	channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
	overridden by the `image_mean` parameter in the `preprocess` method.
	image_std (`float` or `List[float]`, optional, defaults to `IMAGENET_STANDARD_STD`):
	Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
	number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
	Can be overridden by the `image_std` parameter in the `preprocess` method.
	do_center_crop (`bool`, optional, defaults to `True`):
	Whether to center crop the image. Can be overridden by the `do_center_crop` parameter in the `preprocess`
	method.
	do_pad (`bool`, optional, defaults to `True`):
	Whether to pad the image to the `(max_height, max_width)` of the images in the batch. Can be overridden by
	the `do_pad` parameter in the `preprocess` method.
	"""

	model_input_names = ["pixel_values"]

	def __init__(
	self,
	do_resize: bool = True,
	size: Dict[str, int] = 288,
	size_divisor: int = 32,
	resample: PILImageResampling = PILImageResampling.BICUBIC,
	do_rescale: bool = True,
	rescale_factor: Union[int, float] = 1 / 255,
	do_normalize: bool = True,
	image_mean: Optional[Union[float, List[float]]] = None,
	image_std: Optional[Union[float, List[float]]] = None,
	do_center_crop: bool = True,
	do_pad: bool = True,
	**kwargs,
	) -> None:
	if "pad_and_return_pixel_mask" in kwargs:
	do_pad = kwargs.pop("pad_and_return_pixel_mask")

	super().__init__(**kwargs)
	size = size if size is not None else {"shortest_edge": 288}
	size = get_size_dict(size, default_to_square=False)

	self.do_resize = do_resize
	self.size = size
	self.size_divisor = size_divisor
	self.resample = resample
	self.do_rescale = do_rescale
	self.rescale_factor = rescale_factor
	self.do_normalize = do_normalize
	self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
	self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
	self.do_pad = do_pad
	self.do_center_crop = do_center_crop

	# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.resize
	def resize(
	self,
	image: np.ndarray,
	size: Dict[str, int],
	size_divisor: int = 32,
	resample: PILImageResampling = PILImageResampling.BICUBIC,
	data_format: Optional[Union[str, ChannelDimension]] = None,
	input_data_format: Optional[Union[str, ChannelDimension]] = None,
	**kwargs,
	) -> np.ndarray:
	"""
	Resize an image.

	Resizes the shorter side of the image to `size["shortest_edge"]` while preserving the aspect ratio. If the
	longer side is larger than the max size `(int(`size["shortest_edge"]` * 1333 / 800))`, the longer side is then
	resized to the max size while preserving the aspect ratio.

	Args:
	image (`np.ndarray`):
	Image to resize.
	size (`Dict[str, int]`):
	Controls the size of the output image. Should be of the form `{"shortest_edge": int}`.
	size_divisor (`int`, defaults to 32):
	The image is resized to a size that is a multiple of this value.
	resample (`PILImageResampling` filter, optional, defaults to `PILImageResampling.BICUBIC`):
	Resampling filter to use when resiizing the image.
	data_format (`str` or `ChannelDimension`, optional):
	The channel dimension format of the image. If not provided, it will be the same as the input image.
	input_data_format (`str` or `ChannelDimension`, optional):
	The channel dimension format of the input image. If not provided, it will be inferred.
	"""
	size = get_size_dict(size, default_to_square=False)
	if "shortest_edge" not in size:
	raise ValueError(f"The `size` dictionary must contain the key `shortest_edge`. Got {size.keys()}")
	shorter = size["shortest_edge"]
	longer = int(1333 / 800 * shorter)
	output_size = get_resize_output_image_size(
	image, shorter=shorter, longer=longer, size_divisor=size_divisor, input_data_format=input_data_format
	)
	return resize(
	image,
	size=output_size,
	resample=resample,
	data_format=data_format,
	input_data_format=input_data_format,
	**kwargs,
	)

	def center_crop(
	self,
	image: np.ndarray,
	size: Dict[str, int],
	data_format: Optional[Union[str, ChannelDimension]] = None,
	input_data_format: Optional[Union[str, ChannelDimension]] = None,
	**kwargs,
	) -> np.ndarray:
	"""
	Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `crop_size` along
	any edge, the image is padded with 0's and then center cropped.

	Args:
	image (`np.ndarray`):
	Image to center crop.
	size (`Dict[str, int]`):
	Size of the output image in the form `{"height": h, "width": w}`.
	data_format (`str` or `ChannelDimension`, optional):
	The channel dimension format of the image. If not provided, it will be the same as the input image.
	input_data_format (`ChannelDimension` or `str`, optional):
	The channel dimension format of the input image. If not provided, it will be inferred from the input
	image.
	"""
	output_size = size["shortest_edge"]
	return center_crop(
	image,
	size=(output_size, output_size),
	data_format=data_format,
	input_data_format=input_data_format,
	**kwargs,
	)

	# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
	def _pad_image(
	self,
	image: np.ndarray,
	output_size: Tuple[int, int],
	constant_values: Union[float, Iterable[float]] = 0,
	data_format: Optional[ChannelDimension] = None,
	input_data_format: Optional[Union[str, ChannelDimension]] = None,
	) -> np.ndarray:
	"""
	Pad an image with zeros to the given size.
	"""
	input_height, input_width = get_image_size(image, channel_dim=input_data_format)
	output_height, output_width = output_size

	pad_bottom = output_height - input_height
	pad_right = output_width - input_width
	padding = ((0, pad_bottom), (0, pad_right))
	padded_image = pad(
	image,
	padding,
	mode=PaddingMode.CONSTANT,
	constant_values=constant_values,
	data_format=data_format,
	input_data_format=input_data_format,
	)
	return padded_image

	# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
	def pad(
	self,
	images: List[np.ndarray],
	constant_values: Union[float, Iterable[float]] = 0,
	return_pixel_mask: bool = True,
	return_tensors: Optional[Union[str, TensorType]] = None,
	data_format: Optional[ChannelDimension] = None,
	input_data_format: Optional[Union[str, ChannelDimension]] = None,
	) -> BatchFeature:
	"""
	Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
	in the batch and optionally returns their corresponding pixel mask.

	Args:
	image (`np.ndarray`):
	Image to pad.
	constant_values (`float` or `Iterable[float]`, optional):
	The value to use for the padding if `mode` is `"constant"`.
	return_pixel_mask (`bool`, optional, defaults to `True`):
	Whether to return a pixel mask.
	return_tensors (`str` or `TensorType`, optional):
	The type of tensors to return. Can be one of:
	- Unset: Return a list of `np.ndarray`.
	- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
	- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
	- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
	- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
	data_format (`str` or `ChannelDimension`, optional):
	The channel dimension format of the image. If not provided, it will be the same as the input image.
	input_data_format (`ChannelDimension` or `str`, optional):
	The channel dimension format of the input image. If not provided, it will be inferred.
	"""
	pad_size = get_max_height_width(images, input_data_format=input_data_format)

	padded_images = [
	self._pad_image(
	image,
	pad_size,
	constant_values=constant_values,
	data_format=data_format,
	input_data_format=input_data_format,
	)
	for image in images
	]
	data = {"pixel_values": padded_images}

	if return_pixel_mask:
	masks = [
	make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
	for image in images
	]
	data["pixel_mask"] = masks

	return BatchFeature(data=data, tensor_type=return_tensors)

	def preprocess(
	self,
	images: ImageInput,
	do_resize: Optional[bool] = None,
	size: Optional[Dict[str, int]] = None,
	size_divisor: Optional[int] = None,
	resample: PILImageResampling = None,
	do_rescale: Optional[bool] = None,
	rescale_factor: Optional[float] = None,
	do_normalize: Optional[bool] = None,
	image_mean: Optional[Union[float, List[float]]] = None,
	image_std: Optional[Union[float, List[float]]] = None,
	do_pad: Optional[bool] = None,
	do_center_crop: Optional[bool] = None,
	return_tensors: Optional[Union[str, TensorType]] = None,
	data_format: ChannelDimension = ChannelDimension.FIRST,
	input_data_format: Optional[Union[str, ChannelDimension]] = None,
	**kwargs,
	) -> PIL.Image.Image:
	"""
	Preprocess an image or batch of images.

	Args:
	images (`ImageInput`):
	Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
	passing in images with pixel values between 0 and 1, set `do_rescale=False`.
	do_resize (`bool`, optional, defaults to `self.do_resize`):
	Whether to resize the image.
	size (`Dict[str, int]`, optional, defaults to `self.size`):
	Controls the size of the image after `resize`. The shortest edge of the image is resized to
	`size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image
	is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
	edge equal to `int(size["shortest_edge"] * (1333 / 800))`.
	size_divisor (`int`, optional, defaults to `self.size_divisor`):
	The image is resized to a size that is a multiple of this value.
	resample (`PILImageResampling`, optional, defaults to `self.resample`):
	Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`.
	do_rescale (`bool`, optional, defaults to `self.do_rescale`):
	Whether to rescale the image values between [0 - 1].
	rescale_factor (`float`, optional, defaults to `self.rescale_factor`):
	Rescale factor to rescale the image by if `do_rescale` is set to `True`.
	do_normalize (`bool`, optional, defaults to `self.do_normalize`):
	Whether to normalize the image.
	image_mean (`float` or `List[float]`, optional, defaults to `self.image_mean`):
	Image mean to normalize the image by if `do_normalize` is set to `True`.
	image_std (`float` or `List[float]`, optional, defaults to `self.image_std`):
	Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
	do_pad (`bool`, optional, defaults to `self.do_pad`):
	Whether to pad the image to the (max_height, max_width) in the batch. If `True`, a pixel mask is also
	created and returned.
	do_center_crop (`bool`, optional, defaults to `self.do_center_crop`):
	Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the
	image is padded with 0's and then center cropped.
	return_tensors (`str` or `TensorType`, optional):
	The type of tensors to return. Can be one of:
	- Unset: Return a list of `np.ndarray`.
	- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
	- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
	- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
	- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
	data_format (`ChannelDimension` or `str`, optional, defaults to `ChannelDimension.FIRST`):
	The channel dimension format for the output image. Can be one of:
	- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
	- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
	- Unset: Use the channel dimension format of the input image.
	input_data_format (`ChannelDimension` or `str`, optional):
	The channel dimension format for the input image. If unset, the channel dimension format is inferred
	from the input image. Can be one of:
	- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
	- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
	- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
	"""
	do_resize = do_resize if do_resize is not None else self.do_resize
	size_divisor = size_divisor if size_divisor is not None else self.size_divisor
	resample = resample if resample is not None else self.resample
	do_rescale = do_rescale if do_rescale is not None else self.do_rescale
	rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
	do_normalize = do_normalize if do_normalize is not None else self.do_normalize
	image_mean = image_mean if image_mean is not None else self.image_mean
	image_std = image_std if image_std is not None else self.image_std
	do_pad = do_pad if do_pad is not None else self.do_pad
	do_center_crop if do_center_crop is not None else self.do_center_crop

	size = size if size is not None else self.size
	size = get_size_dict(size, default_to_square=False)

	if not is_batched(images):
	images = [images]

	if not valid_images(images):
	raise ValueError(
	"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
	"torch.Tensor, tf.Tensor or jax.ndarray."
	)

	if do_resize and size is None or resample is None:
	raise ValueError("Size and resample must be specified if do_resize is True.")

	if do_rescale and rescale_factor is None:
	raise ValueError("Rescale factor must be specified if do_rescale is True.")

	if do_normalize and (image_mean is None or image_std is None):
	raise ValueError("Image mean and std must be specified if do_normalize is True.")

	# All transformations expect numpy arrays.
	images = [to_numpy_array(image) for image in images]

	if is_scaled_image(images[0]) and do_rescale:
	logger.warning_once(
	"It looks like you are trying to rescale already rescaled images. If the input"
	" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
	)

	if do_resize:
	images = [
	self.resize(
	image=image,
	size=size,
	size_divisor=size_divisor,
	resample=resample,
	input_data_format=input_data_format,
	)
	for image in images
	]

	if do_center_crop:
	images = [
	self.center_crop(image=image, size=size, input_data_format=input_data_format) for image in images
	]

	if do_rescale:
	images = [
	self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
	for image in images
	]

	if do_normalize:
	images = [
	self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
	for image in images
	]

	images = [
	to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
	]

	if do_pad:
	encoded_outputs = self.pad(
	images, return_pixel_mask=True, return_tensors=return_tensors, input_data_format=data_format
	)
	else:
	encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)

	return encoded_outputs