|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| """Preprocessing ops."""
|
|
|
| import math
|
| from typing import Optional, Sequence, Tuple, Union
|
| from six.moves import range
|
| import tensorflow as tf, tf_keras
|
|
|
| from official.vision.ops import augment
|
| from official.vision.ops import box_ops
|
|
|
| CENTER_CROP_FRACTION = 0.875
|
|
|
|
|
| MEAN_NORM = (0.485, 0.456, 0.406)
|
| STDDEV_NORM = (0.229, 0.224, 0.225)
|
| MEAN_RGB = tuple(255 * i for i in MEAN_NORM)
|
| STDDEV_RGB = tuple(255 * i for i in STDDEV_NORM)
|
| MEDIAN_RGB = (128.0, 128.0, 128.0)
|
|
|
|
|
| horizontal_flip_boxes = box_ops.horizontal_flip_boxes
|
| vertical_flip_boxes = box_ops.vertical_flip_boxes
|
|
|
|
|
| def clip_or_pad_to_fixed_size(input_tensor, size, constant_values=0):
|
| """Pads data to a fixed length at the first dimension.
|
|
|
| Args:
|
| input_tensor: `Tensor` with any dimension.
|
| size: `int` number for the first dimension of output Tensor.
|
| constant_values: `int` value assigned to the paddings.
|
|
|
| Returns:
|
| `Tensor` with the first dimension padded to `size`.
|
| """
|
| input_shape = input_tensor.get_shape().as_list()
|
| padding_shape = []
|
|
|
|
|
|
|
| input_length = tf.shape(input_tensor)[0]
|
| input_length = tf.clip_by_value(input_length, 0, size)
|
| input_tensor = input_tensor[:input_length]
|
|
|
| padding_length = tf.maximum(0, size - input_length)
|
| padding_shape.append(padding_length)
|
|
|
|
|
| for i in range(1, len(input_shape)):
|
| padding_shape.append(tf.shape(input_tensor)[i])
|
|
|
|
|
| paddings = tf.cast(constant_values * tf.ones(padding_shape),
|
| input_tensor.dtype)
|
| padded_tensor = tf.concat([input_tensor, paddings], axis=0)
|
| output_shape = input_shape
|
| output_shape[0] = size
|
| padded_tensor.set_shape(output_shape)
|
| return padded_tensor
|
|
|
|
|
| def normalize_image(image: tf.Tensor,
|
| offset: Sequence[float] = MEAN_NORM,
|
| scale: Sequence[float] = STDDEV_NORM) -> tf.Tensor:
|
| """Normalizes the image to zero mean and unit variance.
|
|
|
| If the input image dtype is float, it is expected to either have values in
|
| [0, 1) and offset is MEAN_NORM, or have values in [0, 255] and offset is
|
| MEAN_RGB.
|
|
|
| Args:
|
| image: A tf.Tensor in either (1) float dtype with values in range [0, 1) or
|
| [0, 255], or (2) int type with values in range [0, 255].
|
| offset: A tuple of mean values to be subtracted from the image.
|
| scale: A tuple of normalization factors.
|
|
|
| Returns:
|
| A normalized image tensor.
|
| """
|
| with tf.name_scope('normalize_image'):
|
| image = tf.image.convert_image_dtype(image, dtype=tf.float32)
|
| return normalize_scaled_float_image(image, offset, scale)
|
|
|
|
|
| def normalize_scaled_float_image(image: tf.Tensor,
|
| offset: Sequence[float] = MEAN_NORM,
|
| scale: Sequence[float] = STDDEV_NORM):
|
| """Normalizes a scaled float image to zero mean and unit variance.
|
|
|
| It assumes the input image is float dtype with values in [0, 1) if offset is
|
| MEAN_NORM, values in [0, 255] if offset is MEAN_RGB.
|
|
|
| Args:
|
| image: A tf.Tensor in float32 dtype with values in range [0, 1) or [0, 255].
|
| offset: A tuple of mean values to be subtracted from the image.
|
| scale: A tuple of normalization factors.
|
|
|
| Returns:
|
| A normalized image tensor.
|
| """
|
| offset = tf.constant(offset)
|
| offset = tf.expand_dims(offset, axis=0)
|
| offset = tf.expand_dims(offset, axis=0)
|
| image -= offset
|
|
|
| scale = tf.constant(scale)
|
| scale = tf.expand_dims(scale, axis=0)
|
| scale = tf.expand_dims(scale, axis=0)
|
| image /= scale
|
| return image
|
|
|
|
|
| def compute_padded_size(desired_size, stride):
|
| """Compute the padded size given the desired size and the stride.
|
|
|
| The padded size will be the smallest rectangle, such that each dimension is
|
| the smallest multiple of the stride which is larger than the desired
|
| dimension. For example, if desired_size = (100, 200) and stride = 32,
|
| the output padded_size = (128, 224).
|
|
|
| Args:
|
| desired_size: a `Tensor` or `int` list/tuple of two elements representing
|
| [height, width] of the target output image size.
|
| stride: an integer, the stride of the backbone network.
|
|
|
| Returns:
|
| padded_size: a `Tensor` or `int` list/tuple of two elements representing
|
| [height, width] of the padded output image size.
|
| """
|
| if isinstance(desired_size, list) or isinstance(desired_size, tuple):
|
| padded_size = [int(math.ceil(d * 1.0 / stride) * stride)
|
| for d in desired_size]
|
| else:
|
| padded_size = tf.cast(
|
| tf.math.ceil(
|
| tf.cast(desired_size, dtype=tf.float32) / stride) * stride,
|
| tf.int32)
|
| return padded_size
|
|
|
|
|
| def resize_and_crop_image(image,
|
| desired_size,
|
| padded_size,
|
| aug_scale_min=1.0,
|
| aug_scale_max=1.0,
|
| seed=1,
|
| method=tf.image.ResizeMethod.BILINEAR,
|
| keep_aspect_ratio=True):
|
| """Resizes the input image to output size (RetinaNet style).
|
|
|
| Resize and pad images given the desired output size of the image and
|
| stride size.
|
|
|
| Here are the preprocessing steps.
|
| 1. For a given image, keep its aspect ratio and rescale the image to make it
|
| the largest rectangle to be bounded by the rectangle specified by the
|
| `desired_size`.
|
| 2. Pad the rescaled image to the padded_size.
|
|
|
| Args:
|
| image: a `Tensor` of shape [height, width, 3] representing an image.
|
| desired_size: a `Tensor` or `int` list/tuple of two elements representing
|
| [height, width] of the desired actual output image size.
|
| padded_size: a `Tensor` or `int` list/tuple of two elements representing
|
| [height, width] of the padded output image size. Padding will be applied
|
| after scaling the image to the desired_size. Can be None to disable
|
| padding.
|
| aug_scale_min: a `float` with range between [0, 1.0] representing minimum
|
| random scale applied to desired_size for training scale jittering.
|
| aug_scale_max: a `float` with range between [1.0, inf] representing maximum
|
| random scale applied to desired_size for training scale jittering.
|
| seed: seed for random scale jittering.
|
| method: function to resize input image to scaled image.
|
| keep_aspect_ratio: whether or not to keep the aspect ratio when resizing.
|
|
|
| Returns:
|
| output_image: `Tensor` of shape [height, width, 3] where [height, width]
|
| equals to `output_size`.
|
| image_info: a 2D `Tensor` that encodes the information of the image and the
|
| applied preprocessing. It is in the format of
|
| [[original_height, original_width], [desired_height, desired_width],
|
| [y_scale, x_scale], [y_offset, x_offset]], where [desired_height,
|
| desired_width] is the actual scaled image size, and [y_scale, x_scale] is
|
| the scaling factor, which is the ratio of
|
| scaled dimension / original dimension.
|
| """
|
| with tf.name_scope('resize_and_crop_image'):
|
| image_size = tf.cast(tf.shape(image)[0:2], tf.float32)
|
|
|
| random_jittering = (
|
| isinstance(aug_scale_min, tf.Tensor)
|
| or isinstance(aug_scale_max, tf.Tensor)
|
| or not math.isclose(aug_scale_min, 1.0)
|
| or not math.isclose(aug_scale_max, 1.0)
|
| )
|
|
|
| if random_jittering:
|
| random_scale = tf.random.uniform(
|
| [], aug_scale_min, aug_scale_max, seed=seed)
|
| scaled_size = tf.round(random_scale * tf.cast(desired_size, tf.float32))
|
| else:
|
| scaled_size = tf.cast(desired_size, tf.float32)
|
|
|
| if keep_aspect_ratio:
|
| scale = tf.minimum(
|
| scaled_size[0] / image_size[0], scaled_size[1] / image_size[1])
|
| scaled_size = tf.round(image_size * scale)
|
|
|
|
|
| image_scale = scaled_size / image_size
|
|
|
|
|
|
|
| if random_jittering:
|
| max_offset = scaled_size - tf.cast(desired_size, tf.float32)
|
| max_offset = tf.where(
|
| tf.less(max_offset, 0), tf.zeros_like(max_offset), max_offset)
|
| offset = max_offset * tf.random.uniform([2,], 0, 1, seed=seed)
|
| offset = tf.cast(offset, tf.int32)
|
| else:
|
| offset = tf.zeros((2,), tf.int32)
|
|
|
| scaled_image = tf.image.resize(
|
| image, tf.cast(scaled_size, tf.int32), method=method)
|
|
|
| if random_jittering:
|
| scaled_image = scaled_image[
|
| offset[0]:offset[0] + desired_size[0],
|
| offset[1]:offset[1] + desired_size[1], :]
|
|
|
| output_image = scaled_image
|
| if padded_size is not None:
|
| output_image = tf.image.pad_to_bounding_box(
|
| scaled_image, 0, 0, padded_size[0], padded_size[1])
|
|
|
| image_info = tf.stack([
|
| image_size,
|
| tf.cast(desired_size, dtype=tf.float32),
|
| image_scale,
|
| tf.cast(offset, tf.float32)])
|
| return output_image, image_info
|
|
|
|
|
| def resize_and_crop_image_v2(image,
|
| short_side,
|
| long_side,
|
| padded_size,
|
| aug_scale_min=1.0,
|
| aug_scale_max=1.0,
|
| seed=1,
|
| method=tf.image.ResizeMethod.BILINEAR):
|
| """Resizes the input image to output size (Faster R-CNN style).
|
|
|
| Resize and pad images given the specified short / long side length and the
|
| stride size.
|
|
|
| Here are the preprocessing steps.
|
| 1. For a given image, keep its aspect ratio and first try to rescale the short
|
| side of the original image to `short_side`.
|
| 2. If the scaled image after 1 has a long side that exceeds `long_side`, keep
|
| the aspect ratio and rescale the long side of the image to `long_side`.
|
| 3. (Optional) Apply random jittering according to `aug_scale_min` and
|
| `aug_scale_max`. By default this step is skipped.
|
| 4. Pad the rescaled image to the padded_size.
|
|
|
| Args:
|
| image: a `Tensor` of shape [height, width, 3] representing an image.
|
| short_side: a scalar `Tensor` or `int` representing the desired short side
|
| to be rescaled to.
|
| long_side: a scalar `Tensor` or `int` representing the desired long side to
|
| be rescaled to.
|
| padded_size: a `Tensor` or `int` list/tuple of two elements representing
|
| [height, width] of the padded output image size.
|
| aug_scale_min: a `float` with range between [0, 1.0] representing minimum
|
| random scale applied for training scale jittering.
|
| aug_scale_max: a `float` with range between [1.0, inf] representing maximum
|
| random scale applied for training scale jittering.
|
| seed: seed for random scale jittering.
|
| method: function to resize input image to scaled image.
|
|
|
| Returns:
|
| output_image: `Tensor` of shape [height, width, 3] where [height, width]
|
| equals to `output_size`.
|
| image_info: a 2D `Tensor` that encodes the information of the image and the
|
| applied preprocessing. It is in the format of
|
| [[original_height, original_width], [desired_height, desired_width],
|
| [y_scale, x_scale], [y_offset, x_offset]], where [desired_height,
|
| desired_width] is the actual scaled image size, and [y_scale, x_scale] is
|
| the scaling factor, which is the ratio of
|
| scaled dimension / original dimension.
|
| """
|
| with tf.name_scope('resize_and_crop_image_v2'):
|
| image_size = tf.cast(tf.shape(image)[0:2], tf.float32)
|
|
|
| scale_using_short_side = (
|
| short_side / tf.math.minimum(image_size[0], image_size[1]))
|
| scale_using_long_side = (
|
| long_side / tf.math.maximum(image_size[0], image_size[1]))
|
|
|
| scaled_size = tf.math.round(image_size * scale_using_short_side)
|
| scaled_size = tf.where(
|
| tf.math.greater(
|
| tf.math.maximum(scaled_size[0], scaled_size[1]), long_side),
|
| tf.math.round(image_size * scale_using_long_side),
|
| scaled_size)
|
| desired_size = scaled_size
|
|
|
| random_jittering = (
|
| isinstance(aug_scale_min, tf.Tensor)
|
| or isinstance(aug_scale_max, tf.Tensor)
|
| or not math.isclose(aug_scale_min, 1.0)
|
| or not math.isclose(aug_scale_max, 1.0)
|
| )
|
|
|
| if random_jittering:
|
| random_scale = tf.random.uniform(
|
| [], aug_scale_min, aug_scale_max, seed=seed)
|
| scaled_size = tf.math.round(random_scale * scaled_size)
|
|
|
|
|
| image_scale = scaled_size / image_size
|
|
|
|
|
|
|
| if random_jittering:
|
| max_offset = scaled_size - desired_size
|
| max_offset = tf.where(
|
| tf.math.less(max_offset, 0), tf.zeros_like(max_offset), max_offset)
|
| offset = max_offset * tf.random.uniform([2,], 0, 1, seed=seed)
|
| offset = tf.cast(offset, tf.int32)
|
| else:
|
| offset = tf.zeros((2,), tf.int32)
|
|
|
| scaled_image = tf.image.resize(
|
| image, tf.cast(scaled_size, tf.int32), method=method)
|
|
|
| if random_jittering:
|
| scaled_image = scaled_image[
|
| offset[0]:offset[0] + desired_size[0],
|
| offset[1]:offset[1] + desired_size[1], :]
|
|
|
| output_image = tf.image.pad_to_bounding_box(
|
| scaled_image, 0, 0, padded_size[0], padded_size[1])
|
|
|
| image_info = tf.stack([
|
| image_size,
|
| tf.cast(desired_size, dtype=tf.float32),
|
| image_scale,
|
| tf.cast(offset, tf.float32)])
|
| return output_image, image_info
|
|
|
|
|
| def resize_image(
|
| image: tf.Tensor,
|
| size: Union[Tuple[int, int], int],
|
| max_size: Optional[int] = None,
|
| method: tf.image.ResizeMethod = tf.image.ResizeMethod.BILINEAR):
|
| """Resize image with size and max_size.
|
|
|
| Args:
|
| image: the image to be resized.
|
| size: if list to tuple, resize to it. If scalar, we keep the same
|
| aspect ratio and resize the short side to the value.
|
| max_size: only used when size is a scalar. When the larger side is larger
|
| than max_size after resized with size we used max_size to keep the aspect
|
| ratio instead.
|
| method: the method argument passed to tf.image.resize.
|
|
|
| Returns:
|
| the resized image and image_info to be used for downstream processing.
|
| image_info: a 2D `Tensor` that encodes the information of the image and the
|
| applied preprocessing. It is in the format of
|
| [[original_height, original_width], [resized_height, resized_width],
|
| [y_scale, x_scale], [0, 0]], where [resized_height, resized_width]
|
| is the actual scaled image size, and [y_scale, x_scale] is the
|
| scaling factor, which is the ratio of
|
| scaled dimension / original dimension.
|
| """
|
|
|
| def get_size_with_aspect_ratio(image_size, size, max_size=None):
|
| h = image_size[0]
|
| w = image_size[1]
|
| if max_size is not None:
|
| min_original_size = tf.cast(tf.math.minimum(w, h), dtype=tf.float32)
|
| max_original_size = tf.cast(tf.math.maximum(w, h), dtype=tf.float32)
|
| if max_original_size / min_original_size * size > max_size:
|
| size = tf.cast(
|
| tf.math.floor(max_size * min_original_size / max_original_size),
|
| dtype=tf.int32)
|
| else:
|
| size = tf.cast(size, tf.int32)
|
|
|
| else:
|
| size = tf.cast(size, tf.int32)
|
| if (w <= h and w == size) or (h <= w and h == size):
|
| return tf.stack([h, w])
|
|
|
| if w < h:
|
| ow = size
|
| oh = tf.cast(
|
| (tf.cast(size, dtype=tf.float32) * tf.cast(h, dtype=tf.float32) /
|
| tf.cast(w, dtype=tf.float32)),
|
| dtype=tf.int32)
|
| else:
|
| oh = size
|
| ow = tf.cast(
|
| (tf.cast(size, dtype=tf.float32) * tf.cast(w, dtype=tf.float32) /
|
| tf.cast(h, dtype=tf.float32)),
|
| dtype=tf.int32)
|
|
|
| return tf.stack([oh, ow])
|
|
|
| def get_size(image_size, size, max_size=None):
|
| if isinstance(size, (list, tuple)):
|
| return size[::-1]
|
| else:
|
| return get_size_with_aspect_ratio(image_size, size, max_size)
|
|
|
| orignal_size = tf.shape(image)[0:2]
|
| size = get_size(orignal_size, size, max_size)
|
| rescaled_image = tf.image.resize(
|
| image, tf.cast(size, tf.int32), method=method)
|
| image_scale = size / orignal_size
|
| image_info = tf.stack([
|
| tf.cast(orignal_size, dtype=tf.float32),
|
| tf.cast(size, dtype=tf.float32),
|
| tf.cast(image_scale, tf.float32),
|
| tf.constant([0.0, 0.0], dtype=tf.float32)
|
| ])
|
| return rescaled_image, image_info
|
|
|
|
|
| def center_crop_image(
|
| image, center_crop_fraction: float = CENTER_CROP_FRACTION):
|
| """Center crop a square shape slice from the input image.
|
|
|
| It crops a square shape slice from the image. The side of the actual crop
|
| is 224 / 256 = 0.875 of the short side of the original image. References:
|
| [1] Very Deep Convolutional Networks for Large-Scale Image Recognition
|
| https://arxiv.org/abs/1409.1556
|
| [2] Deep Residual Learning for Image Recognition
|
| https://arxiv.org/abs/1512.03385
|
|
|
| Args:
|
| image: a Tensor of shape [height, width, 3] representing the input image.
|
| center_crop_fraction: a float of ratio between the side of the cropped image
|
| and the short side of the original image
|
|
|
| Returns:
|
| cropped_image: a Tensor representing the center cropped image.
|
| """
|
| with tf.name_scope('center_crop_image'):
|
| image_size = tf.cast(tf.shape(image)[:2], dtype=tf.float32)
|
| crop_size = (
|
| center_crop_fraction * tf.math.minimum(image_size[0], image_size[1]))
|
| crop_offset = tf.cast((image_size - crop_size) / 2.0, dtype=tf.int32)
|
| crop_size = tf.cast(crop_size, dtype=tf.int32)
|
| cropped_image = image[
|
| crop_offset[0]:crop_offset[0] + crop_size,
|
| crop_offset[1]:crop_offset[1] + crop_size, :]
|
| return cropped_image
|
|
|
|
|
| def center_crop_image_v2(
|
| image_bytes, image_shape, center_crop_fraction: float = CENTER_CROP_FRACTION
|
| ):
|
| """Center crop a square shape slice from the input image.
|
|
|
| It crops a square shape slice from the image. The side of the actual crop
|
| is 224 / 256 = 0.875 of the short side of the original image. References:
|
| [1] Very Deep Convolutional Networks for Large-Scale Image Recognition
|
| https://arxiv.org/abs/1409.1556
|
| [2] Deep Residual Learning for Image Recognition
|
| https://arxiv.org/abs/1512.03385
|
|
|
| This is a faster version of `center_crop_image` which takes the original
|
| image bytes and image size as the inputs, and partially decode the JPEG
|
| bytes according to the center crop.
|
|
|
| Args:
|
| image_bytes: a Tensor of type string representing the raw image bytes.
|
| image_shape: a Tensor specifying the shape of the raw image.
|
| center_crop_fraction: a float of ratio between the side of the cropped image
|
| and the short side of the original image
|
|
|
| Returns:
|
| cropped_image: a Tensor representing the center cropped image.
|
| """
|
| with tf.name_scope('center_image_crop_v2'):
|
| image_shape = tf.cast(image_shape, tf.float32)
|
| crop_size = center_crop_fraction * tf.math.minimum(
|
| image_shape[0], image_shape[1]
|
| )
|
| crop_offset = tf.cast((image_shape - crop_size) / 2.0, dtype=tf.int32)
|
| crop_size = tf.cast(crop_size, dtype=tf.int32)
|
| crop_window = tf.stack(
|
| [crop_offset[0], crop_offset[1], crop_size, crop_size])
|
| cropped_image = tf.image.decode_and_crop_jpeg(
|
| image_bytes, crop_window, channels=3)
|
| return cropped_image
|
|
|
|
|
| def random_crop_image(image,
|
| aspect_ratio_range=(3. / 4., 4. / 3.),
|
| area_range=(0.08, 1.0),
|
| max_attempts=10,
|
| seed=1):
|
| """Randomly crop an arbitrary shaped slice from the input image.
|
|
|
| Args:
|
| image: a Tensor of shape [height, width, 3] representing the input image.
|
| aspect_ratio_range: a list of floats. The cropped area of the image must
|
| have an aspect ratio = width / height within this range.
|
| area_range: a list of floats. The cropped reas of the image must contain
|
| a fraction of the input image within this range.
|
| max_attempts: the number of attempts at generating a cropped region of the
|
| image of the specified constraints. After max_attempts failures, return
|
| the entire image.
|
| seed: the seed of the random generator.
|
|
|
| Returns:
|
| cropped_image: a Tensor representing the random cropped image. Can be the
|
| original image if max_attempts is exhausted.
|
| """
|
| with tf.name_scope('random_crop_image'):
|
| crop_offset, crop_size, _ = tf.image.sample_distorted_bounding_box(
|
| tf.shape(image),
|
| tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]),
|
| seed=seed,
|
| min_object_covered=area_range[0],
|
| aspect_ratio_range=aspect_ratio_range,
|
| area_range=area_range,
|
| max_attempts=max_attempts)
|
| cropped_image = tf.slice(image, crop_offset, crop_size)
|
| return cropped_image
|
|
|
|
|
| def random_crop_image_v2(image_bytes,
|
| image_shape,
|
| aspect_ratio_range=(3. / 4., 4. / 3.),
|
| area_range=(0.08, 1.0),
|
| max_attempts=10,
|
| seed=1):
|
| """Randomly crop an arbitrary shaped slice from the input image.
|
|
|
| This is a faster version of `random_crop_image` which takes the original
|
| image bytes and image size as the inputs, and partially decode the JPEG
|
| bytes according to the generated crop.
|
|
|
| Args:
|
| image_bytes: a Tensor of type string representing the raw image bytes.
|
| image_shape: a Tensor specifying the shape of the raw image.
|
| aspect_ratio_range: a list of floats. The cropped area of the image must
|
| have an aspect ratio = width / height within this range.
|
| area_range: a list of floats. The cropped reas of the image must contain
|
| a fraction of the input image within this range.
|
| max_attempts: the number of attempts at generating a cropped region of the
|
| image of the specified constraints. After max_attempts failures, return
|
| the entire image.
|
| seed: the seed of the random generator.
|
|
|
| Returns:
|
| cropped_image: a Tensor representing the random cropped image. Can be the
|
| original image if max_attempts is exhausted.
|
| """
|
| with tf.name_scope('random_crop_image_v2'):
|
| crop_offset, crop_size, _ = tf.image.sample_distorted_bounding_box(
|
| image_shape,
|
| tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]),
|
| seed=seed,
|
| min_object_covered=area_range[0],
|
| aspect_ratio_range=aspect_ratio_range,
|
| area_range=area_range,
|
| max_attempts=max_attempts)
|
| offset_y, offset_x, _ = tf.unstack(crop_offset)
|
| crop_height, crop_width, _ = tf.unstack(crop_size)
|
| crop_window = tf.stack([offset_y, offset_x, crop_height, crop_width])
|
| cropped_image = tf.image.decode_and_crop_jpeg(
|
| image_bytes, crop_window, channels=3)
|
| return cropped_image
|
|
|
|
|
| def resize_and_crop_boxes(boxes,
|
| image_scale,
|
| output_size,
|
| offset):
|
| """Resizes boxes to output size with scale and offset.
|
|
|
| Args:
|
| boxes: `Tensor` of shape [N, 4] representing ground truth boxes.
|
| image_scale: 2D float `Tensor` representing scale factors that apply to
|
| [height, width] of input image.
|
| output_size: 2D `Tensor` or `int` representing [height, width] of target
|
| output image size.
|
| offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled
|
| boxes.
|
|
|
| Returns:
|
| boxes: `Tensor` of shape [N, 4] representing the scaled boxes.
|
| """
|
| with tf.name_scope('resize_and_crop_boxes'):
|
|
|
| boxes *= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2])
|
| boxes -= tf.tile(tf.expand_dims(offset, axis=0), [1, 2])
|
|
|
| boxes = box_ops.clip_boxes(boxes, output_size)
|
| return boxes
|
|
|
|
|
| def resize_and_crop_masks(masks, image_scale, output_size, offset):
|
| """Resizes boxes to output size with scale and offset.
|
|
|
| Args:
|
| masks: `Tensor` of shape [N, H, W, C] representing ground truth masks.
|
| image_scale: 2D float `Tensor` representing scale factors that apply to
|
| [height, width] of input image.
|
| output_size: 2D `Tensor` or `int` representing [height, width] of target
|
| output image size.
|
| offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled
|
| boxes.
|
|
|
| Returns:
|
| masks: `Tensor` of shape [N, H, W, C] representing the scaled masks.
|
| """
|
| with tf.name_scope('resize_and_crop_masks'):
|
| mask_size = tf.cast(tf.shape(masks)[1:3], tf.float32)
|
| num_channels = tf.shape(masks)[3]
|
|
|
| masks = tf.concat([
|
| tf.zeros([1, mask_size[0], mask_size[1], num_channels],
|
| dtype=masks.dtype), masks
|
| ],
|
| axis=0)
|
|
|
| scaled_size = tf.cast(image_scale * mask_size, tf.int32)
|
| scaled_masks = tf.image.resize(
|
| masks, scaled_size, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
|
| offset = tf.cast(offset, tf.int32)
|
| scaled_masks = scaled_masks[
|
| :,
|
| offset[0]:offset[0] + output_size[0],
|
| offset[1]:offset[1] + output_size[1],
|
| :]
|
|
|
| output_masks = tf.image.pad_to_bounding_box(
|
| scaled_masks, 0, 0, output_size[0], output_size[1])
|
|
|
| output_masks = output_masks[1::]
|
| return output_masks
|
|
|
|
|
| def horizontal_flip_image(image):
|
| """Flips image horizontally."""
|
| return tf.image.flip_left_right(image)
|
|
|
|
|
| def horizontal_flip_masks(masks):
|
| """Flips masks horizontally."""
|
| return masks[:, :, ::-1]
|
|
|
|
|
| def random_horizontal_flip(
|
| image, normalized_boxes=None, masks=None, seed=1, prob=0.5
|
| ):
|
| """Randomly flips input image and bounding boxes horizontally."""
|
| with tf.name_scope('random_horizontal_flip'):
|
| do_flip = tf.less(tf.random.uniform([], seed=seed), prob)
|
|
|
| image = tf.cond(
|
| do_flip,
|
| lambda: horizontal_flip_image(image),
|
| lambda: image)
|
|
|
| if normalized_boxes is not None:
|
| normalized_boxes = tf.cond(
|
| do_flip,
|
| lambda: horizontal_flip_boxes(normalized_boxes),
|
| lambda: normalized_boxes)
|
|
|
| if masks is not None:
|
| masks = tf.cond(
|
| do_flip,
|
| lambda: horizontal_flip_masks(masks),
|
| lambda: masks)
|
|
|
| return image, normalized_boxes, masks
|
|
|
|
|
| def random_horizontal_flip_with_roi(
|
| image: tf.Tensor,
|
| boxes: Optional[tf.Tensor] = None,
|
| masks: Optional[tf.Tensor] = None,
|
| roi_boxes: Optional[tf.Tensor] = None,
|
| seed: int = 1
|
| ) -> Tuple[tf.Tensor, Optional[tf.Tensor], Optional[tf.Tensor],
|
| Optional[tf.Tensor]]:
|
| """Randomly flips input image and bounding boxes horizontally.
|
|
|
| Extends preprocess_ops.random_horizontal_flip to also flip roi_boxes used
|
| by ViLD.
|
|
|
| Args:
|
| image: `tf.Tensor`, the image to apply the random flip.
|
| boxes: `tf.Tensor` or `None`, boxes corresponding to the image.
|
| masks: `tf.Tensor` or `None`, masks corresponding to the image.
|
| roi_boxes: `tf.Tensor` or `None`, RoIs corresponding to the image.
|
| seed: Seed for Tensorflow's random number generator.
|
|
|
| Returns:
|
| image: `tf.Tensor`, flipped image.
|
| boxes: `tf.Tensor` or `None`, flipped boxes corresponding to the image.
|
| masks: `tf.Tensor` or `None`, flipped masks corresponding to the image.
|
| roi_boxes: `tf.Tensor` or `None`, flipped RoIs corresponding to the image.
|
| """
|
| with tf.name_scope('random_horizontal_flip'):
|
| do_flip = tf.greater(tf.random.uniform([], seed=seed), 0.5)
|
|
|
| image = tf.cond(do_flip, lambda: horizontal_flip_image(image),
|
| lambda: image)
|
|
|
| if boxes is not None:
|
| boxes = tf.cond(do_flip, lambda: horizontal_flip_boxes(boxes),
|
| lambda: boxes)
|
|
|
| if masks is not None:
|
| masks = tf.cond(do_flip, lambda: horizontal_flip_masks(masks),
|
| lambda: masks)
|
|
|
| if roi_boxes is not None:
|
| roi_boxes = tf.cond(do_flip, lambda: horizontal_flip_boxes(roi_boxes),
|
| lambda: roi_boxes)
|
|
|
| return image, boxes, masks, roi_boxes
|
|
|
|
|
| def random_vertical_flip(
|
| image, normalized_boxes=None, masks=None, seed=1, prob=0.5
|
| ):
|
| """Randomly flips input image and bounding boxes vertically."""
|
| with tf.name_scope('random_vertical_flip'):
|
| do_flip = tf.less(tf.random.uniform([], seed=seed), prob)
|
|
|
| image = tf.cond(
|
| do_flip,
|
| lambda: tf.image.flip_up_down(image),
|
| lambda: image)
|
|
|
| if normalized_boxes is not None:
|
| normalized_boxes = tf.cond(
|
| do_flip,
|
| lambda: vertical_flip_boxes(normalized_boxes),
|
| lambda: normalized_boxes)
|
|
|
| if masks is not None:
|
| masks = tf.cond(
|
| do_flip,
|
| lambda: tf.image.flip_up_down(masks[..., None])[..., 0],
|
| lambda: masks)
|
|
|
| return image, normalized_boxes, masks
|
|
|
|
|
| def color_jitter(image: tf.Tensor,
|
| brightness: Optional[float] = 0.,
|
| contrast: Optional[float] = 0.,
|
| saturation: Optional[float] = 0.,
|
| seed: Optional[int] = None) -> tf.Tensor:
|
| """Applies color jitter to an image, similarly to torchvision`s ColorJitter.
|
|
|
| Args:
|
| image (tf.Tensor): Of shape [height, width, 3] and type uint8.
|
| brightness (float, optional): Magnitude for brightness jitter. Defaults to
|
| 0.
|
| contrast (float, optional): Magnitude for contrast jitter. Defaults to 0.
|
| saturation (float, optional): Magnitude for saturation jitter. Defaults to
|
| 0.
|
| seed (int, optional): Random seed. Defaults to None.
|
|
|
| Returns:
|
| tf.Tensor: The augmented `image` of type uint8.
|
| """
|
| image = tf.cast(image, dtype=tf.uint8)
|
| image = random_brightness(image, brightness, seed=seed)
|
| image = random_contrast(image, contrast, seed=seed)
|
| image = random_saturation(image, saturation, seed=seed)
|
| return image
|
|
|
|
|
| def random_brightness(image: tf.Tensor,
|
| brightness: float = 0.,
|
| seed: Optional[int] = None) -> tf.Tensor:
|
| """Jitters brightness of an image.
|
|
|
| Args:
|
| image (tf.Tensor): Of shape [height, width, 3] and type uint8.
|
| brightness (float, optional): Magnitude for brightness jitter. Defaults to
|
| 0.
|
| seed (int, optional): Random seed. Defaults to None.
|
|
|
| Returns:
|
| tf.Tensor: The augmented `image` of type uint8.
|
| """
|
| assert brightness >= 0, '`brightness` must be positive'
|
| brightness = tf.random.uniform([],
|
| max(0, 1 - brightness),
|
| 1 + brightness,
|
| seed=seed,
|
| dtype=tf.float32)
|
| return augment.brightness(image, brightness)
|
|
|
|
|
| def random_contrast(image: tf.Tensor,
|
| contrast: float = 0.,
|
| seed: Optional[int] = None) -> tf.Tensor:
|
| """Jitters contrast of an image, similarly to torchvision`s ColorJitter.
|
|
|
| Args:
|
| image (tf.Tensor): Of shape [height, width, 3] and type uint8.
|
| contrast (float, optional): Magnitude for contrast jitter. Defaults to 0.
|
| seed (int, optional): Random seed. Defaults to None.
|
|
|
| Returns:
|
| tf.Tensor: The augmented `image` of type uint8.
|
| """
|
| assert contrast >= 0, '`contrast` must be positive'
|
| contrast = tf.random.uniform([],
|
| max(0, 1 - contrast),
|
| 1 + contrast,
|
| seed=seed,
|
| dtype=tf.float32)
|
| return augment.contrast(image, contrast)
|
|
|
|
|
| def random_saturation(image: tf.Tensor,
|
| saturation: float = 0.,
|
| seed: Optional[int] = None) -> tf.Tensor:
|
| """Jitters saturation of an image, similarly to torchvision`s ColorJitter.
|
|
|
| Args:
|
| image (tf.Tensor): Of shape [height, width, 3] and type uint8.
|
| saturation (float, optional): Magnitude for saturation jitter. Defaults to
|
| 0.
|
| seed (int, optional): Random seed. Defaults to None.
|
|
|
| Returns:
|
| tf.Tensor: The augmented `image` of type uint8.
|
| """
|
| assert saturation >= 0, '`saturation` must be positive'
|
| saturation = tf.random.uniform([],
|
| max(0, 1 - saturation),
|
| 1 + saturation,
|
| seed=seed,
|
| dtype=tf.float32)
|
| return _saturation(image, saturation)
|
|
|
|
|
| def _saturation(image: tf.Tensor,
|
| saturation: Optional[float] = 0.) -> tf.Tensor:
|
| return augment.blend(
|
| tf.repeat(tf.image.rgb_to_grayscale(image), 3, axis=-1), image,
|
| saturation)
|
|
|
|
|
| def random_crop_image_with_boxes_and_labels(img, boxes, labels, min_scale,
|
| aspect_ratio_range,
|
| min_overlap_params, max_retry):
|
| """Crops a random slice from the input image.
|
|
|
| The function will correspondingly recompute the bounding boxes and filter out
|
| outside boxes and their labels.
|
|
|
| References:
|
| [1] End-to-End Object Detection with Transformers
|
| https://arxiv.org/abs/2005.12872
|
|
|
| The preprocessing steps:
|
| 1. Sample a minimum IoU overlap.
|
| 2. For each trial, sample the new image width, height, and top-left corner.
|
| 3. Compute the IoUs of bounding boxes with the cropped image and retry if
|
| the maximum IoU is below the sampled threshold.
|
| 4. Find boxes whose centers are in the cropped image.
|
| 5. Compute new bounding boxes in the cropped region and only select those
|
| boxes' labels.
|
|
|
| Args:
|
| img: a 'Tensor' of shape [height, width, 3] representing the input image.
|
| boxes: a 'Tensor' of shape [N, 4] representing the ground-truth bounding
|
| boxes with (ymin, xmin, ymax, xmax).
|
| labels: a 'Tensor' of shape [N,] representing the class labels of the boxes.
|
| min_scale: a 'float' in [0.0, 1.0) indicating the lower bound of the random
|
| scale variable.
|
| aspect_ratio_range: a list of two 'float' that specifies the lower and upper
|
| bound of the random aspect ratio.
|
| min_overlap_params: a list of four 'float' representing the min value, max
|
| value, step size, and offset for the minimum overlap sample.
|
| max_retry: an 'int' representing the number of trials for cropping. If it is
|
| exhausted, no cropping will be performed.
|
|
|
| Returns:
|
| img: a Tensor representing the random cropped image. Can be the
|
| original image if max_retry is exhausted.
|
| boxes: a Tensor representing the bounding boxes in the cropped image.
|
| labels: a Tensor representing the new bounding boxes' labels.
|
| """
|
|
|
| shape = tf.shape(img)
|
| original_h = shape[0]
|
| original_w = shape[1]
|
|
|
| minval, maxval, step, offset = min_overlap_params
|
|
|
| min_overlap = tf.math.floordiv(
|
| tf.random.uniform([], minval=minval, maxval=maxval), step) * step - offset
|
|
|
| min_overlap = tf.clip_by_value(min_overlap, 0.0, 1.1)
|
|
|
| if min_overlap > 1.0:
|
| return img, boxes, labels
|
|
|
| aspect_ratio_low = aspect_ratio_range[0]
|
| aspect_ratio_high = aspect_ratio_range[1]
|
|
|
| for _ in tf.range(max_retry):
|
| scale_h = tf.random.uniform([], min_scale, 1.0)
|
| scale_w = tf.random.uniform([], min_scale, 1.0)
|
| new_h = tf.cast(
|
| scale_h * tf.cast(original_h, dtype=tf.float32), dtype=tf.int32)
|
| new_w = tf.cast(
|
| scale_w * tf.cast(original_w, dtype=tf.float32), dtype=tf.int32)
|
|
|
|
|
| aspect_ratio = new_h / new_w
|
| if aspect_ratio_low > aspect_ratio or aspect_ratio > aspect_ratio_high:
|
| continue
|
|
|
| left = tf.random.uniform([], 0, original_w - new_w, dtype=tf.int32)
|
| right = left + new_w
|
| top = tf.random.uniform([], 0, original_h - new_h, dtype=tf.int32)
|
| bottom = top + new_h
|
|
|
| normalized_left = tf.cast(
|
| left, dtype=tf.float32) / tf.cast(
|
| original_w, dtype=tf.float32)
|
| normalized_right = tf.cast(
|
| right, dtype=tf.float32) / tf.cast(
|
| original_w, dtype=tf.float32)
|
| normalized_top = tf.cast(
|
| top, dtype=tf.float32) / tf.cast(
|
| original_h, dtype=tf.float32)
|
| normalized_bottom = tf.cast(
|
| bottom, dtype=tf.float32) / tf.cast(
|
| original_h, dtype=tf.float32)
|
|
|
| cropped_box = tf.expand_dims(
|
| tf.stack([
|
| normalized_top,
|
| normalized_left,
|
| normalized_bottom,
|
| normalized_right,
|
| ]),
|
| axis=0)
|
| iou = box_ops.bbox_overlap(
|
| tf.expand_dims(cropped_box, axis=0),
|
| tf.expand_dims(boxes, axis=0))
|
| iou = tf.squeeze(iou, axis=[0, 1])
|
|
|
|
|
|
|
| if tf.reduce_max(iou) < min_overlap:
|
| continue
|
|
|
| centroids = box_ops.yxyx_to_cycxhw(boxes)
|
| mask = tf.math.logical_and(
|
| tf.math.logical_and(centroids[:, 0] > normalized_top,
|
| centroids[:, 0] < normalized_bottom),
|
| tf.math.logical_and(centroids[:, 1] > normalized_left,
|
| centroids[:, 1] < normalized_right))
|
|
|
| if tf.reduce_sum(tf.cast(mask, dtype=tf.int32)) > 0:
|
| indices = tf.squeeze(tf.where(mask), axis=1)
|
|
|
| filtered_boxes = tf.gather(boxes, indices)
|
|
|
| boxes = tf.clip_by_value(
|
| (filtered_boxes[..., :] * tf.cast(
|
| tf.stack([original_h, original_w, original_h, original_w]),
|
| dtype=tf.float32) -
|
| tf.cast(tf.stack([top, left, top, left]), dtype=tf.float32)) /
|
| tf.cast(tf.stack([new_h, new_w, new_h, new_w]), dtype=tf.float32),
|
| 0.0, 1.0)
|
|
|
| img = tf.image.crop_to_bounding_box(img, top, left, bottom - top,
|
| right - left)
|
|
|
| labels = tf.gather(labels, indices)
|
| break
|
|
|
| return img, boxes, labels
|
|
|
|
|
| def random_crop(image,
|
| boxes,
|
| labels,
|
| min_scale=0.3,
|
| aspect_ratio_range=(0.5, 2.0),
|
| min_overlap_params=(0.0, 1.4, 0.2, 0.1),
|
| max_retry=50,
|
| seed=None):
|
| """Randomly crop the image and boxes, filtering labels.
|
|
|
| Args:
|
| image: a 'Tensor' of shape [height, width, 3] representing the input image.
|
| boxes: a 'Tensor' of shape [N, 4] representing the ground-truth bounding
|
| boxes with (ymin, xmin, ymax, xmax).
|
| labels: a 'Tensor' of shape [N,] representing the class labels of the boxes.
|
| min_scale: a 'float' in [0.0, 1.0) indicating the lower bound of the random
|
| scale variable.
|
| aspect_ratio_range: a list of two 'float' that specifies the lower and upper
|
| bound of the random aspect ratio.
|
| min_overlap_params: a list of four 'float' representing the min value, max
|
| value, step size, and offset for the minimum overlap sample.
|
| max_retry: an 'int' representing the number of trials for cropping. If it is
|
| exhausted, no cropping will be performed.
|
| seed: the random number seed of int, but could be None.
|
|
|
| Returns:
|
| image: a Tensor representing the random cropped image. Can be the
|
| original image if max_retry is exhausted.
|
| boxes: a Tensor representing the bounding boxes in the cropped image.
|
| labels: a Tensor representing the new bounding boxes' labels.
|
| """
|
| with tf.name_scope('random_crop'):
|
| do_crop = tf.greater(tf.random.uniform([], seed=seed), 0.5)
|
| if do_crop:
|
| return random_crop_image_with_boxes_and_labels(image, boxes, labels,
|
| min_scale,
|
| aspect_ratio_range,
|
| min_overlap_params,
|
| max_retry)
|
| else:
|
| return image, boxes, labels
|
|
|