Spaces:
Sleeping
Sleeping
| # Copyright 2023 The TensorFlow Authors. All Rights Reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """Preprocess images and bounding boxes for detection. | |
| We perform two sets of operations in preprocessing stage: | |
| (a) operations that are applied to both training and testing data, | |
| (b) operations that are applied only to training data for the purpose of | |
| data augmentation. | |
| A preprocessing function receives a set of inputs, | |
| e.g. an image and bounding boxes, | |
| performs an operation on them, and returns them. | |
| Some examples are: randomly cropping the image, randomly mirroring the image, | |
| randomly changing the brightness, contrast, hue and | |
| randomly jittering the bounding boxes. | |
| The image is a rank 4 tensor: [1, height, width, channels] with | |
| dtype=tf.float32. The groundtruth_boxes is a rank 2 tensor: [N, 4] where | |
| in each row there is a box with [ymin xmin ymax xmax]. | |
| Boxes are in normalized coordinates meaning | |
| their coordinate values range in [0, 1] | |
| Important Note: In tensor_dict, images is a rank 4 tensor, but preprocessing | |
| functions receive a rank 3 tensor for processing the image. Thus, inside the | |
| preprocess function we squeeze the image to become a rank 3 tensor and then | |
| we pass it to the functions. At the end of the preprocess we expand the image | |
| back to rank 4. | |
| """ | |
| import numpy as np | |
| import tensorflow as tf, tf_keras | |
| from official.vision.utils.object_detection import box_list | |
| def _flip_boxes_left_right(boxes): | |
| """Left-right flip the boxes. | |
| Args: | |
| boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. Boxes | |
| are in normalized form meaning their coordinates vary between [0, 1]. Each | |
| row is in the form of [ymin, xmin, ymax, xmax]. | |
| Returns: | |
| Flipped boxes. | |
| """ | |
| ymin, xmin, ymax, xmax = tf.split(value=boxes, num_or_size_splits=4, axis=1) | |
| flipped_xmin = tf.subtract(1.0, xmax) | |
| flipped_xmax = tf.subtract(1.0, xmin) | |
| flipped_boxes = tf.concat([ymin, flipped_xmin, ymax, flipped_xmax], 1) | |
| return flipped_boxes | |
| def _flip_masks_left_right(masks): | |
| """Left-right flip masks. | |
| Args: | |
| masks: rank 3 float32 tensor with shape [num_instances, height, width] | |
| representing instance masks. | |
| Returns: | |
| flipped masks: rank 3 float32 tensor with shape | |
| [num_instances, height, width] representing instance masks. | |
| """ | |
| return masks[:, :, ::-1] | |
| def keypoint_flip_horizontal(keypoints, | |
| flip_point, | |
| flip_permutation, | |
| scope=None): | |
| """Flips the keypoints horizontally around the flip_point. | |
| This operation flips the x coordinate for each keypoint around the flip_point | |
| and also permutes the keypoints in a manner specified by flip_permutation. | |
| Args: | |
| keypoints: a tensor of shape [num_instances, num_keypoints, 2] | |
| flip_point: (float) scalar tensor representing the x coordinate to flip the | |
| keypoints around. | |
| flip_permutation: rank 1 int32 tensor containing the keypoint flip | |
| permutation. This specifies the mapping from original keypoint indices to | |
| the flipped keypoint indices. This is used primarily for keypoints that | |
| are not reflection invariant. E.g. Suppose there are 3 keypoints | |
| representing ['head', 'right_eye', 'left_eye'], then a logical choice for | |
| flip_permutation might be [0, 2, 1] since we want to swap the 'left_eye' | |
| and 'right_eye' after a horizontal flip. | |
| scope: name scope. | |
| Returns: | |
| new_keypoints: a tensor of shape [num_instances, num_keypoints, 2] | |
| """ | |
| if not scope: | |
| scope = 'FlipHorizontal' | |
| with tf.name_scope(scope): | |
| keypoints = tf.transpose(a=keypoints, perm=[1, 0, 2]) | |
| keypoints = tf.gather(keypoints, flip_permutation) | |
| v, u = tf.split(value=keypoints, num_or_size_splits=2, axis=2) | |
| u = flip_point * 2.0 - u | |
| new_keypoints = tf.concat([v, u], 2) | |
| new_keypoints = tf.transpose(a=new_keypoints, perm=[1, 0, 2]) | |
| return new_keypoints | |
| def keypoint_change_coordinate_frame(keypoints, window, scope=None): | |
| """Changes coordinate frame of the keypoints to be relative to window's frame. | |
| Given a window of the form [y_min, x_min, y_max, x_max], changes keypoint | |
| coordinates from keypoints of shape [num_instances, num_keypoints, 2] | |
| to be relative to this window. | |
| An example use case is data augmentation: where we are given groundtruth | |
| keypoints and would like to randomly crop the image to some window. In this | |
| case we need to change the coordinate frame of each groundtruth keypoint to be | |
| relative to this new window. | |
| Args: | |
| keypoints: a tensor of shape [num_instances, num_keypoints, 2] | |
| window: a tensor of shape [4] representing the [y_min, x_min, y_max, x_max] | |
| window we should change the coordinate frame to. | |
| scope: name scope. | |
| Returns: | |
| new_keypoints: a tensor of shape [num_instances, num_keypoints, 2] | |
| """ | |
| if not scope: | |
| scope = 'ChangeCoordinateFrame' | |
| with tf.name_scope(scope): | |
| win_height = window[2] - window[0] | |
| win_width = window[3] - window[1] | |
| new_keypoints = box_list_ops.scale(keypoints - [window[0], window[1]], | |
| 1.0 / win_height, 1.0 / win_width) | |
| return new_keypoints | |
| def keypoint_prune_outside_window(keypoints, window, scope=None): | |
| """Prunes keypoints that fall outside a given window. | |
| This function replaces keypoints that fall outside the given window with nan. | |
| See also clip_to_window which clips any keypoints that fall outside the given | |
| window. | |
| Args: | |
| keypoints: a tensor of shape [num_instances, num_keypoints, 2] | |
| window: a tensor of shape [4] representing the [y_min, x_min, y_max, x_max] | |
| window outside of which the op should prune the keypoints. | |
| scope: name scope. | |
| Returns: | |
| new_keypoints: a tensor of shape [num_instances, num_keypoints, 2] | |
| """ | |
| if not scope: | |
| scope = 'PruneOutsideWindow' | |
| with tf.name_scope(scope): | |
| y, x = tf.split(value=keypoints, num_or_size_splits=2, axis=2) | |
| win_y_min, win_x_min, win_y_max, win_x_max = tf.unstack(window) | |
| valid_indices = tf.logical_and( | |
| tf.logical_and(y >= win_y_min, y <= win_y_max), | |
| tf.logical_and(x >= win_x_min, x <= win_x_max)) | |
| new_y = tf.where(valid_indices, y, np.nan * tf.ones_like(y)) | |
| new_x = tf.where(valid_indices, x, np.nan * tf.ones_like(x)) | |
| new_keypoints = tf.concat([new_y, new_x], 2) | |
| return new_keypoints | |
| def random_horizontal_flip(image, | |
| boxes=None, | |
| masks=None, | |
| keypoints=None, | |
| keypoint_flip_permutation=None, | |
| seed=None): | |
| """Randomly flips the image and detections horizontally. | |
| The probability of flipping the image is 50%. | |
| Args: | |
| image: rank 3 float32 tensor with shape [height, width, channels]. | |
| boxes: (optional) rank 2 float32 tensor with shape [N, 4] containing the | |
| bounding boxes. Boxes are in normalized form meaning their coordinates | |
| vary between [0, 1]. Each row is in the form of [ymin, xmin, ymax, xmax]. | |
| masks: (optional) rank 3 float32 tensor with shape [num_instances, height, | |
| width] containing instance masks. The masks are of the same height, width | |
| as the input `image`. | |
| keypoints: (optional) rank 3 float32 tensor with shape [num_instances, | |
| num_keypoints, 2]. The keypoints are in y-x normalized coordinates. | |
| keypoint_flip_permutation: rank 1 int32 tensor containing the keypoint flip | |
| permutation. | |
| seed: random seed | |
| Returns: | |
| image: image which is the same shape as input image. | |
| If boxes, masks, keypoints, and keypoint_flip_permutation are not None, | |
| the function also returns the following tensors. | |
| boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. | |
| Boxes are in normalized form meaning their coordinates vary | |
| between [0, 1]. | |
| masks: rank 3 float32 tensor with shape [num_instances, height, width] | |
| containing instance masks. | |
| keypoints: rank 3 float32 tensor with shape | |
| [num_instances, num_keypoints, 2] | |
| Raises: | |
| ValueError: if keypoints are provided but keypoint_flip_permutation is not. | |
| """ | |
| def _flip_image(image): | |
| # flip image | |
| image_flipped = tf.image.flip_left_right(image) | |
| return image_flipped | |
| if keypoints is not None and keypoint_flip_permutation is None: | |
| raise ValueError( | |
| 'keypoints are provided but keypoints_flip_permutation is not provided') | |
| with tf.name_scope('RandomHorizontalFlip'): | |
| result = [] | |
| # random variable defining whether to do flip or not | |
| do_a_flip_random = tf.greater(tf.random.uniform([], seed=seed), 0.5) | |
| # flip image | |
| image = tf.cond( | |
| pred=do_a_flip_random, | |
| true_fn=lambda: _flip_image(image), | |
| false_fn=lambda: image) | |
| result.append(image) | |
| # flip boxes | |
| if boxes is not None: | |
| boxes = tf.cond( | |
| pred=do_a_flip_random, | |
| true_fn=lambda: _flip_boxes_left_right(boxes), | |
| false_fn=lambda: boxes) | |
| result.append(boxes) | |
| # flip masks | |
| if masks is not None: | |
| masks = tf.cond( | |
| pred=do_a_flip_random, | |
| true_fn=lambda: _flip_masks_left_right(masks), | |
| false_fn=lambda: masks) | |
| result.append(masks) | |
| # flip keypoints | |
| if keypoints is not None and keypoint_flip_permutation is not None: | |
| permutation = keypoint_flip_permutation | |
| keypoints = tf.cond( | |
| pred=do_a_flip_random, | |
| true_fn=lambda: keypoint_flip_horizontal(keypoints, 0.5, permutation), | |
| false_fn=lambda: keypoints) | |
| result.append(keypoints) | |
| return tuple(result) | |
| def _compute_new_static_size(image, min_dimension, max_dimension): | |
| """Compute new static shape for resize_to_range method.""" | |
| image_shape = image.get_shape().as_list() | |
| orig_height = image_shape[0] | |
| orig_width = image_shape[1] | |
| num_channels = image_shape[2] | |
| orig_min_dim = min(orig_height, orig_width) | |
| # Calculates the larger of the possible sizes | |
| large_scale_factor = min_dimension / float(orig_min_dim) | |
| # Scaling orig_(height|width) by large_scale_factor will make the smaller | |
| # dimension equal to min_dimension, save for floating point rounding errors. | |
| # For reasonably-sized images, taking the nearest integer will reliably | |
| # eliminate this error. | |
| large_height = int(round(orig_height * large_scale_factor)) | |
| large_width = int(round(orig_width * large_scale_factor)) | |
| large_size = [large_height, large_width] | |
| if max_dimension: | |
| # Calculates the smaller of the possible sizes, use that if the larger | |
| # is too big. | |
| orig_max_dim = max(orig_height, orig_width) | |
| small_scale_factor = max_dimension / float(orig_max_dim) | |
| # Scaling orig_(height|width) by small_scale_factor will make the larger | |
| # dimension equal to max_dimension, save for floating point rounding | |
| # errors. For reasonably-sized images, taking the nearest integer will | |
| # reliably eliminate this error. | |
| small_height = int(round(orig_height * small_scale_factor)) | |
| small_width = int(round(orig_width * small_scale_factor)) | |
| small_size = [small_height, small_width] | |
| new_size = large_size | |
| if max(large_size) > max_dimension: | |
| new_size = small_size | |
| else: | |
| new_size = large_size | |
| return tf.constant(new_size + [num_channels]) | |
| def _compute_new_dynamic_size(image, min_dimension, max_dimension): | |
| """Compute new dynamic shape for resize_to_range method.""" | |
| image_shape = tf.shape(input=image) | |
| orig_height = tf.cast(image_shape[0], dtype=tf.float32) | |
| orig_width = tf.cast(image_shape[1], dtype=tf.float32) | |
| num_channels = image_shape[2] | |
| orig_min_dim = tf.minimum(orig_height, orig_width) | |
| # Calculates the larger of the possible sizes | |
| min_dimension = tf.constant(min_dimension, dtype=tf.float32) | |
| large_scale_factor = min_dimension / orig_min_dim | |
| # Scaling orig_(height|width) by large_scale_factor will make the smaller | |
| # dimension equal to min_dimension, save for floating point rounding errors. | |
| # For reasonably-sized images, taking the nearest integer will reliably | |
| # eliminate this error. | |
| large_height = tf.cast( | |
| tf.round(orig_height * large_scale_factor), dtype=tf.int32) | |
| large_width = tf.cast( | |
| tf.round(orig_width * large_scale_factor), dtype=tf.int32) | |
| large_size = tf.stack([large_height, large_width]) | |
| if max_dimension: | |
| # Calculates the smaller of the possible sizes, use that if the larger | |
| # is too big. | |
| orig_max_dim = tf.maximum(orig_height, orig_width) | |
| max_dimension = tf.constant(max_dimension, dtype=tf.float32) | |
| small_scale_factor = max_dimension / orig_max_dim | |
| # Scaling orig_(height|width) by small_scale_factor will make the larger | |
| # dimension equal to max_dimension, save for floating point rounding | |
| # errors. For reasonably-sized images, taking the nearest integer will | |
| # reliably eliminate this error. | |
| small_height = tf.cast( | |
| tf.round(orig_height * small_scale_factor), dtype=tf.int32) | |
| small_width = tf.cast( | |
| tf.round(orig_width * small_scale_factor), dtype=tf.int32) | |
| small_size = tf.stack([small_height, small_width]) | |
| new_size = tf.cond( | |
| pred=tf.cast(tf.reduce_max(input_tensor=large_size), dtype=tf.float32) > | |
| max_dimension, | |
| true_fn=lambda: small_size, | |
| false_fn=lambda: large_size) | |
| else: | |
| new_size = large_size | |
| return tf.stack(tf.unstack(new_size) + [num_channels]) | |
| def resize_to_range(image, | |
| masks=None, | |
| min_dimension=None, | |
| max_dimension=None, | |
| method=tf.image.ResizeMethod.BILINEAR, | |
| align_corners=False, | |
| pad_to_max_dimension=False): | |
| """Resizes an image so its dimensions are within the provided value. | |
| The output size can be described by two cases: | |
| 1. If the image can be rescaled so its minimum dimension is equal to the | |
| provided value without the other dimension exceeding max_dimension, | |
| then do so. | |
| 2. Otherwise, resize so the largest dimension is equal to max_dimension. | |
| Args: | |
| image: A 3D tensor of shape [height, width, channels] | |
| masks: (optional) rank 3 float32 tensor with shape [num_instances, height, | |
| width] containing instance masks. | |
| min_dimension: (optional) (scalar) desired size of the smaller image | |
| dimension. | |
| max_dimension: (optional) (scalar) maximum allowed size of the larger image | |
| dimension. | |
| method: (optional) interpolation method used in resizing. Defaults to | |
| BILINEAR. | |
| align_corners: bool. If true, exactly align all 4 corners of the input and | |
| output. Defaults to False. | |
| pad_to_max_dimension: Whether to resize the image and pad it with zeros so | |
| the resulting image is of the spatial size [max_dimension, max_dimension]. | |
| If masks are included they are padded similarly. | |
| Returns: | |
| Note that the position of the resized_image_shape changes based on whether | |
| masks are present. | |
| resized_image: A 3D tensor of shape [new_height, new_width, channels], | |
| where the image has been resized (with bilinear interpolation) so that | |
| min(new_height, new_width) == min_dimension or | |
| max(new_height, new_width) == max_dimension. | |
| resized_masks: If masks is not None, also outputs masks. A 3D tensor of | |
| shape [num_instances, new_height, new_width]. | |
| resized_image_shape: A 1D tensor of shape [3] containing shape of the | |
| resized image. | |
| Raises: | |
| ValueError: if the image is not a 3D tensor. | |
| """ | |
| if len(image.get_shape()) != 3: | |
| raise ValueError('Image should be 3D tensor') | |
| with tf.name_scope('ResizeToRange'): | |
| if image.get_shape().is_fully_defined(): | |
| new_size = _compute_new_static_size(image, min_dimension, max_dimension) | |
| else: | |
| new_size = _compute_new_dynamic_size(image, min_dimension, max_dimension) | |
| new_image = tf.image.resize(image, new_size[:-1], method=method) | |
| if pad_to_max_dimension: | |
| new_image = tf.image.pad_to_bounding_box(new_image, 0, 0, max_dimension, | |
| max_dimension) | |
| result = [new_image] | |
| if masks is not None: | |
| new_masks = tf.expand_dims(masks, 3) | |
| new_masks = tf.image.resize( | |
| new_masks, | |
| new_size[:-1], | |
| method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) | |
| new_masks = tf.squeeze(new_masks, 3) | |
| if pad_to_max_dimension: | |
| new_masks = tf.image.pad_to_bounding_box(new_masks, 0, 0, max_dimension, | |
| max_dimension) | |
| result.append(new_masks) | |
| result.append(new_size) | |
| return result | |
| def _copy_extra_fields(boxlist_to_copy_to, boxlist_to_copy_from): | |
| """Copies the extra fields of boxlist_to_copy_from to boxlist_to_copy_to. | |
| Args: | |
| boxlist_to_copy_to: BoxList to which extra fields are copied. | |
| boxlist_to_copy_from: BoxList from which fields are copied. | |
| Returns: | |
| boxlist_to_copy_to with extra fields. | |
| """ | |
| for field in boxlist_to_copy_from.get_extra_fields(): | |
| boxlist_to_copy_to.add_field(field, boxlist_to_copy_from.get_field(field)) | |
| return boxlist_to_copy_to | |
| def box_list_scale(boxlist, y_scale, x_scale, scope=None): | |
| """scale box coordinates in x and y dimensions. | |
| Args: | |
| boxlist: BoxList holding N boxes | |
| y_scale: (float) scalar tensor | |
| x_scale: (float) scalar tensor | |
| scope: name scope. | |
| Returns: | |
| boxlist: BoxList holding N boxes | |
| """ | |
| if not scope: | |
| scope = 'Scale' | |
| with tf.name_scope(scope): | |
| y_scale = tf.cast(y_scale, tf.float32) | |
| x_scale = tf.cast(x_scale, tf.float32) | |
| y_min, x_min, y_max, x_max = tf.split( | |
| value=boxlist.get(), num_or_size_splits=4, axis=1) | |
| y_min = y_scale * y_min | |
| y_max = y_scale * y_max | |
| x_min = x_scale * x_min | |
| x_max = x_scale * x_max | |
| scaled_boxlist = box_list.BoxList( | |
| tf.concat([y_min, x_min, y_max, x_max], 1)) | |
| return _copy_extra_fields(scaled_boxlist, boxlist) | |
| def keypoint_scale(keypoints, y_scale, x_scale, scope=None): | |
| """Scales keypoint coordinates in x and y dimensions. | |
| Args: | |
| keypoints: a tensor of shape [num_instances, num_keypoints, 2] | |
| y_scale: (float) scalar tensor | |
| x_scale: (float) scalar tensor | |
| scope: name scope. | |
| Returns: | |
| new_keypoints: a tensor of shape [num_instances, num_keypoints, 2] | |
| """ | |
| if not scope: | |
| scope = 'Scale' | |
| with tf.name_scope(scope): | |
| y_scale = tf.cast(y_scale, tf.float32) | |
| x_scale = tf.cast(x_scale, tf.float32) | |
| new_keypoints = keypoints * [[[y_scale, x_scale]]] | |
| return new_keypoints | |
| def scale_boxes_to_pixel_coordinates(image, boxes, keypoints=None): | |
| """Scales boxes from normalized to pixel coordinates. | |
| Args: | |
| image: A 3D float32 tensor of shape [height, width, channels]. | |
| boxes: A 2D float32 tensor of shape [num_boxes, 4] containing the bounding | |
| boxes in normalized coordinates. Each row is of the form [ymin, xmin, | |
| ymax, xmax]. | |
| keypoints: (optional) rank 3 float32 tensor with shape [num_instances, | |
| num_keypoints, 2]. The keypoints are in y-x normalized coordinates. | |
| Returns: | |
| image: unchanged input image. | |
| scaled_boxes: a 2D float32 tensor of shape [num_boxes, 4] containing the | |
| bounding boxes in pixel coordinates. | |
| scaled_keypoints: a 3D float32 tensor with shape | |
| [num_instances, num_keypoints, 2] containing the keypoints in pixel | |
| coordinates. | |
| """ | |
| boxlist = box_list.BoxList(boxes) | |
| image_height = tf.shape(input=image)[0] | |
| image_width = tf.shape(input=image)[1] | |
| scaled_boxes = box_list_scale(boxlist, image_height, image_width).get() | |
| result = [image, scaled_boxes] | |
| if keypoints is not None: | |
| scaled_keypoints = keypoint_scale(keypoints, image_height, image_width) | |
| result.append(scaled_keypoints) | |
| return tuple(result) | |