|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| """Functions to manipulate feature map pyramids, such as for FPNs and BiFPNs.
|
|
|
| Includes utility functions to facilitate feature pyramid map manipulations,
|
| such as combining multiple feature maps, upsampling or downsampling feature
|
| maps, and applying blocks of convolution, batchnorm, and activation layers.
|
| """
|
| from six.moves import range
|
| import tensorflow as tf
|
| from object_detection.utils import ops
|
| from object_detection.utils import shape_utils
|
|
|
|
|
| def create_conv_block(name, num_filters, kernel_size, strides, padding,
|
| use_separable, apply_batchnorm, apply_activation,
|
| conv_hyperparams, is_training, freeze_batchnorm,
|
| conv_bn_act_pattern=True):
|
| """Create Keras layers for regular or separable convolutions.
|
|
|
| Args:
|
| name: String. The name of the layer.
|
| num_filters: Number of filters (channels) for the output feature maps.
|
| kernel_size: A list of length 2: [kernel_height, kernel_width] of the
|
| filters, or a single int if both values are the same.
|
| strides: A list of length 2: [stride_height, stride_width], specifying the
|
| convolution stride, or a single int if both strides are the same.
|
| padding: One of 'VALID' or 'SAME'.
|
| use_separable: Bool. Whether to use depthwise separable convolution instead
|
| of regular convolution.
|
| apply_batchnorm: Bool. Whether to apply a batch normalization layer after
|
| convolution, constructed according to the conv_hyperparams.
|
| apply_activation: Bool. Whether to apply an activation layer after
|
| convolution, constructed according to the conv_hyperparams.
|
| conv_hyperparams: A `hyperparams_builder.KerasLayerHyperparams` object
|
| containing hyperparameters for convolution ops.
|
| is_training: Bool. Whether the feature generator is in training mode.
|
| freeze_batchnorm: Bool. Whether to freeze batch norm parameters during
|
| training or not. When training with a small batch size (e.g. 1), it is
|
| desirable to freeze batch norm update and use pretrained batch norm
|
| params.
|
| conv_bn_act_pattern: Bool. By default, when True, the layers returned by
|
| this function are in the order [conv, batchnorm, activation]. Otherwise,
|
| when False, the order of the layers is [activation, conv, batchnorm].
|
|
|
| Returns:
|
| A list of keras layers, including (regular or seperable) convolution, and
|
| optionally batch normalization and activation layers.
|
| """
|
| layers = []
|
| if use_separable:
|
| kwargs = conv_hyperparams.params()
|
|
|
|
|
| kwargs['depthwise_regularizer'] = kwargs['kernel_regularizer']
|
| kwargs['depthwise_initializer'] = kwargs['kernel_initializer']
|
|
|
|
|
| kwargs['pointwise_regularizer'] = kwargs['kernel_regularizer']
|
| kwargs['pointwise_initializer'] = kwargs['kernel_initializer']
|
| layers.append(
|
| tf.keras.layers.SeparableConv2D(
|
| filters=num_filters,
|
| kernel_size=kernel_size,
|
| depth_multiplier=1,
|
| padding=padding,
|
| strides=strides,
|
| name=name + 'separable_conv',
|
| **kwargs))
|
| else:
|
| layers.append(
|
| tf.keras.layers.Conv2D(
|
| filters=num_filters,
|
| kernel_size=kernel_size,
|
| padding=padding,
|
| strides=strides,
|
| name=name + 'conv',
|
| **conv_hyperparams.params()))
|
|
|
| if apply_batchnorm:
|
| layers.append(
|
| conv_hyperparams.build_batch_norm(
|
| training=(is_training and not freeze_batchnorm),
|
| name=name + 'batchnorm'))
|
|
|
| if apply_activation:
|
| activation_layer = conv_hyperparams.build_activation_layer(
|
| name=name + 'activation')
|
| if conv_bn_act_pattern:
|
| layers.append(activation_layer)
|
| else:
|
| layers = [activation_layer] + layers
|
|
|
| return layers
|
|
|
|
|
| def create_downsample_feature_map_ops(scale, downsample_method,
|
| conv_hyperparams, is_training,
|
| freeze_batchnorm, name):
|
| """Creates Keras layers for downsampling feature maps.
|
|
|
| Args:
|
| scale: Int. The scale factor by which to downsample input feature maps. For
|
| example, in the case of a typical feature map pyramid, the scale factor
|
| between level_i and level_i+1 is 2.
|
| downsample_method: String. The method used for downsampling. Currently
|
| supported methods include 'max_pooling', 'avg_pooling', and
|
| 'depthwise_conv'.
|
| conv_hyperparams: A `hyperparams_builder.KerasLayerHyperparams` object
|
| containing hyperparameters for convolution ops.
|
| is_training: Bool. Whether the feature generator is in training mode.
|
| freeze_batchnorm: Bool. Whether to freeze batch norm parameters during
|
| training or not. When training with a small batch size (e.g. 1), it is
|
| desirable to freeze batch norm update and use pretrained batch norm
|
| params.
|
| name: String. The name used to prefix the constructed layers.
|
|
|
| Returns:
|
| A list of Keras layers which will downsample input feature maps by the
|
| desired scale factor.
|
| """
|
| layers = []
|
| padding = 'SAME'
|
| stride = int(scale)
|
| kernel_size = stride + 1
|
| if downsample_method == 'max_pooling':
|
| layers.append(
|
| tf.keras.layers.MaxPooling2D(
|
| pool_size=kernel_size,
|
| strides=stride,
|
| padding=padding,
|
| name=name + 'downsample_max_x{}'.format(stride)))
|
| elif downsample_method == 'avg_pooling':
|
| layers.append(
|
| tf.keras.layers.AveragePooling2D(
|
| pool_size=kernel_size,
|
| strides=stride,
|
| padding=padding,
|
| name=name + 'downsample_avg_x{}'.format(stride)))
|
| elif downsample_method == 'depthwise_conv':
|
| layers.append(
|
| tf.keras.layers.DepthwiseConv2D(
|
| kernel_size=kernel_size,
|
| strides=stride,
|
| padding=padding,
|
| name=name + 'downsample_depthwise_x{}'.format(stride)))
|
| layers.append(
|
| conv_hyperparams.build_batch_norm(
|
| training=(is_training and not freeze_batchnorm),
|
| name=name + 'downsample_batchnorm'))
|
| layers.append(
|
| conv_hyperparams.build_activation_layer(name=name +
|
| 'downsample_activation'))
|
| else:
|
| raise ValueError('Unknown downsample method: {}'.format(downsample_method))
|
|
|
| return layers
|
|
|
|
|
| def create_upsample_feature_map_ops(scale, use_native_resize_op, name):
|
| """Creates Keras layers for upsampling feature maps.
|
|
|
| Args:
|
| scale: Int. The scale factor by which to upsample input feature maps. For
|
| example, in the case of a typical feature map pyramid, the scale factor
|
| between level_i and level_i-1 is 2.
|
| use_native_resize_op: If True, uses tf.image.resize_nearest_neighbor op for
|
| the upsampling process instead of reshape and broadcasting implementation.
|
| name: String. The name used to prefix the constructed layers.
|
|
|
| Returns:
|
| A list of Keras layers which will upsample input feature maps by the
|
| desired scale factor.
|
| """
|
| layers = []
|
| if use_native_resize_op:
|
|
|
| def resize_nearest_neighbor(image):
|
| image_shape = shape_utils.combined_static_and_dynamic_shape(image)
|
| return tf.compat.v1.image.resize_nearest_neighbor(
|
| image, [image_shape[1] * scale, image_shape[2] * scale])
|
|
|
| layers.append(
|
| tf.keras.layers.Lambda(
|
| resize_nearest_neighbor,
|
| name=name + 'nearest_neighbor_upsampling_x{}'.format(scale)))
|
| else:
|
|
|
| def nearest_neighbor_upsampling(image):
|
| return ops.nearest_neighbor_upsampling(image, scale=scale)
|
|
|
| layers.append(
|
| tf.keras.layers.Lambda(
|
| nearest_neighbor_upsampling,
|
| name=name + 'nearest_neighbor_upsampling_x{}'.format(scale)))
|
|
|
| return layers
|
|
|
|
|
| def create_resample_feature_map_ops(input_scale_factor, output_scale_factor,
|
| downsample_method, use_native_resize_op,
|
| conv_hyperparams, is_training,
|
| freeze_batchnorm, name):
|
| """Creates Keras layers for downsampling or upsampling feature maps.
|
|
|
| Args:
|
| input_scale_factor: Int. Scale factor of the input feature map. For example,
|
| for a feature pyramid where each successive level halves its spatial
|
| resolution, the scale factor of a level is 2^level. The input and output
|
| scale factors are used to compute the scale for upsampling or downsamling,
|
| so they should be evenly divisible.
|
| output_scale_factor: Int. Scale factor of the output feature map. See
|
| input_scale_factor for additional details.
|
| downsample_method: String. The method used for downsampling. See
|
| create_downsample_feature_map_ops for details on supported methods.
|
| use_native_resize_op: If True, uses tf.image.resize_nearest_neighbor op for
|
| the upsampling process instead of reshape and broadcasting implementation.
|
| See create_upsample_feature_map_ops for details.
|
| conv_hyperparams: A `hyperparams_builder.KerasLayerHyperparams` object
|
| containing hyperparameters for convolution ops.
|
| is_training: Bool. Whether the feature generator is in training mode.
|
| freeze_batchnorm: Bool. Whether to freeze batch norm parameters during
|
| training or not. When training with a small batch size (e.g. 1), it is
|
| desirable to freeze batch norm update and use pretrained batch norm
|
| params.
|
| name: String. The name used to prefix the constructed layers.
|
|
|
| Returns:
|
| A list of Keras layers which will downsample or upsample input feature maps
|
| to match the desired output feature map scale.
|
| """
|
| if input_scale_factor < output_scale_factor:
|
| if output_scale_factor % input_scale_factor != 0:
|
| raise ValueError('Invalid scale factor: input scale 1/{} not divisible by'
|
| 'output scale 1/{}'.format(input_scale_factor,
|
| output_scale_factor))
|
| scale = output_scale_factor // input_scale_factor
|
| return create_downsample_feature_map_ops(scale, downsample_method,
|
| conv_hyperparams, is_training,
|
| freeze_batchnorm, name)
|
| elif input_scale_factor > output_scale_factor:
|
| if input_scale_factor % output_scale_factor != 0:
|
| raise ValueError('Invalid scale factor: input scale 1/{} not a divisor of'
|
| 'output scale 1/{}'.format(input_scale_factor,
|
| output_scale_factor))
|
| scale = input_scale_factor // output_scale_factor
|
| return create_upsample_feature_map_ops(scale, use_native_resize_op, name)
|
| else:
|
| return []
|
|
|
|
|
|
|
| class BiFPNCombineLayer(tf.keras.layers.Layer):
|
| """Combines multiple input feature maps into a single output feature map.
|
|
|
| A Keras layer which combines multiple input feature maps into a single output
|
| feature map, according to the desired combination method. Options for
|
| combining feature maps include simple summation, or several types of weighted
|
| sums using learned weights for each input feature map. These include
|
| 'weighted_sum', 'attention', and 'fast_attention'. For more details, see the
|
| EfficientDet paper by Tan et al, see arxiv.org/abs/1911.09070.
|
|
|
| Specifically, this layer takes a list of tensors as input, all of the same
|
| shape, and returns a single tensor, also of the same shape.
|
| """
|
|
|
| def __init__(self, combine_method, **kwargs):
|
| """Constructor.
|
|
|
| Args:
|
| combine_method: String. The method used to combine the input feature maps
|
| into a single output feature map. One of 'sum', 'weighted_sum',
|
| 'attention', or 'fast_attention'.
|
| **kwargs: Additional Keras layer arguments.
|
| """
|
| super(BiFPNCombineLayer, self).__init__(**kwargs)
|
| self.combine_method = combine_method
|
|
|
| def _combine_weighted_sum(self, inputs):
|
| return tf.squeeze(
|
| tf.linalg.matmul(tf.stack(inputs, axis=-1), self.per_input_weights),
|
| axis=[-1])
|
|
|
| def _combine_attention(self, inputs):
|
| normalized_weights = tf.nn.softmax(self.per_input_weights)
|
| return tf.squeeze(
|
| tf.linalg.matmul(tf.stack(inputs, axis=-1), normalized_weights),
|
| axis=[-1])
|
|
|
| def _combine_fast_attention(self, inputs):
|
| weights_non_neg = tf.nn.relu(self.per_input_weights)
|
| normalizer = tf.reduce_sum(weights_non_neg) + 0.0001
|
| normalized_weights = weights_non_neg / normalizer
|
| return tf.squeeze(
|
| tf.linalg.matmul(tf.stack(inputs, axis=-1), normalized_weights),
|
| axis=[-1])
|
|
|
| def build(self, input_shape):
|
| if not isinstance(input_shape, list):
|
| raise ValueError('A BiFPN combine layer should be called '
|
| 'on a list of inputs.')
|
| if len(input_shape) < 2:
|
| raise ValueError('A BiFPN combine layer should be called '
|
| 'on a list of at least 2 inputs. '
|
| 'Got ' + str(len(input_shape)) + ' inputs.')
|
| if self.combine_method == 'sum':
|
| self._combine_op = tf.keras.layers.Add()
|
| elif self.combine_method == 'weighted_sum':
|
| self._combine_op = self._combine_weighted_sum
|
| elif self.combine_method == 'attention':
|
| self._combine_op = self._combine_attention
|
| elif self.combine_method == 'fast_attention':
|
| self._combine_op = self._combine_fast_attention
|
| else:
|
| raise ValueError('Unknown combine type: {}'.format(self.combine_method))
|
| if self.combine_method in {'weighted_sum', 'attention', 'fast_attention'}:
|
| self.per_input_weights = self.add_weight(
|
| name='bifpn_combine_weights',
|
| shape=(len(input_shape), 1),
|
| initializer='ones',
|
| trainable=True)
|
| super(BiFPNCombineLayer, self).build(input_shape)
|
|
|
| def call(self, inputs):
|
| """Combines multiple input feature maps into a single output feature map.
|
|
|
| Executed when calling the `.__call__` method on input.
|
|
|
| Args:
|
| inputs: A list of tensors where all tensors have the same shape, [batch,
|
| height_i, width_i, depth_i].
|
|
|
| Returns:
|
| A single tensor, with the same shape as the input tensors,
|
| [batch, height_i, width_i, depth_i].
|
| """
|
| return self._combine_op(inputs)
|
|
|
| def compute_output_shape(self, input_shape):
|
| output_shape = input_shape[0]
|
| for i in range(1, len(input_shape)):
|
| if input_shape[i] != output_shape:
|
| raise ValueError(
|
| 'Inputs could not be combined. Shapes should match, '
|
| 'but input_shape[0] is {} while input_shape[{}] is {}'.format(
|
| output_shape, i, input_shape[i]))
|
|
|