|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| """Contains building blocks for various versions of Residual Networks.
|
|
|
| Residual networks (ResNets) were proposed in:
|
| Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
|
| Deep Residual Learning for Image Recognition. arXiv:1512.03385, 2015
|
|
|
| More variants were introduced in:
|
| Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
|
| Identity Mappings in Deep Residual Networks. arXiv: 1603.05027, 2016
|
|
|
| We can obtain different ResNet variants by changing the network depth, width,
|
| and form of residual unit. This module implements the infrastructure for
|
| building them. Concrete ResNet units and full ResNet networks are implemented in
|
| the accompanying resnet_v1.py and resnet_v2.py modules.
|
|
|
| Compared to https://github.com/KaimingHe/deep-residual-networks, in the current
|
| implementation we subsample the output activations in the last residual unit of
|
| each block, instead of subsampling the input activations in the first residual
|
| unit of each block. The two implementations give identical results but our
|
| implementation is more memory efficient.
|
| """
|
| from __future__ import absolute_import
|
| from __future__ import division
|
| from __future__ import print_function
|
|
|
| import collections
|
| import tensorflow.compat.v1 as tf
|
| import tf_slim as slim
|
|
|
|
|
| class Block(collections.namedtuple('Block', ['scope', 'unit_fn', 'args'])):
|
| """A named tuple describing a ResNet block.
|
|
|
| Its parts are:
|
| scope: The scope of the `Block`.
|
| unit_fn: The ResNet unit function which takes as input a `Tensor` and
|
| returns another `Tensor` with the output of the ResNet unit.
|
| args: A list of length equal to the number of units in the `Block`. The list
|
| contains one (depth, depth_bottleneck, stride) tuple for each unit in the
|
| block to serve as argument to unit_fn.
|
| """
|
|
|
|
|
| def subsample(inputs, factor, scope=None):
|
| """Subsamples the input along the spatial dimensions.
|
|
|
| Args:
|
| inputs: A `Tensor` of size [batch, height_in, width_in, channels].
|
| factor: The subsampling factor.
|
| scope: Optional variable_scope.
|
|
|
| Returns:
|
| output: A `Tensor` of size [batch, height_out, width_out, channels] with the
|
| input, either intact (if factor == 1) or subsampled (if factor > 1).
|
| """
|
| if factor == 1:
|
| return inputs
|
| else:
|
| return slim.max_pool2d(inputs, [1, 1], stride=factor, scope=scope)
|
|
|
|
|
| def conv2d_same(inputs, num_outputs, kernel_size, stride, rate=1, scope=None):
|
| """Strided 2-D convolution with 'SAME' padding.
|
|
|
| When stride > 1, then we do explicit zero-padding, followed by conv2d with
|
| 'VALID' padding.
|
|
|
| Note that
|
|
|
| net = conv2d_same(inputs, num_outputs, 3, stride=stride)
|
|
|
| is equivalent to
|
|
|
| net = slim.conv2d(inputs, num_outputs, 3, stride=1, padding='SAME')
|
| net = subsample(net, factor=stride)
|
|
|
| whereas
|
|
|
| net = slim.conv2d(inputs, num_outputs, 3, stride=stride, padding='SAME')
|
|
|
| is different when the input's height or width is even, which is why we add the
|
| current function. For more details, see ResnetUtilsTest.testConv2DSameEven().
|
|
|
| Args:
|
| inputs: A 4-D tensor of size [batch, height_in, width_in, channels].
|
| num_outputs: An integer, the number of output filters.
|
| kernel_size: An int with the kernel_size of the filters.
|
| stride: An integer, the output stride.
|
| rate: An integer, rate for atrous convolution.
|
| scope: Scope.
|
|
|
| Returns:
|
| output: A 4-D tensor of size [batch, height_out, width_out, channels] with
|
| the convolution output.
|
| """
|
| if stride == 1:
|
| return slim.conv2d(inputs, num_outputs, kernel_size, stride=1, rate=rate,
|
| padding='SAME', scope=scope)
|
| else:
|
| kernel_size_effective = kernel_size + (kernel_size - 1) * (rate - 1)
|
| pad_total = kernel_size_effective - 1
|
| pad_beg = pad_total // 2
|
| pad_end = pad_total - pad_beg
|
| inputs = tf.pad(
|
| tensor=inputs,
|
| paddings=[[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]])
|
| return slim.conv2d(inputs, num_outputs, kernel_size, stride=stride,
|
| rate=rate, padding='VALID', scope=scope)
|
|
|
|
|
| @slim.add_arg_scope
|
| def stack_blocks_dense(net, blocks, output_stride=None,
|
| store_non_strided_activations=False,
|
| outputs_collections=None):
|
| """Stacks ResNet `Blocks` and controls output feature density.
|
|
|
| First, this function creates scopes for the ResNet in the form of
|
| 'block_name/unit_1', 'block_name/unit_2', etc.
|
|
|
| Second, this function allows the user to explicitly control the ResNet
|
| output_stride, which is the ratio of the input to output spatial resolution.
|
| This is useful for dense prediction tasks such as semantic segmentation or
|
| object detection.
|
|
|
| Most ResNets consist of 4 ResNet blocks and subsample the activations by a
|
| factor of 2 when transitioning between consecutive ResNet blocks. This results
|
| to a nominal ResNet output_stride equal to 8. If we set the output_stride to
|
| half the nominal network stride (e.g., output_stride=4), then we compute
|
| responses twice.
|
|
|
| Control of the output feature density is implemented by atrous convolution.
|
|
|
| Args:
|
| net: A `Tensor` of size [batch, height, width, channels].
|
| blocks: A list of length equal to the number of ResNet `Blocks`. Each
|
| element is a ResNet `Block` object describing the units in the `Block`.
|
| output_stride: If `None`, then the output will be computed at the nominal
|
| network stride. If output_stride is not `None`, it specifies the requested
|
| ratio of input to output spatial resolution, which needs to be equal to
|
| the product of unit strides from the start up to some level of the ResNet.
|
| For example, if the ResNet employs units with strides 1, 2, 1, 3, 4, 1,
|
| then valid values for the output_stride are 1, 2, 6, 24 or None (which
|
| is equivalent to output_stride=24).
|
| store_non_strided_activations: If True, we compute non-strided (undecimated)
|
| activations at the last unit of each block and store them in the
|
| `outputs_collections` before subsampling them. This gives us access to
|
| higher resolution intermediate activations which are useful in some
|
| dense prediction problems but increases 4x the computation and memory cost
|
| at the last unit of each block.
|
| outputs_collections: Collection to add the ResNet block outputs.
|
|
|
| Returns:
|
| net: Output tensor with stride equal to the specified output_stride.
|
|
|
| Raises:
|
| ValueError: If the target output_stride is not valid.
|
| """
|
|
|
|
|
|
|
|
|
| current_stride = 1
|
|
|
|
|
| rate = 1
|
|
|
| for block in blocks:
|
| with tf.variable_scope(block.scope, 'block', [net]) as sc:
|
| block_stride = 1
|
| for i, unit in enumerate(block.args):
|
| if store_non_strided_activations and i == len(block.args) - 1:
|
|
|
| block_stride = unit.get('stride', 1)
|
| unit = dict(unit, stride=1)
|
|
|
| with tf.variable_scope('unit_%d' % (i + 1), values=[net]):
|
|
|
|
|
|
|
| if output_stride is not None and current_stride == output_stride:
|
| net = block.unit_fn(net, rate=rate, **dict(unit, stride=1))
|
| rate *= unit.get('stride', 1)
|
|
|
| else:
|
| net = block.unit_fn(net, rate=1, **unit)
|
| current_stride *= unit.get('stride', 1)
|
| if output_stride is not None and current_stride > output_stride:
|
| raise ValueError('The target output_stride cannot be reached.')
|
|
|
|
|
| net = slim.utils.collect_named_outputs(outputs_collections, sc.name, net)
|
|
|
|
|
| if output_stride is not None and current_stride == output_stride:
|
| rate *= block_stride
|
| else:
|
| net = subsample(net, block_stride)
|
| current_stride *= block_stride
|
| if output_stride is not None and current_stride > output_stride:
|
| raise ValueError('The target output_stride cannot be reached.')
|
|
|
| if output_stride is not None and current_stride != output_stride:
|
| raise ValueError('The target output_stride cannot be reached.')
|
|
|
| return net
|
|
|
|
|
| def resnet_arg_scope(
|
| weight_decay=0.0001,
|
| batch_norm_decay=0.997,
|
| batch_norm_epsilon=1e-5,
|
| batch_norm_scale=True,
|
| activation_fn=tf.nn.relu,
|
| use_batch_norm=True,
|
| batch_norm_updates_collections=tf.GraphKeys.UPDATE_OPS):
|
| """Defines the default ResNet arg scope.
|
|
|
| TODO(gpapan): The batch-normalization related default values above are
|
| appropriate for use in conjunction with the reference ResNet models
|
| released at https://github.com/KaimingHe/deep-residual-networks. When
|
| training ResNets from scratch, they might need to be tuned.
|
|
|
| Args:
|
| weight_decay: The weight decay to use for regularizing the model.
|
| batch_norm_decay: The moving average decay when estimating layer activation
|
| statistics in batch normalization.
|
| batch_norm_epsilon: Small constant to prevent division by zero when
|
| normalizing activations by their variance in batch normalization.
|
| batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the
|
| activations in the batch normalization layer.
|
| activation_fn: The activation function which is used in ResNet.
|
| use_batch_norm: Whether or not to use batch normalization.
|
| batch_norm_updates_collections: Collection for the update ops for
|
| batch norm.
|
|
|
| Returns:
|
| An `arg_scope` to use for the resnet models.
|
| """
|
| batch_norm_params = {
|
| 'decay': batch_norm_decay,
|
| 'epsilon': batch_norm_epsilon,
|
| 'scale': batch_norm_scale,
|
| 'updates_collections': batch_norm_updates_collections,
|
| 'fused': None,
|
| }
|
|
|
| with slim.arg_scope(
|
| [slim.conv2d],
|
| weights_regularizer=slim.l2_regularizer(weight_decay),
|
| weights_initializer=slim.variance_scaling_initializer(),
|
| activation_fn=activation_fn,
|
| normalizer_fn=slim.batch_norm if use_batch_norm else None,
|
| normalizer_params=batch_norm_params):
|
| with slim.arg_scope([slim.batch_norm], **batch_norm_params):
|
|
|
|
|
|
|
|
|
|
|
|
|
| with slim.arg_scope([slim.max_pool2d], padding='SAME') as arg_sc:
|
| return arg_sc
|
|
|