|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| """Tests for RetinaNet models."""
|
|
|
|
|
| from absl.testing import parameterized
|
| import numpy as np
|
| import tensorflow as tf, tf_keras
|
|
|
| from tensorflow.python.distribute import combinations
|
| from tensorflow.python.distribute import strategy_combinations
|
| from official.vision.modeling import retinanet_model
|
| from official.vision.modeling.backbones import resnet
|
| from official.vision.modeling.decoders import fpn
|
| from official.vision.modeling.heads import dense_prediction_heads
|
| from official.vision.modeling.layers import detection_generator
|
| from official.vision.ops import anchor
|
|
|
|
|
| class RetinaNetTest(parameterized.TestCase, tf.test.TestCase):
|
|
|
| @parameterized.parameters(
|
| {
|
| 'use_separable_conv': True,
|
| 'build_anchor_boxes': True,
|
| 'is_training': False,
|
| 'has_att_heads': False,
|
| },
|
| {
|
| 'use_separable_conv': False,
|
| 'build_anchor_boxes': True,
|
| 'is_training': False,
|
| 'has_att_heads': False,
|
| },
|
| {
|
| 'use_separable_conv': False,
|
| 'build_anchor_boxes': False,
|
| 'is_training': False,
|
| 'has_att_heads': False,
|
| },
|
| {
|
| 'use_separable_conv': False,
|
| 'build_anchor_boxes': False,
|
| 'is_training': True,
|
| 'has_att_heads': False,
|
| },
|
| {
|
| 'use_separable_conv': False,
|
| 'build_anchor_boxes': True,
|
| 'is_training': True,
|
| 'has_att_heads': True,
|
| },
|
| {
|
| 'use_separable_conv': False,
|
| 'build_anchor_boxes': True,
|
| 'is_training': False,
|
| 'has_att_heads': True,
|
| },
|
| )
|
| def test_build_model(self, use_separable_conv, build_anchor_boxes,
|
| is_training, has_att_heads):
|
| num_classes = 3
|
| min_level = 3
|
| max_level = 7
|
| num_scales = 3
|
| aspect_ratios = [1.0]
|
| anchor_size = 3
|
| fpn_num_filters = 256
|
| head_num_convs = 4
|
| head_num_filters = 256
|
| num_anchors_per_location = num_scales * len(aspect_ratios)
|
| image_size = 384
|
| images = np.random.rand(2, image_size, image_size, 3)
|
| image_shape = np.array([[image_size, image_size], [image_size, image_size]])
|
|
|
| if build_anchor_boxes:
|
| anchor_boxes = anchor.Anchor(
|
| min_level=min_level,
|
| max_level=max_level,
|
| num_scales=num_scales,
|
| aspect_ratios=aspect_ratios,
|
| anchor_size=anchor_size,
|
| image_size=(image_size, image_size)).multilevel_boxes
|
| for l in anchor_boxes:
|
| anchor_boxes[l] = tf.tile(
|
| tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
|
| else:
|
| anchor_boxes = None
|
|
|
| if has_att_heads:
|
| attribute_heads = [
|
| dict(
|
| name='depth', type='regression', size=1, prediction_tower_name='')
|
| ]
|
| else:
|
| attribute_heads = None
|
|
|
| backbone = resnet.ResNet(model_id=50)
|
| decoder = fpn.FPN(
|
| input_specs=backbone.output_specs,
|
| min_level=min_level,
|
| max_level=max_level,
|
| num_filters=fpn_num_filters,
|
| use_separable_conv=use_separable_conv)
|
| head = dense_prediction_heads.RetinaNetHead(
|
| min_level=min_level,
|
| max_level=max_level,
|
| num_classes=num_classes,
|
| attribute_heads=attribute_heads,
|
| num_anchors_per_location=num_anchors_per_location,
|
| use_separable_conv=use_separable_conv,
|
| num_convs=head_num_convs,
|
| num_filters=head_num_filters)
|
| generator = detection_generator.MultilevelDetectionGenerator(
|
| max_num_detections=10)
|
| model = retinanet_model.RetinaNetModel(
|
| backbone=backbone,
|
| decoder=decoder,
|
| head=head,
|
| detection_generator=generator,
|
| min_level=min_level,
|
| max_level=max_level,
|
| num_scales=num_scales,
|
| aspect_ratios=aspect_ratios,
|
| anchor_size=anchor_size)
|
|
|
| _ = model(images, image_shape, anchor_boxes, training=is_training)
|
|
|
| @combinations.generate(
|
| combinations.combine(
|
| strategy=[
|
| strategy_combinations.cloud_tpu_strategy,
|
| strategy_combinations.one_device_strategy_gpu,
|
| ],
|
| image_size=[
|
| (128, 128),
|
| ],
|
| training=[True, False],
|
| has_att_heads=[True, False],
|
| output_intermediate_features=[True, False],
|
| soft_nms_sigma=[None, 0.0, 0.1],
|
| ))
|
| def test_forward(self, strategy, image_size, training, has_att_heads,
|
| output_intermediate_features, soft_nms_sigma):
|
| """Test for creation of a R50-FPN RetinaNet."""
|
| tf_keras.backend.set_image_data_format('channels_last')
|
| num_classes = 3
|
| min_level = 3
|
| max_level = 7
|
| num_scales = 3
|
| aspect_ratios = [1.0]
|
| num_anchors_per_location = num_scales * len(aspect_ratios)
|
|
|
| images = np.random.rand(2, image_size[0], image_size[1], 3)
|
| image_shape = np.array(
|
| [[image_size[0], image_size[1]], [image_size[0], image_size[1]]])
|
|
|
| with strategy.scope():
|
| anchor_gen = anchor.build_anchor_generator(
|
| min_level=min_level,
|
| max_level=max_level,
|
| num_scales=num_scales,
|
| aspect_ratios=aspect_ratios,
|
| anchor_size=3)
|
| anchor_boxes = anchor_gen(image_size)
|
| for l in anchor_boxes:
|
| anchor_boxes[l] = tf.tile(
|
| tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
|
|
|
| backbone = resnet.ResNet(model_id=50)
|
| decoder = fpn.FPN(
|
| input_specs=backbone.output_specs,
|
| min_level=min_level,
|
| max_level=max_level)
|
|
|
| if has_att_heads:
|
| attribute_heads = [
|
| dict(
|
| name='depth',
|
| type='regression',
|
| size=1,
|
| prediction_tower_name='')
|
| ]
|
| else:
|
| attribute_heads = None
|
| head = dense_prediction_heads.RetinaNetHead(
|
| min_level=min_level,
|
| max_level=max_level,
|
| num_classes=num_classes,
|
| attribute_heads=attribute_heads,
|
| num_anchors_per_location=num_anchors_per_location)
|
| generator = detection_generator.MultilevelDetectionGenerator(
|
| max_num_detections=10,
|
| nms_version='v1',
|
| use_cpu_nms=soft_nms_sigma is not None,
|
| soft_nms_sigma=soft_nms_sigma)
|
| model = retinanet_model.RetinaNetModel(
|
| backbone=backbone,
|
| decoder=decoder,
|
| head=head,
|
| detection_generator=generator)
|
|
|
| model_outputs = model(
|
| images,
|
| image_shape,
|
| anchor_boxes,
|
| output_intermediate_features=output_intermediate_features,
|
| training=training)
|
|
|
| if training:
|
| cls_outputs = model_outputs['cls_outputs']
|
| box_outputs = model_outputs['box_outputs']
|
| for level in range(min_level, max_level + 1):
|
| self.assertIn(str(level), cls_outputs)
|
| self.assertIn(str(level), box_outputs)
|
| self.assertAllEqual([
|
| 2,
|
| image_size[0] // 2**level,
|
| image_size[1] // 2**level,
|
| num_classes * num_anchors_per_location
|
| ], cls_outputs[str(level)].numpy().shape)
|
| self.assertAllEqual([
|
| 2,
|
| image_size[0] // 2**level,
|
| image_size[1] // 2**level,
|
| 4 * num_anchors_per_location
|
| ], box_outputs[str(level)].numpy().shape)
|
| if has_att_heads:
|
| att_outputs = model_outputs['attribute_outputs']
|
| for att in att_outputs.values():
|
| self.assertAllEqual([
|
| 2, image_size[0] // 2**level, image_size[1] // 2**level,
|
| 1 * num_anchors_per_location
|
| ], att[str(level)].numpy().shape)
|
| else:
|
| self.assertIn('detection_boxes', model_outputs)
|
| self.assertIn('detection_scores', model_outputs)
|
| self.assertIn('detection_classes', model_outputs)
|
| self.assertIn('num_detections', model_outputs)
|
| self.assertAllEqual(
|
| [2, 10, 4], model_outputs['detection_boxes'].numpy().shape)
|
| self.assertAllEqual(
|
| [2, 10], model_outputs['detection_scores'].numpy().shape)
|
| self.assertAllEqual(
|
| [2, 10], model_outputs['detection_classes'].numpy().shape)
|
| self.assertAllEqual(
|
| [2,], model_outputs['num_detections'].numpy().shape)
|
| if has_att_heads:
|
| self.assertIn('detection_attributes', model_outputs)
|
| self.assertAllEqual(
|
| [2, 10, 1],
|
| model_outputs['detection_attributes']['depth'].numpy().shape)
|
| if output_intermediate_features:
|
| for l in range(2, 6):
|
| self.assertIn('backbone_{}'.format(l), model_outputs)
|
| self.assertAllEqual([
|
| 2, image_size[0] // 2**l, image_size[1] // 2**l,
|
| backbone.output_specs[str(l)].as_list()[-1]
|
| ], model_outputs['backbone_{}'.format(l)].numpy().shape)
|
| for l in range(min_level, max_level + 1):
|
| self.assertIn('decoder_{}'.format(l), model_outputs)
|
| self.assertAllEqual([
|
| 2, image_size[0] // 2**l, image_size[1] // 2**l,
|
| decoder.output_specs[str(l)].as_list()[-1]
|
| ], model_outputs['decoder_{}'.format(l)].numpy().shape)
|
|
|
| def test_serialize_deserialize(self):
|
| """Validate the network can be serialized and deserialized."""
|
| num_classes = 3
|
| min_level = 3
|
| max_level = 7
|
| num_scales = 3
|
| aspect_ratios = [1.0]
|
| num_anchors_per_location = num_scales * len(aspect_ratios)
|
|
|
| backbone = resnet.ResNet(model_id=50)
|
| decoder = fpn.FPN(
|
| input_specs=backbone.output_specs,
|
| min_level=min_level,
|
| max_level=max_level)
|
| head = dense_prediction_heads.RetinaNetHead(
|
| min_level=min_level,
|
| max_level=max_level,
|
| num_classes=num_classes,
|
| num_anchors_per_location=num_anchors_per_location)
|
| generator = detection_generator.MultilevelDetectionGenerator(
|
| max_num_detections=10)
|
| model = retinanet_model.RetinaNetModel(
|
| backbone=backbone,
|
| decoder=decoder,
|
| head=head,
|
| detection_generator=generator,
|
| min_level=min_level,
|
| max_level=max_level,
|
| num_scales=num_scales,
|
| aspect_ratios=aspect_ratios,
|
| anchor_size=3)
|
|
|
| config = model.get_config()
|
| new_model = retinanet_model.RetinaNetModel.from_config(config)
|
|
|
|
|
| _ = new_model.to_json()
|
|
|
|
|
| self.assertAllEqual(model.get_config(), new_model.get_config())
|
|
|
|
|
| if __name__ == '__main__':
|
| tf.test.main()
|
|
|