|
|
|
|
|
import platform
|
|
|
|
|
|
import pytest
|
|
|
import torch
|
|
|
|
|
|
from mmaction.registry import MODELS
|
|
|
from mmaction.structures import ActionDataSample
|
|
|
from mmaction.testing import get_recognizer_cfg
|
|
|
from mmaction.utils import register_all_modules
|
|
|
from mmaction.utils.gradcam_utils import GradCAM
|
|
|
|
|
|
register_all_modules()
|
|
|
|
|
|
|
|
|
def _get_target_shapes(input_shape, num_classes=400, model_type='2D'):
|
|
|
if model_type not in ['2D', '3D']:
|
|
|
raise ValueError(f'Data type {model_type} is not available')
|
|
|
|
|
|
preds_target_shape = (input_shape[0], num_classes)
|
|
|
if model_type == '3D':
|
|
|
|
|
|
|
|
|
blended_imgs_target_shape = (input_shape[0] * input_shape[1],
|
|
|
input_shape[3], input_shape[4],
|
|
|
input_shape[5], input_shape[2])
|
|
|
else:
|
|
|
|
|
|
|
|
|
blended_imgs_target_shape = (input_shape[0], input_shape[1],
|
|
|
input_shape[3], input_shape[4],
|
|
|
input_shape[2])
|
|
|
|
|
|
return blended_imgs_target_shape, preds_target_shape
|
|
|
|
|
|
|
|
|
def _do_test_2D_models(recognizer,
|
|
|
target_layer_name,
|
|
|
input_shape,
|
|
|
num_classes=400,
|
|
|
device='cpu'):
|
|
|
demo_data = {
|
|
|
'inputs': [torch.randint(0, 256, input_shape[1:])],
|
|
|
'data_samples': [ActionDataSample().set_gt_label(2)]
|
|
|
}
|
|
|
|
|
|
recognizer = recognizer.to(device)
|
|
|
gradcam = GradCAM(recognizer, target_layer_name)
|
|
|
|
|
|
blended_imgs_target_shape, preds_target_shape = _get_target_shapes(
|
|
|
input_shape, num_classes=num_classes, model_type='2D')
|
|
|
|
|
|
blended_imgs, preds = gradcam(demo_data)
|
|
|
assert blended_imgs.size() == blended_imgs_target_shape
|
|
|
assert preds.size() == preds_target_shape
|
|
|
|
|
|
blended_imgs, preds = gradcam(demo_data, True)
|
|
|
assert blended_imgs.size() == blended_imgs_target_shape
|
|
|
assert preds.size() == preds_target_shape
|
|
|
|
|
|
|
|
|
def _do_test_3D_models(recognizer,
|
|
|
target_layer_name,
|
|
|
input_shape,
|
|
|
num_classes=400):
|
|
|
blended_imgs_target_shape, preds_target_shape = _get_target_shapes(
|
|
|
input_shape, num_classes=num_classes, model_type='3D')
|
|
|
demo_data = {
|
|
|
'inputs': [torch.randint(0, 256, input_shape[1:])],
|
|
|
'data_samples': [ActionDataSample().set_gt_label(2)]
|
|
|
}
|
|
|
|
|
|
gradcam = GradCAM(recognizer, target_layer_name)
|
|
|
|
|
|
blended_imgs, preds = gradcam(demo_data)
|
|
|
assert blended_imgs.size() == blended_imgs_target_shape
|
|
|
assert preds.size() == preds_target_shape
|
|
|
|
|
|
blended_imgs, preds = gradcam(demo_data, True)
|
|
|
assert blended_imgs.size() == blended_imgs_target_shape
|
|
|
assert preds.size() == preds_target_shape
|
|
|
|
|
|
|
|
|
def test_tsn():
|
|
|
config = get_recognizer_cfg(
|
|
|
'tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py')
|
|
|
config.model['backbone']['pretrained'] = None
|
|
|
recognizer = MODELS.build(config.model)
|
|
|
recognizer.cfg = config
|
|
|
|
|
|
input_shape = (1, 25, 3, 32, 32)
|
|
|
target_layer_name = 'backbone/layer4/1/relu'
|
|
|
|
|
|
_do_test_2D_models(recognizer, target_layer_name, input_shape)
|
|
|
|
|
|
|
|
|
def test_i3d():
|
|
|
config = get_recognizer_cfg(
|
|
|
'i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py')
|
|
|
config.model['backbone']['pretrained2d'] = False
|
|
|
config.model['backbone']['pretrained'] = None
|
|
|
|
|
|
recognizer = MODELS.build(config.model)
|
|
|
recognizer.cfg = config
|
|
|
|
|
|
input_shape = (1, 1, 3, 32, 32, 32)
|
|
|
target_layer_name = 'backbone/layer4/1/relu'
|
|
|
|
|
|
_do_test_3D_models(recognizer, target_layer_name, input_shape)
|
|
|
|
|
|
|
|
|
def test_r2plus1d():
|
|
|
config = get_recognizer_cfg(
|
|
|
'r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py')
|
|
|
config.model['backbone']['pretrained2d'] = False
|
|
|
config.model['backbone']['pretrained'] = None
|
|
|
config.model['backbone']['norm_cfg'] = dict(type='BN3d')
|
|
|
|
|
|
recognizer = MODELS.build(config.model)
|
|
|
recognizer.cfg = config
|
|
|
|
|
|
input_shape = (1, 3, 3, 8, 16, 16)
|
|
|
target_layer_name = 'backbone/layer4/1/relu'
|
|
|
|
|
|
_do_test_3D_models(recognizer, target_layer_name, input_shape)
|
|
|
|
|
|
|
|
|
def test_slowfast():
|
|
|
config = get_recognizer_cfg(
|
|
|
'slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py')
|
|
|
|
|
|
recognizer = MODELS.build(config.model)
|
|
|
recognizer.cfg = config
|
|
|
|
|
|
input_shape = (1, 1, 3, 32, 32, 32)
|
|
|
target_layer_name = 'backbone/slow_path/layer4/1/relu'
|
|
|
|
|
|
_do_test_3D_models(recognizer, target_layer_name, input_shape)
|
|
|
|
|
|
|
|
|
@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
|
|
|
def test_tsm():
|
|
|
config = get_recognizer_cfg(
|
|
|
'tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py')
|
|
|
config.model['backbone']['pretrained'] = None
|
|
|
target_layer_name = 'backbone/layer4/1/relu'
|
|
|
|
|
|
|
|
|
recognizer = MODELS.build(config.model)
|
|
|
recognizer.cfg = config
|
|
|
input_shape = (1, 8, 3, 32, 32)
|
|
|
_do_test_2D_models(recognizer, target_layer_name, input_shape)
|
|
|
|
|
|
|
|
|
config.model.test_cfg = dict(average_clips='prob')
|
|
|
recognizer = MODELS.build(config.model)
|
|
|
recognizer.cfg = config
|
|
|
input_shape = (1, 48, 3, 32, 32)
|
|
|
_do_test_2D_models(recognizer, target_layer_name, input_shape)
|
|
|
|
|
|
|
|
|
@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
|
|
|
def test_csn():
|
|
|
config = get_recognizer_cfg(
|
|
|
'csn/ipcsn_ig65m-pretrained-r152-bnfrozen_32x2x1-58e_kinetics400-rgb.py'
|
|
|
)
|
|
|
config.model['backbone']['pretrained2d'] = False
|
|
|
config.model['backbone']['pretrained'] = None
|
|
|
|
|
|
recognizer = MODELS.build(config.model)
|
|
|
recognizer.cfg = config
|
|
|
input_shape = (1, 1, 3, 32, 16, 16)
|
|
|
target_layer_name = 'backbone/layer4/1/relu'
|
|
|
|
|
|
_do_test_3D_models(recognizer, target_layer_name, input_shape)
|
|
|
|
|
|
|
|
|
@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
|
|
|
def test_tpn():
|
|
|
target_layer_name = 'backbone/layer4/1/relu'
|
|
|
|
|
|
config = get_recognizer_cfg(
|
|
|
'tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py')
|
|
|
config.model['backbone']['pretrained'] = None
|
|
|
config.model['backbone']['num_segments'] = 4
|
|
|
config.model.test_cfg['fcn_test'] = False
|
|
|
recognizer = MODELS.build(config.model)
|
|
|
recognizer.cfg = config
|
|
|
|
|
|
input_shape = (1, 4, 3, 16, 16)
|
|
|
_do_test_2D_models(recognizer, target_layer_name, input_shape, 174)
|
|
|
|
|
|
config = get_recognizer_cfg(
|
|
|
'tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb.py')
|
|
|
config.model['backbone']['pretrained'] = None
|
|
|
config.model.test_cfg['fcn_test'] = False
|
|
|
recognizer = MODELS.build(config.model)
|
|
|
recognizer.cfg = config
|
|
|
input_shape = (1, 3, 3, 4, 16, 16)
|
|
|
_do_test_3D_models(recognizer, target_layer_name, input_shape)
|
|
|
|
|
|
|
|
|
@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
|
|
|
def test_c3d():
|
|
|
config = get_recognizer_cfg(
|
|
|
'c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py')
|
|
|
config.model['backbone']['pretrained'] = None
|
|
|
recognizer = MODELS.build(config.model)
|
|
|
recognizer.cfg = config
|
|
|
input_shape = (1, 1, 3, 16, 112, 112)
|
|
|
target_layer_name = 'backbone/conv5a/activate'
|
|
|
_do_test_3D_models(recognizer, target_layer_name, input_shape, 101)
|
|
|
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
|
not torch.cuda.is_available(), reason='requires CUDA support')
|
|
|
def test_tin():
|
|
|
config = get_recognizer_cfg(
|
|
|
'tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb.py')
|
|
|
config.model['backbone']['pretrained'] = None
|
|
|
target_layer_name = 'backbone/layer4/1/relu'
|
|
|
|
|
|
recognizer = MODELS.build(config.model)
|
|
|
recognizer.cfg = config
|
|
|
input_shape = (1, 8, 3, 64, 64)
|
|
|
_do_test_2D_models(
|
|
|
recognizer, target_layer_name, input_shape, device='cuda:0')
|
|
|
|
|
|
|
|
|
@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
|
|
|
def test_x3d():
|
|
|
config = get_recognizer_cfg('x3d/x3d_s_13x6x1_facebook-kinetics400-rgb.py')
|
|
|
config.model['backbone']['pretrained'] = None
|
|
|
recognizer = MODELS.build(config.model)
|
|
|
recognizer.cfg = config
|
|
|
input_shape = (1, 1, 3, 13, 16, 16)
|
|
|
target_layer_name = 'backbone/layer4/1/relu'
|
|
|
_do_test_3D_models(recognizer, target_layer_name, input_shape)
|
|
|
|