Spaces:

AIDetect-benchmarked
/

Deepfake-Detector

Sleeping

App Files Files Community

Deepfake-Detector / mmaction /models /backbones /resnet_tsm.py

AZIIIIIIIIZ

Upload 1039 files

d670799 verified 3 months ago

raw

history blame contribute delete

14.7 kB

	# Copyright (c) OpenMMLab. All rights reserved.
	import torch
	import torch.nn as nn
	from mmcv.cnn import ConvModule, NonLocal3d
	from mmengine.logging import MMLogger
	from mmengine.runner.checkpoint import _load_checkpoint
	from torch.nn.modules.utils import _ntuple

	from mmaction.registry import MODELS
	from .resnet import ResNet


	class NL3DWrapper(nn.Module):
	"""3D Non-local wrapper for ResNet50.

	Wrap ResNet layers with 3D NonLocal modules.

	Args:
	block (nn.Module): Residual blocks to be built.
	num_segments (int): Number of frame segments.
	non_local_cfg (dict): Config for non-local layers. Default: ``dict()``.
	"""

	def __init__(self, block, num_segments, non_local_cfg=dict()):
	super(NL3DWrapper, self).__init__()
	self.block = block
	self.non_local_cfg = non_local_cfg
	self.non_local_block = NonLocal3d(self.block.conv3.norm.num_features,
	**self.non_local_cfg)
	self.num_segments = num_segments

	def forward(self, x):
	"""Defines the computation performed at every call."""
	x = self.block(x)

	n, c, h, w = x.size()
	x = x.view(n // self.num_segments, self.num_segments, c, h,
	w).transpose(1, 2).contiguous()
	x = self.non_local_block(x)
	x = x.transpose(1, 2).contiguous().view(n, c, h, w)
	return x


	class TemporalShift(nn.Module):
	"""Temporal shift module.

	This module is proposed in
	`TSM: Temporal Shift Module for Efficient Video Understanding
	<https://arxiv.org/abs/1811.08383>`_

	Args:
	net (nn.module): Module to make temporal shift.
	num_segments (int): Number of frame segments. Default: 3.
	shift_div (int): Number of divisions for shift. Default: 8.
	"""

	def __init__(self, net, num_segments=3, shift_div=8):
	super().__init__()
	self.net = net
	self.num_segments = num_segments
	self.shift_div = shift_div

	def forward(self, x):
	"""Defines the computation performed at every call.

	Args:
	x (torch.Tensor): The input data.

	Returns:
	torch.Tensor: The output of the module.
	"""
	x = self.shift(x, self.num_segments, shift_div=self.shift_div)
	return self.net(x)

	@staticmethod
	def shift(x, num_segments, shift_div=3):
	"""Perform temporal shift operation on the feature.

	Args:
	x (torch.Tensor): The input feature to be shifted.
	num_segments (int): Number of frame segments.
	shift_div (int): Number of divisions for shift. Default: 3.

	Returns:
	torch.Tensor: The shifted feature.
	"""
	# [N, C, H, W]
	n, c, h, w = x.size()

	# [N // num_segments, num_segments, C, H*W]
	# can't use 5 dimensional array on PPL2D backend for caffe
	x = x.view(-1, num_segments, c, h * w)

	# get shift fold
	fold = c // shift_div

	# split c channel into three parts:
	# left_split, mid_split, right_split
	left_split = x[:, :, :fold, :]
	mid_split = x[:, :, fold:2 * fold, :]
	right_split = x[:, :, 2 * fold:, :]

	# can't use torch.zeros(*A.shape) or torch.zeros_like(A)
	# because array on caffe inference must be got by computing

	# shift left on num_segments channel in `left_split`
	zeros = left_split - left_split
	blank = zeros[:, :1, :, :]
	left_split = left_split[:, 1:, :, :]
	left_split = torch.cat((left_split, blank), 1)

	# shift right on num_segments channel in `mid_split`
	zeros = mid_split - mid_split
	blank = zeros[:, :1, :, :]
	mid_split = mid_split[:, :-1, :, :]
	mid_split = torch.cat((blank, mid_split), 1)

	# right_split: no shift

	# concatenate
	out = torch.cat((left_split, mid_split, right_split), 2)

	# [N, C, H, W]
	# restore the original dimension
	return out.view(n, c, h, w)


	@MODELS.register_module()
	class ResNetTSM(ResNet):
	"""ResNet backbone for TSM.

	Args:
	num_segments (int): Number of frame segments. Defaults to 8.
	is_shift (bool): Whether to make temporal shift in reset layers.
	Defaults to True.
	non_local (Sequence[int]): Determine whether to apply non-local module
	in the corresponding block of each stages.
	Defaults to (0, 0, 0, 0).
	non_local_cfg (dict): Config for non-local module.
	Defaults to ``dict()``.
	shift_div (int): Number of div for shift. Defaults to 8.
	shift_place (str): Places in resnet layers for shift, which is chosen
	from ['block', 'blockres'].
	If set to 'block', it will apply temporal shift to all child blocks
	in each resnet layer.
	If set to 'blockres', it will apply temporal shift to each `conv1`
	layer of all child blocks in each resnet layer.
	Defaults to 'blockres'.
	temporal_pool (bool): Whether to add temporal pooling.
	Defaults to False.
	pretrained2d (bool): Whether to load pretrained 2D model.
	Defaults to True.
	**kwargs (keyword arguments, optional): Arguments for ResNet.
	"""

	def __init__(self,
	depth,
	num_segments=8,
	is_shift=True,
	non_local=(0, 0, 0, 0),
	non_local_cfg=dict(),
	shift_div=8,
	shift_place='blockres',
	temporal_pool=False,
	pretrained2d=True,
	**kwargs):
	super().__init__(depth, **kwargs)
	self.num_segments = num_segments
	self.is_shift = is_shift
	self.shift_div = shift_div
	self.shift_place = shift_place
	self.temporal_pool = temporal_pool
	self.non_local = non_local
	self.non_local_stages = _ntuple(self.num_stages)(non_local)
	self.non_local_cfg = non_local_cfg
	self.pretrained2d = pretrained2d
	self.init_structure()

	def init_structure(self):
	"""Initialize structure for tsm."""
	if self.is_shift:
	self.make_temporal_shift()
	if len(self.non_local_cfg) != 0:
	self.make_non_local()
	if self.temporal_pool:
	self.make_temporal_pool()

	def make_temporal_shift(self):
	"""Make temporal shift for some layers."""
	if self.temporal_pool:
	num_segment_list = [
	self.num_segments, self.num_segments // 2,
	self.num_segments // 2, self.num_segments // 2
	]
	else:
	num_segment_list = [self.num_segments] * 4
	if num_segment_list[-1] <= 0:
	raise ValueError('num_segment_list[-1] must be positive')

	if self.shift_place == 'block':

	def make_block_temporal(stage, num_segments):
	"""Make temporal shift on some blocks.

	Args:
	stage (nn.Module): Model layers to be shifted.
	num_segments (int): Number of frame segments.

	Returns:
	nn.Module: The shifted blocks.
	"""
	blocks = list(stage.children())
	for i, b in enumerate(blocks):
	blocks[i] = TemporalShift(
	b, num_segments=num_segments, shift_div=self.shift_div)
	return nn.Sequential(*blocks)

	self.layer1 = make_block_temporal(self.layer1, num_segment_list[0])
	self.layer2 = make_block_temporal(self.layer2, num_segment_list[1])
	self.layer3 = make_block_temporal(self.layer3, num_segment_list[2])
	self.layer4 = make_block_temporal(self.layer4, num_segment_list[3])

	elif 'blockres' in self.shift_place:
	n_round = 1
	if len(list(self.layer3.children())) >= 23:
	n_round = 2

	def make_block_temporal(stage, num_segments):
	"""Make temporal shift on some blocks.

	Args:
	stage (nn.Module): Model layers to be shifted.
	num_segments (int): Number of frame segments.

	Returns:
	nn.Module: The shifted blocks.
	"""
	blocks = list(stage.children())
	for i, b in enumerate(blocks):
	if i % n_round == 0:
	blocks[i].conv1.conv = TemporalShift(
	b.conv1.conv,
	num_segments=num_segments,
	shift_div=self.shift_div)
	return nn.Sequential(*blocks)

	self.layer1 = make_block_temporal(self.layer1, num_segment_list[0])
	self.layer2 = make_block_temporal(self.layer2, num_segment_list[1])
	self.layer3 = make_block_temporal(self.layer3, num_segment_list[2])
	self.layer4 = make_block_temporal(self.layer4, num_segment_list[3])

	else:
	raise NotImplementedError

	def make_temporal_pool(self):
	"""Make temporal pooling between layer1 and layer2, using a 3D max
	pooling layer."""

	class TemporalPool(nn.Module):
	"""Temporal pool module.

	Wrap layer2 in ResNet50 with a 3D max pooling layer.

	Args:
	net (nn.Module): Module to make temporal pool.
	num_segments (int): Number of frame segments.
	"""

	def __init__(self, net, num_segments):
	super().__init__()
	self.net = net
	self.num_segments = num_segments
	self.max_pool3d = nn.MaxPool3d(
	kernel_size=(3, 1, 1), stride=(2, 1, 1), padding=(1, 0, 0))

	def forward(self, x):
	"""Defines the computation performed at every call."""
	# [N, C, H, W]
	n, c, h, w = x.size()
	# [N // num_segments, C, num_segments, H, W]
	x = x.view(n // self.num_segments, self.num_segments, c, h,
	w).transpose(1, 2)
	# [N // num_segmnets, C, num_segments // 2, H, W]
	x = self.max_pool3d(x)
	# [N // 2, C, H, W]
	x = x.transpose(1, 2).contiguous().view(n // 2, c, h, w)
	return self.net(x)

	self.layer2 = TemporalPool(self.layer2, self.num_segments)

	def make_non_local(self):
	"""Wrap resnet layer into non local wrapper."""
	# This part is for ResNet50
	for i in range(self.num_stages):
	non_local_stage = self.non_local_stages[i]
	if sum(non_local_stage) == 0:
	continue

	layer_name = f'layer{i + 1}'
	res_layer = getattr(self, layer_name)

	for idx, non_local in enumerate(non_local_stage):
	if non_local:
	res_layer[idx] = NL3DWrapper(res_layer[idx],
	self.num_segments,
	self.non_local_cfg)

	def _get_wrap_prefix(self):
	return ['.net', '.block']

	def load_original_weights(self, logger):
	"""Load weights from original checkpoint, which required converting
	keys."""
	state_dict_torchvision = _load_checkpoint(
	self.pretrained, map_location='cpu')
	if 'state_dict' in state_dict_torchvision:
	state_dict_torchvision = state_dict_torchvision['state_dict']

	wrapped_layers_map = dict()
	for name, module in self.named_modules():
	# convert torchvision keys
	ori_name = name
	for wrap_prefix in self._get_wrap_prefix():
	if wrap_prefix in ori_name:
	ori_name = ori_name.replace(wrap_prefix, '')
	wrapped_layers_map[ori_name] = name

	if isinstance(module, ConvModule):
	if 'downsample' in ori_name:
	# layer{X}.{Y}.downsample.conv->layer{X}.{Y}.downsample.0
	tv_conv_name = ori_name + '.0'
	# layer{X}.{Y}.downsample.bn->layer{X}.{Y}.downsample.1
	tv_bn_name = ori_name + '.1'
	else:
	# layer{X}.{Y}.conv{n}.conv->layer{X}.{Y}.conv{n}
	tv_conv_name = ori_name
	# layer{X}.{Y}.conv{n}.bn->layer{X}.{Y}.bn{n}
	tv_bn_name = ori_name.replace('conv', 'bn')

	for conv_param in ['.weight', '.bias']:
	if tv_conv_name + conv_param in state_dict_torchvision:
	state_dict_torchvision[ori_name+'.conv'+conv_param] = \
	state_dict_torchvision.pop(tv_conv_name+conv_param)

	for bn_param in [
	'.weight', '.bias', '.running_mean', '.running_var'
	]:
	if tv_bn_name + bn_param in state_dict_torchvision:
	state_dict_torchvision[ori_name+'.bn'+bn_param] = \
	state_dict_torchvision.pop(tv_bn_name+bn_param)

	# convert wrapped keys
	for param_name in list(state_dict_torchvision.keys()):
	layer_name = '.'.join(param_name.split('.')[:-1])
	if layer_name in wrapped_layers_map:
	wrapped_name = param_name.replace(
	layer_name, wrapped_layers_map[layer_name])
	print(f'wrapped_name {wrapped_name}')
	state_dict_torchvision[
	wrapped_name] = state_dict_torchvision.pop(param_name)

	msg = self.load_state_dict(state_dict_torchvision, strict=False)
	logger.info(msg)

	def init_weights(self):
	"""Initiate the parameters either from existing checkpoint or from
	scratch."""
	if self.pretrained2d:
	logger = MMLogger.get_current_instance()
	self.load_original_weights(logger)
	else:
	if self.pretrained:
	self.init_cfg = dict(
	type='Pretrained', checkpoint=self.pretrained)
	super().init_weights()