Upload open source code of MTFL model

28e129b verified 4 months ago

12 kB

	""" Reference source: https://github.com/tianyu0207/RTFM"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.nn.init as torch_init
	torch.set_default_tensor_type('torch.FloatTensor')


	def weight_init(m):
	classname = m.__class__.__name__
	if classname.find('Conv') != -1 or classname.find('Linear') != -1:
	torch_init.xavier_uniform_(m.weight)
	if m.bias is not None:
	m.bias.data.fill_(0)


	class CVA(nn.Module):
	def __init__(self, input_dim=1024):
	"""
	Cross-View Attention (CVA) module.

	Args:
	input_dim (int): Dimension of the input features.
	"""
	super(CVA, self).__init__()
	drop_out_rate = 0.1
	num_heads = 4
	self.cross_attention = nn.MultiheadAttention(embed_dim=input_dim, num_heads=num_heads, dropout=drop_out_rate,
	device='cuda')

	def forward(self, feature1, feature2):
	"""
	Args:
	feature1 (torch.Tensor): one path features. Shape: B x T x C.
	feature2 (torch.Tensor): another path features. Shape: B x T x C.

	Returns:
	out1 (torch.Tensor): Processed features after cross-attention. Shape: B x T x C.
	"""

	feature1 = F.layer_norm(feature1, [feature1.size(-1)])
	feature2 = F.layer_norm(feature2, [feature2.size(-1)])
	feature1 = feature1.permute(1, 0, 2) # T B C
	feature2 = feature2.permute(1, 0, 2)

	out1, _ = self.cross_attention(query=feature1, key=feature2, value=feature2) # T B C (For test:32 1 1024)
	out1 = out1 + feature1 # residual connection

	return out1 # B T C


	class Aggregate(nn.Module):
	def __init__(self, input_dim):
	"""
	An aggregate network including local temporal correlation learning, global temporal correlation learning,
	and feature fusion in MTFF.

	Args:
	input_dim (int): input features dim.
	"""
	super(Aggregate, self).__init__()
	bn = nn.BatchNorm1d
	num_heads = 4
	self.input_dim = input_dim
	self.conv_1 = nn.Sequential(
	nn.Conv1d(in_channels=input_dim, out_channels=512, kernel_size=3,
	stride=1,dilation=1, padding=1),
	nn.LeakyReLU(negative_slope=5e-2),
	bn(512)
	)
	self.conv_2 = nn.Sequential(
	nn.Conv1d(in_channels=input_dim, out_channels=512, kernel_size=3,
	stride=1, dilation=2, padding=2),
	nn.LeakyReLU(negative_slope=5e-2),
	bn(512)
	)
	self.conv_3 = nn.Sequential(
	nn.Conv1d(in_channels=input_dim, out_channels=512, kernel_size=3,
	stride=1, dilation=4, padding=4),
	nn.LeakyReLU(negative_slope=5e-2),
	bn(512)
	)
	self.conv_4 = nn.Sequential(
	nn.Conv1d(in_channels=input_dim*3, out_channels=512, kernel_size=1,
	stride=1, padding=0, bias = False),
	nn.LeakyReLU(negative_slope=5e-2),
	)
	self.conv_5 = nn.Sequential(
	nn.Conv1d(in_channels=2048, out_channels=input_dim, kernel_size=3,
	stride=1, padding=1, bias=False),
	nn.LeakyReLU(negative_slope=5e-2),
	nn.BatchNorm1d(input_dim),
	)
	self.self_attention = nn.MultiheadAttention(embed_dim=512, num_heads=num_heads,
	dropout=0.1, device='cuda')

	def forward(self, input1, input2, input3):
	"""
	Args:
	input1 (torch.Tensor): long-frame-length features. Shape: T x B x C.
	input2 (torch.Tensor): medium-frame-length features. Shape: T x B x C.
	input3 (torch.Tensor): short-frame-length features. Shape: T x B x C.

	Returns:
	torch.Tensor: Processed and fused output features. Shape: B x T x C.
	"""
	x1 = input1.permute(1, 2, 0) # B C T
	x2 = input2.permute(1, 2, 0)
	x3 = input3.permute(1, 2, 0)
	tensor_list = [x1, x2, x3]

	residual = torch.mean(torch.stack(tensor_list), dim=0)

	out1 = self.conv_1(x1) # B C/2 T
	out2 = self.conv_2(x2)
	out3 = self.conv_3(x3)
	x = torch.cat([out1, out2, out3], dim=1) # B 3C/2 T

	feature = torch.cat((x1, x2, x3), dim=1)
	out = self.conv_4(feature)
	out = out.permute(2, 0, 1) # T B C/2
	out = F.layer_norm(out, normalized_shape=[out.size(-1)])
	out, _ = self.self_attention(out, out, out) # T B C/2
	out = out.permute(1, 2, 0) # B C/2 T
	out = torch.cat((x, out), dim=1) # B 2C T
	out = self.conv_5(out) # fuse all the features together
	out = out + residual
	out = out.permute(0, 2, 1)

	return out


	class Encoder(nn.Module):
	def __init__(self, input_dim=1024, seg_num=32):
	"""
	Multi-Temporal Feature Fusion (MTFF) module.

	Args:
	input_dim (int): Dimension of the input features.
	seg_num (int): Number of snippets in a video.
	"""
	super(Encoder, self).__init__()
	self.drop_out_rate = 0.1
	self.input_dim = input_dim
	self.min_temporal_dim = seg_num
	self.CVA1 = CVA(input_dim=input_dim)
	self.CVA2 = CVA(input_dim=input_dim)
	self.CVA3 = CVA(input_dim=input_dim)

	self.aggregate = Aggregate(input_dim=input_dim)

	def forward(self, feature1, feature2, feature3):
	"""
	Args:
	feature1 (torch.Tensor): long-frame-length features. Shape: B x T x C.
	(Batch size X The number of snippets x Input dimensions)
	feature2 (torch.Tensor): medium-frame-length features. Shape: B x T x C.
	feature3 (torch.Tensor): short-frame-length features. Shape: B x T x C.

	Returns:
	torch.Tensor: Fused and processed output features. Shape: B x T x C.
	"""

	att1 = self.CVA1(feature1, feature2)
	att2 = self.CVA2(feature2, feature3)
	att3 = self.CVA3(feature3, feature1)

	out1 = self.aggregate(att1, att2, att3) # B T C

	return out1


	class Model(nn.Module):
	def __init__(self, feature_dim, batch_size, seg_num=32):
	"""
	Multi-Temporal Feature Learning (MTFL) recognition model.

	Args:
	feature_dim (int): Dimension of the input features.
	batch_size (int): Batch size.
	seg_num (int): Number of snippets in a video.
	"""
	super(Model, self).__init__()
	self.batch_size = batch_size
	self.num_segments = seg_num
	self.k_abn = self.num_segments // 10 # select 3 snippets
	self.k_nor = self.num_segments // 10

	self.Encoder = Encoder(input_dim=feature_dim, seg_num=seg_num)

	# Fully connected layers for classification
	self.fc1 = nn.Linear(feature_dim, 512)
	self.fc2 = nn.Linear(512, 128)
	self.fc3 = nn.Linear(128, 18) # class amount = 18

	self.drop_out = nn.Dropout(0.2)
	self.relu = nn.LeakyReLU(negative_slope=5e-2)
	self.sigmoid = nn.Sigmoid()
	self.apply(weight_init)

	def forward(self, input1, input2, input3):
	"""
	Args:
	input1 (torch.Tensor): long-frame-length features. Shape: B x T x feature_dim.
	input2 (torch.Tensor): medium-frame-length features. Shape: B x T x feature_dim.
	input3 (torch.Tensor): short-frame-length features. Shape: B x T x feature_dim.

	Returns:
	score_abnormal (torch.Tensor): The mean scores for top-3 abnormal instances.
	score_normal (torch.Tensor): The mean scores for top-3 normal instances.
	feat_select_abn (torch.Tensor): Selected abnormal features.
	feat_select_normal (torch.Tensor): Selected normal features.
	scores (torch.Tensor): All computed scores. Shape: B x T x the number of classes (18)
	"""
	k_abn = self.k_abn
	k_nor = self.k_nor
	ncrops = 1 # Reserving the parameter for spatial cropping, which is not used and defaults to 1

	# Multi-Temporal Feature Fusion
	out = self.Encoder(input1, input2, input3)
	bs, t, f = out.size()
	features = self.drop_out(out) # B T D

	# classification layers
	scores = self.relu(self.fc1(features))
	scores = self.drop_out(scores)
	scores = self.relu(self.fc2(scores))
	scores = self.drop_out(scores)
	scores = self.sigmoid(self.fc3(scores))
	scores = scores.view(bs, t, -1) # B T 18
	# B * t * f
	normal_features = features[0:self.batch_size]
	normal_scores = scores[0:self.batch_size]

	abnormal_features = features[self.batch_size:]
	abnormal_scores = scores[self.batch_size:]

	# Compute feature magnitudes
	feat_magnitudes = torch.norm(features, p=2, dim=2)
	feat_magnitudes = feat_magnitudes.view(bs, ncrops, -1).mean(1)
	nfea_magnitudes = feat_magnitudes[0:self.batch_size] # normal feature magnitudes
	afea_magnitudes = feat_magnitudes[self.batch_size:] # abnormal feature magnitudes
	n_size = nfea_magnitudes.shape[0]

	# Inference mode for batch size 1
	if nfea_magnitudes.shape[0] == 1:
	afea_magnitudes = nfea_magnitudes
	abnormal_scores = normal_scores
	abnormal_features = normal_features

	select_idx = torch.ones_like(nfea_magnitudes)
	select_idx = self.drop_out(select_idx)

	####### process abnormal videos -> select top3 feature magnitude #######
	afea_magnitudes_drop = afea_magnitudes * select_idx
	idx_abn = torch.topk(afea_magnitudes_drop, k_abn, dim=1)[1]
	idx_abn_feat = idx_abn.unsqueeze(2).expand([-1, -1, abnormal_features.shape[2]])

	abnormal_features = abnormal_features.view(n_size, ncrops, t, f) # B X N X T X F
	abnormal_features = abnormal_features.permute(1, 0, 2, 3) # N X B X T X F

	total_select_abn_feature = torch.zeros(0, device=input1.device)
	for abnormal_feature in abnormal_features:
	feat_select_abn = torch.gather(abnormal_feature, 1, idx_abn_feat) # top 3 features magnitude in abnormal bag
	total_select_abn_feature = torch.cat((total_select_abn_feature, feat_select_abn))

	idx_abn_score = idx_abn.unsqueeze(2).expand([-1, -1, abnormal_scores.shape[2]])
	# top 3 scores in abnormal bag based on the top-3 magnitude
	score_abnormal = torch.mean(torch.gather(abnormal_scores, 1, idx_abn_score), dim=1)


	####### process normal videos -> select top3 feature magnitude #######

	select_idx_normal = torch.ones_like(nfea_magnitudes)
	select_idx_normal = self.drop_out(select_idx_normal)
	nfea_magnitudes_drop = nfea_magnitudes * select_idx_normal
	idx_normal = torch.topk(nfea_magnitudes_drop, k_nor, dim=1)[1]
	idx_normal_feat = idx_normal.unsqueeze(2).expand([-1, -1, normal_features.shape[2]])

	normal_features = normal_features.view(n_size, ncrops, t, f)
	normal_features = normal_features.permute(1, 0, 2, 3) # 1 B T D

	total_select_nor_feature = torch.zeros(0, device=input1.device)
	for nor_fea in normal_features:
	feat_select_normal = torch.gather(nor_fea, 1, idx_normal_feat) # top 3 features magnitude in normal bag (hard negative)
	total_select_nor_feature = torch.cat((total_select_nor_feature, feat_select_normal))

	idx_normal_score = idx_normal.unsqueeze(2).expand([-1, -1, normal_scores.shape[2]])
	score_normal = torch.mean(torch.gather(normal_scores, 1, idx_normal_score), dim=1) # top 3 scores in normal bag

	feat_select_abn = total_select_abn_feature
	feat_select_normal = total_select_nor_feature

	return score_abnormal, score_normal, feat_select_abn, feat_select_normal, scores