erkutt commited on Oct 29, 2025

Commit

28e129b

verified ·

1 Parent(s): e6a6dfb

Upload open source code of MTFL model

MTFL: Multi-Timescale Feature Learning for Weakly-supervised Anomaly Detection in Surveillance Videos
https://arxiv.org/abs/2410.05900

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.gitignore +1 -0
README.md +238 -3
detection/dataset.py +117 -0
detection/model.py +296 -0
detection/option.py +56 -0
detection/test.py +168 -0
detection/train.py +188 -0
figures/Intro.png +3 -0
recognition/dataset.py +140 -0
recognition/model.py +295 -0
recognition/option.py +56 -0
recognition/test.py +120 -0
recognition/train.py +171 -0
requirements.txt +10 -0
utils/feature_extractor.py +284 -0
utils/functional_video.py +102 -0
utils/swin_config/_base_/default_runtime.py +13 -0
utils/swin_config/_base_/models/audioonly_r50.py +18 -0
utils/swin_config/_base_/models/bmn_400x100.py +12 -0
utils/swin_config/_base_/models/bsn_pem.py +13 -0
utils/swin_config/_base_/models/bsn_tem.py +8 -0
utils/swin_config/_base_/models/c3d_sports1m_pretrained.py +23 -0
utils/swin_config/_base_/models/csn_ig65m_pretrained.py +23 -0
utils/swin_config/_base_/models/i3d_r50.py +27 -0
utils/swin_config/_base_/models/r2plus1d_r34.py +28 -0
utils/swin_config/_base_/models/slowfast_r50.py +39 -0
utils/swin_config/_base_/models/slowonly_r50.py +22 -0
utils/swin_config/_base_/models/swin/swin_base.py +6 -0
utils/swin_config/_base_/models/swin/swin_large.py +6 -0
utils/swin_config/_base_/models/swin/swin_small.py +3 -0
utils/swin_config/_base_/models/swin/swin_tiny.py +24 -0
utils/swin_config/_base_/models/swin/swin_tiny_backup.py +24 -0
utils/swin_config/_base_/models/tanet_r50.py +20 -0
utils/swin_config/_base_/models/tin_r50.py +21 -0
utils/swin_config/_base_/models/tpn_slowonly_r50.py +40 -0
utils/swin_config/_base_/models/tpn_tsm_r50.py +36 -0
utils/swin_config/_base_/models/trn_r50.py +22 -0
utils/swin_config/_base_/models/tsm_mobilenet_v2.py +22 -0
utils/swin_config/_base_/models/tsm_r50.py +21 -0
utils/swin_config/_base_/models/tsn_r50.py +19 -0
utils/swin_config/_base_/models/tsn_r50_audio.py +13 -0
utils/swin_config/_base_/models/x3d.py +14 -0
utils/swin_config/_base_/schedules/adam_20e.py +7 -0
utils/swin_config/_base_/schedules/sgd_100e.py +10 -0
utils/swin_config/_base_/schedules/sgd_150e_warmup.py +13 -0
utils/swin_config/_base_/schedules/sgd_50e.py +10 -0
utils/swin_config/_base_/schedules/sgd_tsm_100e.py +12 -0
utils/swin_config/_base_/schedules/sgd_tsm_50e.py +12 -0
utils/swin_config/_base_/schedules/sgd_tsm_mobilenet_v2_100e.py +12 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+figures/Intro.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ /test_videos/

README.md CHANGED Viewed

@@ -1,3 +1,238 @@
----
-license: cc-by-4.0
----

+# MTFL
+This repo is the official Pytorch implementation of our paper:
+> [**MTFL: Multi-Timescale Feature Learning for Weakly-supervised Anomaly Detection in Surveillance Videos**](to be filled)
+>
+<!--Author list-->
+## Introduction
+![intro](figures/Intro.png)
+Detection of anomaly events is relevant for public safety and requires a combination of fine-grained motion information and long-time action recognition. Therefore, we propose a Multi-Timescale Feature Learning (MTFL) method to enhance the representation of anomaly features. We employ short, medium, and long temporal tubelets to extract spatio-temporal video
+features using the Video Swin Transformer. Experimental results demonstrate that
+MTFL outperforms state-of-the-art methods on the UCF-Crime dataset, achieving an
+anomaly detection performance 89.78% AUC. Moreover, it performs 95.32% AUC on the
+ShanghaiTech and 84.57% AP on the XD-Violence dataset, complementary to several
+SotA results. Building upon MTFL, we also propose an anomaly recognition network
+that employs partial features for classification, achieving a leading accuracy on
+UCF-Crime, outperforming the existing recognition literature. Furthermore,
+we introduce an extended dataset for UCF-Crime,
+namely Video Anomaly Detection Dataset (VADD),
+involving 2,591 videos in 18 classes with extensive coverage of realistic anomalies.
+## Models and Dataset
+### [Video Anomaly Detection Dataset (VADD)](https://form.jotform.com/240714220958354)
+VADD includes 2,591 videos with a frame rate of 30 fps and a
+resolution of 320×240 pixels, with 2,202 train and 389 test videos.
+The subfolders in VADD are named according to video categories, totaling 18 subfolders.
+Train-set annotations only include a class label, while test-set annotations contain a
+video class label, a number of frames in a video, as well as the starting and
+ending frame positions of abnormal events in a video.
+```
+# Training annotation
+[Subfolder/video name] [video label]
+# Test annotation
+[Subfolder/video name] [video label] [total frames] [start_frame1] [end_frame1] [start_frame2]...
+```
+* Taking a training video containing littering as an example, it is annotated as below:
+```
+Littering/CarSafe015.mp4 Littering
+```
+* Taking a test video containing dangerous throwing behavior as an example,
+its annotations indicate that the video has a total of 636 frames and
+there are two instances of dangerous throwing behavior.
+The first instance occurs between frames 145 and 186,
+while the second instance occurs between frames 289 and 340.
+```
+DangerousThrowing/BicyclistDangerous039.mp4 DangerousThrowing 636 145 186 289 340
+```
+Additionally, to train and test our MTFL with benchmark datasets,
+we converted annotation files from other datasets to match the format of VADD annotation
+files, including Shanghai Tech, XD-Violence, and UCF-Crime.
+All train and test annotation files for AnomalyDetection and AnomalyRecognition are provided in the ["Annotation".](https://tuenl-my.sharepoint.com/:f:/r/personal/e_akdag_tue_nl/Documents/MTFL/annotation?csf=1&web=1&e=UYxR0H).
+### [MTFL checkpoints for anomaly detection](https://tuenl-my.sharepoint.com/:f:/r/personal/e_akdag_tue_nl/Documents/MTFL/MTFL-checkpoints?csf=1&web=1&e=hJhPgh)
+| Detection Checkpoint | Feature       | UCF   | Shanghai<br/>Tech | XD-Violence | VADD |
+|------------------|---------------|-------|--------------|-------------|---|
+| MTFL_VST_Kinetics400 | VST-RGB       | 87.61 | 95.32        | 84.57       | - |
+| MTFL_VST_VADD    | VST<sub>Aug</sub>_RGB | 89.79 | 95.70 | 79.40 | 88.42 |
+There are several MTFL checkpoints for anomaly detection using different feature extractors
+and datasets where,
+* xxx_VST_Kinetics400 = Features extracted using VST pretrained on Kinetics400,
+* xxx_VST_VADD = Features extracted using VST pretrained on VADD with data augmentation.
+* MTFL-yyy-VST-Kinetics400 =  MTFL models trained with VST_RGB features.
+* MTFL-yyy-VST-VADD = MTFL models trained with VST<sub>Aug</sub>_RGB features.
+xxx = Shanghai, VADD, and XD.
+yyy = SH, VADD-UCF, and XD.
+Two feature extractors used in our detection models and
+the resulting features of benchmark datasets are provided below:
+* [Video Swin Transformer pretrained on Kinetics-400](https://tuenl-my.sharepoint.com/:u:/r/personal/e_akdag_tue_nl/Documents/MTFL/feature-extractors/AnomalyDetection/swin_base_patch244_window877_kinetics400_22k.pth?csf=1&web=1&e=8spheA)
+* [Video Swin Transformer pretrained on VADD](https://tuenl-my.sharepoint.com/:u:/r/personal/e_akdag_tue_nl/Documents/MTFL/feature-extractors/AnomalyDetection/VST_swin_base_patch244_window877_VADD.pth?csf=1&web=1&e=AzfewH)
+* [VST_RBG features of UCF-Crime, XD-Violence, Shanghai Tech, and VADD](https://tuenl-my.sharepoint.com/:f:/r/personal/e_akdag_tue_nl/Documents/MTFL/features/AnomalyDetection?csf=1&web=1&e=CT8WZ3)
+* [VST<sub>Aug</sub>_RGB features of UCF-Crime, XD-Violence, Shanghai Tech, and VADD](https://tuenl-my.sharepoint.com/:f:/r/personal/e_akdag_tue_nl/Documents/MTFL/features/AnomalyDetection?csf=1&web=1&e=CT8WZ3)
+The Video Swin Transformer model pretrained on Kinetics400 and
+the training method for Video Swin Transformer are derived from
+the [Video-Swin-Transformer repository](https://github.com/SwinTransformer/Video-Swin-Transformer).
+### [MTFL checkpoints for anomaly recognition](https://tuenl-my.sharepoint.com/:f:/r/personal/e_akdag_tue_nl/Documents/MTFL/MTFL-checkpoints/AnomalyRecognition?csf=1&web=1&e=NOkpNn)
+| Recognition Checkpoint        | UCF Acc(%) | VAD Acc(%) |
+|-------------------------------|------------|------------|
+| MTFL_VADDsplit1_best_UCF      | 39.88      | -          |
+| MTFL_VADDsplit1_best_VADD     | -          | 45.87      |
+| MTFL_VADDsplit2_best_UCF      | 47.02      | -          |
+| MTFL_VADDsplit2_best_VADD     | -          | 49.31      |
+| MTFL_VADDsplit3_best_UCF      | 49.40      | -          |
+| MTFL_VADDsplit3_best_VADD     | -          | 53.88      |
+| MTFL_VADDsplit4_best_UCF_VADD | 45.83      | 52.29      |
+| 4-fold average                | 45.53      | 50.34      |
+Following the experimental setup of 4-fold cross-validation from [Sultani et al](https://arxiv.org/abs/1801.04264),
+there are seven recognition checkpoints by saving the checkpoints that performed the best on
+UCF and VADD separately during training on different VADD splits, as shown in the above table. For example,
+MTFL_VADDsplit1_best_UCF represents the MTFL recognition model trained on VADD
+split 1 with the best recognition performance on UCF-Crime split 1 test-set.
+All the models use VST trained on the corresponding VADD splits
+for feature extraction.
+* [The used feature extractors for anomaly recognition](https://tuenl-my.sharepoint.com/:f:/r/personal/e_akdag_tue_nl/Documents/MTFL/feature-extractors/AnomalyRecognition?csf=1&web=1&e=ToseKM)
+* [The generated features for anomaly recognition](https://tuenl-my.sharepoint.com/:f:/r/personal/e_akdag_tue_nl/Documents/MTFL/features/AnomalyRecognition?csf=1&web=1&e=4nbEUm)
+## Environment setup
+```
+pip install -r requirements.txt
+```
+## Folder Structure
+```flow
+demo/
+│
+├── detection/       # MTFL detection
+│   └── ...
+├── recognition/     # MTFL recognition
+│   └── ...
+├── utils/
+│   ├── swin_config/ # VST config for loading feature extractor
+│   │   └── ...
+│   ├── feature_extractor.py
+│   ├── ...
+│   └── video_preprocessing/ # scripts for annotation and unifying video format
+│       └── ...
+├── test_videos/     # put your test video here
+├── Annotation/      # put your annotation here
+├── features/        # feature path
+│   ├── L8
+│   ├── L32
+│   └── L64
+├── results/
+│   ├── AUC          # detection AUC
+│   ├── scores       # detection scores
+│   └── rec_results  # recognition labels
+└── README.md
+```
+## Feature Extraction
+Both recognition and detection models require multi-timescale features using tubelets 8, 32, and 64 frames.
+To extract features, you need to upload the videos to the 'test_videos' directory and then run the following command:
+```
+python utils/feature_extractor.py --clip_length [8/32/64]
+```
+In the default settings, test videos should be stored in the 'test_videos' directory, and the extracted features will be
+organized within the 'features' folder following the same directory structure as 'test_videos'.
+For example, the feature of video 'test_videos/A/B.mp4' extracted with a frame length 8 is saved as 'features/L8/A/B.txt'.
+You can modify the parameters inside the "VST Feature Extractor Parser" as needed.
+For example, you can change the input video path, the save path of features and the used pretrained feature extractor by specifying the model path.
+```
+python utils/feature_extractor.py --clip_length [8/32/64] --dataset_path [your video path] --save_dir [your feature path] --pretrained_3d [model path]
+```
+Note: if you use VST pretrained on Kinetics400, you need to change <num_classes> to 400 in line 21 of
+'utils/swin_config/_base/models/swin/swin_tiny.py' to adapt the model size. For VST pretrained on VADD, the <num_classes>
+is 18. These settings are referenced from the guidelines provided by
+[Video-Swin-Transformer repository](https://github.com/SwinTransformer/Video-Swin-Transformer).
+## Anomaly Detection
+### Inference
+To test a detection checkpoint model on your test videos, run:
+```
+python detection/test.py --test_anno [your_anno_file.txt] --detection_model [checkpoint path]
+```
+In the default settings:
+* Test videos should be stored in the 'test_videos' directory.
+* The corresponding annotation file need to be placed in the 'annotation' folder. Annotation format can be found under Video Preprocessing->Annotation.
+* Multi-temporal scale features of the videos should be stored in the 'features' directory. See Feature Extraction.
+The detection AUC and the scores for each video will be generated within the 'results' folder.
+The directory structure of the generated results, in relation to both 'results/AUC' and 'results/scores', mirrors the
+structure of the corresponding test videos in the 'test_videos' directory. For example,
+the score of video 'test_videos/A/B.mp4' is saved as 'results/scores/A/B.png'
+If you want to change paths to input and output data or any running configs,
+feel free to change the args in 'detection/option.py'.
+### Train
+To train a detection model, run:
+```
+python detection/train.py --train_anno [your_train_anno_file.txt] --test_anno [your_test_anno_file.txt]
+--lf_dir [path to long frame length features] --mf_dir [path to medium frame length features] --sf_dir
+[path to short frame length features] --save_models [path for saving checkpoints] --output_dir [path for saving checkpoint AUC]
+```
+Other training parameters can be found in 'detection/option.py'
+## Anomaly Recognition
+### Inference
+To test a recognition checkpoint model on your test videos, run
+```
+python recognition/test.py --test_anno [your_anno_file.txt] --recognition_model [checkpoint path]
+```
+The default settings are same as Detection, and the modifiable parameters are in 'recognition/option.py'.
+The recognition results of all input will be saved as 'results/rec_results/output_pred.txt'.
+### Train
+To train a recognition model, run:
+```
+python recognition/train.py --train_anno [your_train_anno_file.txt] --test_anno [your_test_anno_file.txt]
+--lf_dir [path to long frame length features] --mf_dir [path to medium frame length features] --sf_dir
+[path to short frame length features] --save_models [path for saving checkpoints] --output_dir [path for saving checkpoint AUC]
+```
+Note: following the experimental setup of 4-fold cross-validation from [Sultani et al](https://arxiv.org/abs/1801.04264),
+ there are four pairs of training annotation and testing annotation files corresponding to four splits for each dataset,
+ which are provided in in the "annotation" folder accessible through the above VADD link.
+Make sure the correspondence between the training and testing files; otherwise, there are data leakage issues.
+Other training parameters can be found in 'recognition/option.py'
+## Acknowledgement
+Partial code is used from
+[Video-Swin-Transformer](https://github.com/SwinTransformer/Video-Swin-Transformer)
+and [RTFM](https://github.com/tianyu0207/RTFM)
+<!--## Citation
+If you find this repo useful for your research, please consider citing our paper:-->

detection/dataset.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import torch.utils.data as data
+import os
+import torch
+torch.set_default_tensor_type('torch.FloatTensor')
+def read_features(feature_path):
+    """
+    Read features from a text file and convert them into a torch tensor.
+    Args:
+        feature_path (str): Path to the text file containing features.
+    Returns:
+        features (torch.Tensor): A tensor containing the features. Shape is T x C.
+    """
+    with open(feature_path, 'r') as file:
+        lines = file.readlines()
+    features = []
+    for line in lines:
+        feature = [float(value) for value in line.strip().split()]
+        features.append(feature)
+    features = torch.tensor(features).float() # T x C
+    return features
+class Dataset(data.Dataset):
+    def __init__(self, args, is_normal=True, transform=None, test_mode=False):
+        """
+        Custom dataset class for loading features and labels.
+        Args:
+            args: Argument object containing paths and options.
+            is_normal (bool): Whether the dataset represents normal samples.
+            transform: Data transformation to be applied.
+            test_mode (bool): Whether the dataset is for testing.
+        Attributes:
+            is_normal (bool): Whether the dataset represents normal samples.
+            transform: Data transformation to be applied.
+            test_mode (bool): Whether the dataset is for testing.
+            list (list): List of feature paths and labels information.
+        """
+        self.is_normal = is_normal
+        self.transform = transform
+        self.test_mode = test_mode
+        if self.test_mode:
+            annotation_path = args.test_anno
+        else:
+            annotation_path = args.train_anno
+        self.list = self._get_features_list(args.lf_dir, args.mf_dir, args.sf_dir, annotation_path)
+    def __getitem__(self, index):
+        label = self.get_label()
+        if self.test_mode:
+            lf_path, mf_path, sf_path, num_frames, start_end_couples, file = self.list[index]
+            l_features = read_features(lf_path)
+            m_features = read_features(mf_path)
+            s_features = read_features(sf_path)
+            return l_features, m_features, s_features, label, start_end_couples, num_frames, file
+        else:
+            lf_path, mf_path, sf_path = self.list[index]
+            l_features = read_features(lf_path)
+            m_features = read_features(mf_path)
+            s_features = read_features(sf_path)
+            return l_features, m_features, s_features, label
+    def get_label(self):
+        if self.is_normal:
+            label = torch.tensor(0.0)
+        else:
+            label = torch.tensor(1.0)
+        return label
+    def __len__(self):
+        return len(self.list)
+    def _get_features_list(self, lf_dir, mf_dir, sf_dir, annotation_path):
+        """
+        Generate a list of features and labels information from annotations.
+        Args:
+            lf_dir (str): Path to long-frame-length features directory.
+            mf_dir (str): Path to medium-frame-length features directory.
+            sf_dir (str): Path to short-frame-length features directory.
+            annotation_path (str): Path to annotation file.
+        Returns:
+            list: A list of tuples containing features and labels information.
+        """
+        assert os.path.exists(lf_dir)
+        assert os.path.exists(mf_dir)
+        assert os.path.exists(sf_dir)
+        features_list = []
+        with open(annotation_path) as f:
+            lines = f.read().splitlines(keepends=False)
+            for line in lines:
+                items = line.split()
+                #file = items[0].split(".")[0] for XD
+                file, ext = os.path.splitext(items[0])
+                file = file.replace("/", os.sep)
+                lf_path = os.path.join(lf_dir, file + '.txt')
+                mf_path = os.path.join(mf_dir, file + '.txt')
+                sf_path = os.path.join(sf_dir, file + '.txt')
+                cls_name = items[1]
+                if self.test_mode:
+                    start_end_couples = [int(x) for x in items[3:]]
+                    num_frames = int(items[2])
+                    features_list.append((lf_path, mf_path, sf_path, num_frames, start_end_couples, file))
+                elif ("Normal" == cls_name) == self.is_normal:
+                    features_list.append((lf_path, mf_path, sf_path))
+        return features_list

detection/model.py ADDED Viewed

	@@ -0,0 +1,296 @@

+""" Reference source: https://github.com/tianyu0207/RTFM"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as torch_init
+torch.set_default_tensor_type('torch.FloatTensor')
+def weight_init(m):
+    classname = m.__class__.__name__
+    if classname.find('Conv') != -1 or classname.find('Linear') != -1:
+        torch_init.xavier_uniform_(m.weight)
+        if m.bias is not None:
+            m.bias.data.fill_(0)
+class CVA(nn.Module):
+    def __init__(self, input_dim=1024):
+        """
+        Cross-View Attention (CVA) module.
+        Args:
+            input_dim (int): Dimension of the input features.
+        """
+        super(CVA, self).__init__()
+        drop_out_rate = 0.1
+        num_heads = 4
+        self.cross_attention = nn.MultiheadAttention(embed_dim=input_dim, num_heads=num_heads, dropout=drop_out_rate,
+                                                     device='cuda')
+    def forward(self, feature1, feature2):
+        """
+        Args:
+            feature1 (torch.Tensor): one path features. Shape: B x T x C.
+            feature2 (torch.Tensor): another path features. Shape: B x T x C.
+        Returns:
+            out1 (torch.Tensor): Processed features after cross-attention. Shape: B x T x C.
+        """
+        feature1 = F.layer_norm(feature1, [feature1.size(-1)])
+        feature2 = F.layer_norm(feature2, [feature2.size(-1)])
+        feature1 = feature1.permute(1, 0, 2)  # T B C
+        feature2 = feature2.permute(1, 0, 2)
+        out1, _ = self.cross_attention(query=feature1, key=feature2, value=feature2)  # T B C (For test:32 1 1024)
+        out1 = out1 + feature1  # residual connection
+        return out1  # B T C
+class Aggregate(nn.Module):
+    def __init__(self, input_dim):
+        """
+        An aggregate network including local temporal correlation learning, global temporal correlation learning,
+            and feature fusion in MTFF.
+        Args:
+            input_dim (int): input features dim.
+        """
+        super(Aggregate, self).__init__()
+        bn = nn.BatchNorm1d
+        num_heads = 4
+        self.input_dim = input_dim
+        self.conv_1 = nn.Sequential(
+            nn.Conv1d(in_channels=input_dim, out_channels=512, kernel_size=3,
+                      stride=1,dilation=1, padding=1),
+            nn.LeakyReLU(negative_slope=5e-2),
+            bn(512)
+        )
+        self.conv_2 = nn.Sequential(
+            nn.Conv1d(in_channels=input_dim, out_channels=512, kernel_size=3,
+                      stride=1, dilation=2, padding=2),
+            nn.LeakyReLU(negative_slope=5e-2),
+            bn(512)
+        )
+        self.conv_3 = nn.Sequential(
+            nn.Conv1d(in_channels=input_dim, out_channels=512, kernel_size=3,
+                      stride=1, dilation=4, padding=4),
+            nn.LeakyReLU(negative_slope=5e-2),
+            bn(512)
+        )
+        self.conv_4 = nn.Sequential(
+            nn.Conv1d(in_channels=input_dim*3, out_channels=512, kernel_size=1,
+                      stride=1, padding=0, bias = False),
+            nn.LeakyReLU(negative_slope=5e-2),
+        )
+        self.conv_5 = nn.Sequential(
+            nn.Conv1d(in_channels=2048, out_channels=input_dim, kernel_size=3,
+                      stride=1, padding=1, bias=False),
+            nn.LeakyReLU(negative_slope=5e-2),
+            nn.BatchNorm1d(input_dim),
+        )
+        self.self_attention = nn.MultiheadAttention(embed_dim=512, num_heads=num_heads,
+                                                    dropout=0.1, device='cuda')
+    def forward(self, input1, input2, input3):
+        """
+        Args:
+            input1 (torch.Tensor): long-frame-length features. Shape: T x B x C.
+            input2 (torch.Tensor): medium-frame-length features. Shape: T x B x C.
+            input3 (torch.Tensor): short-frame-length features. Shape: T x B x C.
+        Returns:
+            torch.Tensor: Processed and fused output features. Shape: B x T x C.
+        """
+        x1 = input1.permute(1, 2, 0)  # B C T
+        x2 = input2.permute(1, 2, 0)
+        x3 = input3.permute(1, 2, 0)
+        tensor_list = [x1, x2, x3]
+        residual = torch.mean(torch.stack(tensor_list), dim=0)
+        out1 = self.conv_1(x1)  # B C/2 T
+        out2 = self.conv_2(x2)
+        out3 = self.conv_3(x3)
+        x = torch.cat([out1, out2, out3], dim=1)  # B 3C/2 T
+        feature = torch.cat((x1, x2, x3), dim=1)
+        out = self.conv_4(feature)
+        out = out.permute(2, 0, 1)  # T B C/2
+        out = F.layer_norm(out, normalized_shape=[out.size(-1)])
+        out, _ = self.self_attention(out, out, out)  # T B C/2
+        out = out.permute(1, 2, 0)  # B C/2 T
+        out = torch.cat((x, out), dim=1)  # B 2C T
+        out = self.conv_5(out)   # fuse all the features together
+        out = out + residual
+        out = out.permute(0, 2, 1)
+        return out
+class Encoder(nn.Module):
+    def __init__(self, input_dim=1024, seg_num=32):
+        """
+        Multi-Temporal Feature Fusion (MTFF) module.
+        Args:
+            input_dim (int): Dimension of the input features.
+            seg_num (int): Number of snippets in a video.
+        """
+        super(Encoder, self).__init__()
+        self.drop_out_rate = 0.1
+        self.input_dim = input_dim
+        self.min_temporal_dim = seg_num
+        self.CVA1 = CVA(input_dim=input_dim)
+        self.CVA2 = CVA(input_dim=input_dim)
+        self.CVA3 = CVA(input_dim=input_dim)
+        self.aggregate = Aggregate(input_dim=input_dim)
+    def forward(self, feature1, feature2, feature3):
+        """
+        Args:
+            feature1 (torch.Tensor): long-frame-length features. Shape: B x T x C.
+                (Batch size X The number of snippets x Input dimensions)
+            feature2 (torch.Tensor): medium-frame-length features. Shape: B x T x C.
+            feature3 (torch.Tensor): short-frame-length features. Shape: B x T x C.
+        Returns:
+            torch.Tensor: Fused and processed output features. Shape: B x T x C.
+        """
+        att1 = self.CVA1(feature1, feature2)
+        att2 = self.CVA2(feature2, feature3)
+        att3 = self.CVA3(feature3, feature1)
+        out1 = self.aggregate(att1, att2, att3)  # B T C
+        return out1
+class Model(nn.Module):
+    def __init__(self, feature_dim, batch_size, seg_num=32):
+        """
+        Multi-Temporal Feature Learning (MTFL) model.
+        Args:
+            feature_dim (int): Dimension of the input features.
+            batch_size (int): Batch size.
+            seg_num (int): Number of snippets in a video.
+        """
+        super(Model, self).__init__()
+        self.batch_size = batch_size
+        self.num_segments = seg_num
+        self.k_abn = self.num_segments // 10  # select 3 snippets
+        self.k_nor = self.num_segments // 10
+        self.Encoder = Encoder(input_dim=feature_dim, seg_num=seg_num)
+        # Fully connected layers for scoring
+        self.fc1 = nn.Linear(feature_dim, 512)
+        self.fc2 = nn.Linear(512, 128)
+        self.fc3 = nn.Linear(128, 1)
+        self.drop_out = nn.Dropout(0.2)
+        self.relu = nn.LeakyReLU(negative_slope=5e-2)
+        self.sigmoid = nn.Sigmoid()
+        self.apply(weight_init)
+    def forward(self, input1, input2, input3):
+        """
+        Args:
+            input1 (torch.Tensor): long-frame-length features. Shape: B x T x feature_dim.
+            input2 (torch.Tensor): medium-frame-length features. Shape: B x T x feature_dim.
+            input3 (torch.Tensor): short-frame-length features. Shape: B x T x feature_dim.
+        Returns:
+            score_abnormal (torch.Tensor): The mean scores for top-3 abnormal instances.
+            score_normal (torch.Tensor): The mean scores for top-3 normal instances.
+            feat_select_abn (torch.Tensor): Selected abnormal features.
+            feat_select_normal (torch.Tensor): Selected normal features.
+            scores (torch.Tensor): All computed scores. Shape: B x T x 1
+        """
+        k_abn = self.k_abn
+        k_nor = self.k_nor
+        ncrops = 1  # Reserving the parameter for spatial cropping, which is not used and defaults to 1
+        # Multi-Temporal Feature Fusion
+        out = self.Encoder(input1, input2, input3)
+        bs, t, f = out.size()
+        features = self.drop_out(out) # B T D
+        # Scoring layers
+        scores = self.relu(self.fc1(features))
+        scores = self.drop_out(scores)
+        scores = self.relu(self.fc2(scores))
+        scores = self.drop_out(scores)
+        scores = self.sigmoid(self.fc3(scores))
+        scores = scores.view(bs, ncrops, -1).mean(1)
+        scores = scores.unsqueeze(dim=2)
+        # Split normal and abnormal instances
+        normal_features = features[0:self.batch_size]
+        normal_scores = scores[0:self.batch_size]
+        abnormal_features = features[self.batch_size:]
+        abnormal_scores = scores[self.batch_size:]
+        # Compute feature magnitudes
+        feat_magnitudes = torch.norm(features, p=2, dim=2)
+        feat_magnitudes = feat_magnitudes.view(bs, ncrops, -1).mean(1)
+        nfea_magnitudes = feat_magnitudes[0:self.batch_size]  # normal feature magnitudes
+        afea_magnitudes = feat_magnitudes[self.batch_size:]  # abnormal feature magnitudes
+        n_size = nfea_magnitudes.shape[0]
+        # Inference mode for batch size 1
+        if nfea_magnitudes.shape[0] == 1:
+            afea_magnitudes = nfea_magnitudes
+            abnormal_scores = normal_scores
+            abnormal_features = normal_features
+        select_idx = torch.ones_like(nfea_magnitudes)
+        select_idx = self.drop_out(select_idx)
+        #######  process abnormal videos -> select top3 feature magnitude  #######
+        afea_magnitudes_drop = afea_magnitudes * select_idx
+        idx_abn = torch.topk(afea_magnitudes_drop, k_abn, dim=1)[1]
+        idx_abn_feat = idx_abn.unsqueeze(2).expand([-1, -1, abnormal_features.shape[2]])
+        abnormal_features = abnormal_features.view(n_size, ncrops, t, f) # B X N X T X F
+        abnormal_features = abnormal_features.permute(1, 0, 2, 3)  # N X B X T X F
+        total_select_abn_feature = torch.zeros(0, device=input1.device)
+        for abnormal_feature in abnormal_features:
+            feat_select_abn = torch.gather(abnormal_feature, 1, idx_abn_feat)   # top 3 features magnitude in abnormal bag
+            total_select_abn_feature = torch.cat((total_select_abn_feature, feat_select_abn))
+        idx_abn_score = idx_abn.unsqueeze(2).expand([-1, -1, abnormal_scores.shape[2]])
+        # top 3 scores in abnormal bag based on the top-3 magnitude
+        score_abnormal = torch.mean(torch.gather(abnormal_scores, 1, idx_abn_score), dim=1)
+        ####### process normal videos -> select top3 feature magnitude #######
+        select_idx_normal = torch.ones_like(nfea_magnitudes)
+        select_idx_normal = self.drop_out(select_idx_normal)
+        nfea_magnitudes_drop = nfea_magnitudes * select_idx_normal
+        idx_normal = torch.topk(nfea_magnitudes_drop, k_nor, dim=1)[1]
+        idx_normal_feat = idx_normal.unsqueeze(2).expand([-1, -1, normal_features.shape[2]])
+        normal_features = normal_features.view(n_size, ncrops, t, f)
+        normal_features = normal_features.permute(1, 0, 2, 3) # 1 B T D
+        total_select_nor_feature = torch.zeros(0, device=input1.device)
+        for nor_fea in normal_features:
+            feat_select_normal = torch.gather(nor_fea, 1, idx_normal_feat)  # top 3 features magnitude in normal bag (hard negative)
+            total_select_nor_feature = torch.cat((total_select_nor_feature, feat_select_normal))
+        idx_normal_score = idx_normal.unsqueeze(2).expand([-1, -1, normal_scores.shape[2]])
+        score_normal = torch.mean(torch.gather(normal_scores, 1, idx_normal_score), dim=1) # top 3 scores in normal bag
+        feat_select_abn = total_select_abn_feature
+        feat_select_normal = total_select_nor_feature
+        return score_abnormal, score_normal, feat_select_abn, feat_select_normal, scores

detection/option.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import argparse
+############ Test args ########################
+test_parser = argparse.ArgumentParser(description='MTFL_detection_test')
+# input path
+test_parser.add_argument('--lf_dir', type=str, default='features/L64', help='long frame length feature path')
+test_parser.add_argument('--mf_dir', type=str, default='features/L32', help='media frame length feature path')
+test_parser.add_argument('--sf_dir', type=str, default='features/L8', help='short frame length feature path')
+test_parser.add_argument('--test_anno', default='annotation/Anomaly_videos.txt', help='test annotation file')
+test_parser.add_argument('--detection_model', default='/media/DataDrive/yiling/Test/models/MTFL/MTFL-vst-VAD.pkl',
+                         help='model path')
+# output path
+test_parser.add_argument('--output_dir', default='results',
+                         help='The path to store the generated scores and AUC results')
+# feature size depending on which feature extractor used
+test_parser.add_argument('--feature_size', type=int, default=1024, help='feature dim (default: VST feature)')
+test_parser.add_argument('--seg_num', type=int, default=32, help='the number of snippets')
+# running cfg
+test_parser.add_argument('--gpu', default="0", type=str, choices=["0", "1"], help='gpu')
+test_parser.add_argument('--workers', default=8, help='number of workers in dataloader')
+############ Train args ########################
+train_parser = argparse.ArgumentParser(description='MTFL_detection_train')
+# input path
+train_parser.add_argument('--lf_dir', type=str, default='/media/DataDrive/yiling/features/VST_VAD_MT/L64R1',
+                          help='long feature path')
+train_parser.add_argument('--mf_dir', type=str, default='/media/DataDrive/yiling/features/VST_VAD_MT/L32R1',
+                          help='media feature path')
+train_parser.add_argument('--sf_dir', type=str, default='/media/DataDrive/yiling/features/VST_VAD_MT/L8R1',
+                          help='short feature path')
+train_parser.add_argument('--train_anno', default='/media/DataDrive/yiling/annotation/VAD_train_annotation.txt',
+                          help='the annotation file for training')
+train_parser.add_argument('--test_anno', default='/media/DataDrive/yiling/annotation/UCF_test_annotation_with_frames.txt',
+                          help='the annotation file for test')
+# output path and saving info
+train_parser.add_argument('--model-name', default='MTFL', help='name to save model')
+train_parser.add_argument('--save_models', default='/media/DataDrive/yiling/models/demo/detection',
+                          help='the path for saving models')
+train_parser.add_argument('--output_dir', default='/media/DataDrive/yiling/results/demo/detection',
+                          help='The path to store AUC results')
+# training cfg and paras
+train_parser.add_argument('--gpu', default="0", type=str, choices=["0", "1"], help='gpu id')
+train_parser.add_argument('--feature_size', type=int, default=1024, help='feature dim (default: VST feature)')
+train_parser.add_argument('--seg_num', type=int, default=32, help='the number of snippets')
+train_parser.add_argument('--lr', type=float, default='0.0001', help='learning rates for steps(list form)')
+train_parser.add_argument('--batch-size', type=int, default=64, help='batch size')
+train_parser.add_argument('--workers', type=int, default=8, help='number of workers in dataloader')
+train_parser.add_argument('--max-epoch', type=int, default=2000, help='maximum iteration to train (default: 100)')
+train_parser.add_argument('--metric', type=str, choices=["AP", "AUC"], default="AUC", help='the used metric')

detection/test.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import torch
+from sklearn.metrics import auc, roc_curve, average_precision_score
+from tqdm import tqdm
+import os
+import matplotlib.pyplot as plt
+import option
+from torch.utils.data import DataLoader
+from dataset import Dataset
+from model import Model
+import warnings
+from sklearn.exceptions import UndefinedMetricWarning
+warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
+def get_gt(start_end_couples, num_frames, device):
+    """
+    Generate a ground truth tensor representing events in a time sequence based on given start and end pairs.
+    Args:
+        start_end_couples (list): A list containing pairs of start and end frames.
+            If None or all '-1', no events are present.
+        num_frames (int): Total number of frames in the time sequence.
+        device: Device where the tensor should be placed.
+    Returns:
+        gt: A tensor of shape (num_frames,) representing whether each frame belongs to an anomalous event.
+            '1' means anomalous, and '0' means normal.
+    """
+    gt = torch.zeros(num_frames).to(device)
+    if start_end_couples is not None and num_frames is not None:
+        for i in range(0, len(start_end_couples) - 1, 2):
+            if start_end_couples[i].item() != -1 and start_end_couples[i + 1].item() != -1:
+                couple = start_end_couples[i:i + 2]
+                gt[couple[0].item():couple[1].item()] = 1.0
+    return gt
+def save_scores(pred, start_end_couples, save_path):
+    """
+    Save plots containing anomaly scores and annotated regions.
+    Args:
+        pred (list): List of anomaly scores.
+        start_end_couples (Tensor): Pairs of start and end frames indicating anomalous regions.
+        save_path (str): Path to save the generated plot.
+        file_name (str): Name to be displayed in the legend of the plot.
+    """
+    plt.figure()
+    file_name = os.path.basename(save_path).split(".")[0]
+    plt.plot(pred, label=file_name, color='blue')
+    # Plot anomalous regions
+    for i in range(0, len(start_end_couples) - 1, 2):
+        if start_end_couples[i].item() != -1 and start_end_couples[i + 1].item() != -1:
+            plt.axvspan(start_end_couples[i].item(), start_end_couples[i + 1].item(), color='red', alpha=0.3)
+    plt.ylim(0, 1)
+    plt.xlabel('Frames', fontdict={'size': 16})
+    plt.ylabel('Anomaly Score', fontdict={'size': 16})
+    plt.yticks(size=14)
+    plt.xticks(size=14)
+    plt.legend(prop={'size': 16})
+    #plt.show()
+    plt.savefig(save_path)
+    plt.close()
+def test(dataloader, model, device, gen_scores=False, save_dir=None):
+    """
+    Test the model's performance on the given dataloader.
+    Args:
+        dataloader (DataLoader): DataLoader for test data.
+        model: The model to be tested.
+        device: Device to perform testing on.
+        gen_scores (bool): Whether to generate and save anomaly scores plot.
+        save_dir (str): Directory to save generated plots.
+    Returns:
+        single_video_AUC (dict): A dictionary containing AUC values for each video.
+        overall_auc (float): Overall AUC value.
+        ap (float): average precision
+    """
+    single_video_AUC = {"video": [], "AUC": []}
+    with torch.no_grad():
+        model.to(device).eval()
+        pred = torch.zeros(0, device=device)
+        gt = torch.zeros(0, device=device)
+        for input1, input2, input3, label, start_end_couples, num_frames, file in tqdm(dataloader):
+            input1 = input1.to(device)
+            input2 = input2.to(device)
+            input3 = input3.to(device)
+            score_abnormal, score_normal, feat_select_abn, feat_select_normal, scores = model(input1, input2, input3)
+            sig = torch.squeeze(scores, dim=(0, 2)) # T scores
+            segment = num_frames.item() // sig.size()[0]
+            sig = sig.repeat_interleave(segment) # Frames
+            if len(sig) < num_frames.item():
+                last_ele = sig[-1]
+                sig = torch.cat((sig, last_ele.repeat(num_frames.item()-len(sig)))) # 1 x Frames
+            pred = torch.cat((pred, sig))
+            cur_gt = get_gt(start_end_couples, num_frames, device)
+            gt = torch.cat((gt, cur_gt))
+            sig = sig.cpu().detach().numpy()
+            cur_gt = cur_gt.cpu().detach().numpy()
+            fpr, tpr, threshold = roc_curve(cur_gt, sig)
+            video_auc = auc(fpr, tpr)
+            single_video_AUC["video"].append(file)
+            single_video_AUC["AUC"].append(video_auc)
+            if gen_scores:
+                save_path = os.path.join(save_dir, file[0] + '.png')
+                os.makedirs(os.path.dirname(save_path), exist_ok=True)
+                save_scores(sig, start_end_couples, save_path)
+        pred = pred.cpu().detach().numpy()
+        gt = gt.cpu().detach().numpy()
+        ap = average_precision_score(gt, pred)
+        fpr, tpr, threshold = roc_curve(gt, pred)
+        overall_auc = auc(fpr, tpr)
+        print('\n' + 'Overall auc : ' + str(overall_auc) + ', Average Precision : ' + str(ap) + '\n')
+        return single_video_AUC, overall_auc, ap
+def main():
+    args = option.test_parser.parse_args()
+    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    AUC_path = os.path.join(args.output_dir, 'AUC')
+    scores_path = os.path.join(args.output_dir, 'scores')
+    test_loader = DataLoader(Dataset(args, test_mode=True),
+                             batch_size=1, shuffle=False,
+                             num_workers=args.workers, pin_memory=True)
+    model = Model(feature_dim=args.feature_size, batch_size=1, seg_num=args.seg_num)
+    model.load_state_dict(torch.load(args.detection_model))
+    single_video_AUC, overall_auc, ap = test(dataloader=test_loader,
+                                             model=model,
+                                             device=device,
+                                             gen_scores=True,
+                                             save_dir=scores_path)
+    # save AUC results
+    video_sub_dir = os.path.basename(os.path.dirname(single_video_AUC["video"][0][0]))
+    file_path = os.path.join(AUC_path, video_sub_dir, 'results.txt')
+    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+    with open(file_path, "w") as f:
+        for video, single_auc in zip(single_video_AUC["video"], single_video_AUC["AUC"]):
+            f.write(f"Video: {video}, AUC: {single_auc}\n")
+        f.write("Overall AUC: {}, Average Precision: {}\n".format(overall_auc, ap))
+if __name__ == '__main__':
+    main()

detection/train.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import torch
+import torch.optim as optim
+import os
+from torch.nn import MSELoss
+from torch.utils.data import DataLoader
+from model import Model
+from dataset import Dataset
+from test import test
+import option
+from tqdm import tqdm
+torch.set_default_tensor_type('torch.FloatTensor')
+def sparsity(arr, lamda2):
+    loss = torch.mean(torch.norm(arr, dim=0))
+    return lamda2*loss
+def smooth(arr, lamda1):
+    arr2 = torch.zeros_like(arr)
+    arr2[:-1] = arr[1:]
+    arr2[-1] = arr[-1]
+    loss = torch.sum((arr2-arr)**2)
+    return lamda1*loss
+class SigmoidMAELoss(torch.nn.Module):
+    def __init__(self):
+        super(SigmoidMAELoss, self).__init__()
+        from torch.nn import Sigmoid
+        self.__sigmoid__ = Sigmoid()
+        self.__l1_loss__ = MSELoss()
+    def forward(self, pred, target):
+        return self.__l1_loss__(pred, target)
+class RTFM_loss(torch.nn.Module):
+    def __init__(self, alpha, margin):
+        super(RTFM_loss, self).__init__()
+        self.alpha = alpha
+        self.margin = margin
+        self.sigmoid = torch.nn.Sigmoid()
+        self.mae_criterion = SigmoidMAELoss()
+        self.criterion = torch.nn.BCELoss()
+    def forward(self, score_normal, score_abnormal, nlabel, alabel, feat_n, feat_a):
+        label = torch.cat((nlabel, alabel), 0)
+        score_abnormal = score_abnormal
+        score_normal = score_normal
+        score = torch.cat((score_normal, score_abnormal), 0)
+        score = score.squeeze()
+        label = label.cuda()
+        loss_cls = self.criterion(score, label)  # BCE loss in the score space
+        loss_abn = torch.abs(self.margin - torch.norm(torch.mean(feat_a, dim=1), p=2, dim=1))
+        loss_nor = torch.norm(torch.mean(feat_n, dim=1), p=2, dim=1)
+        loss_rtfm = torch.mean((loss_abn + loss_nor) ** 2)
+        loss_total = loss_cls + self.alpha * loss_rtfm
+        return loss_total
+def train(nloader, aloader, model, batch_size, seg_num, optimizer, device):
+    with torch.set_grad_enabled(True):
+        model.train()
+        ninput1, ninput2, ninput3, nlabel = next(nloader)
+        ainput1, ainput2, ainput3, alabel = next(aloader)
+        input1 = torch.cat((ninput1, ainput1), 0).to(device)
+        input2 = torch.cat((ninput2, ainput2), 0).to(device)
+        input3 = torch.cat((ninput3, ainput3), 0).to(device)
+        score_abnormal, score_normal, feat_select_abn, feat_select_normal, scores = model(input1, input2, input3)
+        scores = scores.view(batch_size * seg_num * 2, -1) # BX32X2, 1
+        scores = scores.squeeze()
+        abn_scores = scores[batch_size * seg_num:]
+        nlabel = nlabel[0:batch_size]
+        alabel = alabel[0:batch_size]
+        loss_criterion = RTFM_loss(0.0001, 100)
+        loss_sparse = sparsity(abn_scores, 8e-3)
+        loss_smooth = smooth(abn_scores, 8e-4)
+        loss_RTFM = loss_criterion(score_normal, score_abnormal, nlabel, alabel, feat_select_normal, feat_select_abn)
+        cost = loss_RTFM + loss_smooth + loss_sparse
+        optimizer.zero_grad()
+        cost.backward()
+        optimizer.step()
+def main():
+    args = option.train_parser.parse_args()
+    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    train_nloader = DataLoader(Dataset(args, test_mode=False, is_normal=True),
+                               batch_size=args.batch_size, shuffle=True,
+                               num_workers=args.workers, pin_memory=True, drop_last=True)
+    train_aloader = DataLoader(Dataset(args, test_mode=False, is_normal=False),
+                               batch_size=args.batch_size, shuffle=True,
+                               num_workers=args.workers, pin_memory=True, drop_last=True)
+    test_loader = DataLoader(Dataset(args, test_mode=True),
+                             batch_size=1, shuffle=False,
+                             num_workers=args.workers, pin_memory=True)
+    if not os.path.exists(args.save_models):
+        os.makedirs(args.save_models)
+    feature_size = args.feature_size
+    model = Model(feature_size, args.batch_size, args.seg_num)
+    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.005)
+    test_info = {"epoch": [], "AUC": [], "AP": []}
+    best_result = -1
+    output_dir = args.output_dir
+    os.makedirs(output_dir, exist_ok=True)
+    _, overall_auc, ap = test(dataloader=test_loader,
+                              model=model,
+                              device=device,
+                              gen_scores=False,
+                              save_dir=None)
+    for step in tqdm(range(1, args.max_epoch + 1), total=args.max_epoch, dynamic_ncols=True):
+        if (step - 1) % len(train_nloader) == 0:
+            loadern_iter = iter(train_nloader)
+        if (step - 1) % len(train_aloader) == 0:
+            loadera_iter = iter(train_aloader)
+        train(nloader=loadern_iter,
+              aloader=loadera_iter,
+              model=model,
+              batch_size=args.batch_size,
+              seg_num=args.seg_num,
+              optimizer=optimizer,
+              device=device)
+        if step % 5 == 0 and step > 200:
+            _, overall_auc, ap = test(dataloader=test_loader,
+                                      model=model,
+                                      device=device,
+                                      gen_scores=False,
+                                      save_dir=None)
+            test_info["epoch"].append(step)
+            test_info["AUC"].append(overall_auc)
+            test_info["AP"].append(ap)
+            # if test_info["AUC"][-1] > best_result:
+            #     best_result = test_info["AUC"][-1]
+            #     torch.save(model.state_dict(), os.path.join(args.save_models, args.model_name + '-{}.pkl'.format(step)))
+            #     file_path = os.path.join(output_dir, '{}-step-AUC.txt'.format(step))
+            #     with open(file_path, "w") as fo:
+            #         for key in test_info:
+            #             fo.write("{}: {}\n".format(key, test_info[key][-1]))
+            metric = args.metric
+            if test_info[metric][-1] > best_result:
+                best_result = test_info[metric][-1]
+                torch.save(model.state_dict(), os.path.join(args.save_models, args.model_name + '-{}.pkl'.format(step)))
+                file_path = os.path.join(output_dir, '{}-step-result.txt'.format(step))
+                with open(file_path, "w") as fo:
+                    for key in test_info:
+                        fo.write("{}: {}\n".format(key, test_info[key][-1]))
+if __name__ == '__main__':
+    main()

figures/Intro.png ADDED Viewed

Git LFS Details

SHA256: c87d0010487ecd66d1a99a020213feec75117b99e626e713aa652a6b7d2eabc1
Pointer size: 132 Bytes
Size of remote file: 1.95 MB

recognition/dataset.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import torch.utils.data as data
+import os
+import torch
+torch.set_default_tensor_type('torch.FloatTensor')
+class_to_int = {
+    'Normal': 0,
+    'Abuse': 1,
+    'Arrest': 2,
+    'Arson': 3,
+    'Assault': 4,
+    'Burglary': 5,
+    'Explosion': 6,
+    'Fighting': 7,
+    'Robbery': 8,
+    'Shooting': 9,
+    'Shoplifting': 10,
+    'Stealing': 11,
+    'Vandalism': 12,
+    'RoadAccidents_EMVvsEMV': 13,
+    'RoadAccidents_EMVvsVRU': 14,
+    'RoadAccidents_VRUvsVRU': 15,
+    'DangerousThrowing': 16,
+    'Littering': 17
+}
+def read_features(feature_path):
+    """
+    Read features from a text file and convert them into a torch tensor.
+    Args:
+        feature_path (str): Path to the text file containing features.
+    Returns:
+        features (torch.Tensor): A tensor containing the features. Shape is T x C.
+    """
+    with open(feature_path, 'r') as file:
+        lines = file.readlines()
+    features = []
+    for line in lines:
+        feature = [float(value) for value in line.strip().split()]
+        features.append(feature)
+    features = torch.tensor(features).float() # T x C
+    return features
+class Dataset(data.Dataset):
+    def __init__(self, args, is_normal=True, transform=None, test_mode=False):
+        """
+        Custom dataset class for loading features and labels.
+        Args:
+            args: Argument object containing paths and options.
+            is_normal (bool): Whether the dataset represents normal samples.
+            transform: Data transformation to be applied.
+            test_mode (bool): Whether the dataset is for testing.
+        Attributes:
+            is_normal (bool): Whether the dataset represents normal samples.
+            transform: Data transformation to be applied.
+            test_mode (bool): Whether the dataset is for testing.
+            list (list): List of feature paths and labels information.
+        """
+        self.is_normal = is_normal
+        self.transform = transform
+        self.test_mode = test_mode
+        if self.test_mode:
+            annotation_path = args.test_anno
+        else:
+            annotation_path = args.train_anno
+        self.list = self._get_features_list(args.lf_dir, args.mf_dir, args.sf_dir, annotation_path)
+    def __getitem__(self, index):
+        if self.test_mode:
+            lf_path, mf_path, sf_path, label, file = self.list[index]
+            l_features = read_features(lf_path)
+            m_features = read_features(mf_path)
+            s_features = read_features(sf_path)
+            label = torch.tensor(label)
+            return s_features, m_features, l_features, label, file
+        else:
+            lf_path, mf_path, sf_path, label = self.list[index]
+            l_features = read_features(lf_path)
+            m_features = read_features(mf_path)
+            s_features = read_features(sf_path)
+            label = torch.tensor(label)
+            return s_features, m_features, l_features, label
+    def __len__(self):
+        return len(self.list)
+    def _get_features_list(self, lf_dir, mf_dir, sf_dir, annotation_path):
+        """
+        Construct a feature list from the given directories and annotation file.
+        Args:
+            lf_dir (str): Directory path containing long-frame-length feature files.
+            mf_dir (str): Directory path containing medium-frame-length feature files.
+            sf_dir (str): Directory path containing short-frame-length feature files.
+            annotation_path (str): Path to a text file containing annotation information.
+        Returns:
+            list: A list of tuples, each containing (lf_path, mf_path, sf_path, cls) or (lf_path, mf_path, sf_path, cls, file).
+        Raises:
+            AssertionError: If the input directories do not exist.
+        Note:
+            - If test_mode is True, each tuple contains (lf_path, mf_path, sf_path, cls, file), where file is the file name.
+            - If test_mode is False, each tuple contains (lf_path, mf_path, sf_path, cls), and selection is based on whether it is normal (is_normal).
+        """
+        assert os.path.exists(lf_dir)
+        assert os.path.exists(mf_dir)
+        assert os.path.exists(sf_dir)
+        features_list = []
+        with open(annotation_path) as f:
+            lines = f.read().splitlines(keepends=False)
+            for line in lines:
+                items = line.split()
+                file = items[0].split(".")[0]
+                file = file.replace("/", os.sep)
+                lf_path = os.path.join(lf_dir, file + '.txt')
+                mf_path = os.path.join(mf_dir, file + '.txt')
+                sf_path = os.path.join(sf_dir, file + '.txt')
+                unsupported_class = 18
+                if not items[1].isdigit():
+                    cls = class_to_int.get(items[1], unsupported_class)
+                else:
+                    cls = int(items[1])
+                if self.test_mode:
+                    features_list.append((lf_path, mf_path, sf_path, cls, file))
+                elif (cls == class_to_int['Normal']) == self.is_normal:
+                    features_list.append((lf_path, mf_path, sf_path, cls))
+        return features_list

recognition/model.py ADDED Viewed

	@@ -0,0 +1,295 @@

+""" Reference source: https://github.com/tianyu0207/RTFM"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as torch_init
+torch.set_default_tensor_type('torch.FloatTensor')
+def weight_init(m):
+    classname = m.__class__.__name__
+    if classname.find('Conv') != -1 or classname.find('Linear') != -1:
+        torch_init.xavier_uniform_(m.weight)
+        if m.bias is not None:
+            m.bias.data.fill_(0)
+class CVA(nn.Module):
+    def __init__(self, input_dim=1024):
+        """
+        Cross-View Attention (CVA) module.
+        Args:
+            input_dim (int): Dimension of the input features.
+        """
+        super(CVA, self).__init__()
+        drop_out_rate = 0.1
+        num_heads = 4
+        self.cross_attention = nn.MultiheadAttention(embed_dim=input_dim, num_heads=num_heads, dropout=drop_out_rate,
+                                                     device='cuda')
+    def forward(self, feature1, feature2):
+        """
+        Args:
+            feature1 (torch.Tensor): one path features. Shape: B x T x C.
+            feature2 (torch.Tensor): another path features. Shape: B x T x C.
+        Returns:
+            out1 (torch.Tensor): Processed features after cross-attention. Shape: B x T x C.
+        """
+        feature1 = F.layer_norm(feature1, [feature1.size(-1)])
+        feature2 = F.layer_norm(feature2, [feature2.size(-1)])
+        feature1 = feature1.permute(1, 0, 2)  # T B C
+        feature2 = feature2.permute(1, 0, 2)
+        out1, _ = self.cross_attention(query=feature1, key=feature2, value=feature2)  # T B C (For test:32 1 1024)
+        out1 = out1 + feature1  # residual connection
+        return out1  # B T C
+class Aggregate(nn.Module):
+    def __init__(self, input_dim):
+        """
+        An aggregate network including local temporal correlation learning, global temporal correlation learning,
+            and feature fusion in MTFF.
+        Args:
+            input_dim (int): input features dim.
+        """
+        super(Aggregate, self).__init__()
+        bn = nn.BatchNorm1d
+        num_heads = 4
+        self.input_dim = input_dim
+        self.conv_1 = nn.Sequential(
+            nn.Conv1d(in_channels=input_dim, out_channels=512, kernel_size=3,
+                      stride=1,dilation=1, padding=1),
+            nn.LeakyReLU(negative_slope=5e-2),
+            bn(512)
+        )
+        self.conv_2 = nn.Sequential(
+            nn.Conv1d(in_channels=input_dim, out_channels=512, kernel_size=3,
+                      stride=1, dilation=2, padding=2),
+            nn.LeakyReLU(negative_slope=5e-2),
+            bn(512)
+        )
+        self.conv_3 = nn.Sequential(
+            nn.Conv1d(in_channels=input_dim, out_channels=512, kernel_size=3,
+                      stride=1, dilation=4, padding=4),
+            nn.LeakyReLU(negative_slope=5e-2),
+            bn(512)
+        )
+        self.conv_4 = nn.Sequential(
+            nn.Conv1d(in_channels=input_dim*3, out_channels=512, kernel_size=1,
+                      stride=1, padding=0, bias = False),
+            nn.LeakyReLU(negative_slope=5e-2),
+        )
+        self.conv_5 = nn.Sequential(
+            nn.Conv1d(in_channels=2048, out_channels=input_dim, kernel_size=3,
+                      stride=1, padding=1, bias=False),
+            nn.LeakyReLU(negative_slope=5e-2),
+            nn.BatchNorm1d(input_dim),
+        )
+        self.self_attention = nn.MultiheadAttention(embed_dim=512, num_heads=num_heads,
+                                                    dropout=0.1, device='cuda')
+    def forward(self, input1, input2, input3):
+        """
+        Args:
+            input1 (torch.Tensor): long-frame-length features. Shape: T x B x C.
+            input2 (torch.Tensor): medium-frame-length features. Shape: T x B x C.
+            input3 (torch.Tensor): short-frame-length features. Shape: T x B x C.
+        Returns:
+            torch.Tensor: Processed and fused output features. Shape: B x T x C.
+        """
+        x1 = input1.permute(1, 2, 0)  # B C T
+        x2 = input2.permute(1, 2, 0)
+        x3 = input3.permute(1, 2, 0)
+        tensor_list = [x1, x2, x3]
+        residual = torch.mean(torch.stack(tensor_list), dim=0)
+        out1 = self.conv_1(x1)  # B C/2 T
+        out2 = self.conv_2(x2)
+        out3 = self.conv_3(x3)
+        x = torch.cat([out1, out2, out3], dim=1)  # B 3C/2 T
+        feature = torch.cat((x1, x2, x3), dim=1)
+        out = self.conv_4(feature)
+        out = out.permute(2, 0, 1)  # T B C/2
+        out = F.layer_norm(out, normalized_shape=[out.size(-1)])
+        out, _ = self.self_attention(out, out, out)  # T B C/2
+        out = out.permute(1, 2, 0)  # B C/2 T
+        out = torch.cat((x, out), dim=1)  # B 2C T
+        out = self.conv_5(out)   # fuse all the features together
+        out = out + residual
+        out = out.permute(0, 2, 1)
+        return out
+class Encoder(nn.Module):
+    def __init__(self, input_dim=1024, seg_num=32):
+        """
+        Multi-Temporal Feature Fusion (MTFF) module.
+        Args:
+            input_dim (int): Dimension of the input features.
+            seg_num (int): Number of snippets in a video.
+        """
+        super(Encoder, self).__init__()
+        self.drop_out_rate = 0.1
+        self.input_dim = input_dim
+        self.min_temporal_dim = seg_num
+        self.CVA1 = CVA(input_dim=input_dim)
+        self.CVA2 = CVA(input_dim=input_dim)
+        self.CVA3 = CVA(input_dim=input_dim)
+        self.aggregate = Aggregate(input_dim=input_dim)
+    def forward(self, feature1, feature2, feature3):
+        """
+        Args:
+            feature1 (torch.Tensor): long-frame-length features. Shape: B x T x C.
+                (Batch size X The number of snippets x Input dimensions)
+            feature2 (torch.Tensor): medium-frame-length features. Shape: B x T x C.
+            feature3 (torch.Tensor): short-frame-length features. Shape: B x T x C.
+        Returns:
+            torch.Tensor: Fused and processed output features. Shape: B x T x C.
+        """
+        att1 = self.CVA1(feature1, feature2)
+        att2 = self.CVA2(feature2, feature3)
+        att3 = self.CVA3(feature3, feature1)
+        out1 = self.aggregate(att1, att2, att3)  # B T C
+        return out1
+class Model(nn.Module):
+    def __init__(self, feature_dim, batch_size, seg_num=32):
+        """
+        Multi-Temporal Feature Learning (MTFL) recognition model.
+        Args:
+            feature_dim (int): Dimension of the input features.
+            batch_size (int): Batch size.
+            seg_num (int): Number of snippets in a video.
+        """
+        super(Model, self).__init__()
+        self.batch_size = batch_size
+        self.num_segments = seg_num
+        self.k_abn = self.num_segments // 10  # select 3 snippets
+        self.k_nor = self.num_segments // 10
+        self.Encoder = Encoder(input_dim=feature_dim, seg_num=seg_num)
+        # Fully connected layers for classification
+        self.fc1 = nn.Linear(feature_dim, 512)
+        self.fc2 = nn.Linear(512, 128)
+        self.fc3 = nn.Linear(128, 18)  # class amount = 18
+        self.drop_out = nn.Dropout(0.2)
+        self.relu = nn.LeakyReLU(negative_slope=5e-2)
+        self.sigmoid = nn.Sigmoid()
+        self.apply(weight_init)
+    def forward(self, input1, input2, input3):
+        """
+        Args:
+            input1 (torch.Tensor): long-frame-length features. Shape: B x T x feature_dim.
+            input2 (torch.Tensor): medium-frame-length features. Shape: B x T x feature_dim.
+            input3 (torch.Tensor): short-frame-length features. Shape: B x T x feature_dim.
+        Returns:
+            score_abnormal (torch.Tensor): The mean scores for top-3 abnormal instances.
+            score_normal (torch.Tensor): The mean scores for top-3 normal instances.
+            feat_select_abn (torch.Tensor): Selected abnormal features.
+            feat_select_normal (torch.Tensor): Selected normal features.
+            scores (torch.Tensor): All computed scores. Shape: B x T x the number of classes (18)
+        """
+        k_abn = self.k_abn
+        k_nor = self.k_nor
+        ncrops = 1  # Reserving the parameter for spatial cropping, which is not used and defaults to 1
+        # Multi-Temporal Feature Fusion
+        out = self.Encoder(input1, input2, input3)
+        bs, t, f = out.size()
+        features = self.drop_out(out) # B T D
+        # classification layers
+        scores = self.relu(self.fc1(features))
+        scores = self.drop_out(scores)
+        scores = self.relu(self.fc2(scores))
+        scores = self.drop_out(scores)
+        scores = self.sigmoid(self.fc3(scores))
+        scores = scores.view(bs, t, -1) # B T 18
+        # B * t * f
+        normal_features = features[0:self.batch_size]
+        normal_scores = scores[0:self.batch_size]
+        abnormal_features = features[self.batch_size:]
+        abnormal_scores = scores[self.batch_size:]
+        # Compute feature magnitudes
+        feat_magnitudes = torch.norm(features, p=2, dim=2)
+        feat_magnitudes = feat_magnitudes.view(bs, ncrops, -1).mean(1)
+        nfea_magnitudes = feat_magnitudes[0:self.batch_size]  # normal feature magnitudes
+        afea_magnitudes = feat_magnitudes[self.batch_size:]  # abnormal feature magnitudes
+        n_size = nfea_magnitudes.shape[0]
+        # Inference mode for batch size 1
+        if nfea_magnitudes.shape[0] == 1:
+            afea_magnitudes = nfea_magnitudes
+            abnormal_scores = normal_scores
+            abnormal_features = normal_features
+        select_idx = torch.ones_like(nfea_magnitudes)
+        select_idx = self.drop_out(select_idx)
+        #######  process abnormal videos -> select top3 feature magnitude  #######
+        afea_magnitudes_drop = afea_magnitudes * select_idx
+        idx_abn = torch.topk(afea_magnitudes_drop, k_abn, dim=1)[1]
+        idx_abn_feat = idx_abn.unsqueeze(2).expand([-1, -1, abnormal_features.shape[2]])
+        abnormal_features = abnormal_features.view(n_size, ncrops, t, f) # B X N X T X F
+        abnormal_features = abnormal_features.permute(1, 0, 2, 3)  # N X B X T X F
+        total_select_abn_feature = torch.zeros(0, device=input1.device)
+        for abnormal_feature in abnormal_features:
+            feat_select_abn = torch.gather(abnormal_feature, 1, idx_abn_feat)   # top 3 features magnitude in abnormal bag
+            total_select_abn_feature = torch.cat((total_select_abn_feature, feat_select_abn))
+        idx_abn_score = idx_abn.unsqueeze(2).expand([-1, -1, abnormal_scores.shape[2]])
+        # top 3 scores in abnormal bag based on the top-3 magnitude
+        score_abnormal = torch.mean(torch.gather(abnormal_scores, 1, idx_abn_score), dim=1)
+        ####### process normal videos -> select top3 feature magnitude #######
+        select_idx_normal = torch.ones_like(nfea_magnitudes)
+        select_idx_normal = self.drop_out(select_idx_normal)
+        nfea_magnitudes_drop = nfea_magnitudes * select_idx_normal
+        idx_normal = torch.topk(nfea_magnitudes_drop, k_nor, dim=1)[1]
+        idx_normal_feat = idx_normal.unsqueeze(2).expand([-1, -1, normal_features.shape[2]])
+        normal_features = normal_features.view(n_size, ncrops, t, f)
+        normal_features = normal_features.permute(1, 0, 2, 3) # 1 B T D
+        total_select_nor_feature = torch.zeros(0, device=input1.device)
+        for nor_fea in normal_features:
+            feat_select_normal = torch.gather(nor_fea, 1, idx_normal_feat)  # top 3 features magnitude in normal bag (hard negative)
+            total_select_nor_feature = torch.cat((total_select_nor_feature, feat_select_normal))
+        idx_normal_score = idx_normal.unsqueeze(2).expand([-1, -1, normal_scores.shape[2]])
+        score_normal = torch.mean(torch.gather(normal_scores, 1, idx_normal_score), dim=1) # top 3 scores in normal bag
+        feat_select_abn = total_select_abn_feature
+        feat_select_normal = total_select_nor_feature
+        return score_abnormal, score_normal, feat_select_abn, feat_select_normal, scores

recognition/option.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import argparse
+############ Test args ########################
+test_parser = argparse.ArgumentParser(description='MTFL_recognition_test')
+# input path
+test_parser.add_argument('--lf_dir', type=str, default='features/L64', help='long frame length feature path')
+test_parser.add_argument('--mf_dir', type=str, default='features/L32', help='media frame length feature path')
+test_parser.add_argument('--sf_dir', type=str, default='features/L8', help='short frame length feature path')
+test_parser.add_argument('--test_anno', type=str, default='annotation/Anomaly_videos.txt', help='test annotation file')
+test_parser.add_argument('--test_dataset', type=str, default='other', choices=['UCF', 'VAD', 'other'],
+                         help='The test data. The test results are the recognized labels of all input videos. '
+                              'For UCF and VAD datasets, the overall accuracy would be printed out')
+test_parser.add_argument('--recognition_model', type=str,
+                         default='/media/DataDrive/yiling/Test/models/MTFL_recog/split_1_best_VAD.pkl',
+                         help='recognition checkpoint path, choose 1 from 7 checkpoints trained on different splits')
+# output path
+test_parser.add_argument('--output_dir', type=str, default='results',
+                         help='The path to store the recognition result')
+# feature size depending on which feature extractor used
+test_parser.add_argument('--feature_size', type=int, default=1024, help='feature dim (default: VST feature)')
+test_parser.add_argument('--seg_num', type=int, default=32, help='the number of snippets')
+# running cfg
+test_parser.add_argument('--gpu', default="0", type=str, choices=["0", "1"], help='gpu')
+test_parser.add_argument('--workers', default=8, help='number of workers in dataloader')
+############ Train args ########################
+train_parser = argparse.ArgumentParser(description='MTFL_recognition_train')
+# input path
+train_parser.add_argument('--lf_dir', type=str, default='/media/DataDrive/yiling/features/recognition/split1_L64R1',
+                          help='long feature path')
+train_parser.add_argument('--mf_dir', type=str, default='/media/DataDrive/yiling/features/recognition/split1_L32R1',
+                          help='media feature path')
+train_parser.add_argument('--sf_dir', type=str, default='/media/DataDrive/yiling/features/recognition/split1_L8R1',
+                          help='short feature path')
+train_parser.add_argument('--train_anno', default='/media/DataDrive/yiling/annotation/recognition/splits/VAD/VAD_train_001.txt',
+                          help='the annotation file for training')
+train_parser.add_argument('--test_anno', default='/media/DataDrive/yiling/annotation/recognition/splits/VAD/VAD_test_001.txt',
+                          help='the annotation file for test')
+train_parser.add_argument('--test_dataset', type=str, default='UCF', choices=['UCF', 'VAD'],
+                         help='The validation data')
+# output path and saving info
+train_parser.add_argument('--model-name', default='MTFL_recognition', help='name to save model')
+train_parser.add_argument('--save_models', default='/media/DataDrive/yiling/models/demo/recognition',
+                          help='the path for saving models')
+train_parser.add_argument('--output_dir', default='/media/DataDrive/yiling/results/demo/recognition',
+                          help='The path to store AUC results')
+# training cfg and paras
+train_parser.add_argument('--gpu', default="0", type=str, choices=["0", "1"], help='gpu id')
+train_parser.add_argument('--feature_size', type=int, default=1024, help='feature dim (default: VST feature)')
+train_parser.add_argument('--seg_num', type=int, default=32, help='the number of snippets')
+train_parser.add_argument('--lr', type=float, default='0.0001', help='learning rates for steps(list form)')
+train_parser.add_argument('--batch-size', type=int, default=32, help='batch size')
+train_parser.add_argument('--workers', default=8, help='number of workers in dataloader')
+train_parser.add_argument('--max-epoch', type=int, default=2000, help='maximum iteration to train (default: 100)')

recognition/test.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import torch
+from tqdm import tqdm
+import numpy as np
+import os
+import option
+from torch.utils.data import DataLoader
+from dataset import class_to_int, Dataset
+from model import Model
+def top_k_accuracy(scores, labels, topk=(1, 5)):
+    """Calculate top k accuracy score.
+    Args:
+        scores (list[np.ndarray]): Prediction scores for each class.
+        labels (list[int]): Ground truth labels.
+        topk (tuple[int]): K value for top_k_accuracy. Default: (1, ).
+    Returns:
+        list[float]: Top k accuracy score for each k.
+    """
+    res = []
+    labels = np.array(labels)[:, np.newaxis]
+    for k in topk:
+        max_k_preds = np.argsort(scores, axis=1)[:, -k:][:, ::-1]
+        match_array = np.logical_or.reduce(max_k_preds == labels, axis=1)
+        topk_acc_score = match_array.sum() / match_array.shape[0]
+        res.append(topk_acc_score)
+    return res
+def test(dataloader, model, device, test_dataset='UCF'):
+    """
+    Evaluate the model's performance on the test dataset and return the top-1 accuracy.
+    Args:
+        dataloader (DataLoader): DataLoader for the test dataset.
+        model (nn.Module): The trained neural network model.
+        device (torch.device): The device (CPU or GPU) on which to perform evaluation.
+        test_dataset (str, optional): The name of the test dataset, either 'UCF' or 'VAD'. Default is 'UCF'.
+                The overall accuracy is calculated only for 'VAD' and 'UCF' because it does not make sense when testing
+                on only a few videos.
+    Returns:
+        float: The top-1 accuracy of the model on the test dataset.
+        dict: A dictionary containing video filenames and their corresponding predicted classes.
+    """
+    video_class = {"video": [], "class": []}
+    with torch.no_grad():
+        model.to(device).eval()
+        outputs = torch.zeros(0, device=device)
+        labels = torch.zeros(0, device=device)
+        for input1, input2, input3, label, file in tqdm(dataloader):
+            input1 = input1.to(device)
+            input2 = input2.to(device)
+            input3 = input3.to(device)
+            label = label.to(device)
+            score_abnormal, score_normal, feat_select_abn, feat_select_normal, scores = model(input1, input2, input3)
+            # cat for acc evaluation
+            outputs = torch.cat((outputs, score_abnormal))
+            labels = torch.cat((labels, label))
+            # obtain the prediction result
+            score_abnormal = score_abnormal.cpu().detach().numpy()
+            pred = np.argmax(score_abnormal, axis=1)
+            found_class = [key for key, value in class_to_int.items() if value == pred[0]]
+            file_name = os.path.basename(file[0])
+            video_class["video"].append(file_name)
+            video_class["class"].append(found_class)
+        outputs = outputs.cpu().detach().numpy()
+        labels = labels.cpu().detach().numpy()
+        res = [-1]
+        if test_dataset == 'UCF':  # all road accidents in UCF are labelled as 13
+            for row in outputs:
+                max_value = max(row[13], row[14], row[15])
+                row[13] = max_value
+                row[14] = 0.0
+                row[15] = 0.0
+        # Accuracy makes sense only when the test classes are involved in VAD
+        if test_dataset == 'UCF' or test_dataset == 'VAD':
+            res = top_k_accuracy(outputs, labels)
+            print('\n' + str(test_dataset) + ' top1 : ' + str(res[0]) + ' top5 : ' + str(res[1]) + '\n')
+        return res[0], video_class
+def main():
+    args = option.test_parser.parse_args()
+    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    out_path = os.path.join(args.output_dir, 'rec_results')
+    test_loader = DataLoader(Dataset(args, test_mode=True),
+                             batch_size=1, shuffle=False,
+                             num_workers=args.workers, pin_memory=True)
+    model = Model(feature_dim=args.feature_size, batch_size=1, seg_num=args.seg_num)
+    model.load_state_dict(torch.load(args.recognition_model))
+    _, video_class = test(dataloader=test_loader,
+                          model=model,
+                          device=device,
+                          test_dataset=args.test_dataset)
+    # save recognition results
+    video_sub_dir = os.path.basename(os.path.dirname(video_class["video"][0][0]))
+    file_path = os.path.join(out_path, video_sub_dir, 'output_pred.txt')
+    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+    with open(file_path, "w") as f:
+        for video, cls in zip(video_class["video"], video_class["class"]):
+            f.write(f"Video: {video}, class: {cls}\n")
+if __name__ == '__main__':
+    main()

recognition/train.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import torch
+import torch.optim as optim
+import os
+from torch.nn import MSELoss
+from torch.utils.data import DataLoader
+from model import Model
+from dataset import Dataset
+from test import test
+import option
+from tqdm import tqdm
+torch.set_default_tensor_type('torch.FloatTensor')
+def sparsity(arr, lamda2):
+    loss = torch.mean(torch.norm(arr, dim=0))
+    return lamda2*loss
+def smooth(arr, lamda1):
+    arr2 = torch.zeros_like(arr)
+    arr2[:-1] = arr[1:]
+    arr2[-1] = arr[-1]
+    loss = torch.sum((arr2-arr)**2)
+    return lamda1*loss
+class SigmoidMAELoss(torch.nn.Module):
+    def __init__(self):
+        super(SigmoidMAELoss, self).__init__()
+        from torch.nn import Sigmoid
+        self.__sigmoid__ = Sigmoid()
+        self.__l1_loss__ = MSELoss()
+    def forward(self, pred, target):
+        return self.__l1_loss__(pred, target)
+class RTFM_loss(torch.nn.Module):
+    def __init__(self, alpha, margin):
+        super(RTFM_loss, self).__init__()
+        self.alpha = alpha
+        self.margin = margin
+        self.sigmoid = torch.nn.Sigmoid()
+        self.mae_criterion = SigmoidMAELoss()
+        self.criterion = torch.nn.CrossEntropyLoss() # multi class
+    def forward(self, score_normal, score_abnormal, nlabel, alabel, feat_n, feat_a):
+        labels = torch.cat((nlabel, alabel), 0)
+        scores = torch.cat((score_normal, score_abnormal), 0)
+        labels = labels.cuda()
+        loss_cls = self.criterion(scores, labels)  # CE loss in the score space
+        loss_abn = torch.abs(self.margin - torch.norm(torch.mean(feat_a, dim=1), p=2, dim=1))
+        loss_nor = torch.norm(torch.mean(feat_n, dim=1), p=2, dim=1)
+        loss_rtfm = torch.mean((loss_abn + loss_nor) ** 2)
+        loss_total = loss_cls + self.alpha * loss_rtfm
+        return loss_total
+def train(nloader, aloader, model, batch_size, seg_num, optimizer, device):
+    with torch.set_grad_enabled(True):
+        model.train()
+        ninput1, ninput2, ninput3, nlabel = next(nloader)
+        ainput1, ainput2, ainput3, alabel = next(aloader)
+        input1 = torch.cat((ninput1, ainput1), 0).to(device)
+        input2 = torch.cat((ninput2, ainput2), 0).to(device)
+        input3 = torch.cat((ninput3, ainput3), 0).to(device)
+        score_abnormal, score_normal, feat_select_abn, feat_select_normal, scores = model(input1, input2, input3)
+        scores = scores.view(batch_size * seg_num * 2, -1)  # BX32X2, 18
+        abn_scores, indice = torch.max(scores[batch_size*32:], dim=1)
+        nlabel = nlabel[0:batch_size]
+        alabel = alabel[0:batch_size]
+        loss_criterion = RTFM_loss(0.0001, 100)
+        loss_sparse = sparsity(abn_scores, 8e-3)
+        loss_smooth = smooth(abn_scores, 8e-4)
+        loss_RTFM = loss_criterion(score_normal, score_abnormal, nlabel, alabel, feat_select_normal, feat_select_abn)
+        cost = loss_RTFM + loss_smooth + loss_sparse
+        optimizer.zero_grad()
+        cost.backward()
+        optimizer.step()
+def main():
+    args = option.train_parser.parse_args()
+    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    train_nloader = DataLoader(Dataset(args, test_mode=False, is_normal=True),
+                               batch_size=args.batch_size, shuffle=True,
+                               num_workers=args.workers, pin_memory=True, drop_last=True)
+    train_aloader = DataLoader(Dataset(args, test_mode=False, is_normal=False),
+                               batch_size=args.batch_size, shuffle=True,
+                               num_workers=args.workers, pin_memory=True, drop_last=True)
+    test_loader = DataLoader(Dataset(args, test_mode=True),
+                             batch_size=1, shuffle=False,
+                             num_workers=args.workers, pin_memory=True)
+    if not os.path.exists(args.save_models):
+        os.makedirs(args.save_models)
+    feature_size = args.feature_size
+    model = Model(feature_size, args.batch_size, args.seg_num)
+    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.005)
+    test_info = {"epoch": [], "TOP-1 ACC": []}
+    best_ACC = -1
+    output_dir = args.output_dir
+    os.makedirs(output_dir, exist_ok=True)
+    acc, _ = test(dataloader=test_loader,
+                  model=model,
+                  device=device,
+                  test_dataset=args.test_dataset)
+    for step in tqdm(range(1, args.max_epoch + 1), total=args.max_epoch, dynamic_ncols=True):
+        if (step - 1) % len(train_nloader) == 0:
+            loadern_iter = iter(train_nloader)
+        if (step - 1) % len(train_aloader) == 0:
+            loadera_iter = iter(train_aloader)
+        train(nloader=loadern_iter,
+              aloader=loadera_iter,
+              model=model,
+              batch_size=args.batch_size,
+              seg_num=args.seg_num,
+              optimizer=optimizer,
+              device=device)
+        if step % 5 == 0 and step > 5:
+            acc, _ = test(dataloader=test_loader,
+                          model=model,
+                          device=device,
+                          test_dataset=args.test_dataset)
+            test_info["epoch"].append(step)
+            test_info["TOP-1 ACC"].append(acc)
+            if test_info["TOP-1 ACC"][-1] > best_ACC:
+                best_ACC = test_info["TOP-1 ACC"][-1]
+                torch.save(model.state_dict(), os.path.join(args.save_models, args.model_name + '-{}.pkl'.format(step)))
+                file_path = os.path.join(output_dir, '{}-step-ACC.txt'.format(step))
+                with open(file_path, "w") as fo:
+                    for key in test_info:
+                        fo.write("{}: {}\n".format(key, test_info[key][-1]))
+if __name__ == '__main__':
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+matplotlib==3.7.0
+mmaction2.egg==info
+mmcv==1.7.0
+numpy==1.25.1
+opencv_contrib_python==4.7.0.72
+opencv_python==4.7.0.72
+scikit_learn==1.2.2
+torch==2.0.0+cu118
+torchvision==0.15.1+cu118
+tqdm==4.64.1

utils/feature_extractor.py ADDED Viewed

	@@ -0,0 +1,284 @@

+"""Reference with Ivo's implementation"""
+import argparse
+import logging
+import os
+from os import path, mkdir
+import random
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+from video_loader import VideoIter
+from utils import register_logger, get_torch_device
+import transforms_video
+from torch.utils.data import DataLoader
+from torchvision.transforms import transforms
+# Video Swin Transformer related repository
+from mmcv import Config
+from mmaction.models import build_model
+from mmcv.runner import load_checkpoint
+import warnings
+warnings.filterwarnings("ignore", message="The pts_unit 'pts' gives wrong results. Please use pts_unit 'sec'.")
+warnings.filterwarnings('ignore', message='No handlers found: "aten::pad". Skipped.')
+def get_args():
+    parser = argparse.ArgumentParser(description="VST Feature Extractor Parser")
+    # I/O
+    parser.add_argument('--dataset_path', default='test_videos',
+                        help="path to dataset")
+    parser.add_argument('--save_dir', type=str, default="features",
+                        help="set output root for the features.")
+    # extraction params
+    parser.add_argument('--model_type', default='swinB',
+                        type=str,
+                        help="type of feature extractor")
+    parser.add_argument('--pretrained_3d',
+                        default='/media/DataDrive/yiling/models/VST_finetune/hflip_speed_120_2d/best_top1_acc_epoch_15.pth',
+                        type=str,
+                        help="load default 3D pretrained feature extractor model.")
+    parser.add_argument('--clip_length', type=int, default=8,
+                        help="define the length of each input sample.")
+    parser.add_argument('--frame_interval', type=int, default=1,
+                        help="define the sampling interval between frames.")
+    parser.add_argument('--use_splits', type=bool, default=False,
+                        help="use full anomalous data or splits, only applicable of Split Dataset of UCF-CRIME and VAD")
+    parser.add_argument('--batch_size', type=int, default=8, help="batch size")
+    # running cfg
+    parser.add_argument('--num_workers', type=int, default=0,
+                        help="define the number of workers used for loading the videos")
+    parser.add_argument('--seed', type=int, default=None, help='random seed')
+    parser.add_argument('--log_every', type=int, default=10,
+                        help="log the writing of clips every n steps.")
+    parser.add_argument('--log_file', type=str,
+                        help="set logging file.")
+    parser.add_argument('--gpu', type=int, default=0, help="gpu id")
+    return parser.parse_args()
+def set_random_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+def to_segments(data, num=32):
+    """
+	These code is taken from:
+	https://github.com/rajanjitenpatel/C3D_feature_extraction/blob/b5894fa06d43aa62b3b64e85b07feb0853e7011a/extract_C3D_feature.py#L805
+	:param data: list of features of a certain video
+	:return: list of 32 segments
+	"""
+    data = np.array(data)
+    Segments_Features = []
+    thirty2_shots = np.round(np.linspace(0, len(data) - 1, num=num + 1)).astype(int)
+    for ss, ee in zip(thirty2_shots[:-1], thirty2_shots[1:]):
+        if ss == ee:
+            temp_vect = data[min(ss, data.shape[0] - 1), :]
+        else:
+            temp_vect = data[ss:ee, :].mean(axis=0)
+        temp_vect = temp_vect / np.linalg.norm(temp_vect)
+        if np.linalg.norm == 0:
+            logging.error("Feature norm is 0")
+            exit()
+        if len(temp_vect) != 0:
+            Segments_Features.append(temp_vect.tolist())
+    return Segments_Features
+class FeaturesWriter:
+    def __init__(self, num_videos, chunk_size=16):
+        """
+        Initialize a FeaturesWriter instance.
+        Args:
+            num_videos (int): Total number of videos to process.
+            chunk_size (int, optional): Chunk size for writing features, and not used. Defaults to 16.
+        """
+        self.path = None
+        self.dir = None
+        self.data = None
+        self.chunk_size = chunk_size
+        self.num_videos = num_videos
+        self.dump_count = 0
+    def _init_video(self, video_name, dir):
+        self.path = path.join(dir, f"{video_name}.txt")
+        self.dir = dir
+        self.data = dict()
+    def has_video(self):
+        return self.data is not None
+    def dump(self):
+        logging.info(f'{self.dump_count} / {self.num_videos}:	Dumping {self.path}')
+        self.dump_count += 1
+        if not path.exists(self.dir):
+            os.mkdir(self.dir)
+        features = to_segments([self.data[key] for key in sorted(self.data)])
+        with open(self.path, 'w') as fp:
+            for d in features:
+                d = [str(x) for x in d]
+                fp.write(' '.join(d) + '\n')
+    def _is_new_video(self, video_name, dir):
+        new_path = path.join(dir, f"{video_name}.txt")
+        if self.path != new_path and self.path is not None:
+            return True
+        return False
+    def store(self, feature, idx):
+        self.data[idx] = list(feature)
+    def write(self, feature, video_name, idx, dir):
+        if not self.has_video():
+            self._init_video(video_name, dir)
+        if self._is_new_video(video_name, dir):
+            self.dump()
+            self._init_video(video_name, dir)
+        self.store(feature, idx)
+def get_features_loader(dataset_path, clip_length, frame_interval, batch_size, num_workers, save_dir, use_splits):
+    """
+    Get the data loader for extracting video features.
+    Args:
+        dataset_path (str): Path to the videos.
+        clip_length (int): Length of each input sample.
+        frame_interval (int): Sampling interval between frames.
+        batch_size (int): Batch size.
+        num_workers (int): Number of workers used for loading videos.
+        save_dir (str): Directory to save features.
+        use_splits (bool): Whether to use full anomalous data or splits.
+    Returns:
+        data_loader (VideoIter): Video data loader.
+        data_iter (DataLoader): Torch data loader for video features extraction.
+    """
+    mean = [0.400, 0.388, 0.372]  # VAD mean and std in RGB
+    std = [0.247, 0.245, 0.243]
+    size = 224
+    resize = size, size
+    crop = size
+    res = transforms.Compose([
+        transforms_video.ToTensorVideo(),
+        transforms_video.ResizeVideo(resize),
+        transforms_video.CenterCropVideo(crop),
+        transforms_video.NormalizeVideo(mean=mean, std=std)
+    ])
+    if os.path.exists(save_dir):
+        proc_v = []
+        for root, dirs, files in os.walk(save_dir):
+            for file in files:
+                file_path = os.path.join(root, file)
+                relative_path = os.path.relpath(file_path, save_dir)
+                proc_v.append(relative_path)
+        proc_v = [v.split(".")[0] for v in proc_v]
+        if len(proc_v) > 0:
+            logging.info(
+                f"[Data] Already {len(proc_v)} files have been processed"
+            )
+    data_loader = VideoIter(
+        dataset_path=dataset_path,
+        proc_video=proc_v,
+        clip_length=clip_length,
+        frame_stride=frame_interval,
+        video_transform=res,
+        use_splits=use_splits,
+        return_label=False,
+    )
+    data_iter = torch.utils.data.DataLoader(
+        data_loader,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=num_workers,
+        pin_memory=True,
+    )
+    return data_loader, data_iter
+def load_VST(checkpoint, device):
+    """load pretrained VST"""
+    config = 'utils/swin_config/recognition/swin/swin_base_patch244_window877_kinetics400_22k_VAD.py'
+    cfg = Config.fromfile(config)
+    model = build_model(cfg.model, train_cfg=None, test_cfg=cfg.get('test_cfg'))
+    load_checkpoint(model, checkpoint, map_location='cpu')
+    return model.to(device)
+def main():
+    args = get_args()
+    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+    torch.cuda.set_device(args.gpu)
+    device = get_torch_device()
+    register_logger(log_file=args.log_file)
+    if args.seed is not None:
+        set_random_seed(args.seed)
+    cudnn.benchmark = True
+    feature_path = os.path.join(args.save_dir, 'L'+str(args.clip_length))
+    if not path.exists(feature_path):
+        mkdir(feature_path)
+    data_loader, data_iter = get_features_loader(args.dataset_path,
+                                                 args.clip_length,
+                                                 args.frame_interval,
+                                                 args.batch_size,
+                                                 args.num_workers,
+                                                 feature_path,
+                                                 args.use_splits, )
+    if data_loader.video_count == 0:
+        return
+    model = load_VST(args.pretrained_3d, device)
+    features_writer = FeaturesWriter(num_videos=data_loader.video_count)
+    loop_i = 0
+    # Perform feature extraction on the dataset
+    with torch.no_grad():
+        for data, clip_idxs, dirs, vid_names in data_iter: # 1 batch
+            outputs = model.extract_feat(data.to(device))
+            outputs = outputs.mean(dim=[2, 3, 4])
+            outputs = outputs.detach().cpu().numpy()
+            for i, (dir, vid_name, clip_idx) in enumerate(zip(dirs, vid_names, clip_idxs)):
+                if loop_i == 0:
+                    logging.info(
+                        f"Video {features_writer.dump_count} / {features_writer.num_videos} : Writing clip {clip_idx} of video {vid_name}")
+                loop_i += 1
+                loop_i %= args.log_every
+                dir = path.join(feature_path, dir)
+                features_writer.write(feature=outputs[i],
+                                      video_name=vid_name,
+                                      idx=clip_idx,
+                                      dir=dir, )
+    # Dump the remaining features to files
+    features_writer.dump()
+if __name__ == "__main__":
+    main()

utils/functional_video.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import torch
+def _is_tensor_video_clip(clip):
+    if not torch.is_tensor(clip):
+        raise TypeError("clip should be Tesnor. Got %s" % type(clip))
+    if not clip.ndimension() == 4:
+        raise ValueError("clip should be 4D. Got %dD" % clip.dim())
+    return True
+def crop(clip, i, j, h, w):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
+    """
+    assert len(clip.size()) == 4, "clip should be a 4D tensor"
+    return clip[..., i:i + h, j:j + w]
+def resize(clip, target_size, interpolation_mode):
+    assert len(target_size) == 2, "target size should be tuple (height, width)"
+    # print(target_size)
+    return torch.nn.functional.interpolate(
+        clip, size=target_size, mode=interpolation_mode, align_corners=False
+    )
+def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"):
+    """
+    Do spatial cropping and resizing to the video clip
+    Args:
+        clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
+        i (int): i in (i,j) i.e coordinates of the upper left corner.
+        j (int): j in (i,j) i.e coordinates of the upper left corner.
+        h (int): Height of the cropped region.
+        w (int): Width of the cropped region.
+        size (tuple(int, int)): height and width of resized clip
+    Returns:
+        clip (torch.tensor): Resized and cropped clip. Size is (C, T, H, W)
+    """
+    assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor"
+    clip = crop(clip, i, j, h, w)
+    clip = resize(clip, size, interpolation_mode)
+    return clip
+def center_crop(clip, crop_size):
+    assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor"
+    h, w = clip.size(-2), clip.size(-1)
+    th, tw = crop_size
+    assert h >= th and w >= tw, "height and width must be no smaller than crop_size"
+    i = int(round((h - th) / 2.0))
+    j = int(round((w - tw) / 2.0))
+    return crop(clip, i, j, th, tw)
+def to_tensor(clip):
+    """
+    Convert tensor data type from uint8 to float, divide value by 255.0 and
+    permute the dimenions of clip tensor
+    Args:
+        clip (torch.tensor, dtype=torch.uint8): Size is (T, H, W, C)
+    Return:
+        clip (torch.tensor, dtype=torch.float): Size is (C, T, H, W)
+    """
+    _is_tensor_video_clip(clip)
+    if not clip.dtype == torch.uint8:
+        raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype))
+    return clip.float().permute(3, 0, 1, 2) / 255.0
+def normalize(clip, mean, std, inplace=False):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be normalized. Size is (C, T, H, W)
+        mean (tuple): pixel RGB mean. Size is (3)
+        std (tuple): pixel standard deviation. Size is (3)
+    Returns:
+        normalized clip (torch.tensor): Size is (C, T, H, W)
+    """
+    assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor"
+    if not inplace:
+        clip = clip.clone()
+    mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device)
+    std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device)
+    clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
+    return clip
+def hflip(clip):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be normalized. Size is (C, T, H, W)
+    Returns:
+        flipped clip (torch.tensor): Size is (C, T, H, W)
+    """
+    assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor"
+    return clip.flip((-1))

utils/swin_config/_base_/default_runtime.py ADDED Viewed

	@@ -0,0 +1,13 @@

+checkpoint_config = dict(interval=1)
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook'),
+    ])
+# runtime settings
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]

utils/swin_config/_base_/models/audioonly_r50.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# model settings
+model = dict(
+    type='AudioRecognizer',
+    backbone=dict(
+        type='ResNetAudio',
+        depth=50,
+        pretrained=None,
+        in_channels=1,
+        norm_eval=False),
+    cls_head=dict(
+        type='AudioTSNHead',
+        num_classes=400,
+        in_channels=1024,
+        dropout_ratio=0.5,
+        init_std=0.01),
+    # model training and testing settings
+    train_cfg=None,
+    test_cfg=dict(average_clips='prob'))

utils/swin_config/_base_/models/bmn_400x100.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# model settings
+model = dict(
+    type='BMN',
+    temporal_dim=100,
+    boundary_ratio=0.5,
+    num_samples=32,
+    num_samples_per_bin=3,
+    feat_dim=400,
+    soft_nms_alpha=0.4,
+    soft_nms_low_threshold=0.5,
+    soft_nms_high_threshold=0.9,
+    post_process_top_k=100)

utils/swin_config/_base_/models/bsn_pem.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# model settings
+model = dict(
+    type='PEM',
+    pem_feat_dim=32,
+    pem_hidden_dim=256,
+    pem_u_ratio_m=1,
+    pem_u_ratio_l=2,
+    pem_high_temporal_iou_threshold=0.6,
+    pem_low_temporal_iou_threshold=0.2,
+    soft_nms_alpha=0.75,
+    soft_nms_low_threshold=0.65,
+    soft_nms_high_threshold=0.9,
+    post_process_top_k=100)

utils/swin_config/_base_/models/bsn_tem.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# model settings
+model = dict(
+    type='TEM',
+    temporal_dim=100,
+    boundary_ratio=0.1,
+    tem_feat_dim=400,
+    tem_hidden_dim=512,
+    tem_match_threshold=0.5)

utils/swin_config/_base_/models/c3d_sports1m_pretrained.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='C3D',
+        pretrained=  # noqa: E251
+        'https://download.openmmlab.com/mmaction/recognition/c3d/c3d_sports1m_pretrain_20201016-dcc47ddc.pth',  # noqa: E501
+        style='pytorch',
+        conv_cfg=dict(type='Conv3d'),
+        norm_cfg=None,
+        act_cfg=dict(type='ReLU'),
+        dropout_ratio=0.5,
+        init_std=0.005),
+    cls_head=dict(
+        type='I3DHead',
+        num_classes=101,
+        in_channels=4096,
+        spatial_type=None,
+        dropout_ratio=0.5,
+        init_std=0.01),
+    # model training and testing settings
+    train_cfg=None,
+    test_cfg=dict(average_clips='score'))

utils/swin_config/_base_/models/csn_ig65m_pretrained.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet3dCSN',
+        pretrained2d=False,
+        pretrained=  # noqa: E251
+        'https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r152_ig65m_20200807-771c4135.pth',  # noqa: E501
+        depth=152,
+        with_pool2=False,
+        bottleneck_mode='ir',
+        norm_eval=False,
+        zero_init_residual=False),
+    cls_head=dict(
+        type='I3DHead',
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        dropout_ratio=0.5,
+        init_std=0.01),
+    # model training and testing settings
+    train_cfg=None,
+    test_cfg=dict(average_clips='prob'))

utils/swin_config/_base_/models/i3d_r50.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet3d',
+        pretrained2d=True,
+        pretrained='torchvision://resnet50',
+        depth=50,
+        conv1_kernel=(5, 7, 7),
+        conv1_stride_t=2,
+        pool1_stride_t=2,
+        conv_cfg=dict(type='Conv3d'),
+        norm_eval=False,
+        inflate=((1, 1, 1), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 1, 0)),
+        zero_init_residual=False),
+    cls_head=dict(
+        type='I3DHead',
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        dropout_ratio=0.5,
+        init_std=0.01),
+    # model training and testing settings
+    train_cfg=None,
+    test_cfg=dict(average_clips='prob'))
+# This setting refers to https://github.com/open-mmlab/mmaction/blob/master/mmaction/models/tenons/backbones/resnet_i3d.py#L329-L332  # noqa: E501

utils/swin_config/_base_/models/r2plus1d_r34.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet2Plus1d',
+        depth=34,
+        pretrained=None,
+        pretrained2d=False,
+        norm_eval=False,
+        conv_cfg=dict(type='Conv2plus1d'),
+        norm_cfg=dict(type='SyncBN', requires_grad=True, eps=1e-3),
+        conv1_kernel=(3, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        inflate=(1, 1, 1, 1),
+        spatial_strides=(1, 2, 2, 2),
+        temporal_strides=(1, 2, 2, 2),
+        zero_init_residual=False),
+    cls_head=dict(
+        type='I3DHead',
+        num_classes=400,
+        in_channels=512,
+        spatial_type='avg',
+        dropout_ratio=0.5,
+        init_std=0.01),
+    # model training and testing settings
+    train_cfg=None,
+    test_cfg=dict(average_clips='prob'))

utils/swin_config/_base_/models/slowfast_r50.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet3dSlowFast',
+        pretrained=None,
+        resample_rate=8,  # tau
+        speed_ratio=8,  # alpha
+        channel_ratio=8,  # beta_inv
+        slow_pathway=dict(
+            type='resnet3d',
+            depth=50,
+            pretrained=None,
+            lateral=True,
+            conv1_kernel=(1, 7, 7),
+            dilations=(1, 1, 1, 1),
+            conv1_stride_t=1,
+            pool1_stride_t=1,
+            inflate=(0, 0, 1, 1),
+            norm_eval=False),
+        fast_pathway=dict(
+            type='resnet3d',
+            depth=50,
+            pretrained=None,
+            lateral=False,
+            base_channels=8,
+            conv1_kernel=(5, 7, 7),
+            conv1_stride_t=1,
+            pool1_stride_t=1,
+            norm_eval=False)),
+    cls_head=dict(
+        type='SlowFastHead',
+        in_channels=2304,  # 2048+256
+        num_classes=400,
+        spatial_type='avg',
+        dropout_ratio=0.5),
+    # model training and testing settings
+    train_cfg=None,
+    test_cfg=dict(average_clips='prob'))

utils/swin_config/_base_/models/slowonly_r50.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet3dSlowOnly',
+        depth=50,
+        pretrained='torchvision://resnet50',
+        lateral=False,
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        inflate=(0, 0, 1, 1),
+        norm_eval=False),
+    cls_head=dict(
+        type='I3DHead',
+        in_channels=2048,
+        num_classes=400,
+        spatial_type='avg',
+        dropout_ratio=0.5),
+    # model training and testing settings
+    train_cfg=None,
+    test_cfg=dict(average_clips='prob'))

utils/swin_config/_base_/models/swin/swin_base.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# model settings
+_base_ = "swin_tiny.py"
+model = dict(backbone=dict(depths=[2, 2, 18, 2],
+                           embed_dim=128,
+                           num_heads=[4, 8, 16, 32]),
+             cls_head=dict(in_channels=1024))

utils/swin_config/_base_/models/swin/swin_large.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# model settings
+_base_ = "swin_tiny.py"
+model = dict(backbone=dict(depths=[2, 2, 18, 2],
+                           embed_dim=192,
+                           num_heads=[6, 12, 24, 48]),
+             cls_head=dict(in_channels=1536))

utils/swin_config/_base_/models/swin/swin_small.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# model settings
+_base_ = "swin_tiny.py"
+model = dict(backbone=dict(depths=[2, 2, 18, 2]))

utils/swin_config/_base_/models/swin/swin_tiny.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='SwinTransformer3D',
+        patch_size=(4,4,4),
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=(8,7,7),
+        mlp_ratio=4.,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True),
+    cls_head=dict(
+        type='I3DHead',
+        in_channels=768,
+        num_classes=18,
+        spatial_type='avg',
+        dropout_ratio=0.5),
+    test_cfg = dict(average_clips='prob'))

utils/swin_config/_base_/models/swin/swin_tiny_backup.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='SwinTransformer3D',
+        patch_size=(4,4,4),
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=(8,7,7),
+        mlp_ratio=4.,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True),
+    cls_head=dict(
+        type='I3DHead',
+        in_channels=768,
+        num_classes=400,
+        spatial_type='avg',
+        dropout_ratio=0.5),
+    test_cfg = dict(average_clips='prob'))

utils/swin_config/_base_/models/tanet_r50.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# model settings
+model = dict(
+    type='Recognizer2D',
+    backbone=dict(
+        type='TANet',
+        pretrained='torchvision://resnet50',
+        depth=50,
+        num_segments=8,
+        tam_cfg=dict()),
+    cls_head=dict(
+        type='TSMHead',
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.5,
+        init_std=0.001),
+    # model training and testing settings
+    train_cfg=None,
+    test_cfg=dict(average_clips='prob'))

utils/swin_config/_base_/models/tin_r50.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# model settings
+model = dict(
+    type='Recognizer2D',
+    backbone=dict(
+        type='ResNetTIN',
+        pretrained='torchvision://resnet50',
+        depth=50,
+        norm_eval=False,
+        shift_div=4),
+    cls_head=dict(
+        type='TSMHead',
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.5,
+        init_std=0.001,
+        is_shift=False),
+    # model training and testing settings
+    train_cfg=None,
+    test_cfg=dict(average_clips=None))

utils/swin_config/_base_/models/tpn_slowonly_r50.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet3dSlowOnly',
+        depth=50,
+        pretrained='torchvision://resnet50',
+        lateral=False,
+        out_indices=(2, 3),
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        inflate=(0, 0, 1, 1),
+        norm_eval=False),
+    neck=dict(
+        type='TPN',
+        in_channels=(1024, 2048),
+        out_channels=1024,
+        spatial_modulation_cfg=dict(
+            in_channels=(1024, 2048), out_channels=2048),
+        temporal_modulation_cfg=dict(downsample_scales=(8, 8)),
+        upsample_cfg=dict(scale_factor=(1, 1, 1)),
+        downsample_cfg=dict(downsample_scale=(1, 1, 1)),
+        level_fusion_cfg=dict(
+            in_channels=(1024, 1024),
+            mid_channels=(1024, 1024),
+            out_channels=2048,
+            downsample_scales=((1, 1, 1), (1, 1, 1))),
+        aux_head_cfg=dict(out_channels=400, loss_weight=0.5)),
+    cls_head=dict(
+        type='TPNHead',
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.5,
+        init_std=0.01),
+    # model training and testing settings
+    train_cfg=None,
+    test_cfg=dict(average_clips='prob'))

utils/swin_config/_base_/models/tpn_tsm_r50.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# model settings
+model = dict(
+    type='Recognizer2D',
+    backbone=dict(
+        type='ResNetTSM',
+        pretrained='torchvision://resnet50',
+        depth=50,
+        out_indices=(2, 3),
+        norm_eval=False,
+        shift_div=8),
+    neck=dict(
+        type='TPN',
+        in_channels=(1024, 2048),
+        out_channels=1024,
+        spatial_modulation_cfg=dict(
+            in_channels=(1024, 2048), out_channels=2048),
+        temporal_modulation_cfg=dict(downsample_scales=(8, 8)),
+        upsample_cfg=dict(scale_factor=(1, 1, 1)),
+        downsample_cfg=dict(downsample_scale=(1, 1, 1)),
+        level_fusion_cfg=dict(
+            in_channels=(1024, 1024),
+            mid_channels=(1024, 1024),
+            out_channels=2048,
+            downsample_scales=((1, 1, 1), (1, 1, 1))),
+        aux_head_cfg=dict(out_channels=174, loss_weight=0.5)),
+    cls_head=dict(
+        type='TPNHead',
+        num_classes=174,
+        in_channels=2048,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.5,
+        init_std=0.01),
+    # model training and testing settings
+    train_cfg=None,
+    test_cfg=dict(average_clips='prob', fcn_test=True))

utils/swin_config/_base_/models/trn_r50.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# model settings
+model = dict(
+    type='Recognizer2D',
+    backbone=dict(
+        type='ResNet',
+        pretrained='torchvision://resnet50',
+        depth=50,
+        norm_eval=False,
+        partial_bn=True),
+    cls_head=dict(
+        type='TRNHead',
+        num_classes=400,
+        in_channels=2048,
+        num_segments=8,
+        spatial_type='avg',
+        relation_type='TRNMultiScale',
+        hidden_dim=256,
+        dropout_ratio=0.8,
+        init_std=0.001),
+    # model training and testing settings
+    train_cfg=None,
+    test_cfg=dict(average_clips='prob'))

utils/swin_config/_base_/models/tsm_mobilenet_v2.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# model settings
+model = dict(
+    type='Recognizer2D',
+    backbone=dict(
+        type='MobileNetV2TSM',
+        shift_div=8,
+        num_segments=8,
+        is_shift=True,
+        pretrained='mmcls://mobilenet_v2'),
+    cls_head=dict(
+        type='TSMHead',
+        num_segments=8,
+        num_classes=400,
+        in_channels=1280,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.5,
+        init_std=0.001,
+        is_shift=True),
+    # model training and testing settings
+    train_cfg=None,
+    test_cfg=dict(average_clips='prob'))

utils/swin_config/_base_/models/tsm_r50.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# model settings
+model = dict(
+    type='Recognizer2D',
+    backbone=dict(
+        type='ResNetTSM',
+        pretrained='torchvision://resnet50',
+        depth=50,
+        norm_eval=False,
+        shift_div=8),
+    cls_head=dict(
+        type='TSMHead',
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.5,
+        init_std=0.001,
+        is_shift=True),
+    # model training and testing settings
+    train_cfg=None,
+    test_cfg=dict(average_clips='prob'))

utils/swin_config/_base_/models/tsn_r50.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# model settings
+model = dict(
+    type='Recognizer2D',
+    backbone=dict(
+        type='ResNet',
+        pretrained='torchvision://resnet50',
+        depth=50,
+        norm_eval=False),
+    cls_head=dict(
+        type='TSNHead',
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.4,
+        init_std=0.01),
+    # model training and testing settings
+    train_cfg=None,
+    test_cfg=dict(average_clips=None))

utils/swin_config/_base_/models/tsn_r50_audio.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# model settings
+model = dict(
+    type='AudioRecognizer',
+    backbone=dict(type='ResNet', depth=50, in_channels=1, norm_eval=False),
+    cls_head=dict(
+        type='AudioTSNHead',
+        num_classes=400,
+        in_channels=2048,
+        dropout_ratio=0.5,
+        init_std=0.01),
+    # model training and testing settings
+    train_cfg=None,
+    test_cfg=dict(average_clips='prob'))

utils/swin_config/_base_/models/x3d.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(type='X3D', gamma_w=1, gamma_b=2.25, gamma_d=2.2),
+    cls_head=dict(
+        type='X3DHead',
+        in_channels=432,
+        num_classes=400,
+        spatial_type='avg',
+        dropout_ratio=0.5,
+        fc1_bias=False),
+    # model training and testing settings
+    train_cfg=None,
+    test_cfg=dict(average_clips='prob'))

utils/swin_config/_base_/schedules/adam_20e.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# optimizer
+optimizer = dict(
+    type='Adam', lr=0.01, weight_decay=0.00001)  # this lr is used for 1 gpus
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(policy='step', step=10)
+total_epochs = 20

utils/swin_config/_base_/schedules/sgd_100e.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# optimizer
+optimizer = dict(
+    type='SGD',
+    lr=0.01,  # this lr is used for 8 gpus
+    momentum=0.9,
+    weight_decay=0.0001)
+optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
+# learning policy
+lr_config = dict(policy='step', step=[40, 80])
+total_epochs = 100

utils/swin_config/_base_/schedules/sgd_150e_warmup.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.01, momentum=0.9,
+    weight_decay=0.0001)  # this lr is used for 8 gpus
+optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    step=[90, 130],
+    warmup='linear',
+    warmup_by_epoch=True,
+    warmup_iters=10)
+total_epochs = 150

utils/swin_config/_base_/schedules/sgd_50e.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# optimizer
+optimizer = dict(
+    type='SGD',
+    lr=0.01,  # this lr is used for 8 gpus
+    momentum=0.9,
+    weight_decay=0.0001)
+optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
+# learning policy
+lr_config = dict(policy='step', step=[20, 40])
+total_epochs = 50

utils/swin_config/_base_/schedules/sgd_tsm_100e.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# optimizer
+optimizer = dict(
+    type='SGD',
+    constructor='TSMOptimizerConstructor',
+    paramwise_cfg=dict(fc_lr5=True),
+    lr=0.02,  # this lr is used for 8 gpus
+    momentum=0.9,
+    weight_decay=0.0001)
+optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
+# learning policy
+lr_config = dict(policy='step', step=[40, 80])
+total_epochs = 100

utils/swin_config/_base_/schedules/sgd_tsm_50e.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# optimizer
+optimizer = dict(
+    type='SGD',
+    constructor='TSMOptimizerConstructor',
+    paramwise_cfg=dict(fc_lr5=True),
+    lr=0.01,  # this lr is used for 8 gpus
+    momentum=0.9,
+    weight_decay=0.0001)
+optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
+# learning policy
+lr_config = dict(policy='step', step=[20, 40])
+total_epochs = 50

utils/swin_config/_base_/schedules/sgd_tsm_mobilenet_v2_100e.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# optimizer
+optimizer = dict(
+    type='SGD',
+    constructor='TSMOptimizerConstructor',
+    paramwise_cfg=dict(fc_lr5=True),
+    lr=0.01,  # this lr is used for 8 gpus
+    momentum=0.9,
+    weight_decay=0.00002)
+optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
+# learning policy
+lr_config = dict(policy='step', step=[40, 80])
+total_epochs = 100