erkutt commited on
Commit
28e129b
·
verified ·
1 Parent(s): e6a6dfb

Upload open source code of MTFL model

Browse files

MTFL: Multi-Timescale Feature Learning for Weakly-supervised Anomaly Detection in Surveillance Videos
https://arxiv.org/abs/2410.05900

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .gitignore +1 -0
  3. README.md +238 -3
  4. detection/dataset.py +117 -0
  5. detection/model.py +296 -0
  6. detection/option.py +56 -0
  7. detection/test.py +168 -0
  8. detection/train.py +188 -0
  9. figures/Intro.png +3 -0
  10. recognition/dataset.py +140 -0
  11. recognition/model.py +295 -0
  12. recognition/option.py +56 -0
  13. recognition/test.py +120 -0
  14. recognition/train.py +171 -0
  15. requirements.txt +10 -0
  16. utils/feature_extractor.py +284 -0
  17. utils/functional_video.py +102 -0
  18. utils/swin_config/_base_/default_runtime.py +13 -0
  19. utils/swin_config/_base_/models/audioonly_r50.py +18 -0
  20. utils/swin_config/_base_/models/bmn_400x100.py +12 -0
  21. utils/swin_config/_base_/models/bsn_pem.py +13 -0
  22. utils/swin_config/_base_/models/bsn_tem.py +8 -0
  23. utils/swin_config/_base_/models/c3d_sports1m_pretrained.py +23 -0
  24. utils/swin_config/_base_/models/csn_ig65m_pretrained.py +23 -0
  25. utils/swin_config/_base_/models/i3d_r50.py +27 -0
  26. utils/swin_config/_base_/models/r2plus1d_r34.py +28 -0
  27. utils/swin_config/_base_/models/slowfast_r50.py +39 -0
  28. utils/swin_config/_base_/models/slowonly_r50.py +22 -0
  29. utils/swin_config/_base_/models/swin/swin_base.py +6 -0
  30. utils/swin_config/_base_/models/swin/swin_large.py +6 -0
  31. utils/swin_config/_base_/models/swin/swin_small.py +3 -0
  32. utils/swin_config/_base_/models/swin/swin_tiny.py +24 -0
  33. utils/swin_config/_base_/models/swin/swin_tiny_backup.py +24 -0
  34. utils/swin_config/_base_/models/tanet_r50.py +20 -0
  35. utils/swin_config/_base_/models/tin_r50.py +21 -0
  36. utils/swin_config/_base_/models/tpn_slowonly_r50.py +40 -0
  37. utils/swin_config/_base_/models/tpn_tsm_r50.py +36 -0
  38. utils/swin_config/_base_/models/trn_r50.py +22 -0
  39. utils/swin_config/_base_/models/tsm_mobilenet_v2.py +22 -0
  40. utils/swin_config/_base_/models/tsm_r50.py +21 -0
  41. utils/swin_config/_base_/models/tsn_r50.py +19 -0
  42. utils/swin_config/_base_/models/tsn_r50_audio.py +13 -0
  43. utils/swin_config/_base_/models/x3d.py +14 -0
  44. utils/swin_config/_base_/schedules/adam_20e.py +7 -0
  45. utils/swin_config/_base_/schedules/sgd_100e.py +10 -0
  46. utils/swin_config/_base_/schedules/sgd_150e_warmup.py +13 -0
  47. utils/swin_config/_base_/schedules/sgd_50e.py +10 -0
  48. utils/swin_config/_base_/schedules/sgd_tsm_100e.py +12 -0
  49. utils/swin_config/_base_/schedules/sgd_tsm_50e.py +12 -0
  50. utils/swin_config/_base_/schedules/sgd_tsm_mobilenet_v2_100e.py +12 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ figures/Intro.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ /test_videos/
README.md CHANGED
@@ -1,3 +1,238 @@
1
- ---
2
- license: cc-by-4.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MTFL
2
+
3
+ This repo is the official Pytorch implementation of our paper:
4
+
5
+ > [**MTFL: Multi-Timescale Feature Learning for Weakly-supervised Anomaly Detection in Surveillance Videos**](to be filled)
6
+ >
7
+ <!--Author list-->
8
+
9
+ ## Introduction
10
+ ![intro](figures/Intro.png)
11
+
12
+ Detection of anomaly events is relevant for public safety and requires a combination of fine-grained motion information and long-time action recognition. Therefore, we propose a Multi-Timescale Feature Learning (MTFL) method to enhance the representation of anomaly features. We employ short, medium, and long temporal tubelets to extract spatio-temporal video
13
+ features using the Video Swin Transformer. Experimental results demonstrate that
14
+ MTFL outperforms state-of-the-art methods on the UCF-Crime dataset, achieving an
15
+ anomaly detection performance 89.78% AUC. Moreover, it performs 95.32% AUC on the
16
+ ShanghaiTech and 84.57% AP on the XD-Violence dataset, complementary to several
17
+ SotA results. Building upon MTFL, we also propose an anomaly recognition network
18
+ that employs partial features for classification, achieving a leading accuracy on
19
+ UCF-Crime, outperforming the existing recognition literature. Furthermore,
20
+ we introduce an extended dataset for UCF-Crime,
21
+ namely Video Anomaly Detection Dataset (VADD),
22
+ involving 2,591 videos in 18 classes with extensive coverage of realistic anomalies.
23
+
24
+ ## Models and Dataset
25
+ ### [Video Anomaly Detection Dataset (VADD)](https://form.jotform.com/240714220958354)
26
+
27
+
28
+ VADD includes 2,591 videos with a frame rate of 30 fps and a
29
+ resolution of 320×240 pixels, with 2,202 train and 389 test videos.
30
+ The subfolders in VADD are named according to video categories, totaling 18 subfolders.
31
+ Train-set annotations only include a class label, while test-set annotations contain a
32
+ video class label, a number of frames in a video, as well as the starting and
33
+ ending frame positions of abnormal events in a video.
34
+ ```
35
+ # Training annotation
36
+ [Subfolder/video name] [video label]
37
+ # Test annotation
38
+ [Subfolder/video name] [video label] [total frames] [start_frame1] [end_frame1] [start_frame2]...
39
+ ```
40
+ * Taking a training video containing littering as an example, it is annotated as below:
41
+ ```
42
+ Littering/CarSafe015.mp4 Littering
43
+ ```
44
+ * Taking a test video containing dangerous throwing behavior as an example,
45
+ its annotations indicate that the video has a total of 636 frames and
46
+ there are two instances of dangerous throwing behavior.
47
+ The first instance occurs between frames 145 and 186,
48
+ while the second instance occurs between frames 289 and 340.
49
+ ```
50
+ DangerousThrowing/BicyclistDangerous039.mp4 DangerousThrowing 636 145 186 289 340
51
+ ```
52
+
53
+ Additionally, to train and test our MTFL with benchmark datasets,
54
+ we converted annotation files from other datasets to match the format of VADD annotation
55
+ files, including Shanghai Tech, XD-Violence, and UCF-Crime.
56
+
57
+ All train and test annotation files for AnomalyDetection and AnomalyRecognition are provided in the ["Annotation".](https://tuenl-my.sharepoint.com/:f:/r/personal/e_akdag_tue_nl/Documents/MTFL/annotation?csf=1&web=1&e=UYxR0H).
58
+
59
+ ### [MTFL checkpoints for anomaly detection](https://tuenl-my.sharepoint.com/:f:/r/personal/e_akdag_tue_nl/Documents/MTFL/MTFL-checkpoints?csf=1&web=1&e=hJhPgh)
60
+
61
+ | Detection Checkpoint | Feature | UCF | Shanghai<br/>Tech | XD-Violence | VADD |
62
+ |------------------|---------------|-------|--------------|-------------|---|
63
+ | MTFL_VST_Kinetics400 | VST-RGB | 87.61 | 95.32 | 84.57 | - |
64
+ | MTFL_VST_VADD | VST<sub>Aug</sub>_RGB | 89.79 | 95.70 | 79.40 | 88.42 |
65
+
66
+ There are several MTFL checkpoints for anomaly detection using different feature extractors
67
+ and datasets where,
68
+ * xxx_VST_Kinetics400 = Features extracted using VST pretrained on Kinetics400,
69
+ * xxx_VST_VADD = Features extracted using VST pretrained on VADD with data augmentation.
70
+ * MTFL-yyy-VST-Kinetics400 = MTFL models trained with VST_RGB features.
71
+ * MTFL-yyy-VST-VADD = MTFL models trained with VST<sub>Aug</sub>_RGB features.
72
+ xxx = Shanghai, VADD, and XD.
73
+ yyy = SH, VADD-UCF, and XD.
74
+
75
+ Two feature extractors used in our detection models and
76
+ the resulting features of benchmark datasets are provided below:
77
+ * [Video Swin Transformer pretrained on Kinetics-400](https://tuenl-my.sharepoint.com/:u:/r/personal/e_akdag_tue_nl/Documents/MTFL/feature-extractors/AnomalyDetection/swin_base_patch244_window877_kinetics400_22k.pth?csf=1&web=1&e=8spheA)
78
+ * [Video Swin Transformer pretrained on VADD](https://tuenl-my.sharepoint.com/:u:/r/personal/e_akdag_tue_nl/Documents/MTFL/feature-extractors/AnomalyDetection/VST_swin_base_patch244_window877_VADD.pth?csf=1&web=1&e=AzfewH)
79
+ * [VST_RBG features of UCF-Crime, XD-Violence, Shanghai Tech, and VADD](https://tuenl-my.sharepoint.com/:f:/r/personal/e_akdag_tue_nl/Documents/MTFL/features/AnomalyDetection?csf=1&web=1&e=CT8WZ3)
80
+ * [VST<sub>Aug</sub>_RGB features of UCF-Crime, XD-Violence, Shanghai Tech, and VADD](https://tuenl-my.sharepoint.com/:f:/r/personal/e_akdag_tue_nl/Documents/MTFL/features/AnomalyDetection?csf=1&web=1&e=CT8WZ3)
81
+
82
+ The Video Swin Transformer model pretrained on Kinetics400 and
83
+ the training method for Video Swin Transformer are derived from
84
+ the [Video-Swin-Transformer repository](https://github.com/SwinTransformer/Video-Swin-Transformer).
85
+
86
+
87
+ ### [MTFL checkpoints for anomaly recognition](https://tuenl-my.sharepoint.com/:f:/r/personal/e_akdag_tue_nl/Documents/MTFL/MTFL-checkpoints/AnomalyRecognition?csf=1&web=1&e=NOkpNn)
88
+
89
+
90
+
91
+ | Recognition Checkpoint | UCF Acc(%) | VAD Acc(%) |
92
+ |-------------------------------|------------|------------|
93
+ | MTFL_VADDsplit1_best_UCF | 39.88 | - |
94
+ | MTFL_VADDsplit1_best_VADD | - | 45.87 |
95
+ | MTFL_VADDsplit2_best_UCF | 47.02 | - |
96
+ | MTFL_VADDsplit2_best_VADD | - | 49.31 |
97
+ | MTFL_VADDsplit3_best_UCF | 49.40 | - |
98
+ | MTFL_VADDsplit3_best_VADD | - | 53.88 |
99
+ | MTFL_VADDsplit4_best_UCF_VADD | 45.83 | 52.29 |
100
+ | 4-fold average | 45.53 | 50.34 |
101
+
102
+ Following the experimental setup of 4-fold cross-validation from [Sultani et al](https://arxiv.org/abs/1801.04264),
103
+ there are seven recognition checkpoints by saving the checkpoints that performed the best on
104
+ UCF and VADD separately during training on different VADD splits, as shown in the above table. For example,
105
+ MTFL_VADDsplit1_best_UCF represents the MTFL recognition model trained on VADD
106
+ split 1 with the best recognition performance on UCF-Crime split 1 test-set.
107
+ All the models use VST trained on the corresponding VADD splits
108
+ for feature extraction.
109
+
110
+ * [The used feature extractors for anomaly recognition](https://tuenl-my.sharepoint.com/:f:/r/personal/e_akdag_tue_nl/Documents/MTFL/feature-extractors/AnomalyRecognition?csf=1&web=1&e=ToseKM)
111
+ * [The generated features for anomaly recognition](https://tuenl-my.sharepoint.com/:f:/r/personal/e_akdag_tue_nl/Documents/MTFL/features/AnomalyRecognition?csf=1&web=1&e=4nbEUm)
112
+
113
+
114
+
115
+ ## Environment setup
116
+ ```
117
+ pip install -r requirements.txt
118
+ ```
119
+ ## Folder Structure
120
+ ```flow
121
+ demo/
122
+
123
+ ├── detection/ # MTFL detection
124
+ │ └── ...
125
+ ├── recognition/ # MTFL recognition
126
+ │ └── ...
127
+ ├── utils/
128
+ │ ├── swin_config/ # VST config for loading feature extractor
129
+ │ │ └── ...
130
+ │ ├── feature_extractor.py
131
+ │ ├── ...
132
+ │ └── video_preprocessing/ # scripts for annotation and unifying video format
133
+ │ └── ...
134
+ ├── test_videos/ # put your test video here
135
+ ├── Annotation/ # put your annotation here
136
+ ├── features/ # feature path
137
+ │ ├── L8
138
+ │ ├── L32
139
+ │ └── L64
140
+ ├── results/
141
+ │ ├── AUC # detection AUC
142
+ │ ├── scores # detection scores
143
+ │ └── rec_results # recognition labels
144
+ └── README.md
145
+ ```
146
+
147
+
148
+ ## Feature Extraction
149
+ Both recognition and detection models require multi-timescale features using tubelets 8, 32, and 64 frames.
150
+ To extract features, you need to upload the videos to the 'test_videos' directory and then run the following command:
151
+ ```
152
+ python utils/feature_extractor.py --clip_length [8/32/64]
153
+ ```
154
+ In the default settings, test videos should be stored in the 'test_videos' directory, and the extracted features will be
155
+ organized within the 'features' folder following the same directory structure as 'test_videos'.
156
+ For example, the feature of video 'test_videos/A/B.mp4' extracted with a frame length 8 is saved as 'features/L8/A/B.txt'.
157
+
158
+ You can modify the parameters inside the "VST Feature Extractor Parser" as needed.
159
+ For example, you can change the input video path, the save path of features and the used pretrained feature extractor by specifying the model path.
160
+ ```
161
+ python utils/feature_extractor.py --clip_length [8/32/64] --dataset_path [your video path] --save_dir [your feature path] --pretrained_3d [model path]
162
+ ```
163
+ Note: if you use VST pretrained on Kinetics400, you need to change <num_classes> to 400 in line 21 of
164
+ 'utils/swin_config/_base/models/swin/swin_tiny.py' to adapt the model size. For VST pretrained on VADD, the <num_classes>
165
+ is 18. These settings are referenced from the guidelines provided by
166
+ [Video-Swin-Transformer repository](https://github.com/SwinTransformer/Video-Swin-Transformer).
167
+
168
+
169
+
170
+ ## Anomaly Detection
171
+ ### Inference
172
+ To test a detection checkpoint model on your test videos, run:
173
+ ```
174
+ python detection/test.py --test_anno [your_anno_file.txt] --detection_model [checkpoint path]
175
+ ```
176
+
177
+
178
+ In the default settings:
179
+
180
+ * Test videos should be stored in the 'test_videos' directory.
181
+ * The corresponding annotation file need to be placed in the 'annotation' folder. Annotation format can be found under Video Preprocessing->Annotation.
182
+ * Multi-temporal scale features of the videos should be stored in the 'features' directory. See Feature Extraction.
183
+
184
+
185
+
186
+ The detection AUC and the scores for each video will be generated within the 'results' folder.
187
+ The directory structure of the generated results, in relation to both 'results/AUC' and 'results/scores', mirrors the
188
+ structure of the corresponding test videos in the 'test_videos' directory. For example,
189
+ the score of video 'test_videos/A/B.mp4' is saved as 'results/scores/A/B.png'
190
+
191
+ If you want to change paths to input and output data or any running configs,
192
+ feel free to change the args in 'detection/option.py'.
193
+
194
+ ### Train
195
+ To train a detection model, run:
196
+ ```
197
+ python detection/train.py --train_anno [your_train_anno_file.txt] --test_anno [your_test_anno_file.txt]
198
+ --lf_dir [path to long frame length features] --mf_dir [path to medium frame length features] --sf_dir
199
+ [path to short frame length features] --save_models [path for saving checkpoints] --output_dir [path for saving checkpoint AUC]
200
+ ```
201
+
202
+ Other training parameters can be found in 'detection/option.py'
203
+
204
+ ## Anomaly Recognition
205
+ ### Inference
206
+ To test a recognition checkpoint model on your test videos, run
207
+ ```
208
+ python recognition/test.py --test_anno [your_anno_file.txt] --recognition_model [checkpoint path]
209
+ ```
210
+
211
+ The default settings are same as Detection, and the modifiable parameters are in 'recognition/option.py'.
212
+ The recognition results of all input will be saved as 'results/rec_results/output_pred.txt'.
213
+
214
+
215
+
216
+ ### Train
217
+ To train a recognition model, run:
218
+ ```
219
+ python recognition/train.py --train_anno [your_train_anno_file.txt] --test_anno [your_test_anno_file.txt]
220
+ --lf_dir [path to long frame length features] --mf_dir [path to medium frame length features] --sf_dir
221
+ [path to short frame length features] --save_models [path for saving checkpoints] --output_dir [path for saving checkpoint AUC]
222
+ ```
223
+
224
+ Note: following the experimental setup of 4-fold cross-validation from [Sultani et al](https://arxiv.org/abs/1801.04264),
225
+ there are four pairs of training annotation and testing annotation files corresponding to four splits for each dataset,
226
+ which are provided in in the "annotation" folder accessible through the above VADD link.
227
+ Make sure the correspondence between the training and testing files; otherwise, there are data leakage issues.
228
+ Other training parameters can be found in 'recognition/option.py'
229
+
230
+ ## Acknowledgement
231
+
232
+ Partial code is used from
233
+ [Video-Swin-Transformer](https://github.com/SwinTransformer/Video-Swin-Transformer)
234
+ and [RTFM](https://github.com/tianyu0207/RTFM)
235
+ <!--## Citation
236
+
237
+ If you find this repo useful for your research, please consider citing our paper:-->
238
+
detection/dataset.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.utils.data as data
2
+ import os
3
+ import torch
4
+ torch.set_default_tensor_type('torch.FloatTensor')
5
+
6
+
7
+ def read_features(feature_path):
8
+ """
9
+ Read features from a text file and convert them into a torch tensor.
10
+
11
+ Args:
12
+ feature_path (str): Path to the text file containing features.
13
+
14
+ Returns:
15
+ features (torch.Tensor): A tensor containing the features. Shape is T x C.
16
+ """
17
+ with open(feature_path, 'r') as file:
18
+ lines = file.readlines()
19
+ features = []
20
+ for line in lines:
21
+ feature = [float(value) for value in line.strip().split()]
22
+ features.append(feature)
23
+ features = torch.tensor(features).float() # T x C
24
+ return features
25
+
26
+
27
+ class Dataset(data.Dataset):
28
+ def __init__(self, args, is_normal=True, transform=None, test_mode=False):
29
+ """
30
+ Custom dataset class for loading features and labels.
31
+
32
+ Args:
33
+ args: Argument object containing paths and options.
34
+ is_normal (bool): Whether the dataset represents normal samples.
35
+ transform: Data transformation to be applied.
36
+ test_mode (bool): Whether the dataset is for testing.
37
+
38
+ Attributes:
39
+ is_normal (bool): Whether the dataset represents normal samples.
40
+ transform: Data transformation to be applied.
41
+ test_mode (bool): Whether the dataset is for testing.
42
+ list (list): List of feature paths and labels information.
43
+ """
44
+ self.is_normal = is_normal
45
+ self.transform = transform
46
+ self.test_mode = test_mode
47
+
48
+ if self.test_mode:
49
+ annotation_path = args.test_anno
50
+ else:
51
+ annotation_path = args.train_anno
52
+
53
+ self.list = self._get_features_list(args.lf_dir, args.mf_dir, args.sf_dir, annotation_path)
54
+
55
+ def __getitem__(self, index):
56
+ label = self.get_label()
57
+ if self.test_mode:
58
+ lf_path, mf_path, sf_path, num_frames, start_end_couples, file = self.list[index]
59
+ l_features = read_features(lf_path)
60
+ m_features = read_features(mf_path)
61
+ s_features = read_features(sf_path)
62
+ return l_features, m_features, s_features, label, start_end_couples, num_frames, file
63
+ else:
64
+ lf_path, mf_path, sf_path = self.list[index]
65
+ l_features = read_features(lf_path)
66
+ m_features = read_features(mf_path)
67
+ s_features = read_features(sf_path)
68
+ return l_features, m_features, s_features, label
69
+
70
+ def get_label(self):
71
+ if self.is_normal:
72
+ label = torch.tensor(0.0)
73
+ else:
74
+ label = torch.tensor(1.0)
75
+
76
+ return label
77
+
78
+ def __len__(self):
79
+ return len(self.list)
80
+
81
+ def _get_features_list(self, lf_dir, mf_dir, sf_dir, annotation_path):
82
+ """
83
+ Generate a list of features and labels information from annotations.
84
+
85
+ Args:
86
+ lf_dir (str): Path to long-frame-length features directory.
87
+ mf_dir (str): Path to medium-frame-length features directory.
88
+ sf_dir (str): Path to short-frame-length features directory.
89
+ annotation_path (str): Path to annotation file.
90
+
91
+ Returns:
92
+ list: A list of tuples containing features and labels information.
93
+ """
94
+ assert os.path.exists(lf_dir)
95
+ assert os.path.exists(mf_dir)
96
+ assert os.path.exists(sf_dir)
97
+ features_list = []
98
+ with open(annotation_path) as f:
99
+ lines = f.read().splitlines(keepends=False)
100
+ for line in lines:
101
+ items = line.split()
102
+ #file = items[0].split(".")[0] for XD
103
+ file, ext = os.path.splitext(items[0])
104
+ file = file.replace("/", os.sep)
105
+ lf_path = os.path.join(lf_dir, file + '.txt')
106
+ mf_path = os.path.join(mf_dir, file + '.txt')
107
+ sf_path = os.path.join(sf_dir, file + '.txt')
108
+ cls_name = items[1]
109
+ if self.test_mode:
110
+ start_end_couples = [int(x) for x in items[3:]]
111
+ num_frames = int(items[2])
112
+ features_list.append((lf_path, mf_path, sf_path, num_frames, start_end_couples, file))
113
+ elif ("Normal" == cls_name) == self.is_normal:
114
+ features_list.append((lf_path, mf_path, sf_path))
115
+
116
+ return features_list
117
+
detection/model.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ Reference source: https://github.com/tianyu0207/RTFM"""
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+ import torch.nn.init as torch_init
7
+ torch.set_default_tensor_type('torch.FloatTensor')
8
+
9
+
10
+ def weight_init(m):
11
+ classname = m.__class__.__name__
12
+ if classname.find('Conv') != -1 or classname.find('Linear') != -1:
13
+ torch_init.xavier_uniform_(m.weight)
14
+ if m.bias is not None:
15
+ m.bias.data.fill_(0)
16
+
17
+
18
+ class CVA(nn.Module):
19
+ def __init__(self, input_dim=1024):
20
+ """
21
+ Cross-View Attention (CVA) module.
22
+
23
+ Args:
24
+ input_dim (int): Dimension of the input features.
25
+ """
26
+ super(CVA, self).__init__()
27
+ drop_out_rate = 0.1
28
+ num_heads = 4
29
+ self.cross_attention = nn.MultiheadAttention(embed_dim=input_dim, num_heads=num_heads, dropout=drop_out_rate,
30
+ device='cuda')
31
+
32
+ def forward(self, feature1, feature2):
33
+ """
34
+ Args:
35
+ feature1 (torch.Tensor): one path features. Shape: B x T x C.
36
+ feature2 (torch.Tensor): another path features. Shape: B x T x C.
37
+
38
+ Returns:
39
+ out1 (torch.Tensor): Processed features after cross-attention. Shape: B x T x C.
40
+ """
41
+
42
+ feature1 = F.layer_norm(feature1, [feature1.size(-1)])
43
+ feature2 = F.layer_norm(feature2, [feature2.size(-1)])
44
+ feature1 = feature1.permute(1, 0, 2) # T B C
45
+ feature2 = feature2.permute(1, 0, 2)
46
+
47
+ out1, _ = self.cross_attention(query=feature1, key=feature2, value=feature2) # T B C (For test:32 1 1024)
48
+ out1 = out1 + feature1 # residual connection
49
+
50
+ return out1 # B T C
51
+
52
+
53
+ class Aggregate(nn.Module):
54
+ def __init__(self, input_dim):
55
+ """
56
+ An aggregate network including local temporal correlation learning, global temporal correlation learning,
57
+ and feature fusion in MTFF.
58
+
59
+ Args:
60
+ input_dim (int): input features dim.
61
+ """
62
+ super(Aggregate, self).__init__()
63
+ bn = nn.BatchNorm1d
64
+ num_heads = 4
65
+ self.input_dim = input_dim
66
+ self.conv_1 = nn.Sequential(
67
+ nn.Conv1d(in_channels=input_dim, out_channels=512, kernel_size=3,
68
+ stride=1,dilation=1, padding=1),
69
+ nn.LeakyReLU(negative_slope=5e-2),
70
+ bn(512)
71
+ )
72
+ self.conv_2 = nn.Sequential(
73
+ nn.Conv1d(in_channels=input_dim, out_channels=512, kernel_size=3,
74
+ stride=1, dilation=2, padding=2),
75
+ nn.LeakyReLU(negative_slope=5e-2),
76
+ bn(512)
77
+ )
78
+ self.conv_3 = nn.Sequential(
79
+ nn.Conv1d(in_channels=input_dim, out_channels=512, kernel_size=3,
80
+ stride=1, dilation=4, padding=4),
81
+ nn.LeakyReLU(negative_slope=5e-2),
82
+ bn(512)
83
+ )
84
+ self.conv_4 = nn.Sequential(
85
+ nn.Conv1d(in_channels=input_dim*3, out_channels=512, kernel_size=1,
86
+ stride=1, padding=0, bias = False),
87
+ nn.LeakyReLU(negative_slope=5e-2),
88
+ )
89
+ self.conv_5 = nn.Sequential(
90
+ nn.Conv1d(in_channels=2048, out_channels=input_dim, kernel_size=3,
91
+ stride=1, padding=1, bias=False),
92
+ nn.LeakyReLU(negative_slope=5e-2),
93
+ nn.BatchNorm1d(input_dim),
94
+ )
95
+ self.self_attention = nn.MultiheadAttention(embed_dim=512, num_heads=num_heads,
96
+ dropout=0.1, device='cuda')
97
+
98
+ def forward(self, input1, input2, input3):
99
+ """
100
+ Args:
101
+ input1 (torch.Tensor): long-frame-length features. Shape: T x B x C.
102
+ input2 (torch.Tensor): medium-frame-length features. Shape: T x B x C.
103
+ input3 (torch.Tensor): short-frame-length features. Shape: T x B x C.
104
+
105
+ Returns:
106
+ torch.Tensor: Processed and fused output features. Shape: B x T x C.
107
+ """
108
+ x1 = input1.permute(1, 2, 0) # B C T
109
+ x2 = input2.permute(1, 2, 0)
110
+ x3 = input3.permute(1, 2, 0)
111
+ tensor_list = [x1, x2, x3]
112
+
113
+ residual = torch.mean(torch.stack(tensor_list), dim=0)
114
+
115
+ out1 = self.conv_1(x1) # B C/2 T
116
+ out2 = self.conv_2(x2)
117
+ out3 = self.conv_3(x3)
118
+ x = torch.cat([out1, out2, out3], dim=1) # B 3C/2 T
119
+
120
+ feature = torch.cat((x1, x2, x3), dim=1)
121
+ out = self.conv_4(feature)
122
+ out = out.permute(2, 0, 1) # T B C/2
123
+ out = F.layer_norm(out, normalized_shape=[out.size(-1)])
124
+ out, _ = self.self_attention(out, out, out) # T B C/2
125
+ out = out.permute(1, 2, 0) # B C/2 T
126
+ out = torch.cat((x, out), dim=1) # B 2C T
127
+ out = self.conv_5(out) # fuse all the features together
128
+ out = out + residual
129
+ out = out.permute(0, 2, 1)
130
+
131
+ return out
132
+
133
+
134
+ class Encoder(nn.Module):
135
+ def __init__(self, input_dim=1024, seg_num=32):
136
+ """
137
+ Multi-Temporal Feature Fusion (MTFF) module.
138
+
139
+ Args:
140
+ input_dim (int): Dimension of the input features.
141
+ seg_num (int): Number of snippets in a video.
142
+ """
143
+ super(Encoder, self).__init__()
144
+ self.drop_out_rate = 0.1
145
+ self.input_dim = input_dim
146
+ self.min_temporal_dim = seg_num
147
+ self.CVA1 = CVA(input_dim=input_dim)
148
+ self.CVA2 = CVA(input_dim=input_dim)
149
+ self.CVA3 = CVA(input_dim=input_dim)
150
+
151
+ self.aggregate = Aggregate(input_dim=input_dim)
152
+
153
+ def forward(self, feature1, feature2, feature3):
154
+ """
155
+ Args:
156
+ feature1 (torch.Tensor): long-frame-length features. Shape: B x T x C.
157
+ (Batch size X The number of snippets x Input dimensions)
158
+ feature2 (torch.Tensor): medium-frame-length features. Shape: B x T x C.
159
+ feature3 (torch.Tensor): short-frame-length features. Shape: B x T x C.
160
+
161
+ Returns:
162
+ torch.Tensor: Fused and processed output features. Shape: B x T x C.
163
+ """
164
+
165
+ att1 = self.CVA1(feature1, feature2)
166
+ att2 = self.CVA2(feature2, feature3)
167
+ att3 = self.CVA3(feature3, feature1)
168
+
169
+ out1 = self.aggregate(att1, att2, att3) # B T C
170
+
171
+ return out1
172
+
173
+
174
+ class Model(nn.Module):
175
+ def __init__(self, feature_dim, batch_size, seg_num=32):
176
+ """
177
+ Multi-Temporal Feature Learning (MTFL) model.
178
+
179
+ Args:
180
+ feature_dim (int): Dimension of the input features.
181
+ batch_size (int): Batch size.
182
+ seg_num (int): Number of snippets in a video.
183
+ """
184
+ super(Model, self).__init__()
185
+ self.batch_size = batch_size
186
+ self.num_segments = seg_num
187
+ self.k_abn = self.num_segments // 10 # select 3 snippets
188
+ self.k_nor = self.num_segments // 10
189
+
190
+ self.Encoder = Encoder(input_dim=feature_dim, seg_num=seg_num)
191
+
192
+ # Fully connected layers for scoring
193
+ self.fc1 = nn.Linear(feature_dim, 512)
194
+ self.fc2 = nn.Linear(512, 128)
195
+ self.fc3 = nn.Linear(128, 1)
196
+
197
+ self.drop_out = nn.Dropout(0.2)
198
+ self.relu = nn.LeakyReLU(negative_slope=5e-2)
199
+ self.sigmoid = nn.Sigmoid()
200
+ self.apply(weight_init)
201
+
202
+ def forward(self, input1, input2, input3):
203
+ """
204
+ Args:
205
+ input1 (torch.Tensor): long-frame-length features. Shape: B x T x feature_dim.
206
+ input2 (torch.Tensor): medium-frame-length features. Shape: B x T x feature_dim.
207
+ input3 (torch.Tensor): short-frame-length features. Shape: B x T x feature_dim.
208
+
209
+ Returns:
210
+ score_abnormal (torch.Tensor): The mean scores for top-3 abnormal instances.
211
+ score_normal (torch.Tensor): The mean scores for top-3 normal instances.
212
+ feat_select_abn (torch.Tensor): Selected abnormal features.
213
+ feat_select_normal (torch.Tensor): Selected normal features.
214
+ scores (torch.Tensor): All computed scores. Shape: B x T x 1
215
+ """
216
+ k_abn = self.k_abn
217
+ k_nor = self.k_nor
218
+ ncrops = 1 # Reserving the parameter for spatial cropping, which is not used and defaults to 1
219
+
220
+ # Multi-Temporal Feature Fusion
221
+ out = self.Encoder(input1, input2, input3)
222
+ bs, t, f = out.size()
223
+ features = self.drop_out(out) # B T D
224
+
225
+ # Scoring layers
226
+ scores = self.relu(self.fc1(features))
227
+ scores = self.drop_out(scores)
228
+ scores = self.relu(self.fc2(scores))
229
+ scores = self.drop_out(scores)
230
+ scores = self.sigmoid(self.fc3(scores))
231
+ scores = scores.view(bs, ncrops, -1).mean(1)
232
+ scores = scores.unsqueeze(dim=2)
233
+
234
+ # Split normal and abnormal instances
235
+ normal_features = features[0:self.batch_size]
236
+ normal_scores = scores[0:self.batch_size]
237
+ abnormal_features = features[self.batch_size:]
238
+ abnormal_scores = scores[self.batch_size:]
239
+
240
+ # Compute feature magnitudes
241
+ feat_magnitudes = torch.norm(features, p=2, dim=2)
242
+ feat_magnitudes = feat_magnitudes.view(bs, ncrops, -1).mean(1)
243
+ nfea_magnitudes = feat_magnitudes[0:self.batch_size] # normal feature magnitudes
244
+ afea_magnitudes = feat_magnitudes[self.batch_size:] # abnormal feature magnitudes
245
+ n_size = nfea_magnitudes.shape[0]
246
+
247
+ # Inference mode for batch size 1
248
+ if nfea_magnitudes.shape[0] == 1:
249
+ afea_magnitudes = nfea_magnitudes
250
+ abnormal_scores = normal_scores
251
+ abnormal_features = normal_features
252
+
253
+ select_idx = torch.ones_like(nfea_magnitudes)
254
+ select_idx = self.drop_out(select_idx)
255
+
256
+ ####### process abnormal videos -> select top3 feature magnitude #######
257
+ afea_magnitudes_drop = afea_magnitudes * select_idx
258
+ idx_abn = torch.topk(afea_magnitudes_drop, k_abn, dim=1)[1]
259
+ idx_abn_feat = idx_abn.unsqueeze(2).expand([-1, -1, abnormal_features.shape[2]])
260
+
261
+ abnormal_features = abnormal_features.view(n_size, ncrops, t, f) # B X N X T X F
262
+ abnormal_features = abnormal_features.permute(1, 0, 2, 3) # N X B X T X F
263
+
264
+ total_select_abn_feature = torch.zeros(0, device=input1.device)
265
+ for abnormal_feature in abnormal_features:
266
+ feat_select_abn = torch.gather(abnormal_feature, 1, idx_abn_feat) # top 3 features magnitude in abnormal bag
267
+ total_select_abn_feature = torch.cat((total_select_abn_feature, feat_select_abn))
268
+
269
+ idx_abn_score = idx_abn.unsqueeze(2).expand([-1, -1, abnormal_scores.shape[2]])
270
+ # top 3 scores in abnormal bag based on the top-3 magnitude
271
+ score_abnormal = torch.mean(torch.gather(abnormal_scores, 1, idx_abn_score), dim=1)
272
+
273
+
274
+ ####### process normal videos -> select top3 feature magnitude #######
275
+
276
+ select_idx_normal = torch.ones_like(nfea_magnitudes)
277
+ select_idx_normal = self.drop_out(select_idx_normal)
278
+ nfea_magnitudes_drop = nfea_magnitudes * select_idx_normal
279
+ idx_normal = torch.topk(nfea_magnitudes_drop, k_nor, dim=1)[1]
280
+ idx_normal_feat = idx_normal.unsqueeze(2).expand([-1, -1, normal_features.shape[2]])
281
+
282
+ normal_features = normal_features.view(n_size, ncrops, t, f)
283
+ normal_features = normal_features.permute(1, 0, 2, 3) # 1 B T D
284
+
285
+ total_select_nor_feature = torch.zeros(0, device=input1.device)
286
+ for nor_fea in normal_features:
287
+ feat_select_normal = torch.gather(nor_fea, 1, idx_normal_feat) # top 3 features magnitude in normal bag (hard negative)
288
+ total_select_nor_feature = torch.cat((total_select_nor_feature, feat_select_normal))
289
+
290
+ idx_normal_score = idx_normal.unsqueeze(2).expand([-1, -1, normal_scores.shape[2]])
291
+ score_normal = torch.mean(torch.gather(normal_scores, 1, idx_normal_score), dim=1) # top 3 scores in normal bag
292
+
293
+ feat_select_abn = total_select_abn_feature
294
+ feat_select_normal = total_select_nor_feature
295
+
296
+ return score_abnormal, score_normal, feat_select_abn, feat_select_normal, scores
detection/option.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ ############ Test args ########################
4
+ test_parser = argparse.ArgumentParser(description='MTFL_detection_test')
5
+ # input path
6
+ test_parser.add_argument('--lf_dir', type=str, default='features/L64', help='long frame length feature path')
7
+ test_parser.add_argument('--mf_dir', type=str, default='features/L32', help='media frame length feature path')
8
+ test_parser.add_argument('--sf_dir', type=str, default='features/L8', help='short frame length feature path')
9
+ test_parser.add_argument('--test_anno', default='annotation/Anomaly_videos.txt', help='test annotation file')
10
+ test_parser.add_argument('--detection_model', default='/media/DataDrive/yiling/Test/models/MTFL/MTFL-vst-VAD.pkl',
11
+ help='model path')
12
+ # output path
13
+ test_parser.add_argument('--output_dir', default='results',
14
+ help='The path to store the generated scores and AUC results')
15
+ # feature size depending on which feature extractor used
16
+ test_parser.add_argument('--feature_size', type=int, default=1024, help='feature dim (default: VST feature)')
17
+ test_parser.add_argument('--seg_num', type=int, default=32, help='the number of snippets')
18
+ # running cfg
19
+ test_parser.add_argument('--gpu', default="0", type=str, choices=["0", "1"], help='gpu')
20
+ test_parser.add_argument('--workers', default=8, help='number of workers in dataloader')
21
+
22
+
23
+ ############ Train args ########################
24
+ train_parser = argparse.ArgumentParser(description='MTFL_detection_train')
25
+ # input path
26
+ train_parser.add_argument('--lf_dir', type=str, default='/media/DataDrive/yiling/features/VST_VAD_MT/L64R1',
27
+ help='long feature path')
28
+ train_parser.add_argument('--mf_dir', type=str, default='/media/DataDrive/yiling/features/VST_VAD_MT/L32R1',
29
+ help='media feature path')
30
+ train_parser.add_argument('--sf_dir', type=str, default='/media/DataDrive/yiling/features/VST_VAD_MT/L8R1',
31
+ help='short feature path')
32
+ train_parser.add_argument('--train_anno', default='/media/DataDrive/yiling/annotation/VAD_train_annotation.txt',
33
+ help='the annotation file for training')
34
+ train_parser.add_argument('--test_anno', default='/media/DataDrive/yiling/annotation/UCF_test_annotation_with_frames.txt',
35
+ help='the annotation file for test')
36
+ # output path and saving info
37
+ train_parser.add_argument('--model-name', default='MTFL', help='name to save model')
38
+ train_parser.add_argument('--save_models', default='/media/DataDrive/yiling/models/demo/detection',
39
+ help='the path for saving models')
40
+ train_parser.add_argument('--output_dir', default='/media/DataDrive/yiling/results/demo/detection',
41
+ help='The path to store AUC results')
42
+ # training cfg and paras
43
+ train_parser.add_argument('--gpu', default="0", type=str, choices=["0", "1"], help='gpu id')
44
+ train_parser.add_argument('--feature_size', type=int, default=1024, help='feature dim (default: VST feature)')
45
+ train_parser.add_argument('--seg_num', type=int, default=32, help='the number of snippets')
46
+ train_parser.add_argument('--lr', type=float, default='0.0001', help='learning rates for steps(list form)')
47
+ train_parser.add_argument('--batch-size', type=int, default=64, help='batch size')
48
+ train_parser.add_argument('--workers', type=int, default=8, help='number of workers in dataloader')
49
+ train_parser.add_argument('--max-epoch', type=int, default=2000, help='maximum iteration to train (default: 100)')
50
+ train_parser.add_argument('--metric', type=str, choices=["AP", "AUC"], default="AUC", help='the used metric')
51
+
52
+
53
+
54
+
55
+
56
+
detection/test.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from sklearn.metrics import auc, roc_curve, average_precision_score
3
+ from tqdm import tqdm
4
+ import os
5
+ import matplotlib.pyplot as plt
6
+ import option
7
+
8
+ from torch.utils.data import DataLoader
9
+ from dataset import Dataset
10
+ from model import Model
11
+ import warnings
12
+ from sklearn.exceptions import UndefinedMetricWarning
13
+
14
+ warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
15
+
16
+
17
+ def get_gt(start_end_couples, num_frames, device):
18
+ """
19
+ Generate a ground truth tensor representing events in a time sequence based on given start and end pairs.
20
+
21
+ Args:
22
+ start_end_couples (list): A list containing pairs of start and end frames.
23
+ If None or all '-1', no events are present.
24
+ num_frames (int): Total number of frames in the time sequence.
25
+ device: Device where the tensor should be placed.
26
+
27
+ Returns:
28
+ gt: A tensor of shape (num_frames,) representing whether each frame belongs to an anomalous event.
29
+ '1' means anomalous, and '0' means normal.
30
+ """
31
+ gt = torch.zeros(num_frames).to(device)
32
+ if start_end_couples is not None and num_frames is not None:
33
+ for i in range(0, len(start_end_couples) - 1, 2):
34
+ if start_end_couples[i].item() != -1 and start_end_couples[i + 1].item() != -1:
35
+ couple = start_end_couples[i:i + 2]
36
+ gt[couple[0].item():couple[1].item()] = 1.0
37
+
38
+ return gt
39
+
40
+
41
+ def save_scores(pred, start_end_couples, save_path):
42
+ """
43
+ Save plots containing anomaly scores and annotated regions.
44
+
45
+ Args:
46
+ pred (list): List of anomaly scores.
47
+ start_end_couples (Tensor): Pairs of start and end frames indicating anomalous regions.
48
+ save_path (str): Path to save the generated plot.
49
+ file_name (str): Name to be displayed in the legend of the plot.
50
+ """
51
+
52
+ plt.figure()
53
+ file_name = os.path.basename(save_path).split(".")[0]
54
+ plt.plot(pred, label=file_name, color='blue')
55
+
56
+ # Plot anomalous regions
57
+ for i in range(0, len(start_end_couples) - 1, 2):
58
+ if start_end_couples[i].item() != -1 and start_end_couples[i + 1].item() != -1:
59
+ plt.axvspan(start_end_couples[i].item(), start_end_couples[i + 1].item(), color='red', alpha=0.3)
60
+
61
+ plt.ylim(0, 1)
62
+ plt.xlabel('Frames', fontdict={'size': 16})
63
+ plt.ylabel('Anomaly Score', fontdict={'size': 16})
64
+ plt.yticks(size=14)
65
+ plt.xticks(size=14)
66
+
67
+ plt.legend(prop={'size': 16})
68
+ #plt.show()
69
+ plt.savefig(save_path)
70
+ plt.close()
71
+
72
+
73
+ def test(dataloader, model, device, gen_scores=False, save_dir=None):
74
+ """
75
+ Test the model's performance on the given dataloader.
76
+
77
+ Args:
78
+ dataloader (DataLoader): DataLoader for test data.
79
+ model: The model to be tested.
80
+ device: Device to perform testing on.
81
+ gen_scores (bool): Whether to generate and save anomaly scores plot.
82
+ save_dir (str): Directory to save generated plots.
83
+
84
+ Returns:
85
+ single_video_AUC (dict): A dictionary containing AUC values for each video.
86
+ overall_auc (float): Overall AUC value.
87
+ ap (float): average precision
88
+ """
89
+ single_video_AUC = {"video": [], "AUC": []}
90
+
91
+ with torch.no_grad():
92
+ model.to(device).eval()
93
+ pred = torch.zeros(0, device=device)
94
+ gt = torch.zeros(0, device=device)
95
+
96
+ for input1, input2, input3, label, start_end_couples, num_frames, file in tqdm(dataloader):
97
+ input1 = input1.to(device)
98
+ input2 = input2.to(device)
99
+ input3 = input3.to(device)
100
+ score_abnormal, score_normal, feat_select_abn, feat_select_normal, scores = model(input1, input2, input3)
101
+ sig = torch.squeeze(scores, dim=(0, 2)) # T scores
102
+ segment = num_frames.item() // sig.size()[0]
103
+ sig = sig.repeat_interleave(segment) # Frames
104
+ if len(sig) < num_frames.item():
105
+ last_ele = sig[-1]
106
+ sig = torch.cat((sig, last_ele.repeat(num_frames.item()-len(sig)))) # 1 x Frames
107
+
108
+ pred = torch.cat((pred, sig))
109
+ cur_gt = get_gt(start_end_couples, num_frames, device)
110
+ gt = torch.cat((gt, cur_gt))
111
+
112
+ sig = sig.cpu().detach().numpy()
113
+ cur_gt = cur_gt.cpu().detach().numpy()
114
+ fpr, tpr, threshold = roc_curve(cur_gt, sig)
115
+ video_auc = auc(fpr, tpr)
116
+ single_video_AUC["video"].append(file)
117
+ single_video_AUC["AUC"].append(video_auc)
118
+
119
+ if gen_scores:
120
+ save_path = os.path.join(save_dir, file[0] + '.png')
121
+ os.makedirs(os.path.dirname(save_path), exist_ok=True)
122
+ save_scores(sig, start_end_couples, save_path)
123
+
124
+ pred = pred.cpu().detach().numpy()
125
+ gt = gt.cpu().detach().numpy()
126
+ ap = average_precision_score(gt, pred)
127
+ fpr, tpr, threshold = roc_curve(gt, pred)
128
+ overall_auc = auc(fpr, tpr)
129
+ print('\n' + 'Overall auc : ' + str(overall_auc) + ', Average Precision : ' + str(ap) + '\n')
130
+
131
+ return single_video_AUC, overall_auc, ap
132
+
133
+
134
+ def main():
135
+ args = option.test_parser.parse_args()
136
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
137
+ os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
138
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
139
+
140
+ AUC_path = os.path.join(args.output_dir, 'AUC')
141
+ scores_path = os.path.join(args.output_dir, 'scores')
142
+
143
+ test_loader = DataLoader(Dataset(args, test_mode=True),
144
+ batch_size=1, shuffle=False,
145
+ num_workers=args.workers, pin_memory=True)
146
+ model = Model(feature_dim=args.feature_size, batch_size=1, seg_num=args.seg_num)
147
+ model.load_state_dict(torch.load(args.detection_model))
148
+
149
+ single_video_AUC, overall_auc, ap = test(dataloader=test_loader,
150
+ model=model,
151
+ device=device,
152
+ gen_scores=True,
153
+ save_dir=scores_path)
154
+
155
+ # save AUC results
156
+ video_sub_dir = os.path.basename(os.path.dirname(single_video_AUC["video"][0][0]))
157
+ file_path = os.path.join(AUC_path, video_sub_dir, 'results.txt')
158
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
159
+ with open(file_path, "w") as f:
160
+ for video, single_auc in zip(single_video_AUC["video"], single_video_AUC["AUC"]):
161
+ f.write(f"Video: {video}, AUC: {single_auc}\n")
162
+ f.write("Overall AUC: {}, Average Precision: {}\n".format(overall_auc, ap))
163
+
164
+
165
+ if __name__ == '__main__':
166
+ main()
167
+
168
+
detection/train.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.optim as optim
3
+ import os
4
+ from torch.nn import MSELoss
5
+ from torch.utils.data import DataLoader
6
+ from model import Model
7
+ from dataset import Dataset
8
+ from test import test
9
+ import option
10
+ from tqdm import tqdm
11
+ torch.set_default_tensor_type('torch.FloatTensor')
12
+
13
+
14
+ def sparsity(arr, lamda2):
15
+ loss = torch.mean(torch.norm(arr, dim=0))
16
+ return lamda2*loss
17
+
18
+
19
+ def smooth(arr, lamda1):
20
+ arr2 = torch.zeros_like(arr)
21
+ arr2[:-1] = arr[1:]
22
+ arr2[-1] = arr[-1]
23
+
24
+ loss = torch.sum((arr2-arr)**2)
25
+
26
+ return lamda1*loss
27
+
28
+
29
+ class SigmoidMAELoss(torch.nn.Module):
30
+ def __init__(self):
31
+ super(SigmoidMAELoss, self).__init__()
32
+ from torch.nn import Sigmoid
33
+ self.__sigmoid__ = Sigmoid()
34
+ self.__l1_loss__ = MSELoss()
35
+
36
+ def forward(self, pred, target):
37
+ return self.__l1_loss__(pred, target)
38
+
39
+
40
+ class RTFM_loss(torch.nn.Module):
41
+ def __init__(self, alpha, margin):
42
+ super(RTFM_loss, self).__init__()
43
+ self.alpha = alpha
44
+ self.margin = margin
45
+ self.sigmoid = torch.nn.Sigmoid()
46
+ self.mae_criterion = SigmoidMAELoss()
47
+ self.criterion = torch.nn.BCELoss()
48
+
49
+ def forward(self, score_normal, score_abnormal, nlabel, alabel, feat_n, feat_a):
50
+ label = torch.cat((nlabel, alabel), 0)
51
+ score_abnormal = score_abnormal
52
+ score_normal = score_normal
53
+
54
+ score = torch.cat((score_normal, score_abnormal), 0)
55
+ score = score.squeeze()
56
+
57
+ label = label.cuda()
58
+
59
+ loss_cls = self.criterion(score, label) # BCE loss in the score space
60
+
61
+ loss_abn = torch.abs(self.margin - torch.norm(torch.mean(feat_a, dim=1), p=2, dim=1))
62
+
63
+ loss_nor = torch.norm(torch.mean(feat_n, dim=1), p=2, dim=1)
64
+
65
+ loss_rtfm = torch.mean((loss_abn + loss_nor) ** 2)
66
+
67
+ loss_total = loss_cls + self.alpha * loss_rtfm
68
+
69
+ return loss_total
70
+
71
+
72
+ def train(nloader, aloader, model, batch_size, seg_num, optimizer, device):
73
+ with torch.set_grad_enabled(True):
74
+ model.train()
75
+
76
+ ninput1, ninput2, ninput3, nlabel = next(nloader)
77
+ ainput1, ainput2, ainput3, alabel = next(aloader)
78
+
79
+ input1 = torch.cat((ninput1, ainput1), 0).to(device)
80
+ input2 = torch.cat((ninput2, ainput2), 0).to(device)
81
+ input3 = torch.cat((ninput3, ainput3), 0).to(device)
82
+ score_abnormal, score_normal, feat_select_abn, feat_select_normal, scores = model(input1, input2, input3)
83
+
84
+ scores = scores.view(batch_size * seg_num * 2, -1) # BX32X2, 1
85
+
86
+ scores = scores.squeeze()
87
+ abn_scores = scores[batch_size * seg_num:]
88
+
89
+ nlabel = nlabel[0:batch_size]
90
+ alabel = alabel[0:batch_size]
91
+
92
+ loss_criterion = RTFM_loss(0.0001, 100)
93
+ loss_sparse = sparsity(abn_scores, 8e-3)
94
+ loss_smooth = smooth(abn_scores, 8e-4)
95
+
96
+ loss_RTFM = loss_criterion(score_normal, score_abnormal, nlabel, alabel, feat_select_normal, feat_select_abn)
97
+ cost = loss_RTFM + loss_smooth + loss_sparse
98
+
99
+ optimizer.zero_grad()
100
+ cost.backward()
101
+ optimizer.step()
102
+
103
+
104
+ def main():
105
+ args = option.train_parser.parse_args()
106
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
107
+ os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
108
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
109
+
110
+ train_nloader = DataLoader(Dataset(args, test_mode=False, is_normal=True),
111
+ batch_size=args.batch_size, shuffle=True,
112
+ num_workers=args.workers, pin_memory=True, drop_last=True)
113
+ train_aloader = DataLoader(Dataset(args, test_mode=False, is_normal=False),
114
+ batch_size=args.batch_size, shuffle=True,
115
+ num_workers=args.workers, pin_memory=True, drop_last=True)
116
+ test_loader = DataLoader(Dataset(args, test_mode=True),
117
+ batch_size=1, shuffle=False,
118
+ num_workers=args.workers, pin_memory=True)
119
+
120
+ if not os.path.exists(args.save_models):
121
+ os.makedirs(args.save_models)
122
+
123
+ feature_size = args.feature_size
124
+ model = Model(feature_size, args.batch_size, args.seg_num)
125
+ optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.005)
126
+ test_info = {"epoch": [], "AUC": [], "AP": []}
127
+ best_result = -1
128
+ output_dir = args.output_dir
129
+ os.makedirs(output_dir, exist_ok=True)
130
+ _, overall_auc, ap = test(dataloader=test_loader,
131
+ model=model,
132
+ device=device,
133
+ gen_scores=False,
134
+ save_dir=None)
135
+
136
+ for step in tqdm(range(1, args.max_epoch + 1), total=args.max_epoch, dynamic_ncols=True):
137
+ if (step - 1) % len(train_nloader) == 0:
138
+ loadern_iter = iter(train_nloader)
139
+
140
+ if (step - 1) % len(train_aloader) == 0:
141
+ loadera_iter = iter(train_aloader)
142
+
143
+ train(nloader=loadern_iter,
144
+ aloader=loadera_iter,
145
+ model=model,
146
+ batch_size=args.batch_size,
147
+ seg_num=args.seg_num,
148
+ optimizer=optimizer,
149
+ device=device)
150
+
151
+ if step % 5 == 0 and step > 200:
152
+ _, overall_auc, ap = test(dataloader=test_loader,
153
+ model=model,
154
+ device=device,
155
+ gen_scores=False,
156
+ save_dir=None)
157
+
158
+ test_info["epoch"].append(step)
159
+ test_info["AUC"].append(overall_auc)
160
+ test_info["AP"].append(ap)
161
+
162
+ # if test_info["AUC"][-1] > best_result:
163
+ # best_result = test_info["AUC"][-1]
164
+ # torch.save(model.state_dict(), os.path.join(args.save_models, args.model_name + '-{}.pkl'.format(step)))
165
+ # file_path = os.path.join(output_dir, '{}-step-AUC.txt'.format(step))
166
+ # with open(file_path, "w") as fo:
167
+ # for key in test_info:
168
+ # fo.write("{}: {}\n".format(key, test_info[key][-1]))
169
+
170
+ metric = args.metric
171
+ if test_info[metric][-1] > best_result:
172
+ best_result = test_info[metric][-1]
173
+ torch.save(model.state_dict(), os.path.join(args.save_models, args.model_name + '-{}.pkl'.format(step)))
174
+ file_path = os.path.join(output_dir, '{}-step-result.txt'.format(step))
175
+ with open(file_path, "w") as fo:
176
+ for key in test_info:
177
+ fo.write("{}: {}\n".format(key, test_info[key][-1]))
178
+
179
+
180
+ if __name__ == '__main__':
181
+ main()
182
+
183
+
184
+
185
+
186
+
187
+
188
+
figures/Intro.png ADDED

Git LFS Details

  • SHA256: c87d0010487ecd66d1a99a020213feec75117b99e626e713aa652a6b7d2eabc1
  • Pointer size: 132 Bytes
  • Size of remote file: 1.95 MB
recognition/dataset.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.utils.data as data
2
+ import os
3
+ import torch
4
+ torch.set_default_tensor_type('torch.FloatTensor')
5
+
6
+ class_to_int = {
7
+ 'Normal': 0,
8
+ 'Abuse': 1,
9
+ 'Arrest': 2,
10
+ 'Arson': 3,
11
+ 'Assault': 4,
12
+ 'Burglary': 5,
13
+ 'Explosion': 6,
14
+ 'Fighting': 7,
15
+ 'Robbery': 8,
16
+ 'Shooting': 9,
17
+ 'Shoplifting': 10,
18
+ 'Stealing': 11,
19
+ 'Vandalism': 12,
20
+ 'RoadAccidents_EMVvsEMV': 13,
21
+ 'RoadAccidents_EMVvsVRU': 14,
22
+ 'RoadAccidents_VRUvsVRU': 15,
23
+ 'DangerousThrowing': 16,
24
+ 'Littering': 17
25
+ }
26
+
27
+
28
+ def read_features(feature_path):
29
+ """
30
+ Read features from a text file and convert them into a torch tensor.
31
+
32
+ Args:
33
+ feature_path (str): Path to the text file containing features.
34
+
35
+ Returns:
36
+ features (torch.Tensor): A tensor containing the features. Shape is T x C.
37
+ """
38
+ with open(feature_path, 'r') as file:
39
+ lines = file.readlines()
40
+ features = []
41
+ for line in lines:
42
+ feature = [float(value) for value in line.strip().split()]
43
+ features.append(feature)
44
+ features = torch.tensor(features).float() # T x C
45
+ return features
46
+
47
+
48
+ class Dataset(data.Dataset):
49
+ def __init__(self, args, is_normal=True, transform=None, test_mode=False):
50
+ """
51
+ Custom dataset class for loading features and labels.
52
+
53
+ Args:
54
+ args: Argument object containing paths and options.
55
+ is_normal (bool): Whether the dataset represents normal samples.
56
+ transform: Data transformation to be applied.
57
+ test_mode (bool): Whether the dataset is for testing.
58
+
59
+ Attributes:
60
+ is_normal (bool): Whether the dataset represents normal samples.
61
+ transform: Data transformation to be applied.
62
+ test_mode (bool): Whether the dataset is for testing.
63
+ list (list): List of feature paths and labels information.
64
+ """
65
+ self.is_normal = is_normal
66
+ self.transform = transform
67
+ self.test_mode = test_mode
68
+
69
+ if self.test_mode:
70
+ annotation_path = args.test_anno
71
+ else:
72
+ annotation_path = args.train_anno
73
+
74
+ self.list = self._get_features_list(args.lf_dir, args.mf_dir, args.sf_dir, annotation_path)
75
+
76
+ def __getitem__(self, index):
77
+ if self.test_mode:
78
+ lf_path, mf_path, sf_path, label, file = self.list[index]
79
+ l_features = read_features(lf_path)
80
+ m_features = read_features(mf_path)
81
+ s_features = read_features(sf_path)
82
+ label = torch.tensor(label)
83
+ return s_features, m_features, l_features, label, file
84
+ else:
85
+ lf_path, mf_path, sf_path, label = self.list[index]
86
+ l_features = read_features(lf_path)
87
+ m_features = read_features(mf_path)
88
+ s_features = read_features(sf_path)
89
+ label = torch.tensor(label)
90
+ return s_features, m_features, l_features, label
91
+
92
+ def __len__(self):
93
+ return len(self.list)
94
+
95
+ def _get_features_list(self, lf_dir, mf_dir, sf_dir, annotation_path):
96
+ """
97
+ Construct a feature list from the given directories and annotation file.
98
+
99
+ Args:
100
+ lf_dir (str): Directory path containing long-frame-length feature files.
101
+ mf_dir (str): Directory path containing medium-frame-length feature files.
102
+ sf_dir (str): Directory path containing short-frame-length feature files.
103
+ annotation_path (str): Path to a text file containing annotation information.
104
+
105
+ Returns:
106
+ list: A list of tuples, each containing (lf_path, mf_path, sf_path, cls) or (lf_path, mf_path, sf_path, cls, file).
107
+
108
+ Raises:
109
+ AssertionError: If the input directories do not exist.
110
+
111
+ Note:
112
+ - If test_mode is True, each tuple contains (lf_path, mf_path, sf_path, cls, file), where file is the file name.
113
+ - If test_mode is False, each tuple contains (lf_path, mf_path, sf_path, cls), and selection is based on whether it is normal (is_normal).
114
+
115
+ """
116
+ assert os.path.exists(lf_dir)
117
+ assert os.path.exists(mf_dir)
118
+ assert os.path.exists(sf_dir)
119
+ features_list = []
120
+ with open(annotation_path) as f:
121
+ lines = f.read().splitlines(keepends=False)
122
+ for line in lines:
123
+ items = line.split()
124
+ file = items[0].split(".")[0]
125
+ file = file.replace("/", os.sep)
126
+ lf_path = os.path.join(lf_dir, file + '.txt')
127
+ mf_path = os.path.join(mf_dir, file + '.txt')
128
+ sf_path = os.path.join(sf_dir, file + '.txt')
129
+ unsupported_class = 18
130
+ if not items[1].isdigit():
131
+ cls = class_to_int.get(items[1], unsupported_class)
132
+ else:
133
+ cls = int(items[1])
134
+ if self.test_mode:
135
+ features_list.append((lf_path, mf_path, sf_path, cls, file))
136
+ elif (cls == class_to_int['Normal']) == self.is_normal:
137
+ features_list.append((lf_path, mf_path, sf_path, cls))
138
+
139
+ return features_list
140
+
recognition/model.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ Reference source: https://github.com/tianyu0207/RTFM"""
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+ import torch.nn.init as torch_init
7
+ torch.set_default_tensor_type('torch.FloatTensor')
8
+
9
+
10
+ def weight_init(m):
11
+ classname = m.__class__.__name__
12
+ if classname.find('Conv') != -1 or classname.find('Linear') != -1:
13
+ torch_init.xavier_uniform_(m.weight)
14
+ if m.bias is not None:
15
+ m.bias.data.fill_(0)
16
+
17
+
18
+ class CVA(nn.Module):
19
+ def __init__(self, input_dim=1024):
20
+ """
21
+ Cross-View Attention (CVA) module.
22
+
23
+ Args:
24
+ input_dim (int): Dimension of the input features.
25
+ """
26
+ super(CVA, self).__init__()
27
+ drop_out_rate = 0.1
28
+ num_heads = 4
29
+ self.cross_attention = nn.MultiheadAttention(embed_dim=input_dim, num_heads=num_heads, dropout=drop_out_rate,
30
+ device='cuda')
31
+
32
+ def forward(self, feature1, feature2):
33
+ """
34
+ Args:
35
+ feature1 (torch.Tensor): one path features. Shape: B x T x C.
36
+ feature2 (torch.Tensor): another path features. Shape: B x T x C.
37
+
38
+ Returns:
39
+ out1 (torch.Tensor): Processed features after cross-attention. Shape: B x T x C.
40
+ """
41
+
42
+ feature1 = F.layer_norm(feature1, [feature1.size(-1)])
43
+ feature2 = F.layer_norm(feature2, [feature2.size(-1)])
44
+ feature1 = feature1.permute(1, 0, 2) # T B C
45
+ feature2 = feature2.permute(1, 0, 2)
46
+
47
+ out1, _ = self.cross_attention(query=feature1, key=feature2, value=feature2) # T B C (For test:32 1 1024)
48
+ out1 = out1 + feature1 # residual connection
49
+
50
+ return out1 # B T C
51
+
52
+
53
+ class Aggregate(nn.Module):
54
+ def __init__(self, input_dim):
55
+ """
56
+ An aggregate network including local temporal correlation learning, global temporal correlation learning,
57
+ and feature fusion in MTFF.
58
+
59
+ Args:
60
+ input_dim (int): input features dim.
61
+ """
62
+ super(Aggregate, self).__init__()
63
+ bn = nn.BatchNorm1d
64
+ num_heads = 4
65
+ self.input_dim = input_dim
66
+ self.conv_1 = nn.Sequential(
67
+ nn.Conv1d(in_channels=input_dim, out_channels=512, kernel_size=3,
68
+ stride=1,dilation=1, padding=1),
69
+ nn.LeakyReLU(negative_slope=5e-2),
70
+ bn(512)
71
+ )
72
+ self.conv_2 = nn.Sequential(
73
+ nn.Conv1d(in_channels=input_dim, out_channels=512, kernel_size=3,
74
+ stride=1, dilation=2, padding=2),
75
+ nn.LeakyReLU(negative_slope=5e-2),
76
+ bn(512)
77
+ )
78
+ self.conv_3 = nn.Sequential(
79
+ nn.Conv1d(in_channels=input_dim, out_channels=512, kernel_size=3,
80
+ stride=1, dilation=4, padding=4),
81
+ nn.LeakyReLU(negative_slope=5e-2),
82
+ bn(512)
83
+ )
84
+ self.conv_4 = nn.Sequential(
85
+ nn.Conv1d(in_channels=input_dim*3, out_channels=512, kernel_size=1,
86
+ stride=1, padding=0, bias = False),
87
+ nn.LeakyReLU(negative_slope=5e-2),
88
+ )
89
+ self.conv_5 = nn.Sequential(
90
+ nn.Conv1d(in_channels=2048, out_channels=input_dim, kernel_size=3,
91
+ stride=1, padding=1, bias=False),
92
+ nn.LeakyReLU(negative_slope=5e-2),
93
+ nn.BatchNorm1d(input_dim),
94
+ )
95
+ self.self_attention = nn.MultiheadAttention(embed_dim=512, num_heads=num_heads,
96
+ dropout=0.1, device='cuda')
97
+
98
+ def forward(self, input1, input2, input3):
99
+ """
100
+ Args:
101
+ input1 (torch.Tensor): long-frame-length features. Shape: T x B x C.
102
+ input2 (torch.Tensor): medium-frame-length features. Shape: T x B x C.
103
+ input3 (torch.Tensor): short-frame-length features. Shape: T x B x C.
104
+
105
+ Returns:
106
+ torch.Tensor: Processed and fused output features. Shape: B x T x C.
107
+ """
108
+ x1 = input1.permute(1, 2, 0) # B C T
109
+ x2 = input2.permute(1, 2, 0)
110
+ x3 = input3.permute(1, 2, 0)
111
+ tensor_list = [x1, x2, x3]
112
+
113
+ residual = torch.mean(torch.stack(tensor_list), dim=0)
114
+
115
+ out1 = self.conv_1(x1) # B C/2 T
116
+ out2 = self.conv_2(x2)
117
+ out3 = self.conv_3(x3)
118
+ x = torch.cat([out1, out2, out3], dim=1) # B 3C/2 T
119
+
120
+ feature = torch.cat((x1, x2, x3), dim=1)
121
+ out = self.conv_4(feature)
122
+ out = out.permute(2, 0, 1) # T B C/2
123
+ out = F.layer_norm(out, normalized_shape=[out.size(-1)])
124
+ out, _ = self.self_attention(out, out, out) # T B C/2
125
+ out = out.permute(1, 2, 0) # B C/2 T
126
+ out = torch.cat((x, out), dim=1) # B 2C T
127
+ out = self.conv_5(out) # fuse all the features together
128
+ out = out + residual
129
+ out = out.permute(0, 2, 1)
130
+
131
+ return out
132
+
133
+
134
+ class Encoder(nn.Module):
135
+ def __init__(self, input_dim=1024, seg_num=32):
136
+ """
137
+ Multi-Temporal Feature Fusion (MTFF) module.
138
+
139
+ Args:
140
+ input_dim (int): Dimension of the input features.
141
+ seg_num (int): Number of snippets in a video.
142
+ """
143
+ super(Encoder, self).__init__()
144
+ self.drop_out_rate = 0.1
145
+ self.input_dim = input_dim
146
+ self.min_temporal_dim = seg_num
147
+ self.CVA1 = CVA(input_dim=input_dim)
148
+ self.CVA2 = CVA(input_dim=input_dim)
149
+ self.CVA3 = CVA(input_dim=input_dim)
150
+
151
+ self.aggregate = Aggregate(input_dim=input_dim)
152
+
153
+ def forward(self, feature1, feature2, feature3):
154
+ """
155
+ Args:
156
+ feature1 (torch.Tensor): long-frame-length features. Shape: B x T x C.
157
+ (Batch size X The number of snippets x Input dimensions)
158
+ feature2 (torch.Tensor): medium-frame-length features. Shape: B x T x C.
159
+ feature3 (torch.Tensor): short-frame-length features. Shape: B x T x C.
160
+
161
+ Returns:
162
+ torch.Tensor: Fused and processed output features. Shape: B x T x C.
163
+ """
164
+
165
+ att1 = self.CVA1(feature1, feature2)
166
+ att2 = self.CVA2(feature2, feature3)
167
+ att3 = self.CVA3(feature3, feature1)
168
+
169
+ out1 = self.aggregate(att1, att2, att3) # B T C
170
+
171
+ return out1
172
+
173
+
174
+ class Model(nn.Module):
175
+ def __init__(self, feature_dim, batch_size, seg_num=32):
176
+ """
177
+ Multi-Temporal Feature Learning (MTFL) recognition model.
178
+
179
+ Args:
180
+ feature_dim (int): Dimension of the input features.
181
+ batch_size (int): Batch size.
182
+ seg_num (int): Number of snippets in a video.
183
+ """
184
+ super(Model, self).__init__()
185
+ self.batch_size = batch_size
186
+ self.num_segments = seg_num
187
+ self.k_abn = self.num_segments // 10 # select 3 snippets
188
+ self.k_nor = self.num_segments // 10
189
+
190
+ self.Encoder = Encoder(input_dim=feature_dim, seg_num=seg_num)
191
+
192
+ # Fully connected layers for classification
193
+ self.fc1 = nn.Linear(feature_dim, 512)
194
+ self.fc2 = nn.Linear(512, 128)
195
+ self.fc3 = nn.Linear(128, 18) # class amount = 18
196
+
197
+ self.drop_out = nn.Dropout(0.2)
198
+ self.relu = nn.LeakyReLU(negative_slope=5e-2)
199
+ self.sigmoid = nn.Sigmoid()
200
+ self.apply(weight_init)
201
+
202
+ def forward(self, input1, input2, input3):
203
+ """
204
+ Args:
205
+ input1 (torch.Tensor): long-frame-length features. Shape: B x T x feature_dim.
206
+ input2 (torch.Tensor): medium-frame-length features. Shape: B x T x feature_dim.
207
+ input3 (torch.Tensor): short-frame-length features. Shape: B x T x feature_dim.
208
+
209
+ Returns:
210
+ score_abnormal (torch.Tensor): The mean scores for top-3 abnormal instances.
211
+ score_normal (torch.Tensor): The mean scores for top-3 normal instances.
212
+ feat_select_abn (torch.Tensor): Selected abnormal features.
213
+ feat_select_normal (torch.Tensor): Selected normal features.
214
+ scores (torch.Tensor): All computed scores. Shape: B x T x the number of classes (18)
215
+ """
216
+ k_abn = self.k_abn
217
+ k_nor = self.k_nor
218
+ ncrops = 1 # Reserving the parameter for spatial cropping, which is not used and defaults to 1
219
+
220
+ # Multi-Temporal Feature Fusion
221
+ out = self.Encoder(input1, input2, input3)
222
+ bs, t, f = out.size()
223
+ features = self.drop_out(out) # B T D
224
+
225
+ # classification layers
226
+ scores = self.relu(self.fc1(features))
227
+ scores = self.drop_out(scores)
228
+ scores = self.relu(self.fc2(scores))
229
+ scores = self.drop_out(scores)
230
+ scores = self.sigmoid(self.fc3(scores))
231
+ scores = scores.view(bs, t, -1) # B T 18
232
+ # B * t * f
233
+ normal_features = features[0:self.batch_size]
234
+ normal_scores = scores[0:self.batch_size]
235
+
236
+ abnormal_features = features[self.batch_size:]
237
+ abnormal_scores = scores[self.batch_size:]
238
+
239
+ # Compute feature magnitudes
240
+ feat_magnitudes = torch.norm(features, p=2, dim=2)
241
+ feat_magnitudes = feat_magnitudes.view(bs, ncrops, -1).mean(1)
242
+ nfea_magnitudes = feat_magnitudes[0:self.batch_size] # normal feature magnitudes
243
+ afea_magnitudes = feat_magnitudes[self.batch_size:] # abnormal feature magnitudes
244
+ n_size = nfea_magnitudes.shape[0]
245
+
246
+ # Inference mode for batch size 1
247
+ if nfea_magnitudes.shape[0] == 1:
248
+ afea_magnitudes = nfea_magnitudes
249
+ abnormal_scores = normal_scores
250
+ abnormal_features = normal_features
251
+
252
+ select_idx = torch.ones_like(nfea_magnitudes)
253
+ select_idx = self.drop_out(select_idx)
254
+
255
+ ####### process abnormal videos -> select top3 feature magnitude #######
256
+ afea_magnitudes_drop = afea_magnitudes * select_idx
257
+ idx_abn = torch.topk(afea_magnitudes_drop, k_abn, dim=1)[1]
258
+ idx_abn_feat = idx_abn.unsqueeze(2).expand([-1, -1, abnormal_features.shape[2]])
259
+
260
+ abnormal_features = abnormal_features.view(n_size, ncrops, t, f) # B X N X T X F
261
+ abnormal_features = abnormal_features.permute(1, 0, 2, 3) # N X B X T X F
262
+
263
+ total_select_abn_feature = torch.zeros(0, device=input1.device)
264
+ for abnormal_feature in abnormal_features:
265
+ feat_select_abn = torch.gather(abnormal_feature, 1, idx_abn_feat) # top 3 features magnitude in abnormal bag
266
+ total_select_abn_feature = torch.cat((total_select_abn_feature, feat_select_abn))
267
+
268
+ idx_abn_score = idx_abn.unsqueeze(2).expand([-1, -1, abnormal_scores.shape[2]])
269
+ # top 3 scores in abnormal bag based on the top-3 magnitude
270
+ score_abnormal = torch.mean(torch.gather(abnormal_scores, 1, idx_abn_score), dim=1)
271
+
272
+
273
+ ####### process normal videos -> select top3 feature magnitude #######
274
+
275
+ select_idx_normal = torch.ones_like(nfea_magnitudes)
276
+ select_idx_normal = self.drop_out(select_idx_normal)
277
+ nfea_magnitudes_drop = nfea_magnitudes * select_idx_normal
278
+ idx_normal = torch.topk(nfea_magnitudes_drop, k_nor, dim=1)[1]
279
+ idx_normal_feat = idx_normal.unsqueeze(2).expand([-1, -1, normal_features.shape[2]])
280
+
281
+ normal_features = normal_features.view(n_size, ncrops, t, f)
282
+ normal_features = normal_features.permute(1, 0, 2, 3) # 1 B T D
283
+
284
+ total_select_nor_feature = torch.zeros(0, device=input1.device)
285
+ for nor_fea in normal_features:
286
+ feat_select_normal = torch.gather(nor_fea, 1, idx_normal_feat) # top 3 features magnitude in normal bag (hard negative)
287
+ total_select_nor_feature = torch.cat((total_select_nor_feature, feat_select_normal))
288
+
289
+ idx_normal_score = idx_normal.unsqueeze(2).expand([-1, -1, normal_scores.shape[2]])
290
+ score_normal = torch.mean(torch.gather(normal_scores, 1, idx_normal_score), dim=1) # top 3 scores in normal bag
291
+
292
+ feat_select_abn = total_select_abn_feature
293
+ feat_select_normal = total_select_nor_feature
294
+
295
+ return score_abnormal, score_normal, feat_select_abn, feat_select_normal, scores
recognition/option.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ ############ Test args ########################
4
+ test_parser = argparse.ArgumentParser(description='MTFL_recognition_test')
5
+ # input path
6
+ test_parser.add_argument('--lf_dir', type=str, default='features/L64', help='long frame length feature path')
7
+ test_parser.add_argument('--mf_dir', type=str, default='features/L32', help='media frame length feature path')
8
+ test_parser.add_argument('--sf_dir', type=str, default='features/L8', help='short frame length feature path')
9
+ test_parser.add_argument('--test_anno', type=str, default='annotation/Anomaly_videos.txt', help='test annotation file')
10
+ test_parser.add_argument('--test_dataset', type=str, default='other', choices=['UCF', 'VAD', 'other'],
11
+ help='The test data. The test results are the recognized labels of all input videos. '
12
+ 'For UCF and VAD datasets, the overall accuracy would be printed out')
13
+ test_parser.add_argument('--recognition_model', type=str,
14
+ default='/media/DataDrive/yiling/Test/models/MTFL_recog/split_1_best_VAD.pkl',
15
+ help='recognition checkpoint path, choose 1 from 7 checkpoints trained on different splits')
16
+ # output path
17
+ test_parser.add_argument('--output_dir', type=str, default='results',
18
+ help='The path to store the recognition result')
19
+ # feature size depending on which feature extractor used
20
+ test_parser.add_argument('--feature_size', type=int, default=1024, help='feature dim (default: VST feature)')
21
+ test_parser.add_argument('--seg_num', type=int, default=32, help='the number of snippets')
22
+ # running cfg
23
+ test_parser.add_argument('--gpu', default="0", type=str, choices=["0", "1"], help='gpu')
24
+ test_parser.add_argument('--workers', default=8, help='number of workers in dataloader')
25
+
26
+
27
+ ############ Train args ########################
28
+ train_parser = argparse.ArgumentParser(description='MTFL_recognition_train')
29
+ # input path
30
+ train_parser.add_argument('--lf_dir', type=str, default='/media/DataDrive/yiling/features/recognition/split1_L64R1',
31
+ help='long feature path')
32
+ train_parser.add_argument('--mf_dir', type=str, default='/media/DataDrive/yiling/features/recognition/split1_L32R1',
33
+ help='media feature path')
34
+ train_parser.add_argument('--sf_dir', type=str, default='/media/DataDrive/yiling/features/recognition/split1_L8R1',
35
+ help='short feature path')
36
+ train_parser.add_argument('--train_anno', default='/media/DataDrive/yiling/annotation/recognition/splits/VAD/VAD_train_001.txt',
37
+ help='the annotation file for training')
38
+ train_parser.add_argument('--test_anno', default='/media/DataDrive/yiling/annotation/recognition/splits/VAD/VAD_test_001.txt',
39
+ help='the annotation file for test')
40
+ train_parser.add_argument('--test_dataset', type=str, default='UCF', choices=['UCF', 'VAD'],
41
+ help='The validation data')
42
+ # output path and saving info
43
+ train_parser.add_argument('--model-name', default='MTFL_recognition', help='name to save model')
44
+ train_parser.add_argument('--save_models', default='/media/DataDrive/yiling/models/demo/recognition',
45
+ help='the path for saving models')
46
+ train_parser.add_argument('--output_dir', default='/media/DataDrive/yiling/results/demo/recognition',
47
+ help='The path to store AUC results')
48
+ # training cfg and paras
49
+ train_parser.add_argument('--gpu', default="0", type=str, choices=["0", "1"], help='gpu id')
50
+ train_parser.add_argument('--feature_size', type=int, default=1024, help='feature dim (default: VST feature)')
51
+ train_parser.add_argument('--seg_num', type=int, default=32, help='the number of snippets')
52
+ train_parser.add_argument('--lr', type=float, default='0.0001', help='learning rates for steps(list form)')
53
+ train_parser.add_argument('--batch-size', type=int, default=32, help='batch size')
54
+ train_parser.add_argument('--workers', default=8, help='number of workers in dataloader')
55
+ train_parser.add_argument('--max-epoch', type=int, default=2000, help='maximum iteration to train (default: 100)')
56
+
recognition/test.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from tqdm import tqdm
3
+ import numpy as np
4
+ import os
5
+ import option
6
+ from torch.utils.data import DataLoader
7
+ from dataset import class_to_int, Dataset
8
+ from model import Model
9
+
10
+
11
+ def top_k_accuracy(scores, labels, topk=(1, 5)):
12
+ """Calculate top k accuracy score.
13
+
14
+ Args:
15
+ scores (list[np.ndarray]): Prediction scores for each class.
16
+ labels (list[int]): Ground truth labels.
17
+ topk (tuple[int]): K value for top_k_accuracy. Default: (1, ).
18
+
19
+ Returns:
20
+ list[float]: Top k accuracy score for each k.
21
+ """
22
+ res = []
23
+ labels = np.array(labels)[:, np.newaxis]
24
+ for k in topk:
25
+ max_k_preds = np.argsort(scores, axis=1)[:, -k:][:, ::-1]
26
+ match_array = np.logical_or.reduce(max_k_preds == labels, axis=1)
27
+ topk_acc_score = match_array.sum() / match_array.shape[0]
28
+ res.append(topk_acc_score)
29
+
30
+ return res
31
+
32
+
33
+ def test(dataloader, model, device, test_dataset='UCF'):
34
+ """
35
+ Evaluate the model's performance on the test dataset and return the top-1 accuracy.
36
+
37
+ Args:
38
+ dataloader (DataLoader): DataLoader for the test dataset.
39
+ model (nn.Module): The trained neural network model.
40
+ device (torch.device): The device (CPU or GPU) on which to perform evaluation.
41
+ test_dataset (str, optional): The name of the test dataset, either 'UCF' or 'VAD'. Default is 'UCF'.
42
+ The overall accuracy is calculated only for 'VAD' and 'UCF' because it does not make sense when testing
43
+ on only a few videos.
44
+
45
+ Returns:
46
+ float: The top-1 accuracy of the model on the test dataset.
47
+ dict: A dictionary containing video filenames and their corresponding predicted classes.
48
+
49
+ """
50
+ video_class = {"video": [], "class": []}
51
+ with torch.no_grad():
52
+ model.to(device).eval()
53
+ outputs = torch.zeros(0, device=device)
54
+ labels = torch.zeros(0, device=device)
55
+
56
+ for input1, input2, input3, label, file in tqdm(dataloader):
57
+ input1 = input1.to(device)
58
+ input2 = input2.to(device)
59
+ input3 = input3.to(device)
60
+ label = label.to(device)
61
+ score_abnormal, score_normal, feat_select_abn, feat_select_normal, scores = model(input1, input2, input3)
62
+ # cat for acc evaluation
63
+ outputs = torch.cat((outputs, score_abnormal))
64
+ labels = torch.cat((labels, label))
65
+ # obtain the prediction result
66
+ score_abnormal = score_abnormal.cpu().detach().numpy()
67
+ pred = np.argmax(score_abnormal, axis=1)
68
+ found_class = [key for key, value in class_to_int.items() if value == pred[0]]
69
+ file_name = os.path.basename(file[0])
70
+ video_class["video"].append(file_name)
71
+ video_class["class"].append(found_class)
72
+
73
+ outputs = outputs.cpu().detach().numpy()
74
+ labels = labels.cpu().detach().numpy()
75
+ res = [-1]
76
+
77
+ if test_dataset == 'UCF': # all road accidents in UCF are labelled as 13
78
+ for row in outputs:
79
+ max_value = max(row[13], row[14], row[15])
80
+ row[13] = max_value
81
+ row[14] = 0.0
82
+ row[15] = 0.0
83
+
84
+ # Accuracy makes sense only when the test classes are involved in VAD
85
+ if test_dataset == 'UCF' or test_dataset == 'VAD':
86
+ res = top_k_accuracy(outputs, labels)
87
+ print('\n' + str(test_dataset) + ' top1 : ' + str(res[0]) + ' top5 : ' + str(res[1]) + '\n')
88
+
89
+ return res[0], video_class
90
+
91
+
92
+ def main():
93
+ args = option.test_parser.parse_args()
94
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
95
+ os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
96
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
97
+
98
+ out_path = os.path.join(args.output_dir, 'rec_results')
99
+
100
+ test_loader = DataLoader(Dataset(args, test_mode=True),
101
+ batch_size=1, shuffle=False,
102
+ num_workers=args.workers, pin_memory=True)
103
+ model = Model(feature_dim=args.feature_size, batch_size=1, seg_num=args.seg_num)
104
+ model.load_state_dict(torch.load(args.recognition_model))
105
+
106
+ _, video_class = test(dataloader=test_loader,
107
+ model=model,
108
+ device=device,
109
+ test_dataset=args.test_dataset)
110
+ # save recognition results
111
+ video_sub_dir = os.path.basename(os.path.dirname(video_class["video"][0][0]))
112
+ file_path = os.path.join(out_path, video_sub_dir, 'output_pred.txt')
113
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
114
+ with open(file_path, "w") as f:
115
+ for video, cls in zip(video_class["video"], video_class["class"]):
116
+ f.write(f"Video: {video}, class: {cls}\n")
117
+
118
+
119
+ if __name__ == '__main__':
120
+ main()
recognition/train.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.optim as optim
3
+ import os
4
+ from torch.nn import MSELoss
5
+ from torch.utils.data import DataLoader
6
+ from model import Model
7
+ from dataset import Dataset
8
+ from test import test
9
+ import option
10
+ from tqdm import tqdm
11
+ torch.set_default_tensor_type('torch.FloatTensor')
12
+
13
+
14
+ def sparsity(arr, lamda2):
15
+ loss = torch.mean(torch.norm(arr, dim=0))
16
+ return lamda2*loss
17
+
18
+
19
+ def smooth(arr, lamda1):
20
+ arr2 = torch.zeros_like(arr)
21
+ arr2[:-1] = arr[1:]
22
+ arr2[-1] = arr[-1]
23
+
24
+ loss = torch.sum((arr2-arr)**2)
25
+
26
+ return lamda1*loss
27
+
28
+
29
+ class SigmoidMAELoss(torch.nn.Module):
30
+ def __init__(self):
31
+ super(SigmoidMAELoss, self).__init__()
32
+ from torch.nn import Sigmoid
33
+ self.__sigmoid__ = Sigmoid()
34
+ self.__l1_loss__ = MSELoss()
35
+
36
+ def forward(self, pred, target):
37
+ return self.__l1_loss__(pred, target)
38
+
39
+
40
+ class RTFM_loss(torch.nn.Module):
41
+ def __init__(self, alpha, margin):
42
+ super(RTFM_loss, self).__init__()
43
+ self.alpha = alpha
44
+ self.margin = margin
45
+ self.sigmoid = torch.nn.Sigmoid()
46
+ self.mae_criterion = SigmoidMAELoss()
47
+ self.criterion = torch.nn.CrossEntropyLoss() # multi class
48
+
49
+ def forward(self, score_normal, score_abnormal, nlabel, alabel, feat_n, feat_a):
50
+ labels = torch.cat((nlabel, alabel), 0)
51
+ scores = torch.cat((score_normal, score_abnormal), 0)
52
+
53
+ labels = labels.cuda()
54
+
55
+ loss_cls = self.criterion(scores, labels) # CE loss in the score space
56
+
57
+ loss_abn = torch.abs(self.margin - torch.norm(torch.mean(feat_a, dim=1), p=2, dim=1))
58
+
59
+ loss_nor = torch.norm(torch.mean(feat_n, dim=1), p=2, dim=1)
60
+
61
+ loss_rtfm = torch.mean((loss_abn + loss_nor) ** 2)
62
+
63
+ loss_total = loss_cls + self.alpha * loss_rtfm
64
+
65
+ return loss_total
66
+
67
+
68
+ def train(nloader, aloader, model, batch_size, seg_num, optimizer, device):
69
+ with torch.set_grad_enabled(True):
70
+ model.train()
71
+
72
+ ninput1, ninput2, ninput3, nlabel = next(nloader)
73
+ ainput1, ainput2, ainput3, alabel = next(aloader)
74
+
75
+ input1 = torch.cat((ninput1, ainput1), 0).to(device)
76
+ input2 = torch.cat((ninput2, ainput2), 0).to(device)
77
+ input3 = torch.cat((ninput3, ainput3), 0).to(device)
78
+ score_abnormal, score_normal, feat_select_abn, feat_select_normal, scores = model(input1, input2, input3)
79
+
80
+ scores = scores.view(batch_size * seg_num * 2, -1) # BX32X2, 18
81
+
82
+ abn_scores, indice = torch.max(scores[batch_size*32:], dim=1)
83
+
84
+ nlabel = nlabel[0:batch_size]
85
+ alabel = alabel[0:batch_size]
86
+
87
+ loss_criterion = RTFM_loss(0.0001, 100)
88
+ loss_sparse = sparsity(abn_scores, 8e-3)
89
+ loss_smooth = smooth(abn_scores, 8e-4)
90
+
91
+ loss_RTFM = loss_criterion(score_normal, score_abnormal, nlabel, alabel, feat_select_normal, feat_select_abn)
92
+ cost = loss_RTFM + loss_smooth + loss_sparse
93
+
94
+ optimizer.zero_grad()
95
+ cost.backward()
96
+ optimizer.step()
97
+
98
+
99
+ def main():
100
+ args = option.train_parser.parse_args()
101
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
102
+ os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
103
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
104
+
105
+ train_nloader = DataLoader(Dataset(args, test_mode=False, is_normal=True),
106
+ batch_size=args.batch_size, shuffle=True,
107
+ num_workers=args.workers, pin_memory=True, drop_last=True)
108
+ train_aloader = DataLoader(Dataset(args, test_mode=False, is_normal=False),
109
+ batch_size=args.batch_size, shuffle=True,
110
+ num_workers=args.workers, pin_memory=True, drop_last=True)
111
+ test_loader = DataLoader(Dataset(args, test_mode=True),
112
+ batch_size=1, shuffle=False,
113
+ num_workers=args.workers, pin_memory=True)
114
+
115
+ if not os.path.exists(args.save_models):
116
+ os.makedirs(args.save_models)
117
+
118
+ feature_size = args.feature_size
119
+ model = Model(feature_size, args.batch_size, args.seg_num)
120
+ optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.005)
121
+ test_info = {"epoch": [], "TOP-1 ACC": []}
122
+ best_ACC = -1
123
+ output_dir = args.output_dir
124
+ os.makedirs(output_dir, exist_ok=True)
125
+ acc, _ = test(dataloader=test_loader,
126
+ model=model,
127
+ device=device,
128
+ test_dataset=args.test_dataset)
129
+
130
+ for step in tqdm(range(1, args.max_epoch + 1), total=args.max_epoch, dynamic_ncols=True):
131
+ if (step - 1) % len(train_nloader) == 0:
132
+ loadern_iter = iter(train_nloader)
133
+
134
+ if (step - 1) % len(train_aloader) == 0:
135
+ loadera_iter = iter(train_aloader)
136
+
137
+ train(nloader=loadern_iter,
138
+ aloader=loadera_iter,
139
+ model=model,
140
+ batch_size=args.batch_size,
141
+ seg_num=args.seg_num,
142
+ optimizer=optimizer,
143
+ device=device)
144
+
145
+ if step % 5 == 0 and step > 5:
146
+ acc, _ = test(dataloader=test_loader,
147
+ model=model,
148
+ device=device,
149
+ test_dataset=args.test_dataset)
150
+
151
+ test_info["epoch"].append(step)
152
+ test_info["TOP-1 ACC"].append(acc)
153
+
154
+ if test_info["TOP-1 ACC"][-1] > best_ACC:
155
+ best_ACC = test_info["TOP-1 ACC"][-1]
156
+ torch.save(model.state_dict(), os.path.join(args.save_models, args.model_name + '-{}.pkl'.format(step)))
157
+ file_path = os.path.join(output_dir, '{}-step-ACC.txt'.format(step))
158
+ with open(file_path, "w") as fo:
159
+ for key in test_info:
160
+ fo.write("{}: {}\n".format(key, test_info[key][-1]))
161
+
162
+
163
+ if __name__ == '__main__':
164
+ main()
165
+
166
+
167
+
168
+
169
+
170
+
171
+
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ matplotlib==3.7.0
2
+ mmaction2.egg==info
3
+ mmcv==1.7.0
4
+ numpy==1.25.1
5
+ opencv_contrib_python==4.7.0.72
6
+ opencv_python==4.7.0.72
7
+ scikit_learn==1.2.2
8
+ torch==2.0.0+cu118
9
+ torchvision==0.15.1+cu118
10
+ tqdm==4.64.1
utils/feature_extractor.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Reference with Ivo's implementation"""
2
+ import argparse
3
+ import logging
4
+ import os
5
+ from os import path, mkdir
6
+ import random
7
+
8
+ import numpy as np
9
+ import torch
10
+ import torch.backends.cudnn as cudnn
11
+ from video_loader import VideoIter
12
+ from utils import register_logger, get_torch_device
13
+ import transforms_video
14
+ from torch.utils.data import DataLoader
15
+ from torchvision.transforms import transforms
16
+
17
+ # Video Swin Transformer related repository
18
+ from mmcv import Config
19
+ from mmaction.models import build_model
20
+ from mmcv.runner import load_checkpoint
21
+ import warnings
22
+
23
+ warnings.filterwarnings("ignore", message="The pts_unit 'pts' gives wrong results. Please use pts_unit 'sec'.")
24
+ warnings.filterwarnings('ignore', message='No handlers found: "aten::pad". Skipped.')
25
+
26
+
27
+ def get_args():
28
+ parser = argparse.ArgumentParser(description="VST Feature Extractor Parser")
29
+ # I/O
30
+ parser.add_argument('--dataset_path', default='test_videos',
31
+ help="path to dataset")
32
+ parser.add_argument('--save_dir', type=str, default="features",
33
+ help="set output root for the features.")
34
+ # extraction params
35
+ parser.add_argument('--model_type', default='swinB',
36
+ type=str,
37
+ help="type of feature extractor")
38
+ parser.add_argument('--pretrained_3d',
39
+ default='/media/DataDrive/yiling/models/VST_finetune/hflip_speed_120_2d/best_top1_acc_epoch_15.pth',
40
+ type=str,
41
+ help="load default 3D pretrained feature extractor model.")
42
+ parser.add_argument('--clip_length', type=int, default=8,
43
+ help="define the length of each input sample.")
44
+ parser.add_argument('--frame_interval', type=int, default=1,
45
+ help="define the sampling interval between frames.")
46
+ parser.add_argument('--use_splits', type=bool, default=False,
47
+ help="use full anomalous data or splits, only applicable of Split Dataset of UCF-CRIME and VAD")
48
+ parser.add_argument('--batch_size', type=int, default=8, help="batch size")
49
+ # running cfg
50
+ parser.add_argument('--num_workers', type=int, default=0,
51
+ help="define the number of workers used for loading the videos")
52
+ parser.add_argument('--seed', type=int, default=None, help='random seed')
53
+ parser.add_argument('--log_every', type=int, default=10,
54
+ help="log the writing of clips every n steps.")
55
+ parser.add_argument('--log_file', type=str,
56
+ help="set logging file.")
57
+ parser.add_argument('--gpu', type=int, default=0, help="gpu id")
58
+
59
+ return parser.parse_args()
60
+
61
+
62
+ def set_random_seed(seed=42):
63
+ random.seed(seed)
64
+ np.random.seed(seed)
65
+ torch.manual_seed(seed)
66
+ torch.cuda.manual_seed(seed)
67
+ torch.cuda.manual_seed_all(seed)
68
+ os.environ['PYTHONHASHSEED'] = str(seed)
69
+
70
+
71
+ def to_segments(data, num=32):
72
+ """
73
+ These code is taken from:
74
+ https://github.com/rajanjitenpatel/C3D_feature_extraction/blob/b5894fa06d43aa62b3b64e85b07feb0853e7011a/extract_C3D_feature.py#L805
75
+ :param data: list of features of a certain video
76
+ :return: list of 32 segments
77
+ """
78
+ data = np.array(data)
79
+ Segments_Features = []
80
+ thirty2_shots = np.round(np.linspace(0, len(data) - 1, num=num + 1)).astype(int)
81
+ for ss, ee in zip(thirty2_shots[:-1], thirty2_shots[1:]):
82
+ if ss == ee:
83
+ temp_vect = data[min(ss, data.shape[0] - 1), :]
84
+ else:
85
+ temp_vect = data[ss:ee, :].mean(axis=0)
86
+
87
+ temp_vect = temp_vect / np.linalg.norm(temp_vect)
88
+ if np.linalg.norm == 0:
89
+ logging.error("Feature norm is 0")
90
+ exit()
91
+ if len(temp_vect) != 0:
92
+ Segments_Features.append(temp_vect.tolist())
93
+
94
+ return Segments_Features
95
+
96
+
97
+ class FeaturesWriter:
98
+ def __init__(self, num_videos, chunk_size=16):
99
+ """
100
+ Initialize a FeaturesWriter instance.
101
+
102
+ Args:
103
+ num_videos (int): Total number of videos to process.
104
+ chunk_size (int, optional): Chunk size for writing features, and not used. Defaults to 16.
105
+ """
106
+ self.path = None
107
+ self.dir = None
108
+ self.data = None
109
+ self.chunk_size = chunk_size
110
+ self.num_videos = num_videos
111
+ self.dump_count = 0
112
+
113
+ def _init_video(self, video_name, dir):
114
+ self.path = path.join(dir, f"{video_name}.txt")
115
+ self.dir = dir
116
+ self.data = dict()
117
+
118
+ def has_video(self):
119
+ return self.data is not None
120
+
121
+ def dump(self):
122
+ logging.info(f'{self.dump_count} / {self.num_videos}: Dumping {self.path}')
123
+ self.dump_count += 1
124
+ if not path.exists(self.dir):
125
+ os.mkdir(self.dir)
126
+ features = to_segments([self.data[key] for key in sorted(self.data)])
127
+ with open(self.path, 'w') as fp:
128
+ for d in features:
129
+ d = [str(x) for x in d]
130
+ fp.write(' '.join(d) + '\n')
131
+
132
+ def _is_new_video(self, video_name, dir):
133
+ new_path = path.join(dir, f"{video_name}.txt")
134
+ if self.path != new_path and self.path is not None:
135
+ return True
136
+
137
+ return False
138
+
139
+ def store(self, feature, idx):
140
+ self.data[idx] = list(feature)
141
+
142
+ def write(self, feature, video_name, idx, dir):
143
+ if not self.has_video():
144
+ self._init_video(video_name, dir)
145
+
146
+ if self._is_new_video(video_name, dir):
147
+ self.dump()
148
+ self._init_video(video_name, dir)
149
+
150
+ self.store(feature, idx)
151
+
152
+
153
+ def get_features_loader(dataset_path, clip_length, frame_interval, batch_size, num_workers, save_dir, use_splits):
154
+ """
155
+ Get the data loader for extracting video features.
156
+
157
+ Args:
158
+ dataset_path (str): Path to the videos.
159
+ clip_length (int): Length of each input sample.
160
+ frame_interval (int): Sampling interval between frames.
161
+ batch_size (int): Batch size.
162
+ num_workers (int): Number of workers used for loading videos.
163
+ save_dir (str): Directory to save features.
164
+ use_splits (bool): Whether to use full anomalous data or splits.
165
+
166
+ Returns:
167
+ data_loader (VideoIter): Video data loader.
168
+ data_iter (DataLoader): Torch data loader for video features extraction.
169
+ """
170
+ mean = [0.400, 0.388, 0.372] # VAD mean and std in RGB
171
+ std = [0.247, 0.245, 0.243]
172
+ size = 224
173
+ resize = size, size
174
+ crop = size
175
+
176
+ res = transforms.Compose([
177
+ transforms_video.ToTensorVideo(),
178
+ transforms_video.ResizeVideo(resize),
179
+ transforms_video.CenterCropVideo(crop),
180
+ transforms_video.NormalizeVideo(mean=mean, std=std)
181
+ ])
182
+
183
+ if os.path.exists(save_dir):
184
+ proc_v = []
185
+ for root, dirs, files in os.walk(save_dir):
186
+ for file in files:
187
+ file_path = os.path.join(root, file)
188
+ relative_path = os.path.relpath(file_path, save_dir)
189
+ proc_v.append(relative_path)
190
+ proc_v = [v.split(".")[0] for v in proc_v]
191
+ if len(proc_v) > 0:
192
+ logging.info(
193
+ f"[Data] Already {len(proc_v)} files have been processed"
194
+ )
195
+
196
+ data_loader = VideoIter(
197
+ dataset_path=dataset_path,
198
+ proc_video=proc_v,
199
+ clip_length=clip_length,
200
+ frame_stride=frame_interval,
201
+ video_transform=res,
202
+ use_splits=use_splits,
203
+ return_label=False,
204
+ )
205
+
206
+ data_iter = torch.utils.data.DataLoader(
207
+ data_loader,
208
+ batch_size=batch_size,
209
+ shuffle=False,
210
+ num_workers=num_workers,
211
+ pin_memory=True,
212
+ )
213
+
214
+ return data_loader, data_iter
215
+
216
+
217
+ def load_VST(checkpoint, device):
218
+ """load pretrained VST"""
219
+ config = 'utils/swin_config/recognition/swin/swin_base_patch244_window877_kinetics400_22k_VAD.py'
220
+ cfg = Config.fromfile(config)
221
+ model = build_model(cfg.model, train_cfg=None, test_cfg=cfg.get('test_cfg'))
222
+ load_checkpoint(model, checkpoint, map_location='cpu')
223
+
224
+ return model.to(device)
225
+
226
+
227
+ def main():
228
+ args = get_args()
229
+
230
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
231
+ torch.cuda.set_device(args.gpu)
232
+ device = get_torch_device()
233
+ register_logger(log_file=args.log_file)
234
+
235
+ if args.seed is not None:
236
+ set_random_seed(args.seed)
237
+
238
+ cudnn.benchmark = True
239
+
240
+ feature_path = os.path.join(args.save_dir, 'L'+str(args.clip_length))
241
+
242
+ if not path.exists(feature_path):
243
+ mkdir(feature_path)
244
+
245
+ data_loader, data_iter = get_features_loader(args.dataset_path,
246
+ args.clip_length,
247
+ args.frame_interval,
248
+ args.batch_size,
249
+ args.num_workers,
250
+ feature_path,
251
+ args.use_splits, )
252
+ if data_loader.video_count == 0:
253
+ return
254
+
255
+ model = load_VST(args.pretrained_3d, device)
256
+
257
+ features_writer = FeaturesWriter(num_videos=data_loader.video_count)
258
+ loop_i = 0
259
+ # Perform feature extraction on the dataset
260
+ with torch.no_grad():
261
+ for data, clip_idxs, dirs, vid_names in data_iter: # 1 batch
262
+ outputs = model.extract_feat(data.to(device))
263
+ outputs = outputs.mean(dim=[2, 3, 4])
264
+ outputs = outputs.detach().cpu().numpy()
265
+
266
+ for i, (dir, vid_name, clip_idx) in enumerate(zip(dirs, vid_names, clip_idxs)):
267
+ if loop_i == 0:
268
+ logging.info(
269
+ f"Video {features_writer.dump_count} / {features_writer.num_videos} : Writing clip {clip_idx} of video {vid_name}")
270
+
271
+ loop_i += 1
272
+ loop_i %= args.log_every
273
+
274
+ dir = path.join(feature_path, dir)
275
+ features_writer.write(feature=outputs[i],
276
+ video_name=vid_name,
277
+ idx=clip_idx,
278
+ dir=dir, )
279
+ # Dump the remaining features to files
280
+ features_writer.dump()
281
+
282
+
283
+ if __name__ == "__main__":
284
+ main()
utils/functional_video.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ def _is_tensor_video_clip(clip):
5
+ if not torch.is_tensor(clip):
6
+ raise TypeError("clip should be Tesnor. Got %s" % type(clip))
7
+
8
+ if not clip.ndimension() == 4:
9
+ raise ValueError("clip should be 4D. Got %dD" % clip.dim())
10
+
11
+ return True
12
+
13
+
14
+ def crop(clip, i, j, h, w):
15
+ """
16
+ Args:
17
+ clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
18
+ """
19
+ assert len(clip.size()) == 4, "clip should be a 4D tensor"
20
+ return clip[..., i:i + h, j:j + w]
21
+
22
+
23
+ def resize(clip, target_size, interpolation_mode):
24
+ assert len(target_size) == 2, "target size should be tuple (height, width)"
25
+ # print(target_size)
26
+ return torch.nn.functional.interpolate(
27
+ clip, size=target_size, mode=interpolation_mode, align_corners=False
28
+ )
29
+
30
+
31
+ def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"):
32
+ """
33
+ Do spatial cropping and resizing to the video clip
34
+ Args:
35
+ clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
36
+ i (int): i in (i,j) i.e coordinates of the upper left corner.
37
+ j (int): j in (i,j) i.e coordinates of the upper left corner.
38
+ h (int): Height of the cropped region.
39
+ w (int): Width of the cropped region.
40
+ size (tuple(int, int)): height and width of resized clip
41
+ Returns:
42
+ clip (torch.tensor): Resized and cropped clip. Size is (C, T, H, W)
43
+ """
44
+ assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor"
45
+ clip = crop(clip, i, j, h, w)
46
+ clip = resize(clip, size, interpolation_mode)
47
+ return clip
48
+
49
+
50
+ def center_crop(clip, crop_size):
51
+ assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor"
52
+ h, w = clip.size(-2), clip.size(-1)
53
+ th, tw = crop_size
54
+ assert h >= th and w >= tw, "height and width must be no smaller than crop_size"
55
+
56
+ i = int(round((h - th) / 2.0))
57
+ j = int(round((w - tw) / 2.0))
58
+ return crop(clip, i, j, th, tw)
59
+
60
+
61
+ def to_tensor(clip):
62
+ """
63
+ Convert tensor data type from uint8 to float, divide value by 255.0 and
64
+ permute the dimenions of clip tensor
65
+ Args:
66
+ clip (torch.tensor, dtype=torch.uint8): Size is (T, H, W, C)
67
+ Return:
68
+ clip (torch.tensor, dtype=torch.float): Size is (C, T, H, W)
69
+ """
70
+ _is_tensor_video_clip(clip)
71
+ if not clip.dtype == torch.uint8:
72
+ raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype))
73
+ return clip.float().permute(3, 0, 1, 2) / 255.0
74
+
75
+
76
+ def normalize(clip, mean, std, inplace=False):
77
+ """
78
+ Args:
79
+ clip (torch.tensor): Video clip to be normalized. Size is (C, T, H, W)
80
+ mean (tuple): pixel RGB mean. Size is (3)
81
+ std (tuple): pixel standard deviation. Size is (3)
82
+ Returns:
83
+ normalized clip (torch.tensor): Size is (C, T, H, W)
84
+ """
85
+ assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor"
86
+ if not inplace:
87
+ clip = clip.clone()
88
+ mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device)
89
+ std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device)
90
+ clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
91
+ return clip
92
+
93
+
94
+ def hflip(clip):
95
+ """
96
+ Args:
97
+ clip (torch.tensor): Video clip to be normalized. Size is (C, T, H, W)
98
+ Returns:
99
+ flipped clip (torch.tensor): Size is (C, T, H, W)
100
+ """
101
+ assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor"
102
+ return clip.flip((-1))
utils/swin_config/_base_/default_runtime.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ checkpoint_config = dict(interval=1)
2
+ log_config = dict(
3
+ interval=20,
4
+ hooks=[
5
+ dict(type='TextLoggerHook'),
6
+ # dict(type='TensorboardLoggerHook'),
7
+ ])
8
+ # runtime settings
9
+ dist_params = dict(backend='nccl')
10
+ log_level = 'INFO'
11
+ load_from = None
12
+ resume_from = None
13
+ workflow = [('train', 1)]
utils/swin_config/_base_/models/audioonly_r50.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='AudioRecognizer',
4
+ backbone=dict(
5
+ type='ResNetAudio',
6
+ depth=50,
7
+ pretrained=None,
8
+ in_channels=1,
9
+ norm_eval=False),
10
+ cls_head=dict(
11
+ type='AudioTSNHead',
12
+ num_classes=400,
13
+ in_channels=1024,
14
+ dropout_ratio=0.5,
15
+ init_std=0.01),
16
+ # model training and testing settings
17
+ train_cfg=None,
18
+ test_cfg=dict(average_clips='prob'))
utils/swin_config/_base_/models/bmn_400x100.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='BMN',
4
+ temporal_dim=100,
5
+ boundary_ratio=0.5,
6
+ num_samples=32,
7
+ num_samples_per_bin=3,
8
+ feat_dim=400,
9
+ soft_nms_alpha=0.4,
10
+ soft_nms_low_threshold=0.5,
11
+ soft_nms_high_threshold=0.9,
12
+ post_process_top_k=100)
utils/swin_config/_base_/models/bsn_pem.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='PEM',
4
+ pem_feat_dim=32,
5
+ pem_hidden_dim=256,
6
+ pem_u_ratio_m=1,
7
+ pem_u_ratio_l=2,
8
+ pem_high_temporal_iou_threshold=0.6,
9
+ pem_low_temporal_iou_threshold=0.2,
10
+ soft_nms_alpha=0.75,
11
+ soft_nms_low_threshold=0.65,
12
+ soft_nms_high_threshold=0.9,
13
+ post_process_top_k=100)
utils/swin_config/_base_/models/bsn_tem.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='TEM',
4
+ temporal_dim=100,
5
+ boundary_ratio=0.1,
6
+ tem_feat_dim=400,
7
+ tem_hidden_dim=512,
8
+ tem_match_threshold=0.5)
utils/swin_config/_base_/models/c3d_sports1m_pretrained.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='Recognizer3D',
4
+ backbone=dict(
5
+ type='C3D',
6
+ pretrained= # noqa: E251
7
+ 'https://download.openmmlab.com/mmaction/recognition/c3d/c3d_sports1m_pretrain_20201016-dcc47ddc.pth', # noqa: E501
8
+ style='pytorch',
9
+ conv_cfg=dict(type='Conv3d'),
10
+ norm_cfg=None,
11
+ act_cfg=dict(type='ReLU'),
12
+ dropout_ratio=0.5,
13
+ init_std=0.005),
14
+ cls_head=dict(
15
+ type='I3DHead',
16
+ num_classes=101,
17
+ in_channels=4096,
18
+ spatial_type=None,
19
+ dropout_ratio=0.5,
20
+ init_std=0.01),
21
+ # model training and testing settings
22
+ train_cfg=None,
23
+ test_cfg=dict(average_clips='score'))
utils/swin_config/_base_/models/csn_ig65m_pretrained.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='Recognizer3D',
4
+ backbone=dict(
5
+ type='ResNet3dCSN',
6
+ pretrained2d=False,
7
+ pretrained= # noqa: E251
8
+ 'https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r152_ig65m_20200807-771c4135.pth', # noqa: E501
9
+ depth=152,
10
+ with_pool2=False,
11
+ bottleneck_mode='ir',
12
+ norm_eval=False,
13
+ zero_init_residual=False),
14
+ cls_head=dict(
15
+ type='I3DHead',
16
+ num_classes=400,
17
+ in_channels=2048,
18
+ spatial_type='avg',
19
+ dropout_ratio=0.5,
20
+ init_std=0.01),
21
+ # model training and testing settings
22
+ train_cfg=None,
23
+ test_cfg=dict(average_clips='prob'))
utils/swin_config/_base_/models/i3d_r50.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='Recognizer3D',
4
+ backbone=dict(
5
+ type='ResNet3d',
6
+ pretrained2d=True,
7
+ pretrained='torchvision://resnet50',
8
+ depth=50,
9
+ conv1_kernel=(5, 7, 7),
10
+ conv1_stride_t=2,
11
+ pool1_stride_t=2,
12
+ conv_cfg=dict(type='Conv3d'),
13
+ norm_eval=False,
14
+ inflate=((1, 1, 1), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 1, 0)),
15
+ zero_init_residual=False),
16
+ cls_head=dict(
17
+ type='I3DHead',
18
+ num_classes=400,
19
+ in_channels=2048,
20
+ spatial_type='avg',
21
+ dropout_ratio=0.5,
22
+ init_std=0.01),
23
+ # model training and testing settings
24
+ train_cfg=None,
25
+ test_cfg=dict(average_clips='prob'))
26
+
27
+ # This setting refers to https://github.com/open-mmlab/mmaction/blob/master/mmaction/models/tenons/backbones/resnet_i3d.py#L329-L332 # noqa: E501
utils/swin_config/_base_/models/r2plus1d_r34.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='Recognizer3D',
4
+ backbone=dict(
5
+ type='ResNet2Plus1d',
6
+ depth=34,
7
+ pretrained=None,
8
+ pretrained2d=False,
9
+ norm_eval=False,
10
+ conv_cfg=dict(type='Conv2plus1d'),
11
+ norm_cfg=dict(type='SyncBN', requires_grad=True, eps=1e-3),
12
+ conv1_kernel=(3, 7, 7),
13
+ conv1_stride_t=1,
14
+ pool1_stride_t=1,
15
+ inflate=(1, 1, 1, 1),
16
+ spatial_strides=(1, 2, 2, 2),
17
+ temporal_strides=(1, 2, 2, 2),
18
+ zero_init_residual=False),
19
+ cls_head=dict(
20
+ type='I3DHead',
21
+ num_classes=400,
22
+ in_channels=512,
23
+ spatial_type='avg',
24
+ dropout_ratio=0.5,
25
+ init_std=0.01),
26
+ # model training and testing settings
27
+ train_cfg=None,
28
+ test_cfg=dict(average_clips='prob'))
utils/swin_config/_base_/models/slowfast_r50.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='Recognizer3D',
4
+ backbone=dict(
5
+ type='ResNet3dSlowFast',
6
+ pretrained=None,
7
+ resample_rate=8, # tau
8
+ speed_ratio=8, # alpha
9
+ channel_ratio=8, # beta_inv
10
+ slow_pathway=dict(
11
+ type='resnet3d',
12
+ depth=50,
13
+ pretrained=None,
14
+ lateral=True,
15
+ conv1_kernel=(1, 7, 7),
16
+ dilations=(1, 1, 1, 1),
17
+ conv1_stride_t=1,
18
+ pool1_stride_t=1,
19
+ inflate=(0, 0, 1, 1),
20
+ norm_eval=False),
21
+ fast_pathway=dict(
22
+ type='resnet3d',
23
+ depth=50,
24
+ pretrained=None,
25
+ lateral=False,
26
+ base_channels=8,
27
+ conv1_kernel=(5, 7, 7),
28
+ conv1_stride_t=1,
29
+ pool1_stride_t=1,
30
+ norm_eval=False)),
31
+ cls_head=dict(
32
+ type='SlowFastHead',
33
+ in_channels=2304, # 2048+256
34
+ num_classes=400,
35
+ spatial_type='avg',
36
+ dropout_ratio=0.5),
37
+ # model training and testing settings
38
+ train_cfg=None,
39
+ test_cfg=dict(average_clips='prob'))
utils/swin_config/_base_/models/slowonly_r50.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='Recognizer3D',
4
+ backbone=dict(
5
+ type='ResNet3dSlowOnly',
6
+ depth=50,
7
+ pretrained='torchvision://resnet50',
8
+ lateral=False,
9
+ conv1_kernel=(1, 7, 7),
10
+ conv1_stride_t=1,
11
+ pool1_stride_t=1,
12
+ inflate=(0, 0, 1, 1),
13
+ norm_eval=False),
14
+ cls_head=dict(
15
+ type='I3DHead',
16
+ in_channels=2048,
17
+ num_classes=400,
18
+ spatial_type='avg',
19
+ dropout_ratio=0.5),
20
+ # model training and testing settings
21
+ train_cfg=None,
22
+ test_cfg=dict(average_clips='prob'))
utils/swin_config/_base_/models/swin/swin_base.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # model settings
2
+ _base_ = "swin_tiny.py"
3
+ model = dict(backbone=dict(depths=[2, 2, 18, 2],
4
+ embed_dim=128,
5
+ num_heads=[4, 8, 16, 32]),
6
+ cls_head=dict(in_channels=1024))
utils/swin_config/_base_/models/swin/swin_large.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # model settings
2
+ _base_ = "swin_tiny.py"
3
+ model = dict(backbone=dict(depths=[2, 2, 18, 2],
4
+ embed_dim=192,
5
+ num_heads=[6, 12, 24, 48]),
6
+ cls_head=dict(in_channels=1536))
utils/swin_config/_base_/models/swin/swin_small.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # model settings
2
+ _base_ = "swin_tiny.py"
3
+ model = dict(backbone=dict(depths=[2, 2, 18, 2]))
utils/swin_config/_base_/models/swin/swin_tiny.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='Recognizer3D',
4
+ backbone=dict(
5
+ type='SwinTransformer3D',
6
+ patch_size=(4,4,4),
7
+ embed_dim=96,
8
+ depths=[2, 2, 6, 2],
9
+ num_heads=[3, 6, 12, 24],
10
+ window_size=(8,7,7),
11
+ mlp_ratio=4.,
12
+ qkv_bias=True,
13
+ qk_scale=None,
14
+ drop_rate=0.,
15
+ attn_drop_rate=0.,
16
+ drop_path_rate=0.2,
17
+ patch_norm=True),
18
+ cls_head=dict(
19
+ type='I3DHead',
20
+ in_channels=768,
21
+ num_classes=18,
22
+ spatial_type='avg',
23
+ dropout_ratio=0.5),
24
+ test_cfg = dict(average_clips='prob'))
utils/swin_config/_base_/models/swin/swin_tiny_backup.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='Recognizer3D',
4
+ backbone=dict(
5
+ type='SwinTransformer3D',
6
+ patch_size=(4,4,4),
7
+ embed_dim=96,
8
+ depths=[2, 2, 6, 2],
9
+ num_heads=[3, 6, 12, 24],
10
+ window_size=(8,7,7),
11
+ mlp_ratio=4.,
12
+ qkv_bias=True,
13
+ qk_scale=None,
14
+ drop_rate=0.,
15
+ attn_drop_rate=0.,
16
+ drop_path_rate=0.2,
17
+ patch_norm=True),
18
+ cls_head=dict(
19
+ type='I3DHead',
20
+ in_channels=768,
21
+ num_classes=400,
22
+ spatial_type='avg',
23
+ dropout_ratio=0.5),
24
+ test_cfg = dict(average_clips='prob'))
utils/swin_config/_base_/models/tanet_r50.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='Recognizer2D',
4
+ backbone=dict(
5
+ type='TANet',
6
+ pretrained='torchvision://resnet50',
7
+ depth=50,
8
+ num_segments=8,
9
+ tam_cfg=dict()),
10
+ cls_head=dict(
11
+ type='TSMHead',
12
+ num_classes=400,
13
+ in_channels=2048,
14
+ spatial_type='avg',
15
+ consensus=dict(type='AvgConsensus', dim=1),
16
+ dropout_ratio=0.5,
17
+ init_std=0.001),
18
+ # model training and testing settings
19
+ train_cfg=None,
20
+ test_cfg=dict(average_clips='prob'))
utils/swin_config/_base_/models/tin_r50.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='Recognizer2D',
4
+ backbone=dict(
5
+ type='ResNetTIN',
6
+ pretrained='torchvision://resnet50',
7
+ depth=50,
8
+ norm_eval=False,
9
+ shift_div=4),
10
+ cls_head=dict(
11
+ type='TSMHead',
12
+ num_classes=400,
13
+ in_channels=2048,
14
+ spatial_type='avg',
15
+ consensus=dict(type='AvgConsensus', dim=1),
16
+ dropout_ratio=0.5,
17
+ init_std=0.001,
18
+ is_shift=False),
19
+ # model training and testing settings
20
+ train_cfg=None,
21
+ test_cfg=dict(average_clips=None))
utils/swin_config/_base_/models/tpn_slowonly_r50.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='Recognizer3D',
4
+ backbone=dict(
5
+ type='ResNet3dSlowOnly',
6
+ depth=50,
7
+ pretrained='torchvision://resnet50',
8
+ lateral=False,
9
+ out_indices=(2, 3),
10
+ conv1_kernel=(1, 7, 7),
11
+ conv1_stride_t=1,
12
+ pool1_stride_t=1,
13
+ inflate=(0, 0, 1, 1),
14
+ norm_eval=False),
15
+ neck=dict(
16
+ type='TPN',
17
+ in_channels=(1024, 2048),
18
+ out_channels=1024,
19
+ spatial_modulation_cfg=dict(
20
+ in_channels=(1024, 2048), out_channels=2048),
21
+ temporal_modulation_cfg=dict(downsample_scales=(8, 8)),
22
+ upsample_cfg=dict(scale_factor=(1, 1, 1)),
23
+ downsample_cfg=dict(downsample_scale=(1, 1, 1)),
24
+ level_fusion_cfg=dict(
25
+ in_channels=(1024, 1024),
26
+ mid_channels=(1024, 1024),
27
+ out_channels=2048,
28
+ downsample_scales=((1, 1, 1), (1, 1, 1))),
29
+ aux_head_cfg=dict(out_channels=400, loss_weight=0.5)),
30
+ cls_head=dict(
31
+ type='TPNHead',
32
+ num_classes=400,
33
+ in_channels=2048,
34
+ spatial_type='avg',
35
+ consensus=dict(type='AvgConsensus', dim=1),
36
+ dropout_ratio=0.5,
37
+ init_std=0.01),
38
+ # model training and testing settings
39
+ train_cfg=None,
40
+ test_cfg=dict(average_clips='prob'))
utils/swin_config/_base_/models/tpn_tsm_r50.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='Recognizer2D',
4
+ backbone=dict(
5
+ type='ResNetTSM',
6
+ pretrained='torchvision://resnet50',
7
+ depth=50,
8
+ out_indices=(2, 3),
9
+ norm_eval=False,
10
+ shift_div=8),
11
+ neck=dict(
12
+ type='TPN',
13
+ in_channels=(1024, 2048),
14
+ out_channels=1024,
15
+ spatial_modulation_cfg=dict(
16
+ in_channels=(1024, 2048), out_channels=2048),
17
+ temporal_modulation_cfg=dict(downsample_scales=(8, 8)),
18
+ upsample_cfg=dict(scale_factor=(1, 1, 1)),
19
+ downsample_cfg=dict(downsample_scale=(1, 1, 1)),
20
+ level_fusion_cfg=dict(
21
+ in_channels=(1024, 1024),
22
+ mid_channels=(1024, 1024),
23
+ out_channels=2048,
24
+ downsample_scales=((1, 1, 1), (1, 1, 1))),
25
+ aux_head_cfg=dict(out_channels=174, loss_weight=0.5)),
26
+ cls_head=dict(
27
+ type='TPNHead',
28
+ num_classes=174,
29
+ in_channels=2048,
30
+ spatial_type='avg',
31
+ consensus=dict(type='AvgConsensus', dim=1),
32
+ dropout_ratio=0.5,
33
+ init_std=0.01),
34
+ # model training and testing settings
35
+ train_cfg=None,
36
+ test_cfg=dict(average_clips='prob', fcn_test=True))
utils/swin_config/_base_/models/trn_r50.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='Recognizer2D',
4
+ backbone=dict(
5
+ type='ResNet',
6
+ pretrained='torchvision://resnet50',
7
+ depth=50,
8
+ norm_eval=False,
9
+ partial_bn=True),
10
+ cls_head=dict(
11
+ type='TRNHead',
12
+ num_classes=400,
13
+ in_channels=2048,
14
+ num_segments=8,
15
+ spatial_type='avg',
16
+ relation_type='TRNMultiScale',
17
+ hidden_dim=256,
18
+ dropout_ratio=0.8,
19
+ init_std=0.001),
20
+ # model training and testing settings
21
+ train_cfg=None,
22
+ test_cfg=dict(average_clips='prob'))
utils/swin_config/_base_/models/tsm_mobilenet_v2.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='Recognizer2D',
4
+ backbone=dict(
5
+ type='MobileNetV2TSM',
6
+ shift_div=8,
7
+ num_segments=8,
8
+ is_shift=True,
9
+ pretrained='mmcls://mobilenet_v2'),
10
+ cls_head=dict(
11
+ type='TSMHead',
12
+ num_segments=8,
13
+ num_classes=400,
14
+ in_channels=1280,
15
+ spatial_type='avg',
16
+ consensus=dict(type='AvgConsensus', dim=1),
17
+ dropout_ratio=0.5,
18
+ init_std=0.001,
19
+ is_shift=True),
20
+ # model training and testing settings
21
+ train_cfg=None,
22
+ test_cfg=dict(average_clips='prob'))
utils/swin_config/_base_/models/tsm_r50.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='Recognizer2D',
4
+ backbone=dict(
5
+ type='ResNetTSM',
6
+ pretrained='torchvision://resnet50',
7
+ depth=50,
8
+ norm_eval=False,
9
+ shift_div=8),
10
+ cls_head=dict(
11
+ type='TSMHead',
12
+ num_classes=400,
13
+ in_channels=2048,
14
+ spatial_type='avg',
15
+ consensus=dict(type='AvgConsensus', dim=1),
16
+ dropout_ratio=0.5,
17
+ init_std=0.001,
18
+ is_shift=True),
19
+ # model training and testing settings
20
+ train_cfg=None,
21
+ test_cfg=dict(average_clips='prob'))
utils/swin_config/_base_/models/tsn_r50.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='Recognizer2D',
4
+ backbone=dict(
5
+ type='ResNet',
6
+ pretrained='torchvision://resnet50',
7
+ depth=50,
8
+ norm_eval=False),
9
+ cls_head=dict(
10
+ type='TSNHead',
11
+ num_classes=400,
12
+ in_channels=2048,
13
+ spatial_type='avg',
14
+ consensus=dict(type='AvgConsensus', dim=1),
15
+ dropout_ratio=0.4,
16
+ init_std=0.01),
17
+ # model training and testing settings
18
+ train_cfg=None,
19
+ test_cfg=dict(average_clips=None))
utils/swin_config/_base_/models/tsn_r50_audio.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='AudioRecognizer',
4
+ backbone=dict(type='ResNet', depth=50, in_channels=1, norm_eval=False),
5
+ cls_head=dict(
6
+ type='AudioTSNHead',
7
+ num_classes=400,
8
+ in_channels=2048,
9
+ dropout_ratio=0.5,
10
+ init_std=0.01),
11
+ # model training and testing settings
12
+ train_cfg=None,
13
+ test_cfg=dict(average_clips='prob'))
utils/swin_config/_base_/models/x3d.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='Recognizer3D',
4
+ backbone=dict(type='X3D', gamma_w=1, gamma_b=2.25, gamma_d=2.2),
5
+ cls_head=dict(
6
+ type='X3DHead',
7
+ in_channels=432,
8
+ num_classes=400,
9
+ spatial_type='avg',
10
+ dropout_ratio=0.5,
11
+ fc1_bias=False),
12
+ # model training and testing settings
13
+ train_cfg=None,
14
+ test_cfg=dict(average_clips='prob'))
utils/swin_config/_base_/schedules/adam_20e.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # optimizer
2
+ optimizer = dict(
3
+ type='Adam', lr=0.01, weight_decay=0.00001) # this lr is used for 1 gpus
4
+ optimizer_config = dict(grad_clip=None)
5
+ # learning policy
6
+ lr_config = dict(policy='step', step=10)
7
+ total_epochs = 20
utils/swin_config/_base_/schedules/sgd_100e.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # optimizer
2
+ optimizer = dict(
3
+ type='SGD',
4
+ lr=0.01, # this lr is used for 8 gpus
5
+ momentum=0.9,
6
+ weight_decay=0.0001)
7
+ optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
8
+ # learning policy
9
+ lr_config = dict(policy='step', step=[40, 80])
10
+ total_epochs = 100
utils/swin_config/_base_/schedules/sgd_150e_warmup.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # optimizer
2
+ optimizer = dict(
3
+ type='SGD', lr=0.01, momentum=0.9,
4
+ weight_decay=0.0001) # this lr is used for 8 gpus
5
+ optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
6
+ # learning policy
7
+ lr_config = dict(
8
+ policy='step',
9
+ step=[90, 130],
10
+ warmup='linear',
11
+ warmup_by_epoch=True,
12
+ warmup_iters=10)
13
+ total_epochs = 150
utils/swin_config/_base_/schedules/sgd_50e.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # optimizer
2
+ optimizer = dict(
3
+ type='SGD',
4
+ lr=0.01, # this lr is used for 8 gpus
5
+ momentum=0.9,
6
+ weight_decay=0.0001)
7
+ optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
8
+ # learning policy
9
+ lr_config = dict(policy='step', step=[20, 40])
10
+ total_epochs = 50
utils/swin_config/_base_/schedules/sgd_tsm_100e.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # optimizer
2
+ optimizer = dict(
3
+ type='SGD',
4
+ constructor='TSMOptimizerConstructor',
5
+ paramwise_cfg=dict(fc_lr5=True),
6
+ lr=0.02, # this lr is used for 8 gpus
7
+ momentum=0.9,
8
+ weight_decay=0.0001)
9
+ optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
10
+ # learning policy
11
+ lr_config = dict(policy='step', step=[40, 80])
12
+ total_epochs = 100
utils/swin_config/_base_/schedules/sgd_tsm_50e.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # optimizer
2
+ optimizer = dict(
3
+ type='SGD',
4
+ constructor='TSMOptimizerConstructor',
5
+ paramwise_cfg=dict(fc_lr5=True),
6
+ lr=0.01, # this lr is used for 8 gpus
7
+ momentum=0.9,
8
+ weight_decay=0.0001)
9
+ optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
10
+ # learning policy
11
+ lr_config = dict(policy='step', step=[20, 40])
12
+ total_epochs = 50
utils/swin_config/_base_/schedules/sgd_tsm_mobilenet_v2_100e.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # optimizer
2
+ optimizer = dict(
3
+ type='SGD',
4
+ constructor='TSMOptimizerConstructor',
5
+ paramwise_cfg=dict(fc_lr5=True),
6
+ lr=0.01, # this lr is used for 8 gpus
7
+ momentum=0.9,
8
+ weight_decay=0.00002)
9
+ optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
10
+ # learning policy
11
+ lr_config = dict(policy='step', step=[40, 80])
12
+ total_epochs = 100