Upload open source code of MTFL model
Browse filesMTFL: Multi-Timescale Feature Learning for Weakly-supervised Anomaly Detection in Surveillance Videos
https://arxiv.org/abs/2410.05900
This view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +1 -0
- .gitignore +1 -0
- README.md +238 -3
- detection/dataset.py +117 -0
- detection/model.py +296 -0
- detection/option.py +56 -0
- detection/test.py +168 -0
- detection/train.py +188 -0
- figures/Intro.png +3 -0
- recognition/dataset.py +140 -0
- recognition/model.py +295 -0
- recognition/option.py +56 -0
- recognition/test.py +120 -0
- recognition/train.py +171 -0
- requirements.txt +10 -0
- utils/feature_extractor.py +284 -0
- utils/functional_video.py +102 -0
- utils/swin_config/_base_/default_runtime.py +13 -0
- utils/swin_config/_base_/models/audioonly_r50.py +18 -0
- utils/swin_config/_base_/models/bmn_400x100.py +12 -0
- utils/swin_config/_base_/models/bsn_pem.py +13 -0
- utils/swin_config/_base_/models/bsn_tem.py +8 -0
- utils/swin_config/_base_/models/c3d_sports1m_pretrained.py +23 -0
- utils/swin_config/_base_/models/csn_ig65m_pretrained.py +23 -0
- utils/swin_config/_base_/models/i3d_r50.py +27 -0
- utils/swin_config/_base_/models/r2plus1d_r34.py +28 -0
- utils/swin_config/_base_/models/slowfast_r50.py +39 -0
- utils/swin_config/_base_/models/slowonly_r50.py +22 -0
- utils/swin_config/_base_/models/swin/swin_base.py +6 -0
- utils/swin_config/_base_/models/swin/swin_large.py +6 -0
- utils/swin_config/_base_/models/swin/swin_small.py +3 -0
- utils/swin_config/_base_/models/swin/swin_tiny.py +24 -0
- utils/swin_config/_base_/models/swin/swin_tiny_backup.py +24 -0
- utils/swin_config/_base_/models/tanet_r50.py +20 -0
- utils/swin_config/_base_/models/tin_r50.py +21 -0
- utils/swin_config/_base_/models/tpn_slowonly_r50.py +40 -0
- utils/swin_config/_base_/models/tpn_tsm_r50.py +36 -0
- utils/swin_config/_base_/models/trn_r50.py +22 -0
- utils/swin_config/_base_/models/tsm_mobilenet_v2.py +22 -0
- utils/swin_config/_base_/models/tsm_r50.py +21 -0
- utils/swin_config/_base_/models/tsn_r50.py +19 -0
- utils/swin_config/_base_/models/tsn_r50_audio.py +13 -0
- utils/swin_config/_base_/models/x3d.py +14 -0
- utils/swin_config/_base_/schedules/adam_20e.py +7 -0
- utils/swin_config/_base_/schedules/sgd_100e.py +10 -0
- utils/swin_config/_base_/schedules/sgd_150e_warmup.py +13 -0
- utils/swin_config/_base_/schedules/sgd_50e.py +10 -0
- utils/swin_config/_base_/schedules/sgd_tsm_100e.py +12 -0
- utils/swin_config/_base_/schedules/sgd_tsm_50e.py +12 -0
- utils/swin_config/_base_/schedules/sgd_tsm_mobilenet_v2_100e.py +12 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
figures/Intro.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
/test_videos/
|
README.md
CHANGED
|
@@ -1,3 +1,238 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# MTFL
|
| 2 |
+
|
| 3 |
+
This repo is the official Pytorch implementation of our paper:
|
| 4 |
+
|
| 5 |
+
> [**MTFL: Multi-Timescale Feature Learning for Weakly-supervised Anomaly Detection in Surveillance Videos**](to be filled)
|
| 6 |
+
>
|
| 7 |
+
<!--Author list-->
|
| 8 |
+
|
| 9 |
+
## Introduction
|
| 10 |
+

|
| 11 |
+
|
| 12 |
+
Detection of anomaly events is relevant for public safety and requires a combination of fine-grained motion information and long-time action recognition. Therefore, we propose a Multi-Timescale Feature Learning (MTFL) method to enhance the representation of anomaly features. We employ short, medium, and long temporal tubelets to extract spatio-temporal video
|
| 13 |
+
features using the Video Swin Transformer. Experimental results demonstrate that
|
| 14 |
+
MTFL outperforms state-of-the-art methods on the UCF-Crime dataset, achieving an
|
| 15 |
+
anomaly detection performance 89.78% AUC. Moreover, it performs 95.32% AUC on the
|
| 16 |
+
ShanghaiTech and 84.57% AP on the XD-Violence dataset, complementary to several
|
| 17 |
+
SotA results. Building upon MTFL, we also propose an anomaly recognition network
|
| 18 |
+
that employs partial features for classification, achieving a leading accuracy on
|
| 19 |
+
UCF-Crime, outperforming the existing recognition literature. Furthermore,
|
| 20 |
+
we introduce an extended dataset for UCF-Crime,
|
| 21 |
+
namely Video Anomaly Detection Dataset (VADD),
|
| 22 |
+
involving 2,591 videos in 18 classes with extensive coverage of realistic anomalies.
|
| 23 |
+
|
| 24 |
+
## Models and Dataset
|
| 25 |
+
### [Video Anomaly Detection Dataset (VADD)](https://form.jotform.com/240714220958354)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
VADD includes 2,591 videos with a frame rate of 30 fps and a
|
| 29 |
+
resolution of 320×240 pixels, with 2,202 train and 389 test videos.
|
| 30 |
+
The subfolders in VADD are named according to video categories, totaling 18 subfolders.
|
| 31 |
+
Train-set annotations only include a class label, while test-set annotations contain a
|
| 32 |
+
video class label, a number of frames in a video, as well as the starting and
|
| 33 |
+
ending frame positions of abnormal events in a video.
|
| 34 |
+
```
|
| 35 |
+
# Training annotation
|
| 36 |
+
[Subfolder/video name] [video label]
|
| 37 |
+
# Test annotation
|
| 38 |
+
[Subfolder/video name] [video label] [total frames] [start_frame1] [end_frame1] [start_frame2]...
|
| 39 |
+
```
|
| 40 |
+
* Taking a training video containing littering as an example, it is annotated as below:
|
| 41 |
+
```
|
| 42 |
+
Littering/CarSafe015.mp4 Littering
|
| 43 |
+
```
|
| 44 |
+
* Taking a test video containing dangerous throwing behavior as an example,
|
| 45 |
+
its annotations indicate that the video has a total of 636 frames and
|
| 46 |
+
there are two instances of dangerous throwing behavior.
|
| 47 |
+
The first instance occurs between frames 145 and 186,
|
| 48 |
+
while the second instance occurs between frames 289 and 340.
|
| 49 |
+
```
|
| 50 |
+
DangerousThrowing/BicyclistDangerous039.mp4 DangerousThrowing 636 145 186 289 340
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
Additionally, to train and test our MTFL with benchmark datasets,
|
| 54 |
+
we converted annotation files from other datasets to match the format of VADD annotation
|
| 55 |
+
files, including Shanghai Tech, XD-Violence, and UCF-Crime.
|
| 56 |
+
|
| 57 |
+
All train and test annotation files for AnomalyDetection and AnomalyRecognition are provided in the ["Annotation".](https://tuenl-my.sharepoint.com/:f:/r/personal/e_akdag_tue_nl/Documents/MTFL/annotation?csf=1&web=1&e=UYxR0H).
|
| 58 |
+
|
| 59 |
+
### [MTFL checkpoints for anomaly detection](https://tuenl-my.sharepoint.com/:f:/r/personal/e_akdag_tue_nl/Documents/MTFL/MTFL-checkpoints?csf=1&web=1&e=hJhPgh)
|
| 60 |
+
|
| 61 |
+
| Detection Checkpoint | Feature | UCF | Shanghai<br/>Tech | XD-Violence | VADD |
|
| 62 |
+
|------------------|---------------|-------|--------------|-------------|---|
|
| 63 |
+
| MTFL_VST_Kinetics400 | VST-RGB | 87.61 | 95.32 | 84.57 | - |
|
| 64 |
+
| MTFL_VST_VADD | VST<sub>Aug</sub>_RGB | 89.79 | 95.70 | 79.40 | 88.42 |
|
| 65 |
+
|
| 66 |
+
There are several MTFL checkpoints for anomaly detection using different feature extractors
|
| 67 |
+
and datasets where,
|
| 68 |
+
* xxx_VST_Kinetics400 = Features extracted using VST pretrained on Kinetics400,
|
| 69 |
+
* xxx_VST_VADD = Features extracted using VST pretrained on VADD with data augmentation.
|
| 70 |
+
* MTFL-yyy-VST-Kinetics400 = MTFL models trained with VST_RGB features.
|
| 71 |
+
* MTFL-yyy-VST-VADD = MTFL models trained with VST<sub>Aug</sub>_RGB features.
|
| 72 |
+
xxx = Shanghai, VADD, and XD.
|
| 73 |
+
yyy = SH, VADD-UCF, and XD.
|
| 74 |
+
|
| 75 |
+
Two feature extractors used in our detection models and
|
| 76 |
+
the resulting features of benchmark datasets are provided below:
|
| 77 |
+
* [Video Swin Transformer pretrained on Kinetics-400](https://tuenl-my.sharepoint.com/:u:/r/personal/e_akdag_tue_nl/Documents/MTFL/feature-extractors/AnomalyDetection/swin_base_patch244_window877_kinetics400_22k.pth?csf=1&web=1&e=8spheA)
|
| 78 |
+
* [Video Swin Transformer pretrained on VADD](https://tuenl-my.sharepoint.com/:u:/r/personal/e_akdag_tue_nl/Documents/MTFL/feature-extractors/AnomalyDetection/VST_swin_base_patch244_window877_VADD.pth?csf=1&web=1&e=AzfewH)
|
| 79 |
+
* [VST_RBG features of UCF-Crime, XD-Violence, Shanghai Tech, and VADD](https://tuenl-my.sharepoint.com/:f:/r/personal/e_akdag_tue_nl/Documents/MTFL/features/AnomalyDetection?csf=1&web=1&e=CT8WZ3)
|
| 80 |
+
* [VST<sub>Aug</sub>_RGB features of UCF-Crime, XD-Violence, Shanghai Tech, and VADD](https://tuenl-my.sharepoint.com/:f:/r/personal/e_akdag_tue_nl/Documents/MTFL/features/AnomalyDetection?csf=1&web=1&e=CT8WZ3)
|
| 81 |
+
|
| 82 |
+
The Video Swin Transformer model pretrained on Kinetics400 and
|
| 83 |
+
the training method for Video Swin Transformer are derived from
|
| 84 |
+
the [Video-Swin-Transformer repository](https://github.com/SwinTransformer/Video-Swin-Transformer).
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
### [MTFL checkpoints for anomaly recognition](https://tuenl-my.sharepoint.com/:f:/r/personal/e_akdag_tue_nl/Documents/MTFL/MTFL-checkpoints/AnomalyRecognition?csf=1&web=1&e=NOkpNn)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
| Recognition Checkpoint | UCF Acc(%) | VAD Acc(%) |
|
| 92 |
+
|-------------------------------|------------|------------|
|
| 93 |
+
| MTFL_VADDsplit1_best_UCF | 39.88 | - |
|
| 94 |
+
| MTFL_VADDsplit1_best_VADD | - | 45.87 |
|
| 95 |
+
| MTFL_VADDsplit2_best_UCF | 47.02 | - |
|
| 96 |
+
| MTFL_VADDsplit2_best_VADD | - | 49.31 |
|
| 97 |
+
| MTFL_VADDsplit3_best_UCF | 49.40 | - |
|
| 98 |
+
| MTFL_VADDsplit3_best_VADD | - | 53.88 |
|
| 99 |
+
| MTFL_VADDsplit4_best_UCF_VADD | 45.83 | 52.29 |
|
| 100 |
+
| 4-fold average | 45.53 | 50.34 |
|
| 101 |
+
|
| 102 |
+
Following the experimental setup of 4-fold cross-validation from [Sultani et al](https://arxiv.org/abs/1801.04264),
|
| 103 |
+
there are seven recognition checkpoints by saving the checkpoints that performed the best on
|
| 104 |
+
UCF and VADD separately during training on different VADD splits, as shown in the above table. For example,
|
| 105 |
+
MTFL_VADDsplit1_best_UCF represents the MTFL recognition model trained on VADD
|
| 106 |
+
split 1 with the best recognition performance on UCF-Crime split 1 test-set.
|
| 107 |
+
All the models use VST trained on the corresponding VADD splits
|
| 108 |
+
for feature extraction.
|
| 109 |
+
|
| 110 |
+
* [The used feature extractors for anomaly recognition](https://tuenl-my.sharepoint.com/:f:/r/personal/e_akdag_tue_nl/Documents/MTFL/feature-extractors/AnomalyRecognition?csf=1&web=1&e=ToseKM)
|
| 111 |
+
* [The generated features for anomaly recognition](https://tuenl-my.sharepoint.com/:f:/r/personal/e_akdag_tue_nl/Documents/MTFL/features/AnomalyRecognition?csf=1&web=1&e=4nbEUm)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
## Environment setup
|
| 116 |
+
```
|
| 117 |
+
pip install -r requirements.txt
|
| 118 |
+
```
|
| 119 |
+
## Folder Structure
|
| 120 |
+
```flow
|
| 121 |
+
demo/
|
| 122 |
+
│
|
| 123 |
+
├── detection/ # MTFL detection
|
| 124 |
+
│ └── ...
|
| 125 |
+
├── recognition/ # MTFL recognition
|
| 126 |
+
│ └── ...
|
| 127 |
+
├── utils/
|
| 128 |
+
│ ├── swin_config/ # VST config for loading feature extractor
|
| 129 |
+
│ │ └── ...
|
| 130 |
+
│ ├── feature_extractor.py
|
| 131 |
+
│ ├── ...
|
| 132 |
+
│ └── video_preprocessing/ # scripts for annotation and unifying video format
|
| 133 |
+
│ └── ...
|
| 134 |
+
├── test_videos/ # put your test video here
|
| 135 |
+
├── Annotation/ # put your annotation here
|
| 136 |
+
├── features/ # feature path
|
| 137 |
+
│ ├── L8
|
| 138 |
+
│ ├── L32
|
| 139 |
+
│ └── L64
|
| 140 |
+
├── results/
|
| 141 |
+
│ ├── AUC # detection AUC
|
| 142 |
+
│ ├── scores # detection scores
|
| 143 |
+
│ └── rec_results # recognition labels
|
| 144 |
+
└── README.md
|
| 145 |
+
```
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
## Feature Extraction
|
| 149 |
+
Both recognition and detection models require multi-timescale features using tubelets 8, 32, and 64 frames.
|
| 150 |
+
To extract features, you need to upload the videos to the 'test_videos' directory and then run the following command:
|
| 151 |
+
```
|
| 152 |
+
python utils/feature_extractor.py --clip_length [8/32/64]
|
| 153 |
+
```
|
| 154 |
+
In the default settings, test videos should be stored in the 'test_videos' directory, and the extracted features will be
|
| 155 |
+
organized within the 'features' folder following the same directory structure as 'test_videos'.
|
| 156 |
+
For example, the feature of video 'test_videos/A/B.mp4' extracted with a frame length 8 is saved as 'features/L8/A/B.txt'.
|
| 157 |
+
|
| 158 |
+
You can modify the parameters inside the "VST Feature Extractor Parser" as needed.
|
| 159 |
+
For example, you can change the input video path, the save path of features and the used pretrained feature extractor by specifying the model path.
|
| 160 |
+
```
|
| 161 |
+
python utils/feature_extractor.py --clip_length [8/32/64] --dataset_path [your video path] --save_dir [your feature path] --pretrained_3d [model path]
|
| 162 |
+
```
|
| 163 |
+
Note: if you use VST pretrained on Kinetics400, you need to change <num_classes> to 400 in line 21 of
|
| 164 |
+
'utils/swin_config/_base/models/swin/swin_tiny.py' to adapt the model size. For VST pretrained on VADD, the <num_classes>
|
| 165 |
+
is 18. These settings are referenced from the guidelines provided by
|
| 166 |
+
[Video-Swin-Transformer repository](https://github.com/SwinTransformer/Video-Swin-Transformer).
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
## Anomaly Detection
|
| 171 |
+
### Inference
|
| 172 |
+
To test a detection checkpoint model on your test videos, run:
|
| 173 |
+
```
|
| 174 |
+
python detection/test.py --test_anno [your_anno_file.txt] --detection_model [checkpoint path]
|
| 175 |
+
```
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
In the default settings:
|
| 179 |
+
|
| 180 |
+
* Test videos should be stored in the 'test_videos' directory.
|
| 181 |
+
* The corresponding annotation file need to be placed in the 'annotation' folder. Annotation format can be found under Video Preprocessing->Annotation.
|
| 182 |
+
* Multi-temporal scale features of the videos should be stored in the 'features' directory. See Feature Extraction.
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
The detection AUC and the scores for each video will be generated within the 'results' folder.
|
| 187 |
+
The directory structure of the generated results, in relation to both 'results/AUC' and 'results/scores', mirrors the
|
| 188 |
+
structure of the corresponding test videos in the 'test_videos' directory. For example,
|
| 189 |
+
the score of video 'test_videos/A/B.mp4' is saved as 'results/scores/A/B.png'
|
| 190 |
+
|
| 191 |
+
If you want to change paths to input and output data or any running configs,
|
| 192 |
+
feel free to change the args in 'detection/option.py'.
|
| 193 |
+
|
| 194 |
+
### Train
|
| 195 |
+
To train a detection model, run:
|
| 196 |
+
```
|
| 197 |
+
python detection/train.py --train_anno [your_train_anno_file.txt] --test_anno [your_test_anno_file.txt]
|
| 198 |
+
--lf_dir [path to long frame length features] --mf_dir [path to medium frame length features] --sf_dir
|
| 199 |
+
[path to short frame length features] --save_models [path for saving checkpoints] --output_dir [path for saving checkpoint AUC]
|
| 200 |
+
```
|
| 201 |
+
|
| 202 |
+
Other training parameters can be found in 'detection/option.py'
|
| 203 |
+
|
| 204 |
+
## Anomaly Recognition
|
| 205 |
+
### Inference
|
| 206 |
+
To test a recognition checkpoint model on your test videos, run
|
| 207 |
+
```
|
| 208 |
+
python recognition/test.py --test_anno [your_anno_file.txt] --recognition_model [checkpoint path]
|
| 209 |
+
```
|
| 210 |
+
|
| 211 |
+
The default settings are same as Detection, and the modifiable parameters are in 'recognition/option.py'.
|
| 212 |
+
The recognition results of all input will be saved as 'results/rec_results/output_pred.txt'.
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
### Train
|
| 217 |
+
To train a recognition model, run:
|
| 218 |
+
```
|
| 219 |
+
python recognition/train.py --train_anno [your_train_anno_file.txt] --test_anno [your_test_anno_file.txt]
|
| 220 |
+
--lf_dir [path to long frame length features] --mf_dir [path to medium frame length features] --sf_dir
|
| 221 |
+
[path to short frame length features] --save_models [path for saving checkpoints] --output_dir [path for saving checkpoint AUC]
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
+
Note: following the experimental setup of 4-fold cross-validation from [Sultani et al](https://arxiv.org/abs/1801.04264),
|
| 225 |
+
there are four pairs of training annotation and testing annotation files corresponding to four splits for each dataset,
|
| 226 |
+
which are provided in in the "annotation" folder accessible through the above VADD link.
|
| 227 |
+
Make sure the correspondence between the training and testing files; otherwise, there are data leakage issues.
|
| 228 |
+
Other training parameters can be found in 'recognition/option.py'
|
| 229 |
+
|
| 230 |
+
## Acknowledgement
|
| 231 |
+
|
| 232 |
+
Partial code is used from
|
| 233 |
+
[Video-Swin-Transformer](https://github.com/SwinTransformer/Video-Swin-Transformer)
|
| 234 |
+
and [RTFM](https://github.com/tianyu0207/RTFM)
|
| 235 |
+
<!--## Citation
|
| 236 |
+
|
| 237 |
+
If you find this repo useful for your research, please consider citing our paper:-->
|
| 238 |
+
|
detection/dataset.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch.utils.data as data
|
| 2 |
+
import os
|
| 3 |
+
import torch
|
| 4 |
+
torch.set_default_tensor_type('torch.FloatTensor')
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def read_features(feature_path):
|
| 8 |
+
"""
|
| 9 |
+
Read features from a text file and convert them into a torch tensor.
|
| 10 |
+
|
| 11 |
+
Args:
|
| 12 |
+
feature_path (str): Path to the text file containing features.
|
| 13 |
+
|
| 14 |
+
Returns:
|
| 15 |
+
features (torch.Tensor): A tensor containing the features. Shape is T x C.
|
| 16 |
+
"""
|
| 17 |
+
with open(feature_path, 'r') as file:
|
| 18 |
+
lines = file.readlines()
|
| 19 |
+
features = []
|
| 20 |
+
for line in lines:
|
| 21 |
+
feature = [float(value) for value in line.strip().split()]
|
| 22 |
+
features.append(feature)
|
| 23 |
+
features = torch.tensor(features).float() # T x C
|
| 24 |
+
return features
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class Dataset(data.Dataset):
|
| 28 |
+
def __init__(self, args, is_normal=True, transform=None, test_mode=False):
|
| 29 |
+
"""
|
| 30 |
+
Custom dataset class for loading features and labels.
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
args: Argument object containing paths and options.
|
| 34 |
+
is_normal (bool): Whether the dataset represents normal samples.
|
| 35 |
+
transform: Data transformation to be applied.
|
| 36 |
+
test_mode (bool): Whether the dataset is for testing.
|
| 37 |
+
|
| 38 |
+
Attributes:
|
| 39 |
+
is_normal (bool): Whether the dataset represents normal samples.
|
| 40 |
+
transform: Data transformation to be applied.
|
| 41 |
+
test_mode (bool): Whether the dataset is for testing.
|
| 42 |
+
list (list): List of feature paths and labels information.
|
| 43 |
+
"""
|
| 44 |
+
self.is_normal = is_normal
|
| 45 |
+
self.transform = transform
|
| 46 |
+
self.test_mode = test_mode
|
| 47 |
+
|
| 48 |
+
if self.test_mode:
|
| 49 |
+
annotation_path = args.test_anno
|
| 50 |
+
else:
|
| 51 |
+
annotation_path = args.train_anno
|
| 52 |
+
|
| 53 |
+
self.list = self._get_features_list(args.lf_dir, args.mf_dir, args.sf_dir, annotation_path)
|
| 54 |
+
|
| 55 |
+
def __getitem__(self, index):
|
| 56 |
+
label = self.get_label()
|
| 57 |
+
if self.test_mode:
|
| 58 |
+
lf_path, mf_path, sf_path, num_frames, start_end_couples, file = self.list[index]
|
| 59 |
+
l_features = read_features(lf_path)
|
| 60 |
+
m_features = read_features(mf_path)
|
| 61 |
+
s_features = read_features(sf_path)
|
| 62 |
+
return l_features, m_features, s_features, label, start_end_couples, num_frames, file
|
| 63 |
+
else:
|
| 64 |
+
lf_path, mf_path, sf_path = self.list[index]
|
| 65 |
+
l_features = read_features(lf_path)
|
| 66 |
+
m_features = read_features(mf_path)
|
| 67 |
+
s_features = read_features(sf_path)
|
| 68 |
+
return l_features, m_features, s_features, label
|
| 69 |
+
|
| 70 |
+
def get_label(self):
|
| 71 |
+
if self.is_normal:
|
| 72 |
+
label = torch.tensor(0.0)
|
| 73 |
+
else:
|
| 74 |
+
label = torch.tensor(1.0)
|
| 75 |
+
|
| 76 |
+
return label
|
| 77 |
+
|
| 78 |
+
def __len__(self):
|
| 79 |
+
return len(self.list)
|
| 80 |
+
|
| 81 |
+
def _get_features_list(self, lf_dir, mf_dir, sf_dir, annotation_path):
|
| 82 |
+
"""
|
| 83 |
+
Generate a list of features and labels information from annotations.
|
| 84 |
+
|
| 85 |
+
Args:
|
| 86 |
+
lf_dir (str): Path to long-frame-length features directory.
|
| 87 |
+
mf_dir (str): Path to medium-frame-length features directory.
|
| 88 |
+
sf_dir (str): Path to short-frame-length features directory.
|
| 89 |
+
annotation_path (str): Path to annotation file.
|
| 90 |
+
|
| 91 |
+
Returns:
|
| 92 |
+
list: A list of tuples containing features and labels information.
|
| 93 |
+
"""
|
| 94 |
+
assert os.path.exists(lf_dir)
|
| 95 |
+
assert os.path.exists(mf_dir)
|
| 96 |
+
assert os.path.exists(sf_dir)
|
| 97 |
+
features_list = []
|
| 98 |
+
with open(annotation_path) as f:
|
| 99 |
+
lines = f.read().splitlines(keepends=False)
|
| 100 |
+
for line in lines:
|
| 101 |
+
items = line.split()
|
| 102 |
+
#file = items[0].split(".")[0] for XD
|
| 103 |
+
file, ext = os.path.splitext(items[0])
|
| 104 |
+
file = file.replace("/", os.sep)
|
| 105 |
+
lf_path = os.path.join(lf_dir, file + '.txt')
|
| 106 |
+
mf_path = os.path.join(mf_dir, file + '.txt')
|
| 107 |
+
sf_path = os.path.join(sf_dir, file + '.txt')
|
| 108 |
+
cls_name = items[1]
|
| 109 |
+
if self.test_mode:
|
| 110 |
+
start_end_couples = [int(x) for x in items[3:]]
|
| 111 |
+
num_frames = int(items[2])
|
| 112 |
+
features_list.append((lf_path, mf_path, sf_path, num_frames, start_end_couples, file))
|
| 113 |
+
elif ("Normal" == cls_name) == self.is_normal:
|
| 114 |
+
features_list.append((lf_path, mf_path, sf_path))
|
| 115 |
+
|
| 116 |
+
return features_list
|
| 117 |
+
|
detection/model.py
ADDED
|
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
""" Reference source: https://github.com/tianyu0207/RTFM"""
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
import torch.nn as nn
|
| 5 |
+
import torch.nn.functional as F
|
| 6 |
+
import torch.nn.init as torch_init
|
| 7 |
+
torch.set_default_tensor_type('torch.FloatTensor')
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def weight_init(m):
|
| 11 |
+
classname = m.__class__.__name__
|
| 12 |
+
if classname.find('Conv') != -1 or classname.find('Linear') != -1:
|
| 13 |
+
torch_init.xavier_uniform_(m.weight)
|
| 14 |
+
if m.bias is not None:
|
| 15 |
+
m.bias.data.fill_(0)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class CVA(nn.Module):
|
| 19 |
+
def __init__(self, input_dim=1024):
|
| 20 |
+
"""
|
| 21 |
+
Cross-View Attention (CVA) module.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
input_dim (int): Dimension of the input features.
|
| 25 |
+
"""
|
| 26 |
+
super(CVA, self).__init__()
|
| 27 |
+
drop_out_rate = 0.1
|
| 28 |
+
num_heads = 4
|
| 29 |
+
self.cross_attention = nn.MultiheadAttention(embed_dim=input_dim, num_heads=num_heads, dropout=drop_out_rate,
|
| 30 |
+
device='cuda')
|
| 31 |
+
|
| 32 |
+
def forward(self, feature1, feature2):
|
| 33 |
+
"""
|
| 34 |
+
Args:
|
| 35 |
+
feature1 (torch.Tensor): one path features. Shape: B x T x C.
|
| 36 |
+
feature2 (torch.Tensor): another path features. Shape: B x T x C.
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
out1 (torch.Tensor): Processed features after cross-attention. Shape: B x T x C.
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
feature1 = F.layer_norm(feature1, [feature1.size(-1)])
|
| 43 |
+
feature2 = F.layer_norm(feature2, [feature2.size(-1)])
|
| 44 |
+
feature1 = feature1.permute(1, 0, 2) # T B C
|
| 45 |
+
feature2 = feature2.permute(1, 0, 2)
|
| 46 |
+
|
| 47 |
+
out1, _ = self.cross_attention(query=feature1, key=feature2, value=feature2) # T B C (For test:32 1 1024)
|
| 48 |
+
out1 = out1 + feature1 # residual connection
|
| 49 |
+
|
| 50 |
+
return out1 # B T C
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class Aggregate(nn.Module):
|
| 54 |
+
def __init__(self, input_dim):
|
| 55 |
+
"""
|
| 56 |
+
An aggregate network including local temporal correlation learning, global temporal correlation learning,
|
| 57 |
+
and feature fusion in MTFF.
|
| 58 |
+
|
| 59 |
+
Args:
|
| 60 |
+
input_dim (int): input features dim.
|
| 61 |
+
"""
|
| 62 |
+
super(Aggregate, self).__init__()
|
| 63 |
+
bn = nn.BatchNorm1d
|
| 64 |
+
num_heads = 4
|
| 65 |
+
self.input_dim = input_dim
|
| 66 |
+
self.conv_1 = nn.Sequential(
|
| 67 |
+
nn.Conv1d(in_channels=input_dim, out_channels=512, kernel_size=3,
|
| 68 |
+
stride=1,dilation=1, padding=1),
|
| 69 |
+
nn.LeakyReLU(negative_slope=5e-2),
|
| 70 |
+
bn(512)
|
| 71 |
+
)
|
| 72 |
+
self.conv_2 = nn.Sequential(
|
| 73 |
+
nn.Conv1d(in_channels=input_dim, out_channels=512, kernel_size=3,
|
| 74 |
+
stride=1, dilation=2, padding=2),
|
| 75 |
+
nn.LeakyReLU(negative_slope=5e-2),
|
| 76 |
+
bn(512)
|
| 77 |
+
)
|
| 78 |
+
self.conv_3 = nn.Sequential(
|
| 79 |
+
nn.Conv1d(in_channels=input_dim, out_channels=512, kernel_size=3,
|
| 80 |
+
stride=1, dilation=4, padding=4),
|
| 81 |
+
nn.LeakyReLU(negative_slope=5e-2),
|
| 82 |
+
bn(512)
|
| 83 |
+
)
|
| 84 |
+
self.conv_4 = nn.Sequential(
|
| 85 |
+
nn.Conv1d(in_channels=input_dim*3, out_channels=512, kernel_size=1,
|
| 86 |
+
stride=1, padding=0, bias = False),
|
| 87 |
+
nn.LeakyReLU(negative_slope=5e-2),
|
| 88 |
+
)
|
| 89 |
+
self.conv_5 = nn.Sequential(
|
| 90 |
+
nn.Conv1d(in_channels=2048, out_channels=input_dim, kernel_size=3,
|
| 91 |
+
stride=1, padding=1, bias=False),
|
| 92 |
+
nn.LeakyReLU(negative_slope=5e-2),
|
| 93 |
+
nn.BatchNorm1d(input_dim),
|
| 94 |
+
)
|
| 95 |
+
self.self_attention = nn.MultiheadAttention(embed_dim=512, num_heads=num_heads,
|
| 96 |
+
dropout=0.1, device='cuda')
|
| 97 |
+
|
| 98 |
+
def forward(self, input1, input2, input3):
|
| 99 |
+
"""
|
| 100 |
+
Args:
|
| 101 |
+
input1 (torch.Tensor): long-frame-length features. Shape: T x B x C.
|
| 102 |
+
input2 (torch.Tensor): medium-frame-length features. Shape: T x B x C.
|
| 103 |
+
input3 (torch.Tensor): short-frame-length features. Shape: T x B x C.
|
| 104 |
+
|
| 105 |
+
Returns:
|
| 106 |
+
torch.Tensor: Processed and fused output features. Shape: B x T x C.
|
| 107 |
+
"""
|
| 108 |
+
x1 = input1.permute(1, 2, 0) # B C T
|
| 109 |
+
x2 = input2.permute(1, 2, 0)
|
| 110 |
+
x3 = input3.permute(1, 2, 0)
|
| 111 |
+
tensor_list = [x1, x2, x3]
|
| 112 |
+
|
| 113 |
+
residual = torch.mean(torch.stack(tensor_list), dim=0)
|
| 114 |
+
|
| 115 |
+
out1 = self.conv_1(x1) # B C/2 T
|
| 116 |
+
out2 = self.conv_2(x2)
|
| 117 |
+
out3 = self.conv_3(x3)
|
| 118 |
+
x = torch.cat([out1, out2, out3], dim=1) # B 3C/2 T
|
| 119 |
+
|
| 120 |
+
feature = torch.cat((x1, x2, x3), dim=1)
|
| 121 |
+
out = self.conv_4(feature)
|
| 122 |
+
out = out.permute(2, 0, 1) # T B C/2
|
| 123 |
+
out = F.layer_norm(out, normalized_shape=[out.size(-1)])
|
| 124 |
+
out, _ = self.self_attention(out, out, out) # T B C/2
|
| 125 |
+
out = out.permute(1, 2, 0) # B C/2 T
|
| 126 |
+
out = torch.cat((x, out), dim=1) # B 2C T
|
| 127 |
+
out = self.conv_5(out) # fuse all the features together
|
| 128 |
+
out = out + residual
|
| 129 |
+
out = out.permute(0, 2, 1)
|
| 130 |
+
|
| 131 |
+
return out
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
class Encoder(nn.Module):
|
| 135 |
+
def __init__(self, input_dim=1024, seg_num=32):
|
| 136 |
+
"""
|
| 137 |
+
Multi-Temporal Feature Fusion (MTFF) module.
|
| 138 |
+
|
| 139 |
+
Args:
|
| 140 |
+
input_dim (int): Dimension of the input features.
|
| 141 |
+
seg_num (int): Number of snippets in a video.
|
| 142 |
+
"""
|
| 143 |
+
super(Encoder, self).__init__()
|
| 144 |
+
self.drop_out_rate = 0.1
|
| 145 |
+
self.input_dim = input_dim
|
| 146 |
+
self.min_temporal_dim = seg_num
|
| 147 |
+
self.CVA1 = CVA(input_dim=input_dim)
|
| 148 |
+
self.CVA2 = CVA(input_dim=input_dim)
|
| 149 |
+
self.CVA3 = CVA(input_dim=input_dim)
|
| 150 |
+
|
| 151 |
+
self.aggregate = Aggregate(input_dim=input_dim)
|
| 152 |
+
|
| 153 |
+
def forward(self, feature1, feature2, feature3):
|
| 154 |
+
"""
|
| 155 |
+
Args:
|
| 156 |
+
feature1 (torch.Tensor): long-frame-length features. Shape: B x T x C.
|
| 157 |
+
(Batch size X The number of snippets x Input dimensions)
|
| 158 |
+
feature2 (torch.Tensor): medium-frame-length features. Shape: B x T x C.
|
| 159 |
+
feature3 (torch.Tensor): short-frame-length features. Shape: B x T x C.
|
| 160 |
+
|
| 161 |
+
Returns:
|
| 162 |
+
torch.Tensor: Fused and processed output features. Shape: B x T x C.
|
| 163 |
+
"""
|
| 164 |
+
|
| 165 |
+
att1 = self.CVA1(feature1, feature2)
|
| 166 |
+
att2 = self.CVA2(feature2, feature3)
|
| 167 |
+
att3 = self.CVA3(feature3, feature1)
|
| 168 |
+
|
| 169 |
+
out1 = self.aggregate(att1, att2, att3) # B T C
|
| 170 |
+
|
| 171 |
+
return out1
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
class Model(nn.Module):
|
| 175 |
+
def __init__(self, feature_dim, batch_size, seg_num=32):
|
| 176 |
+
"""
|
| 177 |
+
Multi-Temporal Feature Learning (MTFL) model.
|
| 178 |
+
|
| 179 |
+
Args:
|
| 180 |
+
feature_dim (int): Dimension of the input features.
|
| 181 |
+
batch_size (int): Batch size.
|
| 182 |
+
seg_num (int): Number of snippets in a video.
|
| 183 |
+
"""
|
| 184 |
+
super(Model, self).__init__()
|
| 185 |
+
self.batch_size = batch_size
|
| 186 |
+
self.num_segments = seg_num
|
| 187 |
+
self.k_abn = self.num_segments // 10 # select 3 snippets
|
| 188 |
+
self.k_nor = self.num_segments // 10
|
| 189 |
+
|
| 190 |
+
self.Encoder = Encoder(input_dim=feature_dim, seg_num=seg_num)
|
| 191 |
+
|
| 192 |
+
# Fully connected layers for scoring
|
| 193 |
+
self.fc1 = nn.Linear(feature_dim, 512)
|
| 194 |
+
self.fc2 = nn.Linear(512, 128)
|
| 195 |
+
self.fc3 = nn.Linear(128, 1)
|
| 196 |
+
|
| 197 |
+
self.drop_out = nn.Dropout(0.2)
|
| 198 |
+
self.relu = nn.LeakyReLU(negative_slope=5e-2)
|
| 199 |
+
self.sigmoid = nn.Sigmoid()
|
| 200 |
+
self.apply(weight_init)
|
| 201 |
+
|
| 202 |
+
def forward(self, input1, input2, input3):
|
| 203 |
+
"""
|
| 204 |
+
Args:
|
| 205 |
+
input1 (torch.Tensor): long-frame-length features. Shape: B x T x feature_dim.
|
| 206 |
+
input2 (torch.Tensor): medium-frame-length features. Shape: B x T x feature_dim.
|
| 207 |
+
input3 (torch.Tensor): short-frame-length features. Shape: B x T x feature_dim.
|
| 208 |
+
|
| 209 |
+
Returns:
|
| 210 |
+
score_abnormal (torch.Tensor): The mean scores for top-3 abnormal instances.
|
| 211 |
+
score_normal (torch.Tensor): The mean scores for top-3 normal instances.
|
| 212 |
+
feat_select_abn (torch.Tensor): Selected abnormal features.
|
| 213 |
+
feat_select_normal (torch.Tensor): Selected normal features.
|
| 214 |
+
scores (torch.Tensor): All computed scores. Shape: B x T x 1
|
| 215 |
+
"""
|
| 216 |
+
k_abn = self.k_abn
|
| 217 |
+
k_nor = self.k_nor
|
| 218 |
+
ncrops = 1 # Reserving the parameter for spatial cropping, which is not used and defaults to 1
|
| 219 |
+
|
| 220 |
+
# Multi-Temporal Feature Fusion
|
| 221 |
+
out = self.Encoder(input1, input2, input3)
|
| 222 |
+
bs, t, f = out.size()
|
| 223 |
+
features = self.drop_out(out) # B T D
|
| 224 |
+
|
| 225 |
+
# Scoring layers
|
| 226 |
+
scores = self.relu(self.fc1(features))
|
| 227 |
+
scores = self.drop_out(scores)
|
| 228 |
+
scores = self.relu(self.fc2(scores))
|
| 229 |
+
scores = self.drop_out(scores)
|
| 230 |
+
scores = self.sigmoid(self.fc3(scores))
|
| 231 |
+
scores = scores.view(bs, ncrops, -1).mean(1)
|
| 232 |
+
scores = scores.unsqueeze(dim=2)
|
| 233 |
+
|
| 234 |
+
# Split normal and abnormal instances
|
| 235 |
+
normal_features = features[0:self.batch_size]
|
| 236 |
+
normal_scores = scores[0:self.batch_size]
|
| 237 |
+
abnormal_features = features[self.batch_size:]
|
| 238 |
+
abnormal_scores = scores[self.batch_size:]
|
| 239 |
+
|
| 240 |
+
# Compute feature magnitudes
|
| 241 |
+
feat_magnitudes = torch.norm(features, p=2, dim=2)
|
| 242 |
+
feat_magnitudes = feat_magnitudes.view(bs, ncrops, -1).mean(1)
|
| 243 |
+
nfea_magnitudes = feat_magnitudes[0:self.batch_size] # normal feature magnitudes
|
| 244 |
+
afea_magnitudes = feat_magnitudes[self.batch_size:] # abnormal feature magnitudes
|
| 245 |
+
n_size = nfea_magnitudes.shape[0]
|
| 246 |
+
|
| 247 |
+
# Inference mode for batch size 1
|
| 248 |
+
if nfea_magnitudes.shape[0] == 1:
|
| 249 |
+
afea_magnitudes = nfea_magnitudes
|
| 250 |
+
abnormal_scores = normal_scores
|
| 251 |
+
abnormal_features = normal_features
|
| 252 |
+
|
| 253 |
+
select_idx = torch.ones_like(nfea_magnitudes)
|
| 254 |
+
select_idx = self.drop_out(select_idx)
|
| 255 |
+
|
| 256 |
+
####### process abnormal videos -> select top3 feature magnitude #######
|
| 257 |
+
afea_magnitudes_drop = afea_magnitudes * select_idx
|
| 258 |
+
idx_abn = torch.topk(afea_magnitudes_drop, k_abn, dim=1)[1]
|
| 259 |
+
idx_abn_feat = idx_abn.unsqueeze(2).expand([-1, -1, abnormal_features.shape[2]])
|
| 260 |
+
|
| 261 |
+
abnormal_features = abnormal_features.view(n_size, ncrops, t, f) # B X N X T X F
|
| 262 |
+
abnormal_features = abnormal_features.permute(1, 0, 2, 3) # N X B X T X F
|
| 263 |
+
|
| 264 |
+
total_select_abn_feature = torch.zeros(0, device=input1.device)
|
| 265 |
+
for abnormal_feature in abnormal_features:
|
| 266 |
+
feat_select_abn = torch.gather(abnormal_feature, 1, idx_abn_feat) # top 3 features magnitude in abnormal bag
|
| 267 |
+
total_select_abn_feature = torch.cat((total_select_abn_feature, feat_select_abn))
|
| 268 |
+
|
| 269 |
+
idx_abn_score = idx_abn.unsqueeze(2).expand([-1, -1, abnormal_scores.shape[2]])
|
| 270 |
+
# top 3 scores in abnormal bag based on the top-3 magnitude
|
| 271 |
+
score_abnormal = torch.mean(torch.gather(abnormal_scores, 1, idx_abn_score), dim=1)
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
####### process normal videos -> select top3 feature magnitude #######
|
| 275 |
+
|
| 276 |
+
select_idx_normal = torch.ones_like(nfea_magnitudes)
|
| 277 |
+
select_idx_normal = self.drop_out(select_idx_normal)
|
| 278 |
+
nfea_magnitudes_drop = nfea_magnitudes * select_idx_normal
|
| 279 |
+
idx_normal = torch.topk(nfea_magnitudes_drop, k_nor, dim=1)[1]
|
| 280 |
+
idx_normal_feat = idx_normal.unsqueeze(2).expand([-1, -1, normal_features.shape[2]])
|
| 281 |
+
|
| 282 |
+
normal_features = normal_features.view(n_size, ncrops, t, f)
|
| 283 |
+
normal_features = normal_features.permute(1, 0, 2, 3) # 1 B T D
|
| 284 |
+
|
| 285 |
+
total_select_nor_feature = torch.zeros(0, device=input1.device)
|
| 286 |
+
for nor_fea in normal_features:
|
| 287 |
+
feat_select_normal = torch.gather(nor_fea, 1, idx_normal_feat) # top 3 features magnitude in normal bag (hard negative)
|
| 288 |
+
total_select_nor_feature = torch.cat((total_select_nor_feature, feat_select_normal))
|
| 289 |
+
|
| 290 |
+
idx_normal_score = idx_normal.unsqueeze(2).expand([-1, -1, normal_scores.shape[2]])
|
| 291 |
+
score_normal = torch.mean(torch.gather(normal_scores, 1, idx_normal_score), dim=1) # top 3 scores in normal bag
|
| 292 |
+
|
| 293 |
+
feat_select_abn = total_select_abn_feature
|
| 294 |
+
feat_select_normal = total_select_nor_feature
|
| 295 |
+
|
| 296 |
+
return score_abnormal, score_normal, feat_select_abn, feat_select_normal, scores
|
detection/option.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
|
| 3 |
+
############ Test args ########################
|
| 4 |
+
test_parser = argparse.ArgumentParser(description='MTFL_detection_test')
|
| 5 |
+
# input path
|
| 6 |
+
test_parser.add_argument('--lf_dir', type=str, default='features/L64', help='long frame length feature path')
|
| 7 |
+
test_parser.add_argument('--mf_dir', type=str, default='features/L32', help='media frame length feature path')
|
| 8 |
+
test_parser.add_argument('--sf_dir', type=str, default='features/L8', help='short frame length feature path')
|
| 9 |
+
test_parser.add_argument('--test_anno', default='annotation/Anomaly_videos.txt', help='test annotation file')
|
| 10 |
+
test_parser.add_argument('--detection_model', default='/media/DataDrive/yiling/Test/models/MTFL/MTFL-vst-VAD.pkl',
|
| 11 |
+
help='model path')
|
| 12 |
+
# output path
|
| 13 |
+
test_parser.add_argument('--output_dir', default='results',
|
| 14 |
+
help='The path to store the generated scores and AUC results')
|
| 15 |
+
# feature size depending on which feature extractor used
|
| 16 |
+
test_parser.add_argument('--feature_size', type=int, default=1024, help='feature dim (default: VST feature)')
|
| 17 |
+
test_parser.add_argument('--seg_num', type=int, default=32, help='the number of snippets')
|
| 18 |
+
# running cfg
|
| 19 |
+
test_parser.add_argument('--gpu', default="0", type=str, choices=["0", "1"], help='gpu')
|
| 20 |
+
test_parser.add_argument('--workers', default=8, help='number of workers in dataloader')
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
############ Train args ########################
|
| 24 |
+
train_parser = argparse.ArgumentParser(description='MTFL_detection_train')
|
| 25 |
+
# input path
|
| 26 |
+
train_parser.add_argument('--lf_dir', type=str, default='/media/DataDrive/yiling/features/VST_VAD_MT/L64R1',
|
| 27 |
+
help='long feature path')
|
| 28 |
+
train_parser.add_argument('--mf_dir', type=str, default='/media/DataDrive/yiling/features/VST_VAD_MT/L32R1',
|
| 29 |
+
help='media feature path')
|
| 30 |
+
train_parser.add_argument('--sf_dir', type=str, default='/media/DataDrive/yiling/features/VST_VAD_MT/L8R1',
|
| 31 |
+
help='short feature path')
|
| 32 |
+
train_parser.add_argument('--train_anno', default='/media/DataDrive/yiling/annotation/VAD_train_annotation.txt',
|
| 33 |
+
help='the annotation file for training')
|
| 34 |
+
train_parser.add_argument('--test_anno', default='/media/DataDrive/yiling/annotation/UCF_test_annotation_with_frames.txt',
|
| 35 |
+
help='the annotation file for test')
|
| 36 |
+
# output path and saving info
|
| 37 |
+
train_parser.add_argument('--model-name', default='MTFL', help='name to save model')
|
| 38 |
+
train_parser.add_argument('--save_models', default='/media/DataDrive/yiling/models/demo/detection',
|
| 39 |
+
help='the path for saving models')
|
| 40 |
+
train_parser.add_argument('--output_dir', default='/media/DataDrive/yiling/results/demo/detection',
|
| 41 |
+
help='The path to store AUC results')
|
| 42 |
+
# training cfg and paras
|
| 43 |
+
train_parser.add_argument('--gpu', default="0", type=str, choices=["0", "1"], help='gpu id')
|
| 44 |
+
train_parser.add_argument('--feature_size', type=int, default=1024, help='feature dim (default: VST feature)')
|
| 45 |
+
train_parser.add_argument('--seg_num', type=int, default=32, help='the number of snippets')
|
| 46 |
+
train_parser.add_argument('--lr', type=float, default='0.0001', help='learning rates for steps(list form)')
|
| 47 |
+
train_parser.add_argument('--batch-size', type=int, default=64, help='batch size')
|
| 48 |
+
train_parser.add_argument('--workers', type=int, default=8, help='number of workers in dataloader')
|
| 49 |
+
train_parser.add_argument('--max-epoch', type=int, default=2000, help='maximum iteration to train (default: 100)')
|
| 50 |
+
train_parser.add_argument('--metric', type=str, choices=["AP", "AUC"], default="AUC", help='the used metric')
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
|
detection/test.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from sklearn.metrics import auc, roc_curve, average_precision_score
|
| 3 |
+
from tqdm import tqdm
|
| 4 |
+
import os
|
| 5 |
+
import matplotlib.pyplot as plt
|
| 6 |
+
import option
|
| 7 |
+
|
| 8 |
+
from torch.utils.data import DataLoader
|
| 9 |
+
from dataset import Dataset
|
| 10 |
+
from model import Model
|
| 11 |
+
import warnings
|
| 12 |
+
from sklearn.exceptions import UndefinedMetricWarning
|
| 13 |
+
|
| 14 |
+
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def get_gt(start_end_couples, num_frames, device):
|
| 18 |
+
"""
|
| 19 |
+
Generate a ground truth tensor representing events in a time sequence based on given start and end pairs.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
start_end_couples (list): A list containing pairs of start and end frames.
|
| 23 |
+
If None or all '-1', no events are present.
|
| 24 |
+
num_frames (int): Total number of frames in the time sequence.
|
| 25 |
+
device: Device where the tensor should be placed.
|
| 26 |
+
|
| 27 |
+
Returns:
|
| 28 |
+
gt: A tensor of shape (num_frames,) representing whether each frame belongs to an anomalous event.
|
| 29 |
+
'1' means anomalous, and '0' means normal.
|
| 30 |
+
"""
|
| 31 |
+
gt = torch.zeros(num_frames).to(device)
|
| 32 |
+
if start_end_couples is not None and num_frames is not None:
|
| 33 |
+
for i in range(0, len(start_end_couples) - 1, 2):
|
| 34 |
+
if start_end_couples[i].item() != -1 and start_end_couples[i + 1].item() != -1:
|
| 35 |
+
couple = start_end_couples[i:i + 2]
|
| 36 |
+
gt[couple[0].item():couple[1].item()] = 1.0
|
| 37 |
+
|
| 38 |
+
return gt
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def save_scores(pred, start_end_couples, save_path):
|
| 42 |
+
"""
|
| 43 |
+
Save plots containing anomaly scores and annotated regions.
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
pred (list): List of anomaly scores.
|
| 47 |
+
start_end_couples (Tensor): Pairs of start and end frames indicating anomalous regions.
|
| 48 |
+
save_path (str): Path to save the generated plot.
|
| 49 |
+
file_name (str): Name to be displayed in the legend of the plot.
|
| 50 |
+
"""
|
| 51 |
+
|
| 52 |
+
plt.figure()
|
| 53 |
+
file_name = os.path.basename(save_path).split(".")[0]
|
| 54 |
+
plt.plot(pred, label=file_name, color='blue')
|
| 55 |
+
|
| 56 |
+
# Plot anomalous regions
|
| 57 |
+
for i in range(0, len(start_end_couples) - 1, 2):
|
| 58 |
+
if start_end_couples[i].item() != -1 and start_end_couples[i + 1].item() != -1:
|
| 59 |
+
plt.axvspan(start_end_couples[i].item(), start_end_couples[i + 1].item(), color='red', alpha=0.3)
|
| 60 |
+
|
| 61 |
+
plt.ylim(0, 1)
|
| 62 |
+
plt.xlabel('Frames', fontdict={'size': 16})
|
| 63 |
+
plt.ylabel('Anomaly Score', fontdict={'size': 16})
|
| 64 |
+
plt.yticks(size=14)
|
| 65 |
+
plt.xticks(size=14)
|
| 66 |
+
|
| 67 |
+
plt.legend(prop={'size': 16})
|
| 68 |
+
#plt.show()
|
| 69 |
+
plt.savefig(save_path)
|
| 70 |
+
plt.close()
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def test(dataloader, model, device, gen_scores=False, save_dir=None):
|
| 74 |
+
"""
|
| 75 |
+
Test the model's performance on the given dataloader.
|
| 76 |
+
|
| 77 |
+
Args:
|
| 78 |
+
dataloader (DataLoader): DataLoader for test data.
|
| 79 |
+
model: The model to be tested.
|
| 80 |
+
device: Device to perform testing on.
|
| 81 |
+
gen_scores (bool): Whether to generate and save anomaly scores plot.
|
| 82 |
+
save_dir (str): Directory to save generated plots.
|
| 83 |
+
|
| 84 |
+
Returns:
|
| 85 |
+
single_video_AUC (dict): A dictionary containing AUC values for each video.
|
| 86 |
+
overall_auc (float): Overall AUC value.
|
| 87 |
+
ap (float): average precision
|
| 88 |
+
"""
|
| 89 |
+
single_video_AUC = {"video": [], "AUC": []}
|
| 90 |
+
|
| 91 |
+
with torch.no_grad():
|
| 92 |
+
model.to(device).eval()
|
| 93 |
+
pred = torch.zeros(0, device=device)
|
| 94 |
+
gt = torch.zeros(0, device=device)
|
| 95 |
+
|
| 96 |
+
for input1, input2, input3, label, start_end_couples, num_frames, file in tqdm(dataloader):
|
| 97 |
+
input1 = input1.to(device)
|
| 98 |
+
input2 = input2.to(device)
|
| 99 |
+
input3 = input3.to(device)
|
| 100 |
+
score_abnormal, score_normal, feat_select_abn, feat_select_normal, scores = model(input1, input2, input3)
|
| 101 |
+
sig = torch.squeeze(scores, dim=(0, 2)) # T scores
|
| 102 |
+
segment = num_frames.item() // sig.size()[0]
|
| 103 |
+
sig = sig.repeat_interleave(segment) # Frames
|
| 104 |
+
if len(sig) < num_frames.item():
|
| 105 |
+
last_ele = sig[-1]
|
| 106 |
+
sig = torch.cat((sig, last_ele.repeat(num_frames.item()-len(sig)))) # 1 x Frames
|
| 107 |
+
|
| 108 |
+
pred = torch.cat((pred, sig))
|
| 109 |
+
cur_gt = get_gt(start_end_couples, num_frames, device)
|
| 110 |
+
gt = torch.cat((gt, cur_gt))
|
| 111 |
+
|
| 112 |
+
sig = sig.cpu().detach().numpy()
|
| 113 |
+
cur_gt = cur_gt.cpu().detach().numpy()
|
| 114 |
+
fpr, tpr, threshold = roc_curve(cur_gt, sig)
|
| 115 |
+
video_auc = auc(fpr, tpr)
|
| 116 |
+
single_video_AUC["video"].append(file)
|
| 117 |
+
single_video_AUC["AUC"].append(video_auc)
|
| 118 |
+
|
| 119 |
+
if gen_scores:
|
| 120 |
+
save_path = os.path.join(save_dir, file[0] + '.png')
|
| 121 |
+
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
| 122 |
+
save_scores(sig, start_end_couples, save_path)
|
| 123 |
+
|
| 124 |
+
pred = pred.cpu().detach().numpy()
|
| 125 |
+
gt = gt.cpu().detach().numpy()
|
| 126 |
+
ap = average_precision_score(gt, pred)
|
| 127 |
+
fpr, tpr, threshold = roc_curve(gt, pred)
|
| 128 |
+
overall_auc = auc(fpr, tpr)
|
| 129 |
+
print('\n' + 'Overall auc : ' + str(overall_auc) + ', Average Precision : ' + str(ap) + '\n')
|
| 130 |
+
|
| 131 |
+
return single_video_AUC, overall_auc, ap
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def main():
|
| 135 |
+
args = option.test_parser.parse_args()
|
| 136 |
+
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
| 137 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
|
| 138 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 139 |
+
|
| 140 |
+
AUC_path = os.path.join(args.output_dir, 'AUC')
|
| 141 |
+
scores_path = os.path.join(args.output_dir, 'scores')
|
| 142 |
+
|
| 143 |
+
test_loader = DataLoader(Dataset(args, test_mode=True),
|
| 144 |
+
batch_size=1, shuffle=False,
|
| 145 |
+
num_workers=args.workers, pin_memory=True)
|
| 146 |
+
model = Model(feature_dim=args.feature_size, batch_size=1, seg_num=args.seg_num)
|
| 147 |
+
model.load_state_dict(torch.load(args.detection_model))
|
| 148 |
+
|
| 149 |
+
single_video_AUC, overall_auc, ap = test(dataloader=test_loader,
|
| 150 |
+
model=model,
|
| 151 |
+
device=device,
|
| 152 |
+
gen_scores=True,
|
| 153 |
+
save_dir=scores_path)
|
| 154 |
+
|
| 155 |
+
# save AUC results
|
| 156 |
+
video_sub_dir = os.path.basename(os.path.dirname(single_video_AUC["video"][0][0]))
|
| 157 |
+
file_path = os.path.join(AUC_path, video_sub_dir, 'results.txt')
|
| 158 |
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
| 159 |
+
with open(file_path, "w") as f:
|
| 160 |
+
for video, single_auc in zip(single_video_AUC["video"], single_video_AUC["AUC"]):
|
| 161 |
+
f.write(f"Video: {video}, AUC: {single_auc}\n")
|
| 162 |
+
f.write("Overall AUC: {}, Average Precision: {}\n".format(overall_auc, ap))
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
if __name__ == '__main__':
|
| 166 |
+
main()
|
| 167 |
+
|
| 168 |
+
|
detection/train.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.optim as optim
|
| 3 |
+
import os
|
| 4 |
+
from torch.nn import MSELoss
|
| 5 |
+
from torch.utils.data import DataLoader
|
| 6 |
+
from model import Model
|
| 7 |
+
from dataset import Dataset
|
| 8 |
+
from test import test
|
| 9 |
+
import option
|
| 10 |
+
from tqdm import tqdm
|
| 11 |
+
torch.set_default_tensor_type('torch.FloatTensor')
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def sparsity(arr, lamda2):
|
| 15 |
+
loss = torch.mean(torch.norm(arr, dim=0))
|
| 16 |
+
return lamda2*loss
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def smooth(arr, lamda1):
|
| 20 |
+
arr2 = torch.zeros_like(arr)
|
| 21 |
+
arr2[:-1] = arr[1:]
|
| 22 |
+
arr2[-1] = arr[-1]
|
| 23 |
+
|
| 24 |
+
loss = torch.sum((arr2-arr)**2)
|
| 25 |
+
|
| 26 |
+
return lamda1*loss
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class SigmoidMAELoss(torch.nn.Module):
|
| 30 |
+
def __init__(self):
|
| 31 |
+
super(SigmoidMAELoss, self).__init__()
|
| 32 |
+
from torch.nn import Sigmoid
|
| 33 |
+
self.__sigmoid__ = Sigmoid()
|
| 34 |
+
self.__l1_loss__ = MSELoss()
|
| 35 |
+
|
| 36 |
+
def forward(self, pred, target):
|
| 37 |
+
return self.__l1_loss__(pred, target)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class RTFM_loss(torch.nn.Module):
|
| 41 |
+
def __init__(self, alpha, margin):
|
| 42 |
+
super(RTFM_loss, self).__init__()
|
| 43 |
+
self.alpha = alpha
|
| 44 |
+
self.margin = margin
|
| 45 |
+
self.sigmoid = torch.nn.Sigmoid()
|
| 46 |
+
self.mae_criterion = SigmoidMAELoss()
|
| 47 |
+
self.criterion = torch.nn.BCELoss()
|
| 48 |
+
|
| 49 |
+
def forward(self, score_normal, score_abnormal, nlabel, alabel, feat_n, feat_a):
|
| 50 |
+
label = torch.cat((nlabel, alabel), 0)
|
| 51 |
+
score_abnormal = score_abnormal
|
| 52 |
+
score_normal = score_normal
|
| 53 |
+
|
| 54 |
+
score = torch.cat((score_normal, score_abnormal), 0)
|
| 55 |
+
score = score.squeeze()
|
| 56 |
+
|
| 57 |
+
label = label.cuda()
|
| 58 |
+
|
| 59 |
+
loss_cls = self.criterion(score, label) # BCE loss in the score space
|
| 60 |
+
|
| 61 |
+
loss_abn = torch.abs(self.margin - torch.norm(torch.mean(feat_a, dim=1), p=2, dim=1))
|
| 62 |
+
|
| 63 |
+
loss_nor = torch.norm(torch.mean(feat_n, dim=1), p=2, dim=1)
|
| 64 |
+
|
| 65 |
+
loss_rtfm = torch.mean((loss_abn + loss_nor) ** 2)
|
| 66 |
+
|
| 67 |
+
loss_total = loss_cls + self.alpha * loss_rtfm
|
| 68 |
+
|
| 69 |
+
return loss_total
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def train(nloader, aloader, model, batch_size, seg_num, optimizer, device):
|
| 73 |
+
with torch.set_grad_enabled(True):
|
| 74 |
+
model.train()
|
| 75 |
+
|
| 76 |
+
ninput1, ninput2, ninput3, nlabel = next(nloader)
|
| 77 |
+
ainput1, ainput2, ainput3, alabel = next(aloader)
|
| 78 |
+
|
| 79 |
+
input1 = torch.cat((ninput1, ainput1), 0).to(device)
|
| 80 |
+
input2 = torch.cat((ninput2, ainput2), 0).to(device)
|
| 81 |
+
input3 = torch.cat((ninput3, ainput3), 0).to(device)
|
| 82 |
+
score_abnormal, score_normal, feat_select_abn, feat_select_normal, scores = model(input1, input2, input3)
|
| 83 |
+
|
| 84 |
+
scores = scores.view(batch_size * seg_num * 2, -1) # BX32X2, 1
|
| 85 |
+
|
| 86 |
+
scores = scores.squeeze()
|
| 87 |
+
abn_scores = scores[batch_size * seg_num:]
|
| 88 |
+
|
| 89 |
+
nlabel = nlabel[0:batch_size]
|
| 90 |
+
alabel = alabel[0:batch_size]
|
| 91 |
+
|
| 92 |
+
loss_criterion = RTFM_loss(0.0001, 100)
|
| 93 |
+
loss_sparse = sparsity(abn_scores, 8e-3)
|
| 94 |
+
loss_smooth = smooth(abn_scores, 8e-4)
|
| 95 |
+
|
| 96 |
+
loss_RTFM = loss_criterion(score_normal, score_abnormal, nlabel, alabel, feat_select_normal, feat_select_abn)
|
| 97 |
+
cost = loss_RTFM + loss_smooth + loss_sparse
|
| 98 |
+
|
| 99 |
+
optimizer.zero_grad()
|
| 100 |
+
cost.backward()
|
| 101 |
+
optimizer.step()
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def main():
|
| 105 |
+
args = option.train_parser.parse_args()
|
| 106 |
+
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
| 107 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
|
| 108 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 109 |
+
|
| 110 |
+
train_nloader = DataLoader(Dataset(args, test_mode=False, is_normal=True),
|
| 111 |
+
batch_size=args.batch_size, shuffle=True,
|
| 112 |
+
num_workers=args.workers, pin_memory=True, drop_last=True)
|
| 113 |
+
train_aloader = DataLoader(Dataset(args, test_mode=False, is_normal=False),
|
| 114 |
+
batch_size=args.batch_size, shuffle=True,
|
| 115 |
+
num_workers=args.workers, pin_memory=True, drop_last=True)
|
| 116 |
+
test_loader = DataLoader(Dataset(args, test_mode=True),
|
| 117 |
+
batch_size=1, shuffle=False,
|
| 118 |
+
num_workers=args.workers, pin_memory=True)
|
| 119 |
+
|
| 120 |
+
if not os.path.exists(args.save_models):
|
| 121 |
+
os.makedirs(args.save_models)
|
| 122 |
+
|
| 123 |
+
feature_size = args.feature_size
|
| 124 |
+
model = Model(feature_size, args.batch_size, args.seg_num)
|
| 125 |
+
optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.005)
|
| 126 |
+
test_info = {"epoch": [], "AUC": [], "AP": []}
|
| 127 |
+
best_result = -1
|
| 128 |
+
output_dir = args.output_dir
|
| 129 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 130 |
+
_, overall_auc, ap = test(dataloader=test_loader,
|
| 131 |
+
model=model,
|
| 132 |
+
device=device,
|
| 133 |
+
gen_scores=False,
|
| 134 |
+
save_dir=None)
|
| 135 |
+
|
| 136 |
+
for step in tqdm(range(1, args.max_epoch + 1), total=args.max_epoch, dynamic_ncols=True):
|
| 137 |
+
if (step - 1) % len(train_nloader) == 0:
|
| 138 |
+
loadern_iter = iter(train_nloader)
|
| 139 |
+
|
| 140 |
+
if (step - 1) % len(train_aloader) == 0:
|
| 141 |
+
loadera_iter = iter(train_aloader)
|
| 142 |
+
|
| 143 |
+
train(nloader=loadern_iter,
|
| 144 |
+
aloader=loadera_iter,
|
| 145 |
+
model=model,
|
| 146 |
+
batch_size=args.batch_size,
|
| 147 |
+
seg_num=args.seg_num,
|
| 148 |
+
optimizer=optimizer,
|
| 149 |
+
device=device)
|
| 150 |
+
|
| 151 |
+
if step % 5 == 0 and step > 200:
|
| 152 |
+
_, overall_auc, ap = test(dataloader=test_loader,
|
| 153 |
+
model=model,
|
| 154 |
+
device=device,
|
| 155 |
+
gen_scores=False,
|
| 156 |
+
save_dir=None)
|
| 157 |
+
|
| 158 |
+
test_info["epoch"].append(step)
|
| 159 |
+
test_info["AUC"].append(overall_auc)
|
| 160 |
+
test_info["AP"].append(ap)
|
| 161 |
+
|
| 162 |
+
# if test_info["AUC"][-1] > best_result:
|
| 163 |
+
# best_result = test_info["AUC"][-1]
|
| 164 |
+
# torch.save(model.state_dict(), os.path.join(args.save_models, args.model_name + '-{}.pkl'.format(step)))
|
| 165 |
+
# file_path = os.path.join(output_dir, '{}-step-AUC.txt'.format(step))
|
| 166 |
+
# with open(file_path, "w") as fo:
|
| 167 |
+
# for key in test_info:
|
| 168 |
+
# fo.write("{}: {}\n".format(key, test_info[key][-1]))
|
| 169 |
+
|
| 170 |
+
metric = args.metric
|
| 171 |
+
if test_info[metric][-1] > best_result:
|
| 172 |
+
best_result = test_info[metric][-1]
|
| 173 |
+
torch.save(model.state_dict(), os.path.join(args.save_models, args.model_name + '-{}.pkl'.format(step)))
|
| 174 |
+
file_path = os.path.join(output_dir, '{}-step-result.txt'.format(step))
|
| 175 |
+
with open(file_path, "w") as fo:
|
| 176 |
+
for key in test_info:
|
| 177 |
+
fo.write("{}: {}\n".format(key, test_info[key][-1]))
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
if __name__ == '__main__':
|
| 181 |
+
main()
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
|
figures/Intro.png
ADDED
|
Git LFS Details
|
recognition/dataset.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch.utils.data as data
|
| 2 |
+
import os
|
| 3 |
+
import torch
|
| 4 |
+
torch.set_default_tensor_type('torch.FloatTensor')
|
| 5 |
+
|
| 6 |
+
class_to_int = {
|
| 7 |
+
'Normal': 0,
|
| 8 |
+
'Abuse': 1,
|
| 9 |
+
'Arrest': 2,
|
| 10 |
+
'Arson': 3,
|
| 11 |
+
'Assault': 4,
|
| 12 |
+
'Burglary': 5,
|
| 13 |
+
'Explosion': 6,
|
| 14 |
+
'Fighting': 7,
|
| 15 |
+
'Robbery': 8,
|
| 16 |
+
'Shooting': 9,
|
| 17 |
+
'Shoplifting': 10,
|
| 18 |
+
'Stealing': 11,
|
| 19 |
+
'Vandalism': 12,
|
| 20 |
+
'RoadAccidents_EMVvsEMV': 13,
|
| 21 |
+
'RoadAccidents_EMVvsVRU': 14,
|
| 22 |
+
'RoadAccidents_VRUvsVRU': 15,
|
| 23 |
+
'DangerousThrowing': 16,
|
| 24 |
+
'Littering': 17
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def read_features(feature_path):
|
| 29 |
+
"""
|
| 30 |
+
Read features from a text file and convert them into a torch tensor.
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
feature_path (str): Path to the text file containing features.
|
| 34 |
+
|
| 35 |
+
Returns:
|
| 36 |
+
features (torch.Tensor): A tensor containing the features. Shape is T x C.
|
| 37 |
+
"""
|
| 38 |
+
with open(feature_path, 'r') as file:
|
| 39 |
+
lines = file.readlines()
|
| 40 |
+
features = []
|
| 41 |
+
for line in lines:
|
| 42 |
+
feature = [float(value) for value in line.strip().split()]
|
| 43 |
+
features.append(feature)
|
| 44 |
+
features = torch.tensor(features).float() # T x C
|
| 45 |
+
return features
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class Dataset(data.Dataset):
|
| 49 |
+
def __init__(self, args, is_normal=True, transform=None, test_mode=False):
|
| 50 |
+
"""
|
| 51 |
+
Custom dataset class for loading features and labels.
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
args: Argument object containing paths and options.
|
| 55 |
+
is_normal (bool): Whether the dataset represents normal samples.
|
| 56 |
+
transform: Data transformation to be applied.
|
| 57 |
+
test_mode (bool): Whether the dataset is for testing.
|
| 58 |
+
|
| 59 |
+
Attributes:
|
| 60 |
+
is_normal (bool): Whether the dataset represents normal samples.
|
| 61 |
+
transform: Data transformation to be applied.
|
| 62 |
+
test_mode (bool): Whether the dataset is for testing.
|
| 63 |
+
list (list): List of feature paths and labels information.
|
| 64 |
+
"""
|
| 65 |
+
self.is_normal = is_normal
|
| 66 |
+
self.transform = transform
|
| 67 |
+
self.test_mode = test_mode
|
| 68 |
+
|
| 69 |
+
if self.test_mode:
|
| 70 |
+
annotation_path = args.test_anno
|
| 71 |
+
else:
|
| 72 |
+
annotation_path = args.train_anno
|
| 73 |
+
|
| 74 |
+
self.list = self._get_features_list(args.lf_dir, args.mf_dir, args.sf_dir, annotation_path)
|
| 75 |
+
|
| 76 |
+
def __getitem__(self, index):
|
| 77 |
+
if self.test_mode:
|
| 78 |
+
lf_path, mf_path, sf_path, label, file = self.list[index]
|
| 79 |
+
l_features = read_features(lf_path)
|
| 80 |
+
m_features = read_features(mf_path)
|
| 81 |
+
s_features = read_features(sf_path)
|
| 82 |
+
label = torch.tensor(label)
|
| 83 |
+
return s_features, m_features, l_features, label, file
|
| 84 |
+
else:
|
| 85 |
+
lf_path, mf_path, sf_path, label = self.list[index]
|
| 86 |
+
l_features = read_features(lf_path)
|
| 87 |
+
m_features = read_features(mf_path)
|
| 88 |
+
s_features = read_features(sf_path)
|
| 89 |
+
label = torch.tensor(label)
|
| 90 |
+
return s_features, m_features, l_features, label
|
| 91 |
+
|
| 92 |
+
def __len__(self):
|
| 93 |
+
return len(self.list)
|
| 94 |
+
|
| 95 |
+
def _get_features_list(self, lf_dir, mf_dir, sf_dir, annotation_path):
|
| 96 |
+
"""
|
| 97 |
+
Construct a feature list from the given directories and annotation file.
|
| 98 |
+
|
| 99 |
+
Args:
|
| 100 |
+
lf_dir (str): Directory path containing long-frame-length feature files.
|
| 101 |
+
mf_dir (str): Directory path containing medium-frame-length feature files.
|
| 102 |
+
sf_dir (str): Directory path containing short-frame-length feature files.
|
| 103 |
+
annotation_path (str): Path to a text file containing annotation information.
|
| 104 |
+
|
| 105 |
+
Returns:
|
| 106 |
+
list: A list of tuples, each containing (lf_path, mf_path, sf_path, cls) or (lf_path, mf_path, sf_path, cls, file).
|
| 107 |
+
|
| 108 |
+
Raises:
|
| 109 |
+
AssertionError: If the input directories do not exist.
|
| 110 |
+
|
| 111 |
+
Note:
|
| 112 |
+
- If test_mode is True, each tuple contains (lf_path, mf_path, sf_path, cls, file), where file is the file name.
|
| 113 |
+
- If test_mode is False, each tuple contains (lf_path, mf_path, sf_path, cls), and selection is based on whether it is normal (is_normal).
|
| 114 |
+
|
| 115 |
+
"""
|
| 116 |
+
assert os.path.exists(lf_dir)
|
| 117 |
+
assert os.path.exists(mf_dir)
|
| 118 |
+
assert os.path.exists(sf_dir)
|
| 119 |
+
features_list = []
|
| 120 |
+
with open(annotation_path) as f:
|
| 121 |
+
lines = f.read().splitlines(keepends=False)
|
| 122 |
+
for line in lines:
|
| 123 |
+
items = line.split()
|
| 124 |
+
file = items[0].split(".")[0]
|
| 125 |
+
file = file.replace("/", os.sep)
|
| 126 |
+
lf_path = os.path.join(lf_dir, file + '.txt')
|
| 127 |
+
mf_path = os.path.join(mf_dir, file + '.txt')
|
| 128 |
+
sf_path = os.path.join(sf_dir, file + '.txt')
|
| 129 |
+
unsupported_class = 18
|
| 130 |
+
if not items[1].isdigit():
|
| 131 |
+
cls = class_to_int.get(items[1], unsupported_class)
|
| 132 |
+
else:
|
| 133 |
+
cls = int(items[1])
|
| 134 |
+
if self.test_mode:
|
| 135 |
+
features_list.append((lf_path, mf_path, sf_path, cls, file))
|
| 136 |
+
elif (cls == class_to_int['Normal']) == self.is_normal:
|
| 137 |
+
features_list.append((lf_path, mf_path, sf_path, cls))
|
| 138 |
+
|
| 139 |
+
return features_list
|
| 140 |
+
|
recognition/model.py
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
""" Reference source: https://github.com/tianyu0207/RTFM"""
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
import torch.nn as nn
|
| 5 |
+
import torch.nn.functional as F
|
| 6 |
+
import torch.nn.init as torch_init
|
| 7 |
+
torch.set_default_tensor_type('torch.FloatTensor')
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def weight_init(m):
|
| 11 |
+
classname = m.__class__.__name__
|
| 12 |
+
if classname.find('Conv') != -1 or classname.find('Linear') != -1:
|
| 13 |
+
torch_init.xavier_uniform_(m.weight)
|
| 14 |
+
if m.bias is not None:
|
| 15 |
+
m.bias.data.fill_(0)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class CVA(nn.Module):
|
| 19 |
+
def __init__(self, input_dim=1024):
|
| 20 |
+
"""
|
| 21 |
+
Cross-View Attention (CVA) module.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
input_dim (int): Dimension of the input features.
|
| 25 |
+
"""
|
| 26 |
+
super(CVA, self).__init__()
|
| 27 |
+
drop_out_rate = 0.1
|
| 28 |
+
num_heads = 4
|
| 29 |
+
self.cross_attention = nn.MultiheadAttention(embed_dim=input_dim, num_heads=num_heads, dropout=drop_out_rate,
|
| 30 |
+
device='cuda')
|
| 31 |
+
|
| 32 |
+
def forward(self, feature1, feature2):
|
| 33 |
+
"""
|
| 34 |
+
Args:
|
| 35 |
+
feature1 (torch.Tensor): one path features. Shape: B x T x C.
|
| 36 |
+
feature2 (torch.Tensor): another path features. Shape: B x T x C.
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
out1 (torch.Tensor): Processed features after cross-attention. Shape: B x T x C.
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
feature1 = F.layer_norm(feature1, [feature1.size(-1)])
|
| 43 |
+
feature2 = F.layer_norm(feature2, [feature2.size(-1)])
|
| 44 |
+
feature1 = feature1.permute(1, 0, 2) # T B C
|
| 45 |
+
feature2 = feature2.permute(1, 0, 2)
|
| 46 |
+
|
| 47 |
+
out1, _ = self.cross_attention(query=feature1, key=feature2, value=feature2) # T B C (For test:32 1 1024)
|
| 48 |
+
out1 = out1 + feature1 # residual connection
|
| 49 |
+
|
| 50 |
+
return out1 # B T C
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class Aggregate(nn.Module):
|
| 54 |
+
def __init__(self, input_dim):
|
| 55 |
+
"""
|
| 56 |
+
An aggregate network including local temporal correlation learning, global temporal correlation learning,
|
| 57 |
+
and feature fusion in MTFF.
|
| 58 |
+
|
| 59 |
+
Args:
|
| 60 |
+
input_dim (int): input features dim.
|
| 61 |
+
"""
|
| 62 |
+
super(Aggregate, self).__init__()
|
| 63 |
+
bn = nn.BatchNorm1d
|
| 64 |
+
num_heads = 4
|
| 65 |
+
self.input_dim = input_dim
|
| 66 |
+
self.conv_1 = nn.Sequential(
|
| 67 |
+
nn.Conv1d(in_channels=input_dim, out_channels=512, kernel_size=3,
|
| 68 |
+
stride=1,dilation=1, padding=1),
|
| 69 |
+
nn.LeakyReLU(negative_slope=5e-2),
|
| 70 |
+
bn(512)
|
| 71 |
+
)
|
| 72 |
+
self.conv_2 = nn.Sequential(
|
| 73 |
+
nn.Conv1d(in_channels=input_dim, out_channels=512, kernel_size=3,
|
| 74 |
+
stride=1, dilation=2, padding=2),
|
| 75 |
+
nn.LeakyReLU(negative_slope=5e-2),
|
| 76 |
+
bn(512)
|
| 77 |
+
)
|
| 78 |
+
self.conv_3 = nn.Sequential(
|
| 79 |
+
nn.Conv1d(in_channels=input_dim, out_channels=512, kernel_size=3,
|
| 80 |
+
stride=1, dilation=4, padding=4),
|
| 81 |
+
nn.LeakyReLU(negative_slope=5e-2),
|
| 82 |
+
bn(512)
|
| 83 |
+
)
|
| 84 |
+
self.conv_4 = nn.Sequential(
|
| 85 |
+
nn.Conv1d(in_channels=input_dim*3, out_channels=512, kernel_size=1,
|
| 86 |
+
stride=1, padding=0, bias = False),
|
| 87 |
+
nn.LeakyReLU(negative_slope=5e-2),
|
| 88 |
+
)
|
| 89 |
+
self.conv_5 = nn.Sequential(
|
| 90 |
+
nn.Conv1d(in_channels=2048, out_channels=input_dim, kernel_size=3,
|
| 91 |
+
stride=1, padding=1, bias=False),
|
| 92 |
+
nn.LeakyReLU(negative_slope=5e-2),
|
| 93 |
+
nn.BatchNorm1d(input_dim),
|
| 94 |
+
)
|
| 95 |
+
self.self_attention = nn.MultiheadAttention(embed_dim=512, num_heads=num_heads,
|
| 96 |
+
dropout=0.1, device='cuda')
|
| 97 |
+
|
| 98 |
+
def forward(self, input1, input2, input3):
|
| 99 |
+
"""
|
| 100 |
+
Args:
|
| 101 |
+
input1 (torch.Tensor): long-frame-length features. Shape: T x B x C.
|
| 102 |
+
input2 (torch.Tensor): medium-frame-length features. Shape: T x B x C.
|
| 103 |
+
input3 (torch.Tensor): short-frame-length features. Shape: T x B x C.
|
| 104 |
+
|
| 105 |
+
Returns:
|
| 106 |
+
torch.Tensor: Processed and fused output features. Shape: B x T x C.
|
| 107 |
+
"""
|
| 108 |
+
x1 = input1.permute(1, 2, 0) # B C T
|
| 109 |
+
x2 = input2.permute(1, 2, 0)
|
| 110 |
+
x3 = input3.permute(1, 2, 0)
|
| 111 |
+
tensor_list = [x1, x2, x3]
|
| 112 |
+
|
| 113 |
+
residual = torch.mean(torch.stack(tensor_list), dim=0)
|
| 114 |
+
|
| 115 |
+
out1 = self.conv_1(x1) # B C/2 T
|
| 116 |
+
out2 = self.conv_2(x2)
|
| 117 |
+
out3 = self.conv_3(x3)
|
| 118 |
+
x = torch.cat([out1, out2, out3], dim=1) # B 3C/2 T
|
| 119 |
+
|
| 120 |
+
feature = torch.cat((x1, x2, x3), dim=1)
|
| 121 |
+
out = self.conv_4(feature)
|
| 122 |
+
out = out.permute(2, 0, 1) # T B C/2
|
| 123 |
+
out = F.layer_norm(out, normalized_shape=[out.size(-1)])
|
| 124 |
+
out, _ = self.self_attention(out, out, out) # T B C/2
|
| 125 |
+
out = out.permute(1, 2, 0) # B C/2 T
|
| 126 |
+
out = torch.cat((x, out), dim=1) # B 2C T
|
| 127 |
+
out = self.conv_5(out) # fuse all the features together
|
| 128 |
+
out = out + residual
|
| 129 |
+
out = out.permute(0, 2, 1)
|
| 130 |
+
|
| 131 |
+
return out
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
class Encoder(nn.Module):
|
| 135 |
+
def __init__(self, input_dim=1024, seg_num=32):
|
| 136 |
+
"""
|
| 137 |
+
Multi-Temporal Feature Fusion (MTFF) module.
|
| 138 |
+
|
| 139 |
+
Args:
|
| 140 |
+
input_dim (int): Dimension of the input features.
|
| 141 |
+
seg_num (int): Number of snippets in a video.
|
| 142 |
+
"""
|
| 143 |
+
super(Encoder, self).__init__()
|
| 144 |
+
self.drop_out_rate = 0.1
|
| 145 |
+
self.input_dim = input_dim
|
| 146 |
+
self.min_temporal_dim = seg_num
|
| 147 |
+
self.CVA1 = CVA(input_dim=input_dim)
|
| 148 |
+
self.CVA2 = CVA(input_dim=input_dim)
|
| 149 |
+
self.CVA3 = CVA(input_dim=input_dim)
|
| 150 |
+
|
| 151 |
+
self.aggregate = Aggregate(input_dim=input_dim)
|
| 152 |
+
|
| 153 |
+
def forward(self, feature1, feature2, feature3):
|
| 154 |
+
"""
|
| 155 |
+
Args:
|
| 156 |
+
feature1 (torch.Tensor): long-frame-length features. Shape: B x T x C.
|
| 157 |
+
(Batch size X The number of snippets x Input dimensions)
|
| 158 |
+
feature2 (torch.Tensor): medium-frame-length features. Shape: B x T x C.
|
| 159 |
+
feature3 (torch.Tensor): short-frame-length features. Shape: B x T x C.
|
| 160 |
+
|
| 161 |
+
Returns:
|
| 162 |
+
torch.Tensor: Fused and processed output features. Shape: B x T x C.
|
| 163 |
+
"""
|
| 164 |
+
|
| 165 |
+
att1 = self.CVA1(feature1, feature2)
|
| 166 |
+
att2 = self.CVA2(feature2, feature3)
|
| 167 |
+
att3 = self.CVA3(feature3, feature1)
|
| 168 |
+
|
| 169 |
+
out1 = self.aggregate(att1, att2, att3) # B T C
|
| 170 |
+
|
| 171 |
+
return out1
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
class Model(nn.Module):
|
| 175 |
+
def __init__(self, feature_dim, batch_size, seg_num=32):
|
| 176 |
+
"""
|
| 177 |
+
Multi-Temporal Feature Learning (MTFL) recognition model.
|
| 178 |
+
|
| 179 |
+
Args:
|
| 180 |
+
feature_dim (int): Dimension of the input features.
|
| 181 |
+
batch_size (int): Batch size.
|
| 182 |
+
seg_num (int): Number of snippets in a video.
|
| 183 |
+
"""
|
| 184 |
+
super(Model, self).__init__()
|
| 185 |
+
self.batch_size = batch_size
|
| 186 |
+
self.num_segments = seg_num
|
| 187 |
+
self.k_abn = self.num_segments // 10 # select 3 snippets
|
| 188 |
+
self.k_nor = self.num_segments // 10
|
| 189 |
+
|
| 190 |
+
self.Encoder = Encoder(input_dim=feature_dim, seg_num=seg_num)
|
| 191 |
+
|
| 192 |
+
# Fully connected layers for classification
|
| 193 |
+
self.fc1 = nn.Linear(feature_dim, 512)
|
| 194 |
+
self.fc2 = nn.Linear(512, 128)
|
| 195 |
+
self.fc3 = nn.Linear(128, 18) # class amount = 18
|
| 196 |
+
|
| 197 |
+
self.drop_out = nn.Dropout(0.2)
|
| 198 |
+
self.relu = nn.LeakyReLU(negative_slope=5e-2)
|
| 199 |
+
self.sigmoid = nn.Sigmoid()
|
| 200 |
+
self.apply(weight_init)
|
| 201 |
+
|
| 202 |
+
def forward(self, input1, input2, input3):
|
| 203 |
+
"""
|
| 204 |
+
Args:
|
| 205 |
+
input1 (torch.Tensor): long-frame-length features. Shape: B x T x feature_dim.
|
| 206 |
+
input2 (torch.Tensor): medium-frame-length features. Shape: B x T x feature_dim.
|
| 207 |
+
input3 (torch.Tensor): short-frame-length features. Shape: B x T x feature_dim.
|
| 208 |
+
|
| 209 |
+
Returns:
|
| 210 |
+
score_abnormal (torch.Tensor): The mean scores for top-3 abnormal instances.
|
| 211 |
+
score_normal (torch.Tensor): The mean scores for top-3 normal instances.
|
| 212 |
+
feat_select_abn (torch.Tensor): Selected abnormal features.
|
| 213 |
+
feat_select_normal (torch.Tensor): Selected normal features.
|
| 214 |
+
scores (torch.Tensor): All computed scores. Shape: B x T x the number of classes (18)
|
| 215 |
+
"""
|
| 216 |
+
k_abn = self.k_abn
|
| 217 |
+
k_nor = self.k_nor
|
| 218 |
+
ncrops = 1 # Reserving the parameter for spatial cropping, which is not used and defaults to 1
|
| 219 |
+
|
| 220 |
+
# Multi-Temporal Feature Fusion
|
| 221 |
+
out = self.Encoder(input1, input2, input3)
|
| 222 |
+
bs, t, f = out.size()
|
| 223 |
+
features = self.drop_out(out) # B T D
|
| 224 |
+
|
| 225 |
+
# classification layers
|
| 226 |
+
scores = self.relu(self.fc1(features))
|
| 227 |
+
scores = self.drop_out(scores)
|
| 228 |
+
scores = self.relu(self.fc2(scores))
|
| 229 |
+
scores = self.drop_out(scores)
|
| 230 |
+
scores = self.sigmoid(self.fc3(scores))
|
| 231 |
+
scores = scores.view(bs, t, -1) # B T 18
|
| 232 |
+
# B * t * f
|
| 233 |
+
normal_features = features[0:self.batch_size]
|
| 234 |
+
normal_scores = scores[0:self.batch_size]
|
| 235 |
+
|
| 236 |
+
abnormal_features = features[self.batch_size:]
|
| 237 |
+
abnormal_scores = scores[self.batch_size:]
|
| 238 |
+
|
| 239 |
+
# Compute feature magnitudes
|
| 240 |
+
feat_magnitudes = torch.norm(features, p=2, dim=2)
|
| 241 |
+
feat_magnitudes = feat_magnitudes.view(bs, ncrops, -1).mean(1)
|
| 242 |
+
nfea_magnitudes = feat_magnitudes[0:self.batch_size] # normal feature magnitudes
|
| 243 |
+
afea_magnitudes = feat_magnitudes[self.batch_size:] # abnormal feature magnitudes
|
| 244 |
+
n_size = nfea_magnitudes.shape[0]
|
| 245 |
+
|
| 246 |
+
# Inference mode for batch size 1
|
| 247 |
+
if nfea_magnitudes.shape[0] == 1:
|
| 248 |
+
afea_magnitudes = nfea_magnitudes
|
| 249 |
+
abnormal_scores = normal_scores
|
| 250 |
+
abnormal_features = normal_features
|
| 251 |
+
|
| 252 |
+
select_idx = torch.ones_like(nfea_magnitudes)
|
| 253 |
+
select_idx = self.drop_out(select_idx)
|
| 254 |
+
|
| 255 |
+
####### process abnormal videos -> select top3 feature magnitude #######
|
| 256 |
+
afea_magnitudes_drop = afea_magnitudes * select_idx
|
| 257 |
+
idx_abn = torch.topk(afea_magnitudes_drop, k_abn, dim=1)[1]
|
| 258 |
+
idx_abn_feat = idx_abn.unsqueeze(2).expand([-1, -1, abnormal_features.shape[2]])
|
| 259 |
+
|
| 260 |
+
abnormal_features = abnormal_features.view(n_size, ncrops, t, f) # B X N X T X F
|
| 261 |
+
abnormal_features = abnormal_features.permute(1, 0, 2, 3) # N X B X T X F
|
| 262 |
+
|
| 263 |
+
total_select_abn_feature = torch.zeros(0, device=input1.device)
|
| 264 |
+
for abnormal_feature in abnormal_features:
|
| 265 |
+
feat_select_abn = torch.gather(abnormal_feature, 1, idx_abn_feat) # top 3 features magnitude in abnormal bag
|
| 266 |
+
total_select_abn_feature = torch.cat((total_select_abn_feature, feat_select_abn))
|
| 267 |
+
|
| 268 |
+
idx_abn_score = idx_abn.unsqueeze(2).expand([-1, -1, abnormal_scores.shape[2]])
|
| 269 |
+
# top 3 scores in abnormal bag based on the top-3 magnitude
|
| 270 |
+
score_abnormal = torch.mean(torch.gather(abnormal_scores, 1, idx_abn_score), dim=1)
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
####### process normal videos -> select top3 feature magnitude #######
|
| 274 |
+
|
| 275 |
+
select_idx_normal = torch.ones_like(nfea_magnitudes)
|
| 276 |
+
select_idx_normal = self.drop_out(select_idx_normal)
|
| 277 |
+
nfea_magnitudes_drop = nfea_magnitudes * select_idx_normal
|
| 278 |
+
idx_normal = torch.topk(nfea_magnitudes_drop, k_nor, dim=1)[1]
|
| 279 |
+
idx_normal_feat = idx_normal.unsqueeze(2).expand([-1, -1, normal_features.shape[2]])
|
| 280 |
+
|
| 281 |
+
normal_features = normal_features.view(n_size, ncrops, t, f)
|
| 282 |
+
normal_features = normal_features.permute(1, 0, 2, 3) # 1 B T D
|
| 283 |
+
|
| 284 |
+
total_select_nor_feature = torch.zeros(0, device=input1.device)
|
| 285 |
+
for nor_fea in normal_features:
|
| 286 |
+
feat_select_normal = torch.gather(nor_fea, 1, idx_normal_feat) # top 3 features magnitude in normal bag (hard negative)
|
| 287 |
+
total_select_nor_feature = torch.cat((total_select_nor_feature, feat_select_normal))
|
| 288 |
+
|
| 289 |
+
idx_normal_score = idx_normal.unsqueeze(2).expand([-1, -1, normal_scores.shape[2]])
|
| 290 |
+
score_normal = torch.mean(torch.gather(normal_scores, 1, idx_normal_score), dim=1) # top 3 scores in normal bag
|
| 291 |
+
|
| 292 |
+
feat_select_abn = total_select_abn_feature
|
| 293 |
+
feat_select_normal = total_select_nor_feature
|
| 294 |
+
|
| 295 |
+
return score_abnormal, score_normal, feat_select_abn, feat_select_normal, scores
|
recognition/option.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
|
| 3 |
+
############ Test args ########################
|
| 4 |
+
test_parser = argparse.ArgumentParser(description='MTFL_recognition_test')
|
| 5 |
+
# input path
|
| 6 |
+
test_parser.add_argument('--lf_dir', type=str, default='features/L64', help='long frame length feature path')
|
| 7 |
+
test_parser.add_argument('--mf_dir', type=str, default='features/L32', help='media frame length feature path')
|
| 8 |
+
test_parser.add_argument('--sf_dir', type=str, default='features/L8', help='short frame length feature path')
|
| 9 |
+
test_parser.add_argument('--test_anno', type=str, default='annotation/Anomaly_videos.txt', help='test annotation file')
|
| 10 |
+
test_parser.add_argument('--test_dataset', type=str, default='other', choices=['UCF', 'VAD', 'other'],
|
| 11 |
+
help='The test data. The test results are the recognized labels of all input videos. '
|
| 12 |
+
'For UCF and VAD datasets, the overall accuracy would be printed out')
|
| 13 |
+
test_parser.add_argument('--recognition_model', type=str,
|
| 14 |
+
default='/media/DataDrive/yiling/Test/models/MTFL_recog/split_1_best_VAD.pkl',
|
| 15 |
+
help='recognition checkpoint path, choose 1 from 7 checkpoints trained on different splits')
|
| 16 |
+
# output path
|
| 17 |
+
test_parser.add_argument('--output_dir', type=str, default='results',
|
| 18 |
+
help='The path to store the recognition result')
|
| 19 |
+
# feature size depending on which feature extractor used
|
| 20 |
+
test_parser.add_argument('--feature_size', type=int, default=1024, help='feature dim (default: VST feature)')
|
| 21 |
+
test_parser.add_argument('--seg_num', type=int, default=32, help='the number of snippets')
|
| 22 |
+
# running cfg
|
| 23 |
+
test_parser.add_argument('--gpu', default="0", type=str, choices=["0", "1"], help='gpu')
|
| 24 |
+
test_parser.add_argument('--workers', default=8, help='number of workers in dataloader')
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
############ Train args ########################
|
| 28 |
+
train_parser = argparse.ArgumentParser(description='MTFL_recognition_train')
|
| 29 |
+
# input path
|
| 30 |
+
train_parser.add_argument('--lf_dir', type=str, default='/media/DataDrive/yiling/features/recognition/split1_L64R1',
|
| 31 |
+
help='long feature path')
|
| 32 |
+
train_parser.add_argument('--mf_dir', type=str, default='/media/DataDrive/yiling/features/recognition/split1_L32R1',
|
| 33 |
+
help='media feature path')
|
| 34 |
+
train_parser.add_argument('--sf_dir', type=str, default='/media/DataDrive/yiling/features/recognition/split1_L8R1',
|
| 35 |
+
help='short feature path')
|
| 36 |
+
train_parser.add_argument('--train_anno', default='/media/DataDrive/yiling/annotation/recognition/splits/VAD/VAD_train_001.txt',
|
| 37 |
+
help='the annotation file for training')
|
| 38 |
+
train_parser.add_argument('--test_anno', default='/media/DataDrive/yiling/annotation/recognition/splits/VAD/VAD_test_001.txt',
|
| 39 |
+
help='the annotation file for test')
|
| 40 |
+
train_parser.add_argument('--test_dataset', type=str, default='UCF', choices=['UCF', 'VAD'],
|
| 41 |
+
help='The validation data')
|
| 42 |
+
# output path and saving info
|
| 43 |
+
train_parser.add_argument('--model-name', default='MTFL_recognition', help='name to save model')
|
| 44 |
+
train_parser.add_argument('--save_models', default='/media/DataDrive/yiling/models/demo/recognition',
|
| 45 |
+
help='the path for saving models')
|
| 46 |
+
train_parser.add_argument('--output_dir', default='/media/DataDrive/yiling/results/demo/recognition',
|
| 47 |
+
help='The path to store AUC results')
|
| 48 |
+
# training cfg and paras
|
| 49 |
+
train_parser.add_argument('--gpu', default="0", type=str, choices=["0", "1"], help='gpu id')
|
| 50 |
+
train_parser.add_argument('--feature_size', type=int, default=1024, help='feature dim (default: VST feature)')
|
| 51 |
+
train_parser.add_argument('--seg_num', type=int, default=32, help='the number of snippets')
|
| 52 |
+
train_parser.add_argument('--lr', type=float, default='0.0001', help='learning rates for steps(list form)')
|
| 53 |
+
train_parser.add_argument('--batch-size', type=int, default=32, help='batch size')
|
| 54 |
+
train_parser.add_argument('--workers', default=8, help='number of workers in dataloader')
|
| 55 |
+
train_parser.add_argument('--max-epoch', type=int, default=2000, help='maximum iteration to train (default: 100)')
|
| 56 |
+
|
recognition/test.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from tqdm import tqdm
|
| 3 |
+
import numpy as np
|
| 4 |
+
import os
|
| 5 |
+
import option
|
| 6 |
+
from torch.utils.data import DataLoader
|
| 7 |
+
from dataset import class_to_int, Dataset
|
| 8 |
+
from model import Model
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def top_k_accuracy(scores, labels, topk=(1, 5)):
|
| 12 |
+
"""Calculate top k accuracy score.
|
| 13 |
+
|
| 14 |
+
Args:
|
| 15 |
+
scores (list[np.ndarray]): Prediction scores for each class.
|
| 16 |
+
labels (list[int]): Ground truth labels.
|
| 17 |
+
topk (tuple[int]): K value for top_k_accuracy. Default: (1, ).
|
| 18 |
+
|
| 19 |
+
Returns:
|
| 20 |
+
list[float]: Top k accuracy score for each k.
|
| 21 |
+
"""
|
| 22 |
+
res = []
|
| 23 |
+
labels = np.array(labels)[:, np.newaxis]
|
| 24 |
+
for k in topk:
|
| 25 |
+
max_k_preds = np.argsort(scores, axis=1)[:, -k:][:, ::-1]
|
| 26 |
+
match_array = np.logical_or.reduce(max_k_preds == labels, axis=1)
|
| 27 |
+
topk_acc_score = match_array.sum() / match_array.shape[0]
|
| 28 |
+
res.append(topk_acc_score)
|
| 29 |
+
|
| 30 |
+
return res
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def test(dataloader, model, device, test_dataset='UCF'):
|
| 34 |
+
"""
|
| 35 |
+
Evaluate the model's performance on the test dataset and return the top-1 accuracy.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
dataloader (DataLoader): DataLoader for the test dataset.
|
| 39 |
+
model (nn.Module): The trained neural network model.
|
| 40 |
+
device (torch.device): The device (CPU or GPU) on which to perform evaluation.
|
| 41 |
+
test_dataset (str, optional): The name of the test dataset, either 'UCF' or 'VAD'. Default is 'UCF'.
|
| 42 |
+
The overall accuracy is calculated only for 'VAD' and 'UCF' because it does not make sense when testing
|
| 43 |
+
on only a few videos.
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
float: The top-1 accuracy of the model on the test dataset.
|
| 47 |
+
dict: A dictionary containing video filenames and their corresponding predicted classes.
|
| 48 |
+
|
| 49 |
+
"""
|
| 50 |
+
video_class = {"video": [], "class": []}
|
| 51 |
+
with torch.no_grad():
|
| 52 |
+
model.to(device).eval()
|
| 53 |
+
outputs = torch.zeros(0, device=device)
|
| 54 |
+
labels = torch.zeros(0, device=device)
|
| 55 |
+
|
| 56 |
+
for input1, input2, input3, label, file in tqdm(dataloader):
|
| 57 |
+
input1 = input1.to(device)
|
| 58 |
+
input2 = input2.to(device)
|
| 59 |
+
input3 = input3.to(device)
|
| 60 |
+
label = label.to(device)
|
| 61 |
+
score_abnormal, score_normal, feat_select_abn, feat_select_normal, scores = model(input1, input2, input3)
|
| 62 |
+
# cat for acc evaluation
|
| 63 |
+
outputs = torch.cat((outputs, score_abnormal))
|
| 64 |
+
labels = torch.cat((labels, label))
|
| 65 |
+
# obtain the prediction result
|
| 66 |
+
score_abnormal = score_abnormal.cpu().detach().numpy()
|
| 67 |
+
pred = np.argmax(score_abnormal, axis=1)
|
| 68 |
+
found_class = [key for key, value in class_to_int.items() if value == pred[0]]
|
| 69 |
+
file_name = os.path.basename(file[0])
|
| 70 |
+
video_class["video"].append(file_name)
|
| 71 |
+
video_class["class"].append(found_class)
|
| 72 |
+
|
| 73 |
+
outputs = outputs.cpu().detach().numpy()
|
| 74 |
+
labels = labels.cpu().detach().numpy()
|
| 75 |
+
res = [-1]
|
| 76 |
+
|
| 77 |
+
if test_dataset == 'UCF': # all road accidents in UCF are labelled as 13
|
| 78 |
+
for row in outputs:
|
| 79 |
+
max_value = max(row[13], row[14], row[15])
|
| 80 |
+
row[13] = max_value
|
| 81 |
+
row[14] = 0.0
|
| 82 |
+
row[15] = 0.0
|
| 83 |
+
|
| 84 |
+
# Accuracy makes sense only when the test classes are involved in VAD
|
| 85 |
+
if test_dataset == 'UCF' or test_dataset == 'VAD':
|
| 86 |
+
res = top_k_accuracy(outputs, labels)
|
| 87 |
+
print('\n' + str(test_dataset) + ' top1 : ' + str(res[0]) + ' top5 : ' + str(res[1]) + '\n')
|
| 88 |
+
|
| 89 |
+
return res[0], video_class
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def main():
|
| 93 |
+
args = option.test_parser.parse_args()
|
| 94 |
+
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
| 95 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
|
| 96 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 97 |
+
|
| 98 |
+
out_path = os.path.join(args.output_dir, 'rec_results')
|
| 99 |
+
|
| 100 |
+
test_loader = DataLoader(Dataset(args, test_mode=True),
|
| 101 |
+
batch_size=1, shuffle=False,
|
| 102 |
+
num_workers=args.workers, pin_memory=True)
|
| 103 |
+
model = Model(feature_dim=args.feature_size, batch_size=1, seg_num=args.seg_num)
|
| 104 |
+
model.load_state_dict(torch.load(args.recognition_model))
|
| 105 |
+
|
| 106 |
+
_, video_class = test(dataloader=test_loader,
|
| 107 |
+
model=model,
|
| 108 |
+
device=device,
|
| 109 |
+
test_dataset=args.test_dataset)
|
| 110 |
+
# save recognition results
|
| 111 |
+
video_sub_dir = os.path.basename(os.path.dirname(video_class["video"][0][0]))
|
| 112 |
+
file_path = os.path.join(out_path, video_sub_dir, 'output_pred.txt')
|
| 113 |
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
| 114 |
+
with open(file_path, "w") as f:
|
| 115 |
+
for video, cls in zip(video_class["video"], video_class["class"]):
|
| 116 |
+
f.write(f"Video: {video}, class: {cls}\n")
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
if __name__ == '__main__':
|
| 120 |
+
main()
|
recognition/train.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.optim as optim
|
| 3 |
+
import os
|
| 4 |
+
from torch.nn import MSELoss
|
| 5 |
+
from torch.utils.data import DataLoader
|
| 6 |
+
from model import Model
|
| 7 |
+
from dataset import Dataset
|
| 8 |
+
from test import test
|
| 9 |
+
import option
|
| 10 |
+
from tqdm import tqdm
|
| 11 |
+
torch.set_default_tensor_type('torch.FloatTensor')
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def sparsity(arr, lamda2):
|
| 15 |
+
loss = torch.mean(torch.norm(arr, dim=0))
|
| 16 |
+
return lamda2*loss
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def smooth(arr, lamda1):
|
| 20 |
+
arr2 = torch.zeros_like(arr)
|
| 21 |
+
arr2[:-1] = arr[1:]
|
| 22 |
+
arr2[-1] = arr[-1]
|
| 23 |
+
|
| 24 |
+
loss = torch.sum((arr2-arr)**2)
|
| 25 |
+
|
| 26 |
+
return lamda1*loss
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class SigmoidMAELoss(torch.nn.Module):
|
| 30 |
+
def __init__(self):
|
| 31 |
+
super(SigmoidMAELoss, self).__init__()
|
| 32 |
+
from torch.nn import Sigmoid
|
| 33 |
+
self.__sigmoid__ = Sigmoid()
|
| 34 |
+
self.__l1_loss__ = MSELoss()
|
| 35 |
+
|
| 36 |
+
def forward(self, pred, target):
|
| 37 |
+
return self.__l1_loss__(pred, target)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class RTFM_loss(torch.nn.Module):
|
| 41 |
+
def __init__(self, alpha, margin):
|
| 42 |
+
super(RTFM_loss, self).__init__()
|
| 43 |
+
self.alpha = alpha
|
| 44 |
+
self.margin = margin
|
| 45 |
+
self.sigmoid = torch.nn.Sigmoid()
|
| 46 |
+
self.mae_criterion = SigmoidMAELoss()
|
| 47 |
+
self.criterion = torch.nn.CrossEntropyLoss() # multi class
|
| 48 |
+
|
| 49 |
+
def forward(self, score_normal, score_abnormal, nlabel, alabel, feat_n, feat_a):
|
| 50 |
+
labels = torch.cat((nlabel, alabel), 0)
|
| 51 |
+
scores = torch.cat((score_normal, score_abnormal), 0)
|
| 52 |
+
|
| 53 |
+
labels = labels.cuda()
|
| 54 |
+
|
| 55 |
+
loss_cls = self.criterion(scores, labels) # CE loss in the score space
|
| 56 |
+
|
| 57 |
+
loss_abn = torch.abs(self.margin - torch.norm(torch.mean(feat_a, dim=1), p=2, dim=1))
|
| 58 |
+
|
| 59 |
+
loss_nor = torch.norm(torch.mean(feat_n, dim=1), p=2, dim=1)
|
| 60 |
+
|
| 61 |
+
loss_rtfm = torch.mean((loss_abn + loss_nor) ** 2)
|
| 62 |
+
|
| 63 |
+
loss_total = loss_cls + self.alpha * loss_rtfm
|
| 64 |
+
|
| 65 |
+
return loss_total
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def train(nloader, aloader, model, batch_size, seg_num, optimizer, device):
|
| 69 |
+
with torch.set_grad_enabled(True):
|
| 70 |
+
model.train()
|
| 71 |
+
|
| 72 |
+
ninput1, ninput2, ninput3, nlabel = next(nloader)
|
| 73 |
+
ainput1, ainput2, ainput3, alabel = next(aloader)
|
| 74 |
+
|
| 75 |
+
input1 = torch.cat((ninput1, ainput1), 0).to(device)
|
| 76 |
+
input2 = torch.cat((ninput2, ainput2), 0).to(device)
|
| 77 |
+
input3 = torch.cat((ninput3, ainput3), 0).to(device)
|
| 78 |
+
score_abnormal, score_normal, feat_select_abn, feat_select_normal, scores = model(input1, input2, input3)
|
| 79 |
+
|
| 80 |
+
scores = scores.view(batch_size * seg_num * 2, -1) # BX32X2, 18
|
| 81 |
+
|
| 82 |
+
abn_scores, indice = torch.max(scores[batch_size*32:], dim=1)
|
| 83 |
+
|
| 84 |
+
nlabel = nlabel[0:batch_size]
|
| 85 |
+
alabel = alabel[0:batch_size]
|
| 86 |
+
|
| 87 |
+
loss_criterion = RTFM_loss(0.0001, 100)
|
| 88 |
+
loss_sparse = sparsity(abn_scores, 8e-3)
|
| 89 |
+
loss_smooth = smooth(abn_scores, 8e-4)
|
| 90 |
+
|
| 91 |
+
loss_RTFM = loss_criterion(score_normal, score_abnormal, nlabel, alabel, feat_select_normal, feat_select_abn)
|
| 92 |
+
cost = loss_RTFM + loss_smooth + loss_sparse
|
| 93 |
+
|
| 94 |
+
optimizer.zero_grad()
|
| 95 |
+
cost.backward()
|
| 96 |
+
optimizer.step()
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def main():
|
| 100 |
+
args = option.train_parser.parse_args()
|
| 101 |
+
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
| 102 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
|
| 103 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 104 |
+
|
| 105 |
+
train_nloader = DataLoader(Dataset(args, test_mode=False, is_normal=True),
|
| 106 |
+
batch_size=args.batch_size, shuffle=True,
|
| 107 |
+
num_workers=args.workers, pin_memory=True, drop_last=True)
|
| 108 |
+
train_aloader = DataLoader(Dataset(args, test_mode=False, is_normal=False),
|
| 109 |
+
batch_size=args.batch_size, shuffle=True,
|
| 110 |
+
num_workers=args.workers, pin_memory=True, drop_last=True)
|
| 111 |
+
test_loader = DataLoader(Dataset(args, test_mode=True),
|
| 112 |
+
batch_size=1, shuffle=False,
|
| 113 |
+
num_workers=args.workers, pin_memory=True)
|
| 114 |
+
|
| 115 |
+
if not os.path.exists(args.save_models):
|
| 116 |
+
os.makedirs(args.save_models)
|
| 117 |
+
|
| 118 |
+
feature_size = args.feature_size
|
| 119 |
+
model = Model(feature_size, args.batch_size, args.seg_num)
|
| 120 |
+
optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.005)
|
| 121 |
+
test_info = {"epoch": [], "TOP-1 ACC": []}
|
| 122 |
+
best_ACC = -1
|
| 123 |
+
output_dir = args.output_dir
|
| 124 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 125 |
+
acc, _ = test(dataloader=test_loader,
|
| 126 |
+
model=model,
|
| 127 |
+
device=device,
|
| 128 |
+
test_dataset=args.test_dataset)
|
| 129 |
+
|
| 130 |
+
for step in tqdm(range(1, args.max_epoch + 1), total=args.max_epoch, dynamic_ncols=True):
|
| 131 |
+
if (step - 1) % len(train_nloader) == 0:
|
| 132 |
+
loadern_iter = iter(train_nloader)
|
| 133 |
+
|
| 134 |
+
if (step - 1) % len(train_aloader) == 0:
|
| 135 |
+
loadera_iter = iter(train_aloader)
|
| 136 |
+
|
| 137 |
+
train(nloader=loadern_iter,
|
| 138 |
+
aloader=loadera_iter,
|
| 139 |
+
model=model,
|
| 140 |
+
batch_size=args.batch_size,
|
| 141 |
+
seg_num=args.seg_num,
|
| 142 |
+
optimizer=optimizer,
|
| 143 |
+
device=device)
|
| 144 |
+
|
| 145 |
+
if step % 5 == 0 and step > 5:
|
| 146 |
+
acc, _ = test(dataloader=test_loader,
|
| 147 |
+
model=model,
|
| 148 |
+
device=device,
|
| 149 |
+
test_dataset=args.test_dataset)
|
| 150 |
+
|
| 151 |
+
test_info["epoch"].append(step)
|
| 152 |
+
test_info["TOP-1 ACC"].append(acc)
|
| 153 |
+
|
| 154 |
+
if test_info["TOP-1 ACC"][-1] > best_ACC:
|
| 155 |
+
best_ACC = test_info["TOP-1 ACC"][-1]
|
| 156 |
+
torch.save(model.state_dict(), os.path.join(args.save_models, args.model_name + '-{}.pkl'.format(step)))
|
| 157 |
+
file_path = os.path.join(output_dir, '{}-step-ACC.txt'.format(step))
|
| 158 |
+
with open(file_path, "w") as fo:
|
| 159 |
+
for key in test_info:
|
| 160 |
+
fo.write("{}: {}\n".format(key, test_info[key][-1]))
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
if __name__ == '__main__':
|
| 164 |
+
main()
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
matplotlib==3.7.0
|
| 2 |
+
mmaction2.egg==info
|
| 3 |
+
mmcv==1.7.0
|
| 4 |
+
numpy==1.25.1
|
| 5 |
+
opencv_contrib_python==4.7.0.72
|
| 6 |
+
opencv_python==4.7.0.72
|
| 7 |
+
scikit_learn==1.2.2
|
| 8 |
+
torch==2.0.0+cu118
|
| 9 |
+
torchvision==0.15.1+cu118
|
| 10 |
+
tqdm==4.64.1
|
utils/feature_extractor.py
ADDED
|
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Reference with Ivo's implementation"""
|
| 2 |
+
import argparse
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
from os import path, mkdir
|
| 6 |
+
import random
|
| 7 |
+
|
| 8 |
+
import numpy as np
|
| 9 |
+
import torch
|
| 10 |
+
import torch.backends.cudnn as cudnn
|
| 11 |
+
from video_loader import VideoIter
|
| 12 |
+
from utils import register_logger, get_torch_device
|
| 13 |
+
import transforms_video
|
| 14 |
+
from torch.utils.data import DataLoader
|
| 15 |
+
from torchvision.transforms import transforms
|
| 16 |
+
|
| 17 |
+
# Video Swin Transformer related repository
|
| 18 |
+
from mmcv import Config
|
| 19 |
+
from mmaction.models import build_model
|
| 20 |
+
from mmcv.runner import load_checkpoint
|
| 21 |
+
import warnings
|
| 22 |
+
|
| 23 |
+
warnings.filterwarnings("ignore", message="The pts_unit 'pts' gives wrong results. Please use pts_unit 'sec'.")
|
| 24 |
+
warnings.filterwarnings('ignore', message='No handlers found: "aten::pad". Skipped.')
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def get_args():
|
| 28 |
+
parser = argparse.ArgumentParser(description="VST Feature Extractor Parser")
|
| 29 |
+
# I/O
|
| 30 |
+
parser.add_argument('--dataset_path', default='test_videos',
|
| 31 |
+
help="path to dataset")
|
| 32 |
+
parser.add_argument('--save_dir', type=str, default="features",
|
| 33 |
+
help="set output root for the features.")
|
| 34 |
+
# extraction params
|
| 35 |
+
parser.add_argument('--model_type', default='swinB',
|
| 36 |
+
type=str,
|
| 37 |
+
help="type of feature extractor")
|
| 38 |
+
parser.add_argument('--pretrained_3d',
|
| 39 |
+
default='/media/DataDrive/yiling/models/VST_finetune/hflip_speed_120_2d/best_top1_acc_epoch_15.pth',
|
| 40 |
+
type=str,
|
| 41 |
+
help="load default 3D pretrained feature extractor model.")
|
| 42 |
+
parser.add_argument('--clip_length', type=int, default=8,
|
| 43 |
+
help="define the length of each input sample.")
|
| 44 |
+
parser.add_argument('--frame_interval', type=int, default=1,
|
| 45 |
+
help="define the sampling interval between frames.")
|
| 46 |
+
parser.add_argument('--use_splits', type=bool, default=False,
|
| 47 |
+
help="use full anomalous data or splits, only applicable of Split Dataset of UCF-CRIME and VAD")
|
| 48 |
+
parser.add_argument('--batch_size', type=int, default=8, help="batch size")
|
| 49 |
+
# running cfg
|
| 50 |
+
parser.add_argument('--num_workers', type=int, default=0,
|
| 51 |
+
help="define the number of workers used for loading the videos")
|
| 52 |
+
parser.add_argument('--seed', type=int, default=None, help='random seed')
|
| 53 |
+
parser.add_argument('--log_every', type=int, default=10,
|
| 54 |
+
help="log the writing of clips every n steps.")
|
| 55 |
+
parser.add_argument('--log_file', type=str,
|
| 56 |
+
help="set logging file.")
|
| 57 |
+
parser.add_argument('--gpu', type=int, default=0, help="gpu id")
|
| 58 |
+
|
| 59 |
+
return parser.parse_args()
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def set_random_seed(seed=42):
|
| 63 |
+
random.seed(seed)
|
| 64 |
+
np.random.seed(seed)
|
| 65 |
+
torch.manual_seed(seed)
|
| 66 |
+
torch.cuda.manual_seed(seed)
|
| 67 |
+
torch.cuda.manual_seed_all(seed)
|
| 68 |
+
os.environ['PYTHONHASHSEED'] = str(seed)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def to_segments(data, num=32):
|
| 72 |
+
"""
|
| 73 |
+
These code is taken from:
|
| 74 |
+
https://github.com/rajanjitenpatel/C3D_feature_extraction/blob/b5894fa06d43aa62b3b64e85b07feb0853e7011a/extract_C3D_feature.py#L805
|
| 75 |
+
:param data: list of features of a certain video
|
| 76 |
+
:return: list of 32 segments
|
| 77 |
+
"""
|
| 78 |
+
data = np.array(data)
|
| 79 |
+
Segments_Features = []
|
| 80 |
+
thirty2_shots = np.round(np.linspace(0, len(data) - 1, num=num + 1)).astype(int)
|
| 81 |
+
for ss, ee in zip(thirty2_shots[:-1], thirty2_shots[1:]):
|
| 82 |
+
if ss == ee:
|
| 83 |
+
temp_vect = data[min(ss, data.shape[0] - 1), :]
|
| 84 |
+
else:
|
| 85 |
+
temp_vect = data[ss:ee, :].mean(axis=0)
|
| 86 |
+
|
| 87 |
+
temp_vect = temp_vect / np.linalg.norm(temp_vect)
|
| 88 |
+
if np.linalg.norm == 0:
|
| 89 |
+
logging.error("Feature norm is 0")
|
| 90 |
+
exit()
|
| 91 |
+
if len(temp_vect) != 0:
|
| 92 |
+
Segments_Features.append(temp_vect.tolist())
|
| 93 |
+
|
| 94 |
+
return Segments_Features
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class FeaturesWriter:
|
| 98 |
+
def __init__(self, num_videos, chunk_size=16):
|
| 99 |
+
"""
|
| 100 |
+
Initialize a FeaturesWriter instance.
|
| 101 |
+
|
| 102 |
+
Args:
|
| 103 |
+
num_videos (int): Total number of videos to process.
|
| 104 |
+
chunk_size (int, optional): Chunk size for writing features, and not used. Defaults to 16.
|
| 105 |
+
"""
|
| 106 |
+
self.path = None
|
| 107 |
+
self.dir = None
|
| 108 |
+
self.data = None
|
| 109 |
+
self.chunk_size = chunk_size
|
| 110 |
+
self.num_videos = num_videos
|
| 111 |
+
self.dump_count = 0
|
| 112 |
+
|
| 113 |
+
def _init_video(self, video_name, dir):
|
| 114 |
+
self.path = path.join(dir, f"{video_name}.txt")
|
| 115 |
+
self.dir = dir
|
| 116 |
+
self.data = dict()
|
| 117 |
+
|
| 118 |
+
def has_video(self):
|
| 119 |
+
return self.data is not None
|
| 120 |
+
|
| 121 |
+
def dump(self):
|
| 122 |
+
logging.info(f'{self.dump_count} / {self.num_videos}: Dumping {self.path}')
|
| 123 |
+
self.dump_count += 1
|
| 124 |
+
if not path.exists(self.dir):
|
| 125 |
+
os.mkdir(self.dir)
|
| 126 |
+
features = to_segments([self.data[key] for key in sorted(self.data)])
|
| 127 |
+
with open(self.path, 'w') as fp:
|
| 128 |
+
for d in features:
|
| 129 |
+
d = [str(x) for x in d]
|
| 130 |
+
fp.write(' '.join(d) + '\n')
|
| 131 |
+
|
| 132 |
+
def _is_new_video(self, video_name, dir):
|
| 133 |
+
new_path = path.join(dir, f"{video_name}.txt")
|
| 134 |
+
if self.path != new_path and self.path is not None:
|
| 135 |
+
return True
|
| 136 |
+
|
| 137 |
+
return False
|
| 138 |
+
|
| 139 |
+
def store(self, feature, idx):
|
| 140 |
+
self.data[idx] = list(feature)
|
| 141 |
+
|
| 142 |
+
def write(self, feature, video_name, idx, dir):
|
| 143 |
+
if not self.has_video():
|
| 144 |
+
self._init_video(video_name, dir)
|
| 145 |
+
|
| 146 |
+
if self._is_new_video(video_name, dir):
|
| 147 |
+
self.dump()
|
| 148 |
+
self._init_video(video_name, dir)
|
| 149 |
+
|
| 150 |
+
self.store(feature, idx)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def get_features_loader(dataset_path, clip_length, frame_interval, batch_size, num_workers, save_dir, use_splits):
|
| 154 |
+
"""
|
| 155 |
+
Get the data loader for extracting video features.
|
| 156 |
+
|
| 157 |
+
Args:
|
| 158 |
+
dataset_path (str): Path to the videos.
|
| 159 |
+
clip_length (int): Length of each input sample.
|
| 160 |
+
frame_interval (int): Sampling interval between frames.
|
| 161 |
+
batch_size (int): Batch size.
|
| 162 |
+
num_workers (int): Number of workers used for loading videos.
|
| 163 |
+
save_dir (str): Directory to save features.
|
| 164 |
+
use_splits (bool): Whether to use full anomalous data or splits.
|
| 165 |
+
|
| 166 |
+
Returns:
|
| 167 |
+
data_loader (VideoIter): Video data loader.
|
| 168 |
+
data_iter (DataLoader): Torch data loader for video features extraction.
|
| 169 |
+
"""
|
| 170 |
+
mean = [0.400, 0.388, 0.372] # VAD mean and std in RGB
|
| 171 |
+
std = [0.247, 0.245, 0.243]
|
| 172 |
+
size = 224
|
| 173 |
+
resize = size, size
|
| 174 |
+
crop = size
|
| 175 |
+
|
| 176 |
+
res = transforms.Compose([
|
| 177 |
+
transforms_video.ToTensorVideo(),
|
| 178 |
+
transforms_video.ResizeVideo(resize),
|
| 179 |
+
transforms_video.CenterCropVideo(crop),
|
| 180 |
+
transforms_video.NormalizeVideo(mean=mean, std=std)
|
| 181 |
+
])
|
| 182 |
+
|
| 183 |
+
if os.path.exists(save_dir):
|
| 184 |
+
proc_v = []
|
| 185 |
+
for root, dirs, files in os.walk(save_dir):
|
| 186 |
+
for file in files:
|
| 187 |
+
file_path = os.path.join(root, file)
|
| 188 |
+
relative_path = os.path.relpath(file_path, save_dir)
|
| 189 |
+
proc_v.append(relative_path)
|
| 190 |
+
proc_v = [v.split(".")[0] for v in proc_v]
|
| 191 |
+
if len(proc_v) > 0:
|
| 192 |
+
logging.info(
|
| 193 |
+
f"[Data] Already {len(proc_v)} files have been processed"
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
data_loader = VideoIter(
|
| 197 |
+
dataset_path=dataset_path,
|
| 198 |
+
proc_video=proc_v,
|
| 199 |
+
clip_length=clip_length,
|
| 200 |
+
frame_stride=frame_interval,
|
| 201 |
+
video_transform=res,
|
| 202 |
+
use_splits=use_splits,
|
| 203 |
+
return_label=False,
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
data_iter = torch.utils.data.DataLoader(
|
| 207 |
+
data_loader,
|
| 208 |
+
batch_size=batch_size,
|
| 209 |
+
shuffle=False,
|
| 210 |
+
num_workers=num_workers,
|
| 211 |
+
pin_memory=True,
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
return data_loader, data_iter
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def load_VST(checkpoint, device):
|
| 218 |
+
"""load pretrained VST"""
|
| 219 |
+
config = 'utils/swin_config/recognition/swin/swin_base_patch244_window877_kinetics400_22k_VAD.py'
|
| 220 |
+
cfg = Config.fromfile(config)
|
| 221 |
+
model = build_model(cfg.model, train_cfg=None, test_cfg=cfg.get('test_cfg'))
|
| 222 |
+
load_checkpoint(model, checkpoint, map_location='cpu')
|
| 223 |
+
|
| 224 |
+
return model.to(device)
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
def main():
|
| 228 |
+
args = get_args()
|
| 229 |
+
|
| 230 |
+
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
| 231 |
+
torch.cuda.set_device(args.gpu)
|
| 232 |
+
device = get_torch_device()
|
| 233 |
+
register_logger(log_file=args.log_file)
|
| 234 |
+
|
| 235 |
+
if args.seed is not None:
|
| 236 |
+
set_random_seed(args.seed)
|
| 237 |
+
|
| 238 |
+
cudnn.benchmark = True
|
| 239 |
+
|
| 240 |
+
feature_path = os.path.join(args.save_dir, 'L'+str(args.clip_length))
|
| 241 |
+
|
| 242 |
+
if not path.exists(feature_path):
|
| 243 |
+
mkdir(feature_path)
|
| 244 |
+
|
| 245 |
+
data_loader, data_iter = get_features_loader(args.dataset_path,
|
| 246 |
+
args.clip_length,
|
| 247 |
+
args.frame_interval,
|
| 248 |
+
args.batch_size,
|
| 249 |
+
args.num_workers,
|
| 250 |
+
feature_path,
|
| 251 |
+
args.use_splits, )
|
| 252 |
+
if data_loader.video_count == 0:
|
| 253 |
+
return
|
| 254 |
+
|
| 255 |
+
model = load_VST(args.pretrained_3d, device)
|
| 256 |
+
|
| 257 |
+
features_writer = FeaturesWriter(num_videos=data_loader.video_count)
|
| 258 |
+
loop_i = 0
|
| 259 |
+
# Perform feature extraction on the dataset
|
| 260 |
+
with torch.no_grad():
|
| 261 |
+
for data, clip_idxs, dirs, vid_names in data_iter: # 1 batch
|
| 262 |
+
outputs = model.extract_feat(data.to(device))
|
| 263 |
+
outputs = outputs.mean(dim=[2, 3, 4])
|
| 264 |
+
outputs = outputs.detach().cpu().numpy()
|
| 265 |
+
|
| 266 |
+
for i, (dir, vid_name, clip_idx) in enumerate(zip(dirs, vid_names, clip_idxs)):
|
| 267 |
+
if loop_i == 0:
|
| 268 |
+
logging.info(
|
| 269 |
+
f"Video {features_writer.dump_count} / {features_writer.num_videos} : Writing clip {clip_idx} of video {vid_name}")
|
| 270 |
+
|
| 271 |
+
loop_i += 1
|
| 272 |
+
loop_i %= args.log_every
|
| 273 |
+
|
| 274 |
+
dir = path.join(feature_path, dir)
|
| 275 |
+
features_writer.write(feature=outputs[i],
|
| 276 |
+
video_name=vid_name,
|
| 277 |
+
idx=clip_idx,
|
| 278 |
+
dir=dir, )
|
| 279 |
+
# Dump the remaining features to files
|
| 280 |
+
features_writer.dump()
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
if __name__ == "__main__":
|
| 284 |
+
main()
|
utils/functional_video.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def _is_tensor_video_clip(clip):
|
| 5 |
+
if not torch.is_tensor(clip):
|
| 6 |
+
raise TypeError("clip should be Tesnor. Got %s" % type(clip))
|
| 7 |
+
|
| 8 |
+
if not clip.ndimension() == 4:
|
| 9 |
+
raise ValueError("clip should be 4D. Got %dD" % clip.dim())
|
| 10 |
+
|
| 11 |
+
return True
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def crop(clip, i, j, h, w):
|
| 15 |
+
"""
|
| 16 |
+
Args:
|
| 17 |
+
clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
|
| 18 |
+
"""
|
| 19 |
+
assert len(clip.size()) == 4, "clip should be a 4D tensor"
|
| 20 |
+
return clip[..., i:i + h, j:j + w]
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def resize(clip, target_size, interpolation_mode):
|
| 24 |
+
assert len(target_size) == 2, "target size should be tuple (height, width)"
|
| 25 |
+
# print(target_size)
|
| 26 |
+
return torch.nn.functional.interpolate(
|
| 27 |
+
clip, size=target_size, mode=interpolation_mode, align_corners=False
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"):
|
| 32 |
+
"""
|
| 33 |
+
Do spatial cropping and resizing to the video clip
|
| 34 |
+
Args:
|
| 35 |
+
clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
|
| 36 |
+
i (int): i in (i,j) i.e coordinates of the upper left corner.
|
| 37 |
+
j (int): j in (i,j) i.e coordinates of the upper left corner.
|
| 38 |
+
h (int): Height of the cropped region.
|
| 39 |
+
w (int): Width of the cropped region.
|
| 40 |
+
size (tuple(int, int)): height and width of resized clip
|
| 41 |
+
Returns:
|
| 42 |
+
clip (torch.tensor): Resized and cropped clip. Size is (C, T, H, W)
|
| 43 |
+
"""
|
| 44 |
+
assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor"
|
| 45 |
+
clip = crop(clip, i, j, h, w)
|
| 46 |
+
clip = resize(clip, size, interpolation_mode)
|
| 47 |
+
return clip
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def center_crop(clip, crop_size):
|
| 51 |
+
assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor"
|
| 52 |
+
h, w = clip.size(-2), clip.size(-1)
|
| 53 |
+
th, tw = crop_size
|
| 54 |
+
assert h >= th and w >= tw, "height and width must be no smaller than crop_size"
|
| 55 |
+
|
| 56 |
+
i = int(round((h - th) / 2.0))
|
| 57 |
+
j = int(round((w - tw) / 2.0))
|
| 58 |
+
return crop(clip, i, j, th, tw)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def to_tensor(clip):
|
| 62 |
+
"""
|
| 63 |
+
Convert tensor data type from uint8 to float, divide value by 255.0 and
|
| 64 |
+
permute the dimenions of clip tensor
|
| 65 |
+
Args:
|
| 66 |
+
clip (torch.tensor, dtype=torch.uint8): Size is (T, H, W, C)
|
| 67 |
+
Return:
|
| 68 |
+
clip (torch.tensor, dtype=torch.float): Size is (C, T, H, W)
|
| 69 |
+
"""
|
| 70 |
+
_is_tensor_video_clip(clip)
|
| 71 |
+
if not clip.dtype == torch.uint8:
|
| 72 |
+
raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype))
|
| 73 |
+
return clip.float().permute(3, 0, 1, 2) / 255.0
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def normalize(clip, mean, std, inplace=False):
|
| 77 |
+
"""
|
| 78 |
+
Args:
|
| 79 |
+
clip (torch.tensor): Video clip to be normalized. Size is (C, T, H, W)
|
| 80 |
+
mean (tuple): pixel RGB mean. Size is (3)
|
| 81 |
+
std (tuple): pixel standard deviation. Size is (3)
|
| 82 |
+
Returns:
|
| 83 |
+
normalized clip (torch.tensor): Size is (C, T, H, W)
|
| 84 |
+
"""
|
| 85 |
+
assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor"
|
| 86 |
+
if not inplace:
|
| 87 |
+
clip = clip.clone()
|
| 88 |
+
mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device)
|
| 89 |
+
std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device)
|
| 90 |
+
clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
|
| 91 |
+
return clip
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def hflip(clip):
|
| 95 |
+
"""
|
| 96 |
+
Args:
|
| 97 |
+
clip (torch.tensor): Video clip to be normalized. Size is (C, T, H, W)
|
| 98 |
+
Returns:
|
| 99 |
+
flipped clip (torch.tensor): Size is (C, T, H, W)
|
| 100 |
+
"""
|
| 101 |
+
assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor"
|
| 102 |
+
return clip.flip((-1))
|
utils/swin_config/_base_/default_runtime.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
checkpoint_config = dict(interval=1)
|
| 2 |
+
log_config = dict(
|
| 3 |
+
interval=20,
|
| 4 |
+
hooks=[
|
| 5 |
+
dict(type='TextLoggerHook'),
|
| 6 |
+
# dict(type='TensorboardLoggerHook'),
|
| 7 |
+
])
|
| 8 |
+
# runtime settings
|
| 9 |
+
dist_params = dict(backend='nccl')
|
| 10 |
+
log_level = 'INFO'
|
| 11 |
+
load_from = None
|
| 12 |
+
resume_from = None
|
| 13 |
+
workflow = [('train', 1)]
|
utils/swin_config/_base_/models/audioonly_r50.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
model = dict(
|
| 3 |
+
type='AudioRecognizer',
|
| 4 |
+
backbone=dict(
|
| 5 |
+
type='ResNetAudio',
|
| 6 |
+
depth=50,
|
| 7 |
+
pretrained=None,
|
| 8 |
+
in_channels=1,
|
| 9 |
+
norm_eval=False),
|
| 10 |
+
cls_head=dict(
|
| 11 |
+
type='AudioTSNHead',
|
| 12 |
+
num_classes=400,
|
| 13 |
+
in_channels=1024,
|
| 14 |
+
dropout_ratio=0.5,
|
| 15 |
+
init_std=0.01),
|
| 16 |
+
# model training and testing settings
|
| 17 |
+
train_cfg=None,
|
| 18 |
+
test_cfg=dict(average_clips='prob'))
|
utils/swin_config/_base_/models/bmn_400x100.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
model = dict(
|
| 3 |
+
type='BMN',
|
| 4 |
+
temporal_dim=100,
|
| 5 |
+
boundary_ratio=0.5,
|
| 6 |
+
num_samples=32,
|
| 7 |
+
num_samples_per_bin=3,
|
| 8 |
+
feat_dim=400,
|
| 9 |
+
soft_nms_alpha=0.4,
|
| 10 |
+
soft_nms_low_threshold=0.5,
|
| 11 |
+
soft_nms_high_threshold=0.9,
|
| 12 |
+
post_process_top_k=100)
|
utils/swin_config/_base_/models/bsn_pem.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
model = dict(
|
| 3 |
+
type='PEM',
|
| 4 |
+
pem_feat_dim=32,
|
| 5 |
+
pem_hidden_dim=256,
|
| 6 |
+
pem_u_ratio_m=1,
|
| 7 |
+
pem_u_ratio_l=2,
|
| 8 |
+
pem_high_temporal_iou_threshold=0.6,
|
| 9 |
+
pem_low_temporal_iou_threshold=0.2,
|
| 10 |
+
soft_nms_alpha=0.75,
|
| 11 |
+
soft_nms_low_threshold=0.65,
|
| 12 |
+
soft_nms_high_threshold=0.9,
|
| 13 |
+
post_process_top_k=100)
|
utils/swin_config/_base_/models/bsn_tem.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
model = dict(
|
| 3 |
+
type='TEM',
|
| 4 |
+
temporal_dim=100,
|
| 5 |
+
boundary_ratio=0.1,
|
| 6 |
+
tem_feat_dim=400,
|
| 7 |
+
tem_hidden_dim=512,
|
| 8 |
+
tem_match_threshold=0.5)
|
utils/swin_config/_base_/models/c3d_sports1m_pretrained.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
model = dict(
|
| 3 |
+
type='Recognizer3D',
|
| 4 |
+
backbone=dict(
|
| 5 |
+
type='C3D',
|
| 6 |
+
pretrained= # noqa: E251
|
| 7 |
+
'https://download.openmmlab.com/mmaction/recognition/c3d/c3d_sports1m_pretrain_20201016-dcc47ddc.pth', # noqa: E501
|
| 8 |
+
style='pytorch',
|
| 9 |
+
conv_cfg=dict(type='Conv3d'),
|
| 10 |
+
norm_cfg=None,
|
| 11 |
+
act_cfg=dict(type='ReLU'),
|
| 12 |
+
dropout_ratio=0.5,
|
| 13 |
+
init_std=0.005),
|
| 14 |
+
cls_head=dict(
|
| 15 |
+
type='I3DHead',
|
| 16 |
+
num_classes=101,
|
| 17 |
+
in_channels=4096,
|
| 18 |
+
spatial_type=None,
|
| 19 |
+
dropout_ratio=0.5,
|
| 20 |
+
init_std=0.01),
|
| 21 |
+
# model training and testing settings
|
| 22 |
+
train_cfg=None,
|
| 23 |
+
test_cfg=dict(average_clips='score'))
|
utils/swin_config/_base_/models/csn_ig65m_pretrained.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
model = dict(
|
| 3 |
+
type='Recognizer3D',
|
| 4 |
+
backbone=dict(
|
| 5 |
+
type='ResNet3dCSN',
|
| 6 |
+
pretrained2d=False,
|
| 7 |
+
pretrained= # noqa: E251
|
| 8 |
+
'https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r152_ig65m_20200807-771c4135.pth', # noqa: E501
|
| 9 |
+
depth=152,
|
| 10 |
+
with_pool2=False,
|
| 11 |
+
bottleneck_mode='ir',
|
| 12 |
+
norm_eval=False,
|
| 13 |
+
zero_init_residual=False),
|
| 14 |
+
cls_head=dict(
|
| 15 |
+
type='I3DHead',
|
| 16 |
+
num_classes=400,
|
| 17 |
+
in_channels=2048,
|
| 18 |
+
spatial_type='avg',
|
| 19 |
+
dropout_ratio=0.5,
|
| 20 |
+
init_std=0.01),
|
| 21 |
+
# model training and testing settings
|
| 22 |
+
train_cfg=None,
|
| 23 |
+
test_cfg=dict(average_clips='prob'))
|
utils/swin_config/_base_/models/i3d_r50.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
model = dict(
|
| 3 |
+
type='Recognizer3D',
|
| 4 |
+
backbone=dict(
|
| 5 |
+
type='ResNet3d',
|
| 6 |
+
pretrained2d=True,
|
| 7 |
+
pretrained='torchvision://resnet50',
|
| 8 |
+
depth=50,
|
| 9 |
+
conv1_kernel=(5, 7, 7),
|
| 10 |
+
conv1_stride_t=2,
|
| 11 |
+
pool1_stride_t=2,
|
| 12 |
+
conv_cfg=dict(type='Conv3d'),
|
| 13 |
+
norm_eval=False,
|
| 14 |
+
inflate=((1, 1, 1), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 1, 0)),
|
| 15 |
+
zero_init_residual=False),
|
| 16 |
+
cls_head=dict(
|
| 17 |
+
type='I3DHead',
|
| 18 |
+
num_classes=400,
|
| 19 |
+
in_channels=2048,
|
| 20 |
+
spatial_type='avg',
|
| 21 |
+
dropout_ratio=0.5,
|
| 22 |
+
init_std=0.01),
|
| 23 |
+
# model training and testing settings
|
| 24 |
+
train_cfg=None,
|
| 25 |
+
test_cfg=dict(average_clips='prob'))
|
| 26 |
+
|
| 27 |
+
# This setting refers to https://github.com/open-mmlab/mmaction/blob/master/mmaction/models/tenons/backbones/resnet_i3d.py#L329-L332 # noqa: E501
|
utils/swin_config/_base_/models/r2plus1d_r34.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
model = dict(
|
| 3 |
+
type='Recognizer3D',
|
| 4 |
+
backbone=dict(
|
| 5 |
+
type='ResNet2Plus1d',
|
| 6 |
+
depth=34,
|
| 7 |
+
pretrained=None,
|
| 8 |
+
pretrained2d=False,
|
| 9 |
+
norm_eval=False,
|
| 10 |
+
conv_cfg=dict(type='Conv2plus1d'),
|
| 11 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True, eps=1e-3),
|
| 12 |
+
conv1_kernel=(3, 7, 7),
|
| 13 |
+
conv1_stride_t=1,
|
| 14 |
+
pool1_stride_t=1,
|
| 15 |
+
inflate=(1, 1, 1, 1),
|
| 16 |
+
spatial_strides=(1, 2, 2, 2),
|
| 17 |
+
temporal_strides=(1, 2, 2, 2),
|
| 18 |
+
zero_init_residual=False),
|
| 19 |
+
cls_head=dict(
|
| 20 |
+
type='I3DHead',
|
| 21 |
+
num_classes=400,
|
| 22 |
+
in_channels=512,
|
| 23 |
+
spatial_type='avg',
|
| 24 |
+
dropout_ratio=0.5,
|
| 25 |
+
init_std=0.01),
|
| 26 |
+
# model training and testing settings
|
| 27 |
+
train_cfg=None,
|
| 28 |
+
test_cfg=dict(average_clips='prob'))
|
utils/swin_config/_base_/models/slowfast_r50.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
model = dict(
|
| 3 |
+
type='Recognizer3D',
|
| 4 |
+
backbone=dict(
|
| 5 |
+
type='ResNet3dSlowFast',
|
| 6 |
+
pretrained=None,
|
| 7 |
+
resample_rate=8, # tau
|
| 8 |
+
speed_ratio=8, # alpha
|
| 9 |
+
channel_ratio=8, # beta_inv
|
| 10 |
+
slow_pathway=dict(
|
| 11 |
+
type='resnet3d',
|
| 12 |
+
depth=50,
|
| 13 |
+
pretrained=None,
|
| 14 |
+
lateral=True,
|
| 15 |
+
conv1_kernel=(1, 7, 7),
|
| 16 |
+
dilations=(1, 1, 1, 1),
|
| 17 |
+
conv1_stride_t=1,
|
| 18 |
+
pool1_stride_t=1,
|
| 19 |
+
inflate=(0, 0, 1, 1),
|
| 20 |
+
norm_eval=False),
|
| 21 |
+
fast_pathway=dict(
|
| 22 |
+
type='resnet3d',
|
| 23 |
+
depth=50,
|
| 24 |
+
pretrained=None,
|
| 25 |
+
lateral=False,
|
| 26 |
+
base_channels=8,
|
| 27 |
+
conv1_kernel=(5, 7, 7),
|
| 28 |
+
conv1_stride_t=1,
|
| 29 |
+
pool1_stride_t=1,
|
| 30 |
+
norm_eval=False)),
|
| 31 |
+
cls_head=dict(
|
| 32 |
+
type='SlowFastHead',
|
| 33 |
+
in_channels=2304, # 2048+256
|
| 34 |
+
num_classes=400,
|
| 35 |
+
spatial_type='avg',
|
| 36 |
+
dropout_ratio=0.5),
|
| 37 |
+
# model training and testing settings
|
| 38 |
+
train_cfg=None,
|
| 39 |
+
test_cfg=dict(average_clips='prob'))
|
utils/swin_config/_base_/models/slowonly_r50.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
model = dict(
|
| 3 |
+
type='Recognizer3D',
|
| 4 |
+
backbone=dict(
|
| 5 |
+
type='ResNet3dSlowOnly',
|
| 6 |
+
depth=50,
|
| 7 |
+
pretrained='torchvision://resnet50',
|
| 8 |
+
lateral=False,
|
| 9 |
+
conv1_kernel=(1, 7, 7),
|
| 10 |
+
conv1_stride_t=1,
|
| 11 |
+
pool1_stride_t=1,
|
| 12 |
+
inflate=(0, 0, 1, 1),
|
| 13 |
+
norm_eval=False),
|
| 14 |
+
cls_head=dict(
|
| 15 |
+
type='I3DHead',
|
| 16 |
+
in_channels=2048,
|
| 17 |
+
num_classes=400,
|
| 18 |
+
spatial_type='avg',
|
| 19 |
+
dropout_ratio=0.5),
|
| 20 |
+
# model training and testing settings
|
| 21 |
+
train_cfg=None,
|
| 22 |
+
test_cfg=dict(average_clips='prob'))
|
utils/swin_config/_base_/models/swin/swin_base.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
_base_ = "swin_tiny.py"
|
| 3 |
+
model = dict(backbone=dict(depths=[2, 2, 18, 2],
|
| 4 |
+
embed_dim=128,
|
| 5 |
+
num_heads=[4, 8, 16, 32]),
|
| 6 |
+
cls_head=dict(in_channels=1024))
|
utils/swin_config/_base_/models/swin/swin_large.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
_base_ = "swin_tiny.py"
|
| 3 |
+
model = dict(backbone=dict(depths=[2, 2, 18, 2],
|
| 4 |
+
embed_dim=192,
|
| 5 |
+
num_heads=[6, 12, 24, 48]),
|
| 6 |
+
cls_head=dict(in_channels=1536))
|
utils/swin_config/_base_/models/swin/swin_small.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
_base_ = "swin_tiny.py"
|
| 3 |
+
model = dict(backbone=dict(depths=[2, 2, 18, 2]))
|
utils/swin_config/_base_/models/swin/swin_tiny.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
model = dict(
|
| 3 |
+
type='Recognizer3D',
|
| 4 |
+
backbone=dict(
|
| 5 |
+
type='SwinTransformer3D',
|
| 6 |
+
patch_size=(4,4,4),
|
| 7 |
+
embed_dim=96,
|
| 8 |
+
depths=[2, 2, 6, 2],
|
| 9 |
+
num_heads=[3, 6, 12, 24],
|
| 10 |
+
window_size=(8,7,7),
|
| 11 |
+
mlp_ratio=4.,
|
| 12 |
+
qkv_bias=True,
|
| 13 |
+
qk_scale=None,
|
| 14 |
+
drop_rate=0.,
|
| 15 |
+
attn_drop_rate=0.,
|
| 16 |
+
drop_path_rate=0.2,
|
| 17 |
+
patch_norm=True),
|
| 18 |
+
cls_head=dict(
|
| 19 |
+
type='I3DHead',
|
| 20 |
+
in_channels=768,
|
| 21 |
+
num_classes=18,
|
| 22 |
+
spatial_type='avg',
|
| 23 |
+
dropout_ratio=0.5),
|
| 24 |
+
test_cfg = dict(average_clips='prob'))
|
utils/swin_config/_base_/models/swin/swin_tiny_backup.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
model = dict(
|
| 3 |
+
type='Recognizer3D',
|
| 4 |
+
backbone=dict(
|
| 5 |
+
type='SwinTransformer3D',
|
| 6 |
+
patch_size=(4,4,4),
|
| 7 |
+
embed_dim=96,
|
| 8 |
+
depths=[2, 2, 6, 2],
|
| 9 |
+
num_heads=[3, 6, 12, 24],
|
| 10 |
+
window_size=(8,7,7),
|
| 11 |
+
mlp_ratio=4.,
|
| 12 |
+
qkv_bias=True,
|
| 13 |
+
qk_scale=None,
|
| 14 |
+
drop_rate=0.,
|
| 15 |
+
attn_drop_rate=0.,
|
| 16 |
+
drop_path_rate=0.2,
|
| 17 |
+
patch_norm=True),
|
| 18 |
+
cls_head=dict(
|
| 19 |
+
type='I3DHead',
|
| 20 |
+
in_channels=768,
|
| 21 |
+
num_classes=400,
|
| 22 |
+
spatial_type='avg',
|
| 23 |
+
dropout_ratio=0.5),
|
| 24 |
+
test_cfg = dict(average_clips='prob'))
|
utils/swin_config/_base_/models/tanet_r50.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
model = dict(
|
| 3 |
+
type='Recognizer2D',
|
| 4 |
+
backbone=dict(
|
| 5 |
+
type='TANet',
|
| 6 |
+
pretrained='torchvision://resnet50',
|
| 7 |
+
depth=50,
|
| 8 |
+
num_segments=8,
|
| 9 |
+
tam_cfg=dict()),
|
| 10 |
+
cls_head=dict(
|
| 11 |
+
type='TSMHead',
|
| 12 |
+
num_classes=400,
|
| 13 |
+
in_channels=2048,
|
| 14 |
+
spatial_type='avg',
|
| 15 |
+
consensus=dict(type='AvgConsensus', dim=1),
|
| 16 |
+
dropout_ratio=0.5,
|
| 17 |
+
init_std=0.001),
|
| 18 |
+
# model training and testing settings
|
| 19 |
+
train_cfg=None,
|
| 20 |
+
test_cfg=dict(average_clips='prob'))
|
utils/swin_config/_base_/models/tin_r50.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
model = dict(
|
| 3 |
+
type='Recognizer2D',
|
| 4 |
+
backbone=dict(
|
| 5 |
+
type='ResNetTIN',
|
| 6 |
+
pretrained='torchvision://resnet50',
|
| 7 |
+
depth=50,
|
| 8 |
+
norm_eval=False,
|
| 9 |
+
shift_div=4),
|
| 10 |
+
cls_head=dict(
|
| 11 |
+
type='TSMHead',
|
| 12 |
+
num_classes=400,
|
| 13 |
+
in_channels=2048,
|
| 14 |
+
spatial_type='avg',
|
| 15 |
+
consensus=dict(type='AvgConsensus', dim=1),
|
| 16 |
+
dropout_ratio=0.5,
|
| 17 |
+
init_std=0.001,
|
| 18 |
+
is_shift=False),
|
| 19 |
+
# model training and testing settings
|
| 20 |
+
train_cfg=None,
|
| 21 |
+
test_cfg=dict(average_clips=None))
|
utils/swin_config/_base_/models/tpn_slowonly_r50.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
model = dict(
|
| 3 |
+
type='Recognizer3D',
|
| 4 |
+
backbone=dict(
|
| 5 |
+
type='ResNet3dSlowOnly',
|
| 6 |
+
depth=50,
|
| 7 |
+
pretrained='torchvision://resnet50',
|
| 8 |
+
lateral=False,
|
| 9 |
+
out_indices=(2, 3),
|
| 10 |
+
conv1_kernel=(1, 7, 7),
|
| 11 |
+
conv1_stride_t=1,
|
| 12 |
+
pool1_stride_t=1,
|
| 13 |
+
inflate=(0, 0, 1, 1),
|
| 14 |
+
norm_eval=False),
|
| 15 |
+
neck=dict(
|
| 16 |
+
type='TPN',
|
| 17 |
+
in_channels=(1024, 2048),
|
| 18 |
+
out_channels=1024,
|
| 19 |
+
spatial_modulation_cfg=dict(
|
| 20 |
+
in_channels=(1024, 2048), out_channels=2048),
|
| 21 |
+
temporal_modulation_cfg=dict(downsample_scales=(8, 8)),
|
| 22 |
+
upsample_cfg=dict(scale_factor=(1, 1, 1)),
|
| 23 |
+
downsample_cfg=dict(downsample_scale=(1, 1, 1)),
|
| 24 |
+
level_fusion_cfg=dict(
|
| 25 |
+
in_channels=(1024, 1024),
|
| 26 |
+
mid_channels=(1024, 1024),
|
| 27 |
+
out_channels=2048,
|
| 28 |
+
downsample_scales=((1, 1, 1), (1, 1, 1))),
|
| 29 |
+
aux_head_cfg=dict(out_channels=400, loss_weight=0.5)),
|
| 30 |
+
cls_head=dict(
|
| 31 |
+
type='TPNHead',
|
| 32 |
+
num_classes=400,
|
| 33 |
+
in_channels=2048,
|
| 34 |
+
spatial_type='avg',
|
| 35 |
+
consensus=dict(type='AvgConsensus', dim=1),
|
| 36 |
+
dropout_ratio=0.5,
|
| 37 |
+
init_std=0.01),
|
| 38 |
+
# model training and testing settings
|
| 39 |
+
train_cfg=None,
|
| 40 |
+
test_cfg=dict(average_clips='prob'))
|
utils/swin_config/_base_/models/tpn_tsm_r50.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
model = dict(
|
| 3 |
+
type='Recognizer2D',
|
| 4 |
+
backbone=dict(
|
| 5 |
+
type='ResNetTSM',
|
| 6 |
+
pretrained='torchvision://resnet50',
|
| 7 |
+
depth=50,
|
| 8 |
+
out_indices=(2, 3),
|
| 9 |
+
norm_eval=False,
|
| 10 |
+
shift_div=8),
|
| 11 |
+
neck=dict(
|
| 12 |
+
type='TPN',
|
| 13 |
+
in_channels=(1024, 2048),
|
| 14 |
+
out_channels=1024,
|
| 15 |
+
spatial_modulation_cfg=dict(
|
| 16 |
+
in_channels=(1024, 2048), out_channels=2048),
|
| 17 |
+
temporal_modulation_cfg=dict(downsample_scales=(8, 8)),
|
| 18 |
+
upsample_cfg=dict(scale_factor=(1, 1, 1)),
|
| 19 |
+
downsample_cfg=dict(downsample_scale=(1, 1, 1)),
|
| 20 |
+
level_fusion_cfg=dict(
|
| 21 |
+
in_channels=(1024, 1024),
|
| 22 |
+
mid_channels=(1024, 1024),
|
| 23 |
+
out_channels=2048,
|
| 24 |
+
downsample_scales=((1, 1, 1), (1, 1, 1))),
|
| 25 |
+
aux_head_cfg=dict(out_channels=174, loss_weight=0.5)),
|
| 26 |
+
cls_head=dict(
|
| 27 |
+
type='TPNHead',
|
| 28 |
+
num_classes=174,
|
| 29 |
+
in_channels=2048,
|
| 30 |
+
spatial_type='avg',
|
| 31 |
+
consensus=dict(type='AvgConsensus', dim=1),
|
| 32 |
+
dropout_ratio=0.5,
|
| 33 |
+
init_std=0.01),
|
| 34 |
+
# model training and testing settings
|
| 35 |
+
train_cfg=None,
|
| 36 |
+
test_cfg=dict(average_clips='prob', fcn_test=True))
|
utils/swin_config/_base_/models/trn_r50.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
model = dict(
|
| 3 |
+
type='Recognizer2D',
|
| 4 |
+
backbone=dict(
|
| 5 |
+
type='ResNet',
|
| 6 |
+
pretrained='torchvision://resnet50',
|
| 7 |
+
depth=50,
|
| 8 |
+
norm_eval=False,
|
| 9 |
+
partial_bn=True),
|
| 10 |
+
cls_head=dict(
|
| 11 |
+
type='TRNHead',
|
| 12 |
+
num_classes=400,
|
| 13 |
+
in_channels=2048,
|
| 14 |
+
num_segments=8,
|
| 15 |
+
spatial_type='avg',
|
| 16 |
+
relation_type='TRNMultiScale',
|
| 17 |
+
hidden_dim=256,
|
| 18 |
+
dropout_ratio=0.8,
|
| 19 |
+
init_std=0.001),
|
| 20 |
+
# model training and testing settings
|
| 21 |
+
train_cfg=None,
|
| 22 |
+
test_cfg=dict(average_clips='prob'))
|
utils/swin_config/_base_/models/tsm_mobilenet_v2.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
model = dict(
|
| 3 |
+
type='Recognizer2D',
|
| 4 |
+
backbone=dict(
|
| 5 |
+
type='MobileNetV2TSM',
|
| 6 |
+
shift_div=8,
|
| 7 |
+
num_segments=8,
|
| 8 |
+
is_shift=True,
|
| 9 |
+
pretrained='mmcls://mobilenet_v2'),
|
| 10 |
+
cls_head=dict(
|
| 11 |
+
type='TSMHead',
|
| 12 |
+
num_segments=8,
|
| 13 |
+
num_classes=400,
|
| 14 |
+
in_channels=1280,
|
| 15 |
+
spatial_type='avg',
|
| 16 |
+
consensus=dict(type='AvgConsensus', dim=1),
|
| 17 |
+
dropout_ratio=0.5,
|
| 18 |
+
init_std=0.001,
|
| 19 |
+
is_shift=True),
|
| 20 |
+
# model training and testing settings
|
| 21 |
+
train_cfg=None,
|
| 22 |
+
test_cfg=dict(average_clips='prob'))
|
utils/swin_config/_base_/models/tsm_r50.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
model = dict(
|
| 3 |
+
type='Recognizer2D',
|
| 4 |
+
backbone=dict(
|
| 5 |
+
type='ResNetTSM',
|
| 6 |
+
pretrained='torchvision://resnet50',
|
| 7 |
+
depth=50,
|
| 8 |
+
norm_eval=False,
|
| 9 |
+
shift_div=8),
|
| 10 |
+
cls_head=dict(
|
| 11 |
+
type='TSMHead',
|
| 12 |
+
num_classes=400,
|
| 13 |
+
in_channels=2048,
|
| 14 |
+
spatial_type='avg',
|
| 15 |
+
consensus=dict(type='AvgConsensus', dim=1),
|
| 16 |
+
dropout_ratio=0.5,
|
| 17 |
+
init_std=0.001,
|
| 18 |
+
is_shift=True),
|
| 19 |
+
# model training and testing settings
|
| 20 |
+
train_cfg=None,
|
| 21 |
+
test_cfg=dict(average_clips='prob'))
|
utils/swin_config/_base_/models/tsn_r50.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
model = dict(
|
| 3 |
+
type='Recognizer2D',
|
| 4 |
+
backbone=dict(
|
| 5 |
+
type='ResNet',
|
| 6 |
+
pretrained='torchvision://resnet50',
|
| 7 |
+
depth=50,
|
| 8 |
+
norm_eval=False),
|
| 9 |
+
cls_head=dict(
|
| 10 |
+
type='TSNHead',
|
| 11 |
+
num_classes=400,
|
| 12 |
+
in_channels=2048,
|
| 13 |
+
spatial_type='avg',
|
| 14 |
+
consensus=dict(type='AvgConsensus', dim=1),
|
| 15 |
+
dropout_ratio=0.4,
|
| 16 |
+
init_std=0.01),
|
| 17 |
+
# model training and testing settings
|
| 18 |
+
train_cfg=None,
|
| 19 |
+
test_cfg=dict(average_clips=None))
|
utils/swin_config/_base_/models/tsn_r50_audio.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
model = dict(
|
| 3 |
+
type='AudioRecognizer',
|
| 4 |
+
backbone=dict(type='ResNet', depth=50, in_channels=1, norm_eval=False),
|
| 5 |
+
cls_head=dict(
|
| 6 |
+
type='AudioTSNHead',
|
| 7 |
+
num_classes=400,
|
| 8 |
+
in_channels=2048,
|
| 9 |
+
dropout_ratio=0.5,
|
| 10 |
+
init_std=0.01),
|
| 11 |
+
# model training and testing settings
|
| 12 |
+
train_cfg=None,
|
| 13 |
+
test_cfg=dict(average_clips='prob'))
|
utils/swin_config/_base_/models/x3d.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model settings
|
| 2 |
+
model = dict(
|
| 3 |
+
type='Recognizer3D',
|
| 4 |
+
backbone=dict(type='X3D', gamma_w=1, gamma_b=2.25, gamma_d=2.2),
|
| 5 |
+
cls_head=dict(
|
| 6 |
+
type='X3DHead',
|
| 7 |
+
in_channels=432,
|
| 8 |
+
num_classes=400,
|
| 9 |
+
spatial_type='avg',
|
| 10 |
+
dropout_ratio=0.5,
|
| 11 |
+
fc1_bias=False),
|
| 12 |
+
# model training and testing settings
|
| 13 |
+
train_cfg=None,
|
| 14 |
+
test_cfg=dict(average_clips='prob'))
|
utils/swin_config/_base_/schedules/adam_20e.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# optimizer
|
| 2 |
+
optimizer = dict(
|
| 3 |
+
type='Adam', lr=0.01, weight_decay=0.00001) # this lr is used for 1 gpus
|
| 4 |
+
optimizer_config = dict(grad_clip=None)
|
| 5 |
+
# learning policy
|
| 6 |
+
lr_config = dict(policy='step', step=10)
|
| 7 |
+
total_epochs = 20
|
utils/swin_config/_base_/schedules/sgd_100e.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# optimizer
|
| 2 |
+
optimizer = dict(
|
| 3 |
+
type='SGD',
|
| 4 |
+
lr=0.01, # this lr is used for 8 gpus
|
| 5 |
+
momentum=0.9,
|
| 6 |
+
weight_decay=0.0001)
|
| 7 |
+
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
|
| 8 |
+
# learning policy
|
| 9 |
+
lr_config = dict(policy='step', step=[40, 80])
|
| 10 |
+
total_epochs = 100
|
utils/swin_config/_base_/schedules/sgd_150e_warmup.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# optimizer
|
| 2 |
+
optimizer = dict(
|
| 3 |
+
type='SGD', lr=0.01, momentum=0.9,
|
| 4 |
+
weight_decay=0.0001) # this lr is used for 8 gpus
|
| 5 |
+
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
|
| 6 |
+
# learning policy
|
| 7 |
+
lr_config = dict(
|
| 8 |
+
policy='step',
|
| 9 |
+
step=[90, 130],
|
| 10 |
+
warmup='linear',
|
| 11 |
+
warmup_by_epoch=True,
|
| 12 |
+
warmup_iters=10)
|
| 13 |
+
total_epochs = 150
|
utils/swin_config/_base_/schedules/sgd_50e.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# optimizer
|
| 2 |
+
optimizer = dict(
|
| 3 |
+
type='SGD',
|
| 4 |
+
lr=0.01, # this lr is used for 8 gpus
|
| 5 |
+
momentum=0.9,
|
| 6 |
+
weight_decay=0.0001)
|
| 7 |
+
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
|
| 8 |
+
# learning policy
|
| 9 |
+
lr_config = dict(policy='step', step=[20, 40])
|
| 10 |
+
total_epochs = 50
|
utils/swin_config/_base_/schedules/sgd_tsm_100e.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# optimizer
|
| 2 |
+
optimizer = dict(
|
| 3 |
+
type='SGD',
|
| 4 |
+
constructor='TSMOptimizerConstructor',
|
| 5 |
+
paramwise_cfg=dict(fc_lr5=True),
|
| 6 |
+
lr=0.02, # this lr is used for 8 gpus
|
| 7 |
+
momentum=0.9,
|
| 8 |
+
weight_decay=0.0001)
|
| 9 |
+
optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
|
| 10 |
+
# learning policy
|
| 11 |
+
lr_config = dict(policy='step', step=[40, 80])
|
| 12 |
+
total_epochs = 100
|
utils/swin_config/_base_/schedules/sgd_tsm_50e.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# optimizer
|
| 2 |
+
optimizer = dict(
|
| 3 |
+
type='SGD',
|
| 4 |
+
constructor='TSMOptimizerConstructor',
|
| 5 |
+
paramwise_cfg=dict(fc_lr5=True),
|
| 6 |
+
lr=0.01, # this lr is used for 8 gpus
|
| 7 |
+
momentum=0.9,
|
| 8 |
+
weight_decay=0.0001)
|
| 9 |
+
optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
|
| 10 |
+
# learning policy
|
| 11 |
+
lr_config = dict(policy='step', step=[20, 40])
|
| 12 |
+
total_epochs = 50
|
utils/swin_config/_base_/schedules/sgd_tsm_mobilenet_v2_100e.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# optimizer
|
| 2 |
+
optimizer = dict(
|
| 3 |
+
type='SGD',
|
| 4 |
+
constructor='TSMOptimizerConstructor',
|
| 5 |
+
paramwise_cfg=dict(fc_lr5=True),
|
| 6 |
+
lr=0.01, # this lr is used for 8 gpus
|
| 7 |
+
momentum=0.9,
|
| 8 |
+
weight_decay=0.00002)
|
| 9 |
+
optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
|
| 10 |
+
# learning policy
|
| 11 |
+
lr_config = dict(policy='step', step=[40, 80])
|
| 12 |
+
total_epochs = 100
|