initial commit
Browse files- .gitattributes +5 -0
- LICENSE.txt +3 -0
- README.md +3 -0
- nervenc/480p_finetuned_baseline/cfg.yaml +103 -0
- nervenc/480p_finetuned_baseline/epoch-last.pth +3 -0
- nervenc/480p_finetuned_baseline_small/cfg.yaml +103 -0
- nervenc/480p_finetuned_baseline_small/epoch-last.pth +3 -0
- nervenc/720p_finetuned_baseline/cfg.yaml +104 -0
- nervenc/720p_finetuned_baseline/epoch-last.pth +3 -0
- nervenc/pre_finetune/pre_finetune_480p_baseline/cfg.yaml +101 -0
- nervenc/pre_finetune/pre_finetune_480p_baseline/epoch-last.pth +3 -0
- nervenc/pre_finetune/pre_finetune_480p_baseline_small/cfg.yaml +101 -0
- nervenc/pre_finetune/pre_finetune_480p_baseline_small/epoch-last.pth +3 -0
- nervenc/pre_finetune/pre_finetune_720p_baseline/cfg.yaml +101 -0
- nervenc/pre_finetune/pre_finetune_720p_baseline/epoch-last.pth +3 -0
- patch_tubelet/320x160_finetuned_patch/cfg.yaml +112 -0
- patch_tubelet/320x160_finetuned_patch/epoch-last.pth +3 -0
- patch_tubelet/320x160_finetuned_patch_small/cfg.yaml +112 -0
- patch_tubelet/320x160_finetuned_patch_small/epoch-last.pth +3 -0
- patch_tubelet/320x240_finetuned_patch_train_720p/cfg.yaml +112 -0
- patch_tubelet/320x240_finetuned_patch_train_720p/epoch-last.pth +3 -0
- patch_tubelet/pre_finetune/pre_finetune_320x160_patch/cfg.yaml +110 -0
- patch_tubelet/pre_finetune/pre_finetune_320x160_patch/epoch-last.pth +3 -0
- patch_tubelet/pre_finetune/pre_finetune_320x160_patch_small/cfg.yaml +110 -0
- patch_tubelet/pre_finetune/pre_finetune_320x160_patch_small/epoch-last.pth +3 -0
- patch_tubelet/pre_finetune/pre_finetune_320x240_patch_train_480p/cfg.yaml +110 -0
- patch_tubelet/pre_finetune/pre_finetune_320x240_patch_train_480p/epoch-last.pth +3 -0
- patch_tubelet/pre_finetune/pre_finetune_320x240_patch_train_720p/cfg.yaml +110 -0
- patch_tubelet/pre_finetune/pre_finetune_320x240_patch_train_720p/epoch-last.pth +3 -0
- patch_tubelet/pre_finetune/pre_finetune_384x270_patch_train_480p/cfg.yaml +110 -0
- patch_tubelet/pre_finetune/pre_finetune_384x270_patch_train_480p/epoch-last.pth +3 -0
- patch_tubelet/pre_finetune/pre_finetune_384x270_patch_train_720p/cfg.yaml +110 -0
- patch_tubelet/pre_finetune/pre_finetune_384x270_patch_train_720p/epoch-last.pth +3 -0
- teconerv/320x160_pairs_teco/cfg.yaml +115 -0
- teconerv/320x160_pairs_teco/epoch-last.pth +3 -0
- teconerv/320x160_pairs_teco_small/cfg.yaml +115 -0
- teconerv/320x160_pairs_teco_small/epoch-last.pth +3 -0
- teconerv/320x240_pairs_teco/cfg.yaml +115 -0
- teconerv/320x240_pairs_teco/epoch-last.pth +3 -0
- teconerv/320x240_pairs_teco_train_720p/cfg.yaml +115 -0
- teconerv/320x240_pairs_teco_train_720p/epoch-last.pth +3 -0
- teconerv/384x270_pairs_teco/cfg.yaml +115 -0
- teconerv/384x270_pairs_teco/epoch-last.pth +3 -0
- teconerv/384x270_pairs_teco_train_720p/cfg.yaml +115 -0
- teconerv/384x270_pairs_teco_train_720p/epoch-last.pth +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
LICENSE.txt filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
README.md filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
nervenc filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
patch_tubelet filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
teconerv filter=lfs diff=lfs merge=lfs -text
|
LICENSE.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:149d807167937014a4633d3a72ce59e201b01087b08c901257bd17c416481bc5
|
| 3 |
+
size 1071
|
README.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:881c0ec81d89da3e6b8b32c9efd8a46e14a71ee5218de4b7c4908f151c34cb24
|
| 3 |
+
size 1519
|
nervenc/480p_finetuned_baseline/cfg.yaml
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
trainer: nerv_enc_trainer
|
| 2 |
+
train_dataset:
|
| 3 |
+
name: vidrec_dataset_clip_sampler_lazy
|
| 4 |
+
args:
|
| 5 |
+
root_path: data/dataset_meta
|
| 6 |
+
split: train
|
| 7 |
+
frame_num: 8
|
| 8 |
+
rand_augment: '1_2_5'
|
| 9 |
+
csv_file: k400_2023_train_cls400_50_480p.js
|
| 10 |
+
cls_vid_num: '400_25'
|
| 11 |
+
crop_size:
|
| 12 |
+
- 480
|
| 13 |
+
- 640
|
| 14 |
+
scale: 1
|
| 15 |
+
aspect_ratio: 1
|
| 16 |
+
rand_flip: 'no'
|
| 17 |
+
clips_per_video: 1
|
| 18 |
+
loader:
|
| 19 |
+
batch_size: 8
|
| 20 |
+
num_workers: 16
|
| 21 |
+
test_dataset:
|
| 22 |
+
name: vidrec_dataset_clip_inference_lazy_uvg
|
| 23 |
+
args:
|
| 24 |
+
root_path: data/dataset_meta
|
| 25 |
+
frame_num: 8
|
| 26 |
+
cls_vid_num: -1_-1
|
| 27 |
+
crop_size:
|
| 28 |
+
- 480
|
| 29 |
+
- 640
|
| 30 |
+
csv_paths:
|
| 31 |
+
uvg: uvg_hd.csv
|
| 32 |
+
frames:
|
| 33 |
+
input: none
|
| 34 |
+
output: none
|
| 35 |
+
loader:
|
| 36 |
+
batch_size: 8
|
| 37 |
+
num_workers: 16
|
| 38 |
+
model:
|
| 39 |
+
name: nerv_enc
|
| 40 |
+
args:
|
| 41 |
+
tokenizer:
|
| 42 |
+
name: vidrec_tokenizer
|
| 43 |
+
args:
|
| 44 |
+
input_size:
|
| 45 |
+
- 480
|
| 46 |
+
- 640
|
| 47 |
+
patch_size: 32
|
| 48 |
+
padding: 0
|
| 49 |
+
frame_num: 8
|
| 50 |
+
eval_frames: none
|
| 51 |
+
img_groups: 1
|
| 52 |
+
hyponet:
|
| 53 |
+
name: hypo_convnets_full_res
|
| 54 |
+
args:
|
| 55 |
+
in_dim: 1
|
| 56 |
+
out_dim: 3
|
| 57 |
+
out_bias: tanh
|
| 58 |
+
strds_h: '5_4_4_3_2'
|
| 59 |
+
strds_w: '5_4_4_4_2'
|
| 60 |
+
ks: '1_3'
|
| 61 |
+
hid_dim: 32
|
| 62 |
+
size: none
|
| 63 |
+
act: gelu
|
| 64 |
+
use_pe: true
|
| 65 |
+
pe_dim: 32
|
| 66 |
+
n_tokens: '32_256_32_24_0'
|
| 67 |
+
token_dims: '200_288_288_288_0'
|
| 68 |
+
transformer_encoder:
|
| 69 |
+
name: transformer_encoder
|
| 70 |
+
args:
|
| 71 |
+
dim: 720
|
| 72 |
+
depth: 6
|
| 73 |
+
n_head: 12
|
| 74 |
+
head_dim: 64
|
| 75 |
+
ff_dim: 2800
|
| 76 |
+
optimizer:
|
| 77 |
+
name: adam
|
| 78 |
+
args:
|
| 79 |
+
lr: 0.0001
|
| 80 |
+
lr_type: step
|
| 81 |
+
max_epoch: 50
|
| 82 |
+
eval_epoch: 200
|
| 83 |
+
vis_epoch: 2000
|
| 84 |
+
dump_ckt: 'no'
|
| 85 |
+
dump_pred: 'no'
|
| 86 |
+
dump_video: 'no'
|
| 87 |
+
generate_from_single_frame: false
|
| 88 |
+
finetune_model: checkpoints/nervenc/pre_finetune/pre_finetune_480p_baseline/epoch-last.pth
|
| 89 |
+
finetune_same_model: true
|
| 90 |
+
env:
|
| 91 |
+
exp_name: nervenc
|
| 92 |
+
save_dir: checkpoints/nervenc/480p_finetuned_baseline
|
| 93 |
+
instance_tag: 480p_finetuned_baseline
|
| 94 |
+
tot_gpus: 1
|
| 95 |
+
cudnn: false
|
| 96 |
+
port: '29600'
|
| 97 |
+
wandb_upload: false
|
| 98 |
+
wandb_exp_name: null
|
| 99 |
+
wandb_run_id: none
|
| 100 |
+
distributed: false
|
| 101 |
+
rank: 0
|
| 102 |
+
world_size: 1
|
| 103 |
+
gpu: null
|
nervenc/480p_finetuned_baseline/epoch-last.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b99efdba84aee0c111efe1ba651de06e342142478b6bdf32a06eb79ed4b9451a
|
| 3 |
+
size 514966594
|
nervenc/480p_finetuned_baseline_small/cfg.yaml
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
trainer: nerv_enc_trainer
|
| 2 |
+
train_dataset:
|
| 3 |
+
name: vidrec_dataset_clip_sampler_lazy
|
| 4 |
+
args:
|
| 5 |
+
root_path: data/dataset_meta
|
| 6 |
+
split: train
|
| 7 |
+
frame_num: 8
|
| 8 |
+
rand_augment: '1_2_5'
|
| 9 |
+
csv_file: k400_2023_train_cls400_50_480p.js
|
| 10 |
+
cls_vid_num: '400_25'
|
| 11 |
+
crop_size:
|
| 12 |
+
- 480
|
| 13 |
+
- 640
|
| 14 |
+
scale: 1
|
| 15 |
+
aspect_ratio: 1
|
| 16 |
+
rand_flip: 'no'
|
| 17 |
+
clips_per_video: 1
|
| 18 |
+
loader:
|
| 19 |
+
batch_size: 8
|
| 20 |
+
num_workers: 24
|
| 21 |
+
test_dataset:
|
| 22 |
+
name: vidrec_dataset_clip_inference_lazy_uvg
|
| 23 |
+
args:
|
| 24 |
+
root_path: data/dataset_meta
|
| 25 |
+
frame_num: 8
|
| 26 |
+
cls_vid_num: -1_-1
|
| 27 |
+
crop_size:
|
| 28 |
+
- 480
|
| 29 |
+
- 640
|
| 30 |
+
csv_paths:
|
| 31 |
+
uvg: uvg_hd.csv
|
| 32 |
+
frames:
|
| 33 |
+
input: none
|
| 34 |
+
output: none
|
| 35 |
+
loader:
|
| 36 |
+
batch_size: 8
|
| 37 |
+
num_workers: 24
|
| 38 |
+
model:
|
| 39 |
+
name: nerv_enc
|
| 40 |
+
args:
|
| 41 |
+
tokenizer:
|
| 42 |
+
name: vidrec_tokenizer
|
| 43 |
+
args:
|
| 44 |
+
input_size:
|
| 45 |
+
- 480
|
| 46 |
+
- 640
|
| 47 |
+
patch_size: 32
|
| 48 |
+
padding: 0
|
| 49 |
+
frame_num: 8
|
| 50 |
+
eval_frames: none
|
| 51 |
+
img_groups: 1
|
| 52 |
+
hyponet:
|
| 53 |
+
name: hypo_convnets_full_res
|
| 54 |
+
args:
|
| 55 |
+
in_dim: 1
|
| 56 |
+
out_dim: 3
|
| 57 |
+
out_bias: tanh
|
| 58 |
+
strds_h: '5_4_4_3_2'
|
| 59 |
+
strds_w: '5_4_4_4_2'
|
| 60 |
+
ks: '1_3'
|
| 61 |
+
hid_dim: 20
|
| 62 |
+
size: none
|
| 63 |
+
act: gelu
|
| 64 |
+
use_pe: true
|
| 65 |
+
pe_dim: 20
|
| 66 |
+
n_tokens: '20_160_20_20_0'
|
| 67 |
+
token_dims: '125_120_288_180_0'
|
| 68 |
+
transformer_encoder:
|
| 69 |
+
name: transformer_encoder
|
| 70 |
+
args:
|
| 71 |
+
dim: 720
|
| 72 |
+
depth: 6
|
| 73 |
+
n_head: 12
|
| 74 |
+
head_dim: 64
|
| 75 |
+
ff_dim: 2800
|
| 76 |
+
optimizer:
|
| 77 |
+
name: adam
|
| 78 |
+
args:
|
| 79 |
+
lr: 0.0001
|
| 80 |
+
lr_type: step
|
| 81 |
+
max_epoch: 50
|
| 82 |
+
eval_epoch: 200
|
| 83 |
+
vis_epoch: 2000
|
| 84 |
+
dump_ckt: 'no'
|
| 85 |
+
dump_pred: 'no'
|
| 86 |
+
dump_video: 'no'
|
| 87 |
+
generate_from_single_frame: false
|
| 88 |
+
finetune_model: checkpoints/nervenc/pre_finetune/pre_finetune_480p_baseline_small/epoch-last.pth
|
| 89 |
+
finetune_same_model: true
|
| 90 |
+
env:
|
| 91 |
+
exp_name: nervenc
|
| 92 |
+
save_dir: checkpoints/nervenc/480p_finetuned_baseline_small
|
| 93 |
+
instance_tag: 480p_finetuned_baseline_small
|
| 94 |
+
tot_gpus: 1
|
| 95 |
+
cudnn: false
|
| 96 |
+
port: '29600'
|
| 97 |
+
wandb_upload: false
|
| 98 |
+
wandb_exp_name: null
|
| 99 |
+
wandb_run_id: none
|
| 100 |
+
distributed: false
|
| 101 |
+
rank: 0
|
| 102 |
+
world_size: 1
|
| 103 |
+
gpu: null
|
nervenc/480p_finetuned_baseline_small/epoch-last.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e100335142ea79a40ae1259c0fb25a50512a8a9117313def7d618c52955f07d5
|
| 3 |
+
size 507680130
|
nervenc/720p_finetuned_baseline/cfg.yaml
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
trainer: nerv_enc_trainer
|
| 2 |
+
train_dataset:
|
| 3 |
+
name: vidrec_dataset_clip_sampler_lazy
|
| 4 |
+
args:
|
| 5 |
+
root_path: data/dataset_meta
|
| 6 |
+
split: train
|
| 7 |
+
frame_num: 8
|
| 8 |
+
rand_augment: '1_2_5'
|
| 9 |
+
csv_file: k400_2023_train_cls400_50_720p.js
|
| 10 |
+
cls_vid_num: '400_25'
|
| 11 |
+
crop_size:
|
| 12 |
+
- 720
|
| 13 |
+
- 1280
|
| 14 |
+
scale: 1
|
| 15 |
+
aspect_ratio: 1
|
| 16 |
+
rand_flip: 'no'
|
| 17 |
+
clips_per_video: 1
|
| 18 |
+
loader:
|
| 19 |
+
batch_size: 8
|
| 20 |
+
num_workers: 16
|
| 21 |
+
test_dataset:
|
| 22 |
+
name: vidrec_dataset_clip_inference_lazy_uvg
|
| 23 |
+
args:
|
| 24 |
+
root_path: data/dataset_meta
|
| 25 |
+
frame_num: 8
|
| 26 |
+
cls_vid_num: -1_-1
|
| 27 |
+
crop_size:
|
| 28 |
+
- 720
|
| 29 |
+
- 1280
|
| 30 |
+
csv_paths:
|
| 31 |
+
uvg: uvg_hd.csv
|
| 32 |
+
frames:
|
| 33 |
+
input: none
|
| 34 |
+
output: none
|
| 35 |
+
loader:
|
| 36 |
+
batch_size: 8
|
| 37 |
+
num_workers: 16
|
| 38 |
+
model:
|
| 39 |
+
name: nerv_enc
|
| 40 |
+
args:
|
| 41 |
+
tokenizer:
|
| 42 |
+
name: vidrec_tokenizer
|
| 43 |
+
args:
|
| 44 |
+
input_size:
|
| 45 |
+
- 720
|
| 46 |
+
- 1280
|
| 47 |
+
patch_size: 32
|
| 48 |
+
padding: 0
|
| 49 |
+
frame_num: 8
|
| 50 |
+
eval_frames: none
|
| 51 |
+
img_groups: 1
|
| 52 |
+
hyponet:
|
| 53 |
+
name: hypo_convnets_full_res
|
| 54 |
+
args:
|
| 55 |
+
in_dim: 1
|
| 56 |
+
out_dim: 3
|
| 57 |
+
out_bias: tanh
|
| 58 |
+
strds_h: '5_4_4_3_3'
|
| 59 |
+
strds_w: '5_4_4_4_4'
|
| 60 |
+
ks: '1_3'
|
| 61 |
+
hid_dim: 56
|
| 62 |
+
size: none
|
| 63 |
+
act: gelu
|
| 64 |
+
use_pe: true
|
| 65 |
+
pe_dim: 56
|
| 66 |
+
n_tokens: '56_448_112_112_0'
|
| 67 |
+
token_dims: '350_504_224_168_0'
|
| 68 |
+
transformer_encoder:
|
| 69 |
+
name: transformer_encoder
|
| 70 |
+
args:
|
| 71 |
+
dim: 720
|
| 72 |
+
depth: 6
|
| 73 |
+
n_head: 12
|
| 74 |
+
head_dim: 64
|
| 75 |
+
ff_dim: 2800
|
| 76 |
+
optimizer:
|
| 77 |
+
name: adam
|
| 78 |
+
args:
|
| 79 |
+
lr: 0.0001
|
| 80 |
+
lr_type: step
|
| 81 |
+
max_epoch: 50
|
| 82 |
+
eval_epoch: 50
|
| 83 |
+
vis_epoch: 2000
|
| 84 |
+
dump_ckt: 'no'
|
| 85 |
+
dump_pred: 'no'
|
| 86 |
+
dump_video: 'no'
|
| 87 |
+
generate_from_single_frame: false
|
| 88 |
+
finetune_model: checkpoints/nervenc/pre_finetune/pre_finetune_720p_baseline/epoch-last.pth
|
| 89 |
+
finetune_same_model: true
|
| 90 |
+
env:
|
| 91 |
+
exp_name: nervenc
|
| 92 |
+
save_dir: checkpoints/nervenc/720p_finetuned_baseline
|
| 93 |
+
instance_tag: 720p_finetuned_baseline
|
| 94 |
+
tot_gpus: 4
|
| 95 |
+
cudnn: false
|
| 96 |
+
port: '4645'
|
| 97 |
+
wandb_upload: false
|
| 98 |
+
wandb_exp_name: null
|
| 99 |
+
rank: 0
|
| 100 |
+
world_size: 4
|
| 101 |
+
gpu: 0
|
| 102 |
+
distributed: true
|
| 103 |
+
dist_backend: nccl
|
| 104 |
+
wandb_run_id: none
|
nervenc/720p_finetuned_baseline/epoch-last.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eea0687cabbee361487cd9a4c49128ca7be7cfb63c0865d9a3e6fbfededdb42c
|
| 3 |
+
size 570814594
|
nervenc/pre_finetune/pre_finetune_480p_baseline/cfg.yaml
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
trainer: nerv_enc_trainer
|
| 2 |
+
train_dataset:
|
| 3 |
+
name: vidrec_dataset_clip_sampler_lazy
|
| 4 |
+
args:
|
| 5 |
+
root_path: data/dataset_meta
|
| 6 |
+
split: train
|
| 7 |
+
frame_num: 8
|
| 8 |
+
rand_augment: '1_2_5'
|
| 9 |
+
csv_file: k400_2023_train_cls400_50_480p.js
|
| 10 |
+
cls_vid_num: '400_25'
|
| 11 |
+
crop_size:
|
| 12 |
+
- 480
|
| 13 |
+
- 640
|
| 14 |
+
scale: 1
|
| 15 |
+
aspect_ratio: 1
|
| 16 |
+
rand_flip: 'no'
|
| 17 |
+
clips_per_video: 1
|
| 18 |
+
loader:
|
| 19 |
+
batch_size: 8
|
| 20 |
+
num_workers: 16
|
| 21 |
+
test_dataset:
|
| 22 |
+
name: vidrec_dataset_clip_inference_lazy_uvg
|
| 23 |
+
args:
|
| 24 |
+
root_path: data/dataset_meta
|
| 25 |
+
frame_num: 8
|
| 26 |
+
cls_vid_num: -1_-1
|
| 27 |
+
crop_size:
|
| 28 |
+
- 480
|
| 29 |
+
- 640
|
| 30 |
+
csv_paths:
|
| 31 |
+
uvg: uvg_hd.csv
|
| 32 |
+
frames:
|
| 33 |
+
input: none
|
| 34 |
+
output: none
|
| 35 |
+
loader:
|
| 36 |
+
batch_size: 8
|
| 37 |
+
num_workers: 16
|
| 38 |
+
model:
|
| 39 |
+
name: nerv_enc
|
| 40 |
+
args:
|
| 41 |
+
tokenizer:
|
| 42 |
+
name: vidrec_tokenizer
|
| 43 |
+
args:
|
| 44 |
+
input_size:
|
| 45 |
+
- 480
|
| 46 |
+
- 640
|
| 47 |
+
patch_size: 32
|
| 48 |
+
padding: 0
|
| 49 |
+
frame_num: 8
|
| 50 |
+
eval_frames: none
|
| 51 |
+
img_groups: 1
|
| 52 |
+
hyponet:
|
| 53 |
+
name: hypo_convnets_full_res
|
| 54 |
+
args:
|
| 55 |
+
in_dim: 1
|
| 56 |
+
out_dim: 3
|
| 57 |
+
out_bias: tanh
|
| 58 |
+
strds_h: '5_4_4_3_2'
|
| 59 |
+
strds_w: '5_4_4_4_2'
|
| 60 |
+
ks: '1_3'
|
| 61 |
+
hid_dim: 32
|
| 62 |
+
size: none
|
| 63 |
+
act: gelu
|
| 64 |
+
use_pe: true
|
| 65 |
+
pe_dim: 32
|
| 66 |
+
n_tokens: '32_256_32_24_0'
|
| 67 |
+
token_dims: '200_288_288_288_0'
|
| 68 |
+
transformer_encoder:
|
| 69 |
+
name: transformer_encoder
|
| 70 |
+
args:
|
| 71 |
+
dim: 720
|
| 72 |
+
depth: 6
|
| 73 |
+
n_head: 12
|
| 74 |
+
head_dim: 64
|
| 75 |
+
ff_dim: 2800
|
| 76 |
+
optimizer:
|
| 77 |
+
name: adam
|
| 78 |
+
args:
|
| 79 |
+
lr: 0.0001
|
| 80 |
+
lr_type: step
|
| 81 |
+
max_epoch: 150
|
| 82 |
+
eval_epoch: 200
|
| 83 |
+
vis_epoch: 2000
|
| 84 |
+
dump_ckt: 'no'
|
| 85 |
+
dump_pred: 'no'
|
| 86 |
+
dump_video: 'no'
|
| 87 |
+
generate_from_single_frame: false
|
| 88 |
+
env:
|
| 89 |
+
exp_name: nervenc
|
| 90 |
+
save_dir: checkpoints/nervenc/pre_finetune/pre_finetune_480p_baseline
|
| 91 |
+
instance_tag: pre_finetune_480p_baseline
|
| 92 |
+
tot_gpus: 1
|
| 93 |
+
cudnn: false
|
| 94 |
+
port: '29600'
|
| 95 |
+
wandb_upload: false
|
| 96 |
+
wandb_exp_name: null
|
| 97 |
+
wandb_run_id: none
|
| 98 |
+
distributed: false
|
| 99 |
+
rank: 0
|
| 100 |
+
world_size: 1
|
| 101 |
+
gpu: null
|
nervenc/pre_finetune/pre_finetune_480p_baseline/epoch-last.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a3dfb4c404d2e17ca15cff392a7ba43cb726b5f136c37dce70c6aecffe97ab9f
|
| 3 |
+
size 514966466
|
nervenc/pre_finetune/pre_finetune_480p_baseline_small/cfg.yaml
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
trainer: nerv_enc_trainer
|
| 2 |
+
train_dataset:
|
| 3 |
+
name: vidrec_dataset_clip_sampler_lazy
|
| 4 |
+
args:
|
| 5 |
+
root_path: data/dataset_meta
|
| 6 |
+
split: train
|
| 7 |
+
frame_num: 8
|
| 8 |
+
rand_augment: '1_2_5'
|
| 9 |
+
csv_file: k400_2023_train_cls400_50_480p.js
|
| 10 |
+
cls_vid_num: '400_25'
|
| 11 |
+
crop_size:
|
| 12 |
+
- 480
|
| 13 |
+
- 640
|
| 14 |
+
scale: 1
|
| 15 |
+
aspect_ratio: 1
|
| 16 |
+
rand_flip: 'no'
|
| 17 |
+
clips_per_video: 1
|
| 18 |
+
loader:
|
| 19 |
+
batch_size: 8
|
| 20 |
+
num_workers: 24
|
| 21 |
+
test_dataset:
|
| 22 |
+
name: vidrec_dataset_clip_inference_lazy_uvg
|
| 23 |
+
args:
|
| 24 |
+
root_path: data/dataset_meta
|
| 25 |
+
frame_num: 8
|
| 26 |
+
cls_vid_num: -1_-1
|
| 27 |
+
crop_size:
|
| 28 |
+
- 480
|
| 29 |
+
- 640
|
| 30 |
+
csv_paths:
|
| 31 |
+
uvg: uvg_hd.csv
|
| 32 |
+
frames:
|
| 33 |
+
input: none
|
| 34 |
+
output: none
|
| 35 |
+
loader:
|
| 36 |
+
batch_size: 8
|
| 37 |
+
num_workers: 24
|
| 38 |
+
model:
|
| 39 |
+
name: nerv_enc
|
| 40 |
+
args:
|
| 41 |
+
tokenizer:
|
| 42 |
+
name: vidrec_tokenizer
|
| 43 |
+
args:
|
| 44 |
+
input_size:
|
| 45 |
+
- 480
|
| 46 |
+
- 640
|
| 47 |
+
patch_size: 32
|
| 48 |
+
padding: 0
|
| 49 |
+
frame_num: 8
|
| 50 |
+
eval_frames: none
|
| 51 |
+
img_groups: 1
|
| 52 |
+
hyponet:
|
| 53 |
+
name: hypo_convnets_full_res
|
| 54 |
+
args:
|
| 55 |
+
in_dim: 1
|
| 56 |
+
out_dim: 3
|
| 57 |
+
out_bias: tanh
|
| 58 |
+
strds_h: '5_4_4_3_2'
|
| 59 |
+
strds_w: '5_4_4_4_2'
|
| 60 |
+
ks: '1_3'
|
| 61 |
+
hid_dim: 20
|
| 62 |
+
size: none
|
| 63 |
+
act: gelu
|
| 64 |
+
use_pe: true
|
| 65 |
+
pe_dim: 20
|
| 66 |
+
n_tokens: '20_160_20_20_0'
|
| 67 |
+
token_dims: '125_120_288_180_0'
|
| 68 |
+
transformer_encoder:
|
| 69 |
+
name: transformer_encoder
|
| 70 |
+
args:
|
| 71 |
+
dim: 720
|
| 72 |
+
depth: 6
|
| 73 |
+
n_head: 12
|
| 74 |
+
head_dim: 64
|
| 75 |
+
ff_dim: 2800
|
| 76 |
+
optimizer:
|
| 77 |
+
name: adam
|
| 78 |
+
args:
|
| 79 |
+
lr: 0.0001
|
| 80 |
+
lr_type: step
|
| 81 |
+
max_epoch: 150
|
| 82 |
+
eval_epoch: 150
|
| 83 |
+
vis_epoch: 2000
|
| 84 |
+
dump_ckt: 'no'
|
| 85 |
+
dump_pred: 'no'
|
| 86 |
+
dump_video: 'no'
|
| 87 |
+
generate_from_single_frame: false
|
| 88 |
+
env:
|
| 89 |
+
exp_name: nervenc
|
| 90 |
+
save_dir: checkpoints/nervenc/pre_finetune/pre_finetune_480p_baseline_small
|
| 91 |
+
instance_tag: pre_finetune_480p_baseline_small
|
| 92 |
+
tot_gpus: 1
|
| 93 |
+
cudnn: false
|
| 94 |
+
port: '29531'
|
| 95 |
+
wandb_upload: false
|
| 96 |
+
wandb_exp_name: null
|
| 97 |
+
wandb_run_id: none
|
| 98 |
+
distributed: false
|
| 99 |
+
rank: 0
|
| 100 |
+
world_size: 1
|
| 101 |
+
gpu: null
|
nervenc/pre_finetune/pre_finetune_480p_baseline_small/epoch-last.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ff513e0f0a4537f91990e11e726ad0c0e2b83a9d280e97e8968999621d889478
|
| 3 |
+
size 507680002
|
nervenc/pre_finetune/pre_finetune_720p_baseline/cfg.yaml
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
trainer: nerv_enc_trainer
|
| 2 |
+
train_dataset:
|
| 3 |
+
name: vidrec_dataset_clip_sampler_lazy
|
| 4 |
+
args:
|
| 5 |
+
root_path: data/dataset_meta
|
| 6 |
+
split: train
|
| 7 |
+
frame_num: 8
|
| 8 |
+
rand_augment: '1_2_5'
|
| 9 |
+
csv_file: k400_2023_train_cls400_50_720p.js
|
| 10 |
+
cls_vid_num: '400_25'
|
| 11 |
+
crop_size:
|
| 12 |
+
- 720
|
| 13 |
+
- 1280
|
| 14 |
+
scale: 1
|
| 15 |
+
aspect_ratio: 1
|
| 16 |
+
rand_flip: 'no'
|
| 17 |
+
clips_per_video: 1
|
| 18 |
+
loader:
|
| 19 |
+
batch_size: 4
|
| 20 |
+
num_workers: 16
|
| 21 |
+
test_dataset:
|
| 22 |
+
name: vidrec_dataset_clip_inference_lazy_uvg
|
| 23 |
+
args:
|
| 24 |
+
root_path: data/dataset_meta
|
| 25 |
+
frame_num: 8
|
| 26 |
+
cls_vid_num: -1_-1
|
| 27 |
+
crop_size:
|
| 28 |
+
- 720
|
| 29 |
+
- 1280
|
| 30 |
+
csv_paths:
|
| 31 |
+
uvg: uvg_hd.csv
|
| 32 |
+
frames:
|
| 33 |
+
input: none
|
| 34 |
+
output: none
|
| 35 |
+
loader:
|
| 36 |
+
batch_size: 4
|
| 37 |
+
num_workers: 16
|
| 38 |
+
model:
|
| 39 |
+
name: nerv_enc
|
| 40 |
+
args:
|
| 41 |
+
tokenizer:
|
| 42 |
+
name: vidrec_tokenizer
|
| 43 |
+
args:
|
| 44 |
+
input_size:
|
| 45 |
+
- 720
|
| 46 |
+
- 1280
|
| 47 |
+
patch_size: 32
|
| 48 |
+
padding: 0
|
| 49 |
+
frame_num: 8
|
| 50 |
+
eval_frames: none
|
| 51 |
+
img_groups: 1
|
| 52 |
+
hyponet:
|
| 53 |
+
name: hypo_convnets_full_res
|
| 54 |
+
args:
|
| 55 |
+
in_dim: 1
|
| 56 |
+
out_dim: 3
|
| 57 |
+
out_bias: tanh
|
| 58 |
+
strds_h: '5_4_4_3_3'
|
| 59 |
+
strds_w: '5_4_4_4_4'
|
| 60 |
+
ks: '1_3'
|
| 61 |
+
hid_dim: 56
|
| 62 |
+
size: none
|
| 63 |
+
act: gelu
|
| 64 |
+
use_pe: true
|
| 65 |
+
pe_dim: 56
|
| 66 |
+
n_tokens: '56_448_112_112_0'
|
| 67 |
+
token_dims: '350_504_224_168_0'
|
| 68 |
+
transformer_encoder:
|
| 69 |
+
name: transformer_encoder
|
| 70 |
+
args:
|
| 71 |
+
dim: 720
|
| 72 |
+
depth: 6
|
| 73 |
+
n_head: 12
|
| 74 |
+
head_dim: 64
|
| 75 |
+
ff_dim: 2800
|
| 76 |
+
optimizer:
|
| 77 |
+
name: adam
|
| 78 |
+
args:
|
| 79 |
+
lr: 0.0001
|
| 80 |
+
lr_type: step
|
| 81 |
+
max_epoch: 150
|
| 82 |
+
eval_epoch: 200
|
| 83 |
+
vis_epoch: 2000
|
| 84 |
+
dump_ckt: 'no'
|
| 85 |
+
dump_pred: 'no'
|
| 86 |
+
dump_video: 'no'
|
| 87 |
+
generate_from_single_frame: false
|
| 88 |
+
env:
|
| 89 |
+
exp_name: nervenc
|
| 90 |
+
save_dir: checkpoints/nervenc/pre_finetune/pre_finetune_720p_baseline
|
| 91 |
+
instance_tag: pre_finetune_720p_baseline
|
| 92 |
+
tot_gpus: 1
|
| 93 |
+
cudnn: false
|
| 94 |
+
port: '29600'
|
| 95 |
+
wandb_upload: false
|
| 96 |
+
wandb_exp_name: null
|
| 97 |
+
wandb_run_id: none
|
| 98 |
+
distributed: false
|
| 99 |
+
rank: 0
|
| 100 |
+
world_size: 1
|
| 101 |
+
gpu: null
|
nervenc/pre_finetune/pre_finetune_720p_baseline/epoch-last.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:10119e6ad5c1e0b27f2ae99a383913b991a5f15f0422cbb297776e9fd467cc69
|
| 3 |
+
size 570814466
|
patch_tubelet/320x160_finetuned_patch/cfg.yaml
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
trainer: nerv_enc_trainer_full_res
|
| 2 |
+
train_dataset:
|
| 3 |
+
name: vidrec_dataset_patch_tubelet_sampler_lazy
|
| 4 |
+
args:
|
| 5 |
+
root_path: data/dataset_meta
|
| 6 |
+
split: train
|
| 7 |
+
frame_num: 8
|
| 8 |
+
rand_augment: 'no'
|
| 9 |
+
csv_file: k400_2023_train_cls400_50_480p.js
|
| 10 |
+
cls_vid_num: '400_25'
|
| 11 |
+
crop_size:
|
| 12 |
+
- 480
|
| 13 |
+
- 640
|
| 14 |
+
tubelet_size:
|
| 15 |
+
- 160
|
| 16 |
+
- 320
|
| 17 |
+
scale: 1
|
| 18 |
+
aspect_ratio: 1
|
| 19 |
+
rand_flip: 'no'
|
| 20 |
+
clips_per_video: 1
|
| 21 |
+
tubelets_per_clip: 1
|
| 22 |
+
loader:
|
| 23 |
+
batch_size: 32
|
| 24 |
+
num_workers: 16
|
| 25 |
+
test_dataset:
|
| 26 |
+
name: vidrec_dataset_patch_tubelet_inference_lazy_uvg
|
| 27 |
+
args:
|
| 28 |
+
root_path: data/dataset_meta
|
| 29 |
+
frame_num: 8
|
| 30 |
+
cls_vid_num: -1_-1
|
| 31 |
+
crop_size:
|
| 32 |
+
- 480
|
| 33 |
+
- 640
|
| 34 |
+
tubelet_size:
|
| 35 |
+
- 160
|
| 36 |
+
- 320
|
| 37 |
+
csv_paths:
|
| 38 |
+
uvg: uvg_hd.csv
|
| 39 |
+
frames:
|
| 40 |
+
input: none
|
| 41 |
+
output: none
|
| 42 |
+
loader:
|
| 43 |
+
batch_size: 32
|
| 44 |
+
num_workers: 16
|
| 45 |
+
model:
|
| 46 |
+
name: nerv_enc_full_res
|
| 47 |
+
args:
|
| 48 |
+
tokenizer:
|
| 49 |
+
name: vidrec_tokenizer
|
| 50 |
+
args:
|
| 51 |
+
input_size:
|
| 52 |
+
- 160
|
| 53 |
+
- 320
|
| 54 |
+
patch_size: 32
|
| 55 |
+
padding: 0
|
| 56 |
+
frame_num: 8
|
| 57 |
+
eval_frames: none
|
| 58 |
+
img_groups: 1
|
| 59 |
+
hyponet:
|
| 60 |
+
name: hypo_convnets_full_res
|
| 61 |
+
args:
|
| 62 |
+
in_dim: 1
|
| 63 |
+
out_dim: 3
|
| 64 |
+
out_bias: tanh
|
| 65 |
+
strds_h: '5_4_4_2'
|
| 66 |
+
strds_w: '5_4_4_4'
|
| 67 |
+
ks: '1_3'
|
| 68 |
+
hid_dim: 14
|
| 69 |
+
size: none
|
| 70 |
+
act: gelu
|
| 71 |
+
use_pe: true
|
| 72 |
+
pe_dim: 14
|
| 73 |
+
n_tokens: '5_56_4_0'
|
| 74 |
+
token_dims: '196_252_196_0'
|
| 75 |
+
transformer_encoder:
|
| 76 |
+
name: transformer_encoder
|
| 77 |
+
args:
|
| 78 |
+
dim: 720
|
| 79 |
+
depth: 6
|
| 80 |
+
n_head: 12
|
| 81 |
+
head_dim: 64
|
| 82 |
+
ff_dim: 2880
|
| 83 |
+
optimizer:
|
| 84 |
+
name: adam
|
| 85 |
+
args:
|
| 86 |
+
lr: 0.0001
|
| 87 |
+
weight_decay: 0.0
|
| 88 |
+
lr_type: step
|
| 89 |
+
max_epoch: 50
|
| 90 |
+
eval_epoch: 50
|
| 91 |
+
vis_epoch: 2000
|
| 92 |
+
dump_ckt: 'no'
|
| 93 |
+
dump_pred: 'no'
|
| 94 |
+
dump_video: 'no'
|
| 95 |
+
generate_from_single_frame: false
|
| 96 |
+
finetune_model: checkpoints/patch_tubelet/pre_finetune/pre_finetune_320x160_patch/epoch-last.pth
|
| 97 |
+
finetune_same_model: true
|
| 98 |
+
env:
|
| 99 |
+
exp_name: patch_tubelet
|
| 100 |
+
save_dir: checkpoints/patch_tubelet/320x160_finetuned_patch
|
| 101 |
+
instance_tag: 320x160_finetuned_patch
|
| 102 |
+
tot_gpus: 4
|
| 103 |
+
cudnn: false
|
| 104 |
+
port: '9503'
|
| 105 |
+
wandb_upload: false
|
| 106 |
+
rank: 0
|
| 107 |
+
world_size: 4
|
| 108 |
+
gpu: 0
|
| 109 |
+
distributed: true
|
| 110 |
+
dist_backend: nccl
|
| 111 |
+
wandb_exp_name: null
|
| 112 |
+
wandb_run_id: none
|
patch_tubelet/320x160_finetuned_patch/epoch-last.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:885d3aae37c82efc6547a924a225202fb6c641461afb3e9acb75a21633834153
|
| 3 |
+
size 495460270
|
patch_tubelet/320x160_finetuned_patch_small/cfg.yaml
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
trainer: nerv_enc_trainer_full_res
|
| 2 |
+
train_dataset:
|
| 3 |
+
name: vidrec_dataset_patch_tubelet_sampler_lazy
|
| 4 |
+
args:
|
| 5 |
+
root_path: data/dataset_meta
|
| 6 |
+
split: train
|
| 7 |
+
frame_num: 8
|
| 8 |
+
rand_augment: 'no'
|
| 9 |
+
csv_file: k400_2023_train_cls400_50_480p.js
|
| 10 |
+
cls_vid_num: '400_25'
|
| 11 |
+
crop_size:
|
| 12 |
+
- 480
|
| 13 |
+
- 640
|
| 14 |
+
tubelet_size:
|
| 15 |
+
- 160
|
| 16 |
+
- 320
|
| 17 |
+
scale: 1
|
| 18 |
+
aspect_ratio: 1
|
| 19 |
+
rand_flip: 'no'
|
| 20 |
+
clips_per_video: 1
|
| 21 |
+
tubelets_per_clip: 1
|
| 22 |
+
loader:
|
| 23 |
+
batch_size: 32
|
| 24 |
+
num_workers: 16
|
| 25 |
+
test_dataset:
|
| 26 |
+
name: vidrec_dataset_patch_tubelet_inference_lazy_uvg
|
| 27 |
+
args:
|
| 28 |
+
root_path: data/dataset_meta
|
| 29 |
+
frame_num: 8
|
| 30 |
+
cls_vid_num: -1_-1
|
| 31 |
+
crop_size:
|
| 32 |
+
- 480
|
| 33 |
+
- 640
|
| 34 |
+
tubelet_size:
|
| 35 |
+
- 160
|
| 36 |
+
- 320
|
| 37 |
+
csv_paths:
|
| 38 |
+
uvg: uvg_hd.csv
|
| 39 |
+
frames:
|
| 40 |
+
input: none
|
| 41 |
+
output: none
|
| 42 |
+
loader:
|
| 43 |
+
batch_size: 32
|
| 44 |
+
num_workers: 16
|
| 45 |
+
model:
|
| 46 |
+
name: nerv_enc_full_res
|
| 47 |
+
args:
|
| 48 |
+
tokenizer:
|
| 49 |
+
name: vidrec_tokenizer
|
| 50 |
+
args:
|
| 51 |
+
input_size:
|
| 52 |
+
- 160
|
| 53 |
+
- 320
|
| 54 |
+
patch_size: 32
|
| 55 |
+
padding: 0
|
| 56 |
+
frame_num: 8
|
| 57 |
+
eval_frames: none
|
| 58 |
+
img_groups: 1
|
| 59 |
+
hyponet:
|
| 60 |
+
name: hypo_convnets_full_res
|
| 61 |
+
args:
|
| 62 |
+
in_dim: 1
|
| 63 |
+
out_dim: 3
|
| 64 |
+
out_bias: tanh
|
| 65 |
+
strds_h: '5_4_4_2'
|
| 66 |
+
strds_w: '5_4_4_4'
|
| 67 |
+
ks: '1_3'
|
| 68 |
+
hid_dim: 14
|
| 69 |
+
size: none
|
| 70 |
+
act: gelu
|
| 71 |
+
use_pe: true
|
| 72 |
+
pe_dim: 14
|
| 73 |
+
n_tokens: '5_16_4_0'
|
| 74 |
+
token_dims: '140_252_98_0'
|
| 75 |
+
transformer_encoder:
|
| 76 |
+
name: transformer_encoder
|
| 77 |
+
args:
|
| 78 |
+
dim: 720
|
| 79 |
+
depth: 6
|
| 80 |
+
n_head: 12
|
| 81 |
+
head_dim: 64
|
| 82 |
+
ff_dim: 2880
|
| 83 |
+
optimizer:
|
| 84 |
+
name: adam
|
| 85 |
+
args:
|
| 86 |
+
lr: 0.0001
|
| 87 |
+
weight_decay: 0.0
|
| 88 |
+
lr_type: step
|
| 89 |
+
max_epoch: 50
|
| 90 |
+
eval_epoch: 50
|
| 91 |
+
vis_epoch: 2000
|
| 92 |
+
dump_ckt: 'no'
|
| 93 |
+
dump_pred: 'no'
|
| 94 |
+
dump_video: 'no'
|
| 95 |
+
generate_from_single_frame: false
|
| 96 |
+
finetune_model: checkpoints/patch_tubelet/pre_finetune/pre_finetune_320x160_patch_small/epoch-last.pth
|
| 97 |
+
finetune_same_model: true
|
| 98 |
+
env:
|
| 99 |
+
exp_name: patch_tubelet
|
| 100 |
+
save_dir: checkpoints/patch_tubelet/320x160_finetuned_patch_small
|
| 101 |
+
instance_tag: 320x160_finetuned_patch_small
|
| 102 |
+
tot_gpus: 4
|
| 103 |
+
cudnn: false
|
| 104 |
+
port: '15419'
|
| 105 |
+
wandb_upload: false
|
| 106 |
+
rank: 0
|
| 107 |
+
world_size: 4
|
| 108 |
+
gpu: 0
|
| 109 |
+
distributed: true
|
| 110 |
+
dist_backend: nccl
|
| 111 |
+
wandb_exp_name: null
|
| 112 |
+
wandb_run_id: none
|
patch_tubelet/320x160_finetuned_patch_small/epoch-last.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:40204a65e856def367f0d2202d96fdb2f979092d36f4ada67301e161b12d4bc3
|
| 3 |
+
size 493782254
|
patch_tubelet/320x240_finetuned_patch_train_720p/cfg.yaml
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
trainer: nerv_enc_trainer_full_res
|
| 2 |
+
train_dataset:
|
| 3 |
+
name: vidrec_dataset_patch_tubelet_sampler_lazy
|
| 4 |
+
args:
|
| 5 |
+
root_path: data/dataset_meta
|
| 6 |
+
split: train
|
| 7 |
+
frame_num: 8
|
| 8 |
+
rand_augment: 'no'
|
| 9 |
+
csv_file: k400_2023_train_cls400_50_720p.js
|
| 10 |
+
cls_vid_num: '400_25'
|
| 11 |
+
crop_size:
|
| 12 |
+
- 720
|
| 13 |
+
- 1280
|
| 14 |
+
tubelet_size:
|
| 15 |
+
- 240
|
| 16 |
+
- 320
|
| 17 |
+
scale: 1
|
| 18 |
+
aspect_ratio: 1
|
| 19 |
+
rand_flip: 'no'
|
| 20 |
+
clips_per_video: 1
|
| 21 |
+
tubelets_per_clip: 1
|
| 22 |
+
loader:
|
| 23 |
+
batch_size: 32
|
| 24 |
+
num_workers: 16
|
| 25 |
+
test_dataset:
|
| 26 |
+
name: vidrec_dataset_patch_tubelet_inference_lazy_uvg
|
| 27 |
+
args:
|
| 28 |
+
root_path: data/dataset_meta
|
| 29 |
+
frame_num: 8
|
| 30 |
+
cls_vid_num: -1_-1
|
| 31 |
+
crop_size:
|
| 32 |
+
- 720
|
| 33 |
+
- 1280
|
| 34 |
+
tubelet_size:
|
| 35 |
+
- 240
|
| 36 |
+
- 320
|
| 37 |
+
csv_paths:
|
| 38 |
+
uvg: uvg_hd.csv
|
| 39 |
+
frames:
|
| 40 |
+
input: none
|
| 41 |
+
output: none
|
| 42 |
+
loader:
|
| 43 |
+
batch_size: 32
|
| 44 |
+
num_workers: 16
|
| 45 |
+
model:
|
| 46 |
+
name: nerv_enc_full_res
|
| 47 |
+
args:
|
| 48 |
+
tokenizer:
|
| 49 |
+
name: vidrec_tokenizer
|
| 50 |
+
args:
|
| 51 |
+
input_size:
|
| 52 |
+
- 240
|
| 53 |
+
- 320
|
| 54 |
+
patch_size: 32
|
| 55 |
+
padding: 0
|
| 56 |
+
frame_num: 8
|
| 57 |
+
eval_frames: none
|
| 58 |
+
img_groups: 1
|
| 59 |
+
hyponet:
|
| 60 |
+
name: hypo_convnets_full_res
|
| 61 |
+
args:
|
| 62 |
+
in_dim: 1
|
| 63 |
+
out_dim: 3
|
| 64 |
+
out_bias: tanh
|
| 65 |
+
strds_h: '5_4_4_3'
|
| 66 |
+
strds_w: '5_4_4_4'
|
| 67 |
+
ks: '1_3'
|
| 68 |
+
hid_dim: 20
|
| 69 |
+
size: none
|
| 70 |
+
act: gelu
|
| 71 |
+
use_pe: true
|
| 72 |
+
pe_dim: 16
|
| 73 |
+
n_tokens: '10_80_16_0'
|
| 74 |
+
token_dims: '200_240_240_0'
|
| 75 |
+
transformer_encoder:
|
| 76 |
+
name: transformer_encoder
|
| 77 |
+
args:
|
| 78 |
+
dim: 720
|
| 79 |
+
depth: 6
|
| 80 |
+
n_head: 12
|
| 81 |
+
head_dim: 64
|
| 82 |
+
ff_dim: 2880
|
| 83 |
+
optimizer:
|
| 84 |
+
name: adam
|
| 85 |
+
args:
|
| 86 |
+
lr: 0.0001
|
| 87 |
+
weight_decay: 0.0
|
| 88 |
+
lr_type: step
|
| 89 |
+
max_epoch: 50
|
| 90 |
+
eval_epoch: 50
|
| 91 |
+
vis_epoch: 2000
|
| 92 |
+
dump_ckt: 'no'
|
| 93 |
+
dump_pred: 'no'
|
| 94 |
+
dump_video: 'no'
|
| 95 |
+
generate_from_single_frame: false
|
| 96 |
+
finetune_model: checkpoints/patch_tubelet/pre_finetune/pre_finetune_320x240_patch_train_720p/epoch-last.pth
|
| 97 |
+
finetune_same_model: true
|
| 98 |
+
env:
|
| 99 |
+
exp_name: patch_tubelet
|
| 100 |
+
save_dir: checkpoints/patch_tubelet/320x240_finetuned_patch_train_720p
|
| 101 |
+
instance_tag: 320x240_finetuned_patch_train_720p
|
| 102 |
+
tot_gpus: 8
|
| 103 |
+
cudnn: false
|
| 104 |
+
port: '15419'
|
| 105 |
+
wandb_upload: false
|
| 106 |
+
rank: 0
|
| 107 |
+
world_size: 8
|
| 108 |
+
gpu: 0
|
| 109 |
+
distributed: true
|
| 110 |
+
dist_backend: nccl
|
| 111 |
+
wandb_exp_name: null
|
| 112 |
+
wandb_run_id: none
|
patch_tubelet/320x240_finetuned_patch_train_720p/epoch-last.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7ed7817598226c11e07ea5254e3761aa09c2cfd38594f84e3fb7eae6856767fe
|
| 3 |
+
size 498296558
|
patch_tubelet/pre_finetune/pre_finetune_320x160_patch/cfg.yaml
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
trainer: nerv_enc_trainer_full_res
|
| 2 |
+
train_dataset:
|
| 3 |
+
name: vidrec_dataset_patch_tubelet_sampler_lazy
|
| 4 |
+
args:
|
| 5 |
+
root_path: data/dataset_meta
|
| 6 |
+
split: train
|
| 7 |
+
frame_num: 8
|
| 8 |
+
rand_augment: 'no'
|
| 9 |
+
csv_file: k400_2023_train_cls400_50_480p.js
|
| 10 |
+
cls_vid_num: '400_25'
|
| 11 |
+
crop_size:
|
| 12 |
+
- 480
|
| 13 |
+
- 640
|
| 14 |
+
tubelet_size:
|
| 15 |
+
- 160
|
| 16 |
+
- 320
|
| 17 |
+
scale: 1
|
| 18 |
+
aspect_ratio: 1
|
| 19 |
+
rand_flip: 'no'
|
| 20 |
+
clips_per_video: 1
|
| 21 |
+
tubelets_per_clip: 1
|
| 22 |
+
loader:
|
| 23 |
+
batch_size: 32
|
| 24 |
+
num_workers: 16
|
| 25 |
+
test_dataset:
|
| 26 |
+
name: vidrec_dataset_patch_tubelet_inference_lazy_uvg
|
| 27 |
+
args:
|
| 28 |
+
root_path: data/dataset_meta
|
| 29 |
+
frame_num: 8
|
| 30 |
+
cls_vid_num: -1_-1
|
| 31 |
+
crop_size:
|
| 32 |
+
- 480
|
| 33 |
+
- 640
|
| 34 |
+
tubelet_size:
|
| 35 |
+
- 160
|
| 36 |
+
- 320
|
| 37 |
+
csv_paths:
|
| 38 |
+
uvg: uvg_hd.csv
|
| 39 |
+
frames:
|
| 40 |
+
input: none
|
| 41 |
+
output: none
|
| 42 |
+
loader:
|
| 43 |
+
batch_size: 32
|
| 44 |
+
num_workers: 16
|
| 45 |
+
model:
|
| 46 |
+
name: nerv_enc_full_res
|
| 47 |
+
args:
|
| 48 |
+
tokenizer:
|
| 49 |
+
name: vidrec_tokenizer
|
| 50 |
+
args:
|
| 51 |
+
input_size:
|
| 52 |
+
- 160
|
| 53 |
+
- 320
|
| 54 |
+
patch_size: 32
|
| 55 |
+
padding: 0
|
| 56 |
+
frame_num: 8
|
| 57 |
+
eval_frames: none
|
| 58 |
+
img_groups: 1
|
| 59 |
+
hyponet:
|
| 60 |
+
name: hypo_convnets_full_res
|
| 61 |
+
args:
|
| 62 |
+
in_dim: 1
|
| 63 |
+
out_dim: 3
|
| 64 |
+
out_bias: tanh
|
| 65 |
+
strds_h: '5_4_4_2'
|
| 66 |
+
strds_w: '5_4_4_4'
|
| 67 |
+
ks: '1_3'
|
| 68 |
+
hid_dim: 14
|
| 69 |
+
size: none
|
| 70 |
+
act: gelu
|
| 71 |
+
use_pe: true
|
| 72 |
+
pe_dim: 14
|
| 73 |
+
n_tokens: '5_56_4_0'
|
| 74 |
+
token_dims: '196_252_196_0'
|
| 75 |
+
transformer_encoder:
|
| 76 |
+
name: transformer_encoder
|
| 77 |
+
args:
|
| 78 |
+
dim: 720
|
| 79 |
+
depth: 6
|
| 80 |
+
n_head: 12
|
| 81 |
+
head_dim: 64
|
| 82 |
+
ff_dim: 2880
|
| 83 |
+
optimizer:
|
| 84 |
+
name: adam
|
| 85 |
+
args:
|
| 86 |
+
lr: 0.0001
|
| 87 |
+
weight_decay: 0.0
|
| 88 |
+
lr_type: step
|
| 89 |
+
max_epoch: 50
|
| 90 |
+
eval_epoch: 50
|
| 91 |
+
vis_epoch: 2000
|
| 92 |
+
dump_ckt: 'no'
|
| 93 |
+
dump_pred: 'no'
|
| 94 |
+
dump_video: 'no'
|
| 95 |
+
generate_from_single_frame: false
|
| 96 |
+
env:
|
| 97 |
+
exp_name: patch_tubelet
|
| 98 |
+
save_dir: checkpoints/patch_tubelet/pre_finetune/pre_finetune_320x160_patch
|
| 99 |
+
instance_tag: pre_finetune_320x160_patch
|
| 100 |
+
tot_gpus: 4
|
| 101 |
+
cudnn: false
|
| 102 |
+
port: '9503'
|
| 103 |
+
wandb_upload: false
|
| 104 |
+
rank: 0
|
| 105 |
+
world_size: 4
|
| 106 |
+
gpu: 0
|
| 107 |
+
distributed: true
|
| 108 |
+
dist_backend: nccl
|
| 109 |
+
wandb_exp_name: null
|
| 110 |
+
wandb_run_id: none
|
patch_tubelet/pre_finetune/pre_finetune_320x160_patch/epoch-last.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bdf8df38940c13fa6a33943cfdd367e9e115c4dea622ddfdb77f4acb3baf6ef0
|
| 3 |
+
size 495460206
|
patch_tubelet/pre_finetune/pre_finetune_320x160_patch_small/cfg.yaml
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
trainer: nerv_enc_trainer_full_res
|
| 2 |
+
train_dataset:
|
| 3 |
+
name: vidrec_dataset_patch_tubelet_sampler_lazy
|
| 4 |
+
args:
|
| 5 |
+
root_path: data/dataset_meta
|
| 6 |
+
split: train
|
| 7 |
+
frame_num: 8
|
| 8 |
+
rand_augment: 'no'
|
| 9 |
+
csv_file: k400_2023_train_cls400_50_480p.js
|
| 10 |
+
cls_vid_num: '400_25'
|
| 11 |
+
crop_size:
|
| 12 |
+
- 480
|
| 13 |
+
- 640
|
| 14 |
+
tubelet_size:
|
| 15 |
+
- 160
|
| 16 |
+
- 320
|
| 17 |
+
scale: 1
|
| 18 |
+
aspect_ratio: 1
|
| 19 |
+
rand_flip: 'no'
|
| 20 |
+
clips_per_video: 1
|
| 21 |
+
tubelets_per_clip: 1
|
| 22 |
+
loader:
|
| 23 |
+
batch_size: 32
|
| 24 |
+
num_workers: 16
|
| 25 |
+
test_dataset:
|
| 26 |
+
name: vidrec_dataset_patch_tubelet_inference_lazy_uvg
|
| 27 |
+
args:
|
| 28 |
+
root_path: data/dataset_meta
|
| 29 |
+
frame_num: 8
|
| 30 |
+
cls_vid_num: -1_-1
|
| 31 |
+
crop_size:
|
| 32 |
+
- 480
|
| 33 |
+
- 640
|
| 34 |
+
tubelet_size:
|
| 35 |
+
- 160
|
| 36 |
+
- 320
|
| 37 |
+
csv_paths:
|
| 38 |
+
uvg: uvg_hd.csv
|
| 39 |
+
frames:
|
| 40 |
+
input: none
|
| 41 |
+
output: none
|
| 42 |
+
loader:
|
| 43 |
+
batch_size: 32
|
| 44 |
+
num_workers: 16
|
| 45 |
+
model:
|
| 46 |
+
name: nerv_enc_full_res
|
| 47 |
+
args:
|
| 48 |
+
tokenizer:
|
| 49 |
+
name: vidrec_tokenizer
|
| 50 |
+
args:
|
| 51 |
+
input_size:
|
| 52 |
+
- 160
|
| 53 |
+
- 320
|
| 54 |
+
patch_size: 32
|
| 55 |
+
padding: 0
|
| 56 |
+
frame_num: 8
|
| 57 |
+
eval_frames: none
|
| 58 |
+
img_groups: 1
|
| 59 |
+
hyponet:
|
| 60 |
+
name: hypo_convnets_full_res
|
| 61 |
+
args:
|
| 62 |
+
in_dim: 1
|
| 63 |
+
out_dim: 3
|
| 64 |
+
out_bias: tanh
|
| 65 |
+
strds_h: '5_4_4_2'
|
| 66 |
+
strds_w: '5_4_4_4'
|
| 67 |
+
ks: '1_3'
|
| 68 |
+
hid_dim: 14
|
| 69 |
+
size: none
|
| 70 |
+
act: gelu
|
| 71 |
+
use_pe: true
|
| 72 |
+
pe_dim: 14
|
| 73 |
+
n_tokens: '5_16_4_0'
|
| 74 |
+
token_dims: '140_252_98_0'
|
| 75 |
+
transformer_encoder:
|
| 76 |
+
name: transformer_encoder
|
| 77 |
+
args:
|
| 78 |
+
dim: 720
|
| 79 |
+
depth: 6
|
| 80 |
+
n_head: 12
|
| 81 |
+
head_dim: 64
|
| 82 |
+
ff_dim: 2880
|
| 83 |
+
optimizer:
|
| 84 |
+
name: adam
|
| 85 |
+
args:
|
| 86 |
+
lr: 0.0001
|
| 87 |
+
weight_decay: 0.0
|
| 88 |
+
lr_type: step
|
| 89 |
+
max_epoch: 150
|
| 90 |
+
eval_epoch: 50
|
| 91 |
+
vis_epoch: 2000
|
| 92 |
+
dump_ckt: 'no'
|
| 93 |
+
dump_pred: 'no'
|
| 94 |
+
dump_video: 'no'
|
| 95 |
+
generate_from_single_frame: false
|
| 96 |
+
env:
|
| 97 |
+
exp_name: patch_tubelet
|
| 98 |
+
save_dir: checkpoints/patch_tubelet/pre_finetune/pre_finetune_320x160_patch_small
|
| 99 |
+
instance_tag: pre_finetune_320x160_patch_small
|
| 100 |
+
tot_gpus: 4
|
| 101 |
+
cudnn: false
|
| 102 |
+
port: '15419'
|
| 103 |
+
wandb_upload: false
|
| 104 |
+
rank: 0
|
| 105 |
+
world_size: 4
|
| 106 |
+
gpu: 0
|
| 107 |
+
distributed: true
|
| 108 |
+
dist_backend: nccl
|
| 109 |
+
wandb_exp_name: null
|
| 110 |
+
wandb_run_id: none
|
patch_tubelet/pre_finetune/pre_finetune_320x160_patch_small/epoch-last.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4541bd2866e16cf7b968ff9fca2594bc225f8f6fb822970c4a021060bf2fdc80
|
| 3 |
+
size 493782126
|
patch_tubelet/pre_finetune/pre_finetune_320x240_patch_train_480p/cfg.yaml
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
trainer: nerv_enc_trainer_full_res
|
| 2 |
+
train_dataset:
|
| 3 |
+
name: vidrec_dataset_patch_tubelet_sampler_lazy
|
| 4 |
+
args:
|
| 5 |
+
root_path: data/dataset_meta
|
| 6 |
+
split: train
|
| 7 |
+
frame_num: 8
|
| 8 |
+
rand_augment: 'no'
|
| 9 |
+
csv_file: k400_2023_train_cls400_50_480p.js
|
| 10 |
+
cls_vid_num: '400_25'
|
| 11 |
+
crop_size:
|
| 12 |
+
- 480
|
| 13 |
+
- 640
|
| 14 |
+
tubelet_size:
|
| 15 |
+
- 240
|
| 16 |
+
- 320
|
| 17 |
+
scale: 1
|
| 18 |
+
aspect_ratio: 1
|
| 19 |
+
rand_flip: 'no'
|
| 20 |
+
clips_per_video: 1
|
| 21 |
+
tubelets_per_clip: 1
|
| 22 |
+
loader:
|
| 23 |
+
batch_size: 32
|
| 24 |
+
num_workers: 16
|
| 25 |
+
test_dataset:
|
| 26 |
+
name: vidrec_dataset_patch_tubelet_inference_lazy_uvg
|
| 27 |
+
args:
|
| 28 |
+
root_path: data/dataset_meta
|
| 29 |
+
frame_num: 8
|
| 30 |
+
cls_vid_num: -1_-1
|
| 31 |
+
crop_size:
|
| 32 |
+
- 480
|
| 33 |
+
- 640
|
| 34 |
+
tubelet_size:
|
| 35 |
+
- 240
|
| 36 |
+
- 320
|
| 37 |
+
csv_paths:
|
| 38 |
+
uvg: uvg_hd.csv
|
| 39 |
+
frames:
|
| 40 |
+
input: none
|
| 41 |
+
output: none
|
| 42 |
+
loader:
|
| 43 |
+
batch_size: 32
|
| 44 |
+
num_workers: 16
|
| 45 |
+
model:
|
| 46 |
+
name: nerv_enc_full_res
|
| 47 |
+
args:
|
| 48 |
+
tokenizer:
|
| 49 |
+
name: vidrec_tokenizer
|
| 50 |
+
args:
|
| 51 |
+
input_size:
|
| 52 |
+
- 240
|
| 53 |
+
- 320
|
| 54 |
+
patch_size: 32
|
| 55 |
+
padding: 0
|
| 56 |
+
frame_num: 8
|
| 57 |
+
eval_frames: none
|
| 58 |
+
img_groups: 1
|
| 59 |
+
hyponet:
|
| 60 |
+
name: hypo_convnets_full_res
|
| 61 |
+
args:
|
| 62 |
+
in_dim: 1
|
| 63 |
+
out_dim: 3
|
| 64 |
+
out_bias: tanh
|
| 65 |
+
strds_h: '5_4_4_3'
|
| 66 |
+
strds_w: '5_4_4_4'
|
| 67 |
+
ks: '1_3'
|
| 68 |
+
hid_dim: 20
|
| 69 |
+
size: none
|
| 70 |
+
act: gelu
|
| 71 |
+
use_pe: true
|
| 72 |
+
pe_dim: 16
|
| 73 |
+
n_tokens: '10_80_16_0'
|
| 74 |
+
token_dims: '200_240_240_0'
|
| 75 |
+
transformer_encoder:
|
| 76 |
+
name: transformer_encoder
|
| 77 |
+
args:
|
| 78 |
+
dim: 720
|
| 79 |
+
depth: 6
|
| 80 |
+
n_head: 12
|
| 81 |
+
head_dim: 64
|
| 82 |
+
ff_dim: 2880
|
| 83 |
+
optimizer:
|
| 84 |
+
name: adam
|
| 85 |
+
args:
|
| 86 |
+
lr: 0.0001
|
| 87 |
+
weight_decay: 0.0
|
| 88 |
+
lr_type: step
|
| 89 |
+
max_epoch: 150
|
| 90 |
+
eval_epoch: 150
|
| 91 |
+
vis_epoch: 2000
|
| 92 |
+
dump_ckt: 'no'
|
| 93 |
+
dump_pred: 'no'
|
| 94 |
+
dump_video: 'no'
|
| 95 |
+
generate_from_single_frame: false
|
| 96 |
+
env:
|
| 97 |
+
exp_name: patch_tubelet
|
| 98 |
+
save_dir: checkpoints/patch_tubelet/pre_finetune/pre_finetune_320x240_patch_train_480p
|
| 99 |
+
instance_tag: pre_finetune_320x240_patch_train_480p
|
| 100 |
+
tot_gpus: 4
|
| 101 |
+
cudnn: false
|
| 102 |
+
port: '29827'
|
| 103 |
+
wandb_upload: false
|
| 104 |
+
rank: 0
|
| 105 |
+
world_size: 4
|
| 106 |
+
gpu: 0
|
| 107 |
+
distributed: true
|
| 108 |
+
dist_backend: nccl
|
| 109 |
+
wandb_exp_name: null
|
| 110 |
+
wandb_run_id: none
|
patch_tubelet/pre_finetune/pre_finetune_320x240_patch_train_480p/epoch-last.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c00485b39f96c5bdb62bfda1ff5e1bd7c7f4854d0bbc1cb470a0f6deea717ad4
|
| 3 |
+
size 498296430
|
patch_tubelet/pre_finetune/pre_finetune_320x240_patch_train_720p/cfg.yaml
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
trainer: nerv_enc_trainer_full_res
|
| 2 |
+
train_dataset:
|
| 3 |
+
name: vidrec_dataset_patch_tubelet_sampler_lazy
|
| 4 |
+
args:
|
| 5 |
+
root_path: data/dataset_meta
|
| 6 |
+
split: train
|
| 7 |
+
frame_num: 8
|
| 8 |
+
rand_augment: 'no'
|
| 9 |
+
csv_file: k400_2023_train_cls400_50_720p.js
|
| 10 |
+
cls_vid_num: '400_25'
|
| 11 |
+
crop_size:
|
| 12 |
+
- 720
|
| 13 |
+
- 1280
|
| 14 |
+
tubelet_size:
|
| 15 |
+
- 240
|
| 16 |
+
- 320
|
| 17 |
+
scale: 1
|
| 18 |
+
aspect_ratio: 1
|
| 19 |
+
rand_flip: 'no'
|
| 20 |
+
clips_per_video: 1
|
| 21 |
+
tubelets_per_clip: 1
|
| 22 |
+
loader:
|
| 23 |
+
batch_size: 32
|
| 24 |
+
num_workers: 16
|
| 25 |
+
test_dataset:
|
| 26 |
+
name: vidrec_dataset_patch_tubelet_inference_lazy_uvg
|
| 27 |
+
args:
|
| 28 |
+
root_path: data/dataset_meta
|
| 29 |
+
frame_num: 8
|
| 30 |
+
cls_vid_num: -1_-1
|
| 31 |
+
crop_size:
|
| 32 |
+
- 720
|
| 33 |
+
- 1280
|
| 34 |
+
tubelet_size:
|
| 35 |
+
- 240
|
| 36 |
+
- 320
|
| 37 |
+
csv_paths:
|
| 38 |
+
uvg_720: uvg_hd_720p.csv
|
| 39 |
+
frames:
|
| 40 |
+
input: none
|
| 41 |
+
output: none
|
| 42 |
+
loader:
|
| 43 |
+
batch_size: 32
|
| 44 |
+
num_workers: 16
|
| 45 |
+
model:
|
| 46 |
+
name: nerv_enc_full_res
|
| 47 |
+
args:
|
| 48 |
+
tokenizer:
|
| 49 |
+
name: vidrec_tokenizer
|
| 50 |
+
args:
|
| 51 |
+
input_size:
|
| 52 |
+
- 240
|
| 53 |
+
- 320
|
| 54 |
+
patch_size: 32
|
| 55 |
+
padding: 0
|
| 56 |
+
frame_num: 8
|
| 57 |
+
eval_frames: none
|
| 58 |
+
img_groups: 1
|
| 59 |
+
hyponet:
|
| 60 |
+
name: hypo_convnets_full_res
|
| 61 |
+
args:
|
| 62 |
+
in_dim: 1
|
| 63 |
+
out_dim: 3
|
| 64 |
+
out_bias: tanh
|
| 65 |
+
strds_h: '5_4_4_3'
|
| 66 |
+
strds_w: '5_4_4_4'
|
| 67 |
+
ks: '1_3'
|
| 68 |
+
hid_dim: 20
|
| 69 |
+
size: none
|
| 70 |
+
act: gelu
|
| 71 |
+
use_pe: true
|
| 72 |
+
pe_dim: 16
|
| 73 |
+
n_tokens: '10_80_16_0'
|
| 74 |
+
token_dims: '200_240_240_0'
|
| 75 |
+
transformer_encoder:
|
| 76 |
+
name: transformer_encoder
|
| 77 |
+
args:
|
| 78 |
+
dim: 720
|
| 79 |
+
depth: 6
|
| 80 |
+
n_head: 12
|
| 81 |
+
head_dim: 64
|
| 82 |
+
ff_dim: 2880
|
| 83 |
+
optimizer:
|
| 84 |
+
name: adam
|
| 85 |
+
args:
|
| 86 |
+
lr: 0.0001
|
| 87 |
+
weight_decay: 0.0
|
| 88 |
+
lr_type: step
|
| 89 |
+
max_epoch: 150
|
| 90 |
+
eval_epoch: 50
|
| 91 |
+
vis_epoch: 2000
|
| 92 |
+
dump_ckt: 'no'
|
| 93 |
+
dump_pred: 'no'
|
| 94 |
+
dump_video: 'no'
|
| 95 |
+
generate_from_single_frame: false
|
| 96 |
+
env:
|
| 97 |
+
exp_name: patch_tubelet
|
| 98 |
+
save_dir: checkpoints/patch_tubelet/pre_finetune/pre_finetune_320x240_patch_train_720p
|
| 99 |
+
instance_tag: pre_finetune_320x240_patch_train_720p
|
| 100 |
+
tot_gpus: 4
|
| 101 |
+
cudnn: false
|
| 102 |
+
port: '15419'
|
| 103 |
+
wandb_upload: false
|
| 104 |
+
rank: 0
|
| 105 |
+
world_size: 4
|
| 106 |
+
gpu: 0
|
| 107 |
+
distributed: true
|
| 108 |
+
dist_backend: nccl
|
| 109 |
+
wandb_exp_name: null
|
| 110 |
+
wandb_run_id: none
|
patch_tubelet/pre_finetune/pre_finetune_320x240_patch_train_720p/epoch-last.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2a020901b01b660432d31d7709618a93e93091653855f25f0728c7a5089ae135
|
| 3 |
+
size 498296430
|
patch_tubelet/pre_finetune/pre_finetune_384x270_patch_train_480p/cfg.yaml
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
trainer: nerv_enc_trainer_full_res
|
| 2 |
+
train_dataset:
|
| 3 |
+
name: vidrec_dataset_patch_tubelet_sampler_lazy
|
| 4 |
+
args:
|
| 5 |
+
root_path: data/dataset_meta
|
| 6 |
+
split: train
|
| 7 |
+
frame_num: 8
|
| 8 |
+
rand_augment: 'no'
|
| 9 |
+
csv_file: k400_2023_train_cls400_50_480p.js
|
| 10 |
+
cls_vid_num: '400_25'
|
| 11 |
+
crop_size:
|
| 12 |
+
- 480
|
| 13 |
+
- 640
|
| 14 |
+
tubelet_size:
|
| 15 |
+
- 270
|
| 16 |
+
- 384
|
| 17 |
+
scale: 1
|
| 18 |
+
aspect_ratio: 1
|
| 19 |
+
rand_flip: 'no'
|
| 20 |
+
clips_per_video: 1
|
| 21 |
+
tubelets_per_clip: 1
|
| 22 |
+
loader:
|
| 23 |
+
batch_size: 32
|
| 24 |
+
num_workers: 16
|
| 25 |
+
test_dataset:
|
| 26 |
+
name: vidrec_dataset_patch_tubelet_inference_lazy_uvg
|
| 27 |
+
args:
|
| 28 |
+
root_path: data/dataset_meta
|
| 29 |
+
frame_num: 8
|
| 30 |
+
cls_vid_num: -1_-1
|
| 31 |
+
crop_size:
|
| 32 |
+
- 480
|
| 33 |
+
- 640
|
| 34 |
+
tubelet_size:
|
| 35 |
+
- 270
|
| 36 |
+
- 384
|
| 37 |
+
csv_paths:
|
| 38 |
+
uvg: uvg_hd.csv
|
| 39 |
+
frames:
|
| 40 |
+
input: none
|
| 41 |
+
output: none
|
| 42 |
+
loader:
|
| 43 |
+
batch_size: 32
|
| 44 |
+
num_workers: 16
|
| 45 |
+
model:
|
| 46 |
+
name: nerv_enc_full_res
|
| 47 |
+
args:
|
| 48 |
+
tokenizer:
|
| 49 |
+
name: vidrec_tokenizer
|
| 50 |
+
args:
|
| 51 |
+
input_size:
|
| 52 |
+
- 270
|
| 53 |
+
- 384
|
| 54 |
+
patch_size: 32
|
| 55 |
+
padding: 0
|
| 56 |
+
frame_num: 8
|
| 57 |
+
eval_frames: none
|
| 58 |
+
img_groups: 1
|
| 59 |
+
hyponet:
|
| 60 |
+
name: hypo_convnets_full_res
|
| 61 |
+
args:
|
| 62 |
+
in_dim: 1
|
| 63 |
+
out_dim: 3
|
| 64 |
+
out_bias: tanh
|
| 65 |
+
strds_h: '6_5_3_3'
|
| 66 |
+
strds_w: '6_4_4_4'
|
| 67 |
+
ks: '1_3'
|
| 68 |
+
hid_dim: 20
|
| 69 |
+
size: none
|
| 70 |
+
act: gelu
|
| 71 |
+
use_pe: true
|
| 72 |
+
pe_dim: 20
|
| 73 |
+
n_tokens: '16_100_16_0'
|
| 74 |
+
token_dims: '180_240_180_0'
|
| 75 |
+
transformer_encoder:
|
| 76 |
+
name: transformer_encoder
|
| 77 |
+
args:
|
| 78 |
+
dim: 720
|
| 79 |
+
depth: 6
|
| 80 |
+
n_head: 12
|
| 81 |
+
head_dim: 64
|
| 82 |
+
ff_dim: 2880
|
| 83 |
+
optimizer:
|
| 84 |
+
name: adam
|
| 85 |
+
args:
|
| 86 |
+
lr: 0.0001
|
| 87 |
+
weight_decay: 0.0
|
| 88 |
+
lr_type: step
|
| 89 |
+
max_epoch: 150
|
| 90 |
+
eval_epoch: 150
|
| 91 |
+
vis_epoch: 2000
|
| 92 |
+
dump_ckt: 'no'
|
| 93 |
+
dump_pred: 'no'
|
| 94 |
+
dump_video: 'no'
|
| 95 |
+
generate_from_single_frame: false
|
| 96 |
+
env:
|
| 97 |
+
exp_name: patch_tubelet
|
| 98 |
+
save_dir: checkpoints/patch_tubelet/pre_finetune/pre_finetune_384x270_patch_train_480p
|
| 99 |
+
instance_tag: pre_finetune_384x270_patch_train_480p
|
| 100 |
+
tot_gpus: 4
|
| 101 |
+
cudnn: false
|
| 102 |
+
port: '15419'
|
| 103 |
+
wandb_upload: false
|
| 104 |
+
rank: 0
|
| 105 |
+
world_size: 4
|
| 106 |
+
gpu: 0
|
| 107 |
+
distributed: true
|
| 108 |
+
dist_backend: nccl
|
| 109 |
+
wandb_exp_name: null
|
| 110 |
+
wandb_run_id: none
|
patch_tubelet/pre_finetune/pre_finetune_384x270_patch_train_480p/epoch-last.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2301668741892201c35f9710155bac0f13437d26b39096902f17ab39a9f58d68
|
| 3 |
+
size 499705518
|
patch_tubelet/pre_finetune/pre_finetune_384x270_patch_train_720p/cfg.yaml
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
trainer: nerv_enc_trainer_full_res
|
| 2 |
+
train_dataset:
|
| 3 |
+
name: vidrec_dataset_patch_tubelet_sampler_lazy
|
| 4 |
+
args:
|
| 5 |
+
root_path: data/dataset_meta
|
| 6 |
+
split: train
|
| 7 |
+
frame_num: 8
|
| 8 |
+
rand_augment: 'no'
|
| 9 |
+
csv_file: k400_2023_train_cls400_50_720p.js
|
| 10 |
+
cls_vid_num: '400_25'
|
| 11 |
+
crop_size:
|
| 12 |
+
- 720
|
| 13 |
+
- 1280
|
| 14 |
+
tubelet_size:
|
| 15 |
+
- 270
|
| 16 |
+
- 384
|
| 17 |
+
scale: 1
|
| 18 |
+
aspect_ratio: 1
|
| 19 |
+
rand_flip: 'no'
|
| 20 |
+
clips_per_video: 1
|
| 21 |
+
tubelets_per_clip: 1
|
| 22 |
+
loader:
|
| 23 |
+
batch_size: 32
|
| 24 |
+
num_workers: 16
|
| 25 |
+
test_dataset:
|
| 26 |
+
name: vidrec_dataset_patch_tubelet_inference_lazy_uvg
|
| 27 |
+
args:
|
| 28 |
+
root_path: data/dataset_meta
|
| 29 |
+
frame_num: 8
|
| 30 |
+
cls_vid_num: -1_-1
|
| 31 |
+
crop_size:
|
| 32 |
+
- 720
|
| 33 |
+
- 1280
|
| 34 |
+
tubelet_size:
|
| 35 |
+
- 270
|
| 36 |
+
- 384
|
| 37 |
+
csv_paths:
|
| 38 |
+
uvg: uvg_hd.csv
|
| 39 |
+
frames:
|
| 40 |
+
input: none
|
| 41 |
+
output: none
|
| 42 |
+
loader:
|
| 43 |
+
batch_size: 32
|
| 44 |
+
num_workers: 16
|
| 45 |
+
model:
|
| 46 |
+
name: nerv_enc_full_res
|
| 47 |
+
args:
|
| 48 |
+
tokenizer:
|
| 49 |
+
name: vidrec_tokenizer
|
| 50 |
+
args:
|
| 51 |
+
input_size:
|
| 52 |
+
- 270
|
| 53 |
+
- 384
|
| 54 |
+
patch_size: 32
|
| 55 |
+
padding: 0
|
| 56 |
+
frame_num: 8
|
| 57 |
+
eval_frames: none
|
| 58 |
+
img_groups: 1
|
| 59 |
+
hyponet:
|
| 60 |
+
name: hypo_convnets_full_res
|
| 61 |
+
args:
|
| 62 |
+
in_dim: 1
|
| 63 |
+
out_dim: 3
|
| 64 |
+
out_bias: tanh
|
| 65 |
+
strds_h: '6_5_3_3'
|
| 66 |
+
strds_w: '6_4_4_4'
|
| 67 |
+
ks: '1_3'
|
| 68 |
+
hid_dim: 20
|
| 69 |
+
size: none
|
| 70 |
+
act: gelu
|
| 71 |
+
use_pe: true
|
| 72 |
+
pe_dim: 20
|
| 73 |
+
n_tokens: '16_100_16_0'
|
| 74 |
+
token_dims: '180_240_180_0'
|
| 75 |
+
transformer_encoder:
|
| 76 |
+
name: transformer_encoder
|
| 77 |
+
args:
|
| 78 |
+
dim: 720
|
| 79 |
+
depth: 6
|
| 80 |
+
n_head: 12
|
| 81 |
+
head_dim: 64
|
| 82 |
+
ff_dim: 2880
|
| 83 |
+
optimizer:
|
| 84 |
+
name: adam
|
| 85 |
+
args:
|
| 86 |
+
lr: 0.0001
|
| 87 |
+
weight_decay: 0.0
|
| 88 |
+
lr_type: step
|
| 89 |
+
max_epoch: 150
|
| 90 |
+
eval_epoch: 150
|
| 91 |
+
vis_epoch: 2000
|
| 92 |
+
dump_ckt: 'no'
|
| 93 |
+
dump_pred: 'no'
|
| 94 |
+
dump_video: 'no'
|
| 95 |
+
generate_from_single_frame: false
|
| 96 |
+
env:
|
| 97 |
+
exp_name: patch_tubelet
|
| 98 |
+
save_dir: checkpoints/patch_tubelet/pre_finetune/pre_finetune_384x270_patch_train_720p
|
| 99 |
+
instance_tag: pre_finetune_384x270_patch_train_720p
|
| 100 |
+
tot_gpus: 4
|
| 101 |
+
cudnn: false
|
| 102 |
+
port: '34306'
|
| 103 |
+
wandb_upload: false
|
| 104 |
+
rank: 0
|
| 105 |
+
world_size: 4
|
| 106 |
+
gpu: 0
|
| 107 |
+
distributed: true
|
| 108 |
+
dist_backend: nccl
|
| 109 |
+
wandb_exp_name: null
|
| 110 |
+
wandb_run_id: none
|
patch_tubelet/pre_finetune/pre_finetune_384x270_patch_train_720p/epoch-last.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2c2afa676bd450479b9ffc32a29af06b6ba499255dc98d3a996a1b2899ec0362
|
| 3 |
+
size 499705518
|
teconerv/320x160_pairs_teco/cfg.yaml
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
trainer: nerv_enc_trainer_full_res_pairs
|
| 2 |
+
train_dataset:
|
| 3 |
+
name: vidrec_dataset_patch_tubelet_sampler_lazy_pairs
|
| 4 |
+
args:
|
| 5 |
+
root_path: data/dataset_meta
|
| 6 |
+
split: train
|
| 7 |
+
frame_num: 8
|
| 8 |
+
rand_augment: '1_2_5'
|
| 9 |
+
csv_file: k400_2023_train_cls400_50_480p.js
|
| 10 |
+
cls_vid_num: '400_25'
|
| 11 |
+
crop_size:
|
| 12 |
+
- 480
|
| 13 |
+
- 640
|
| 14 |
+
tubelet_size:
|
| 15 |
+
- 160
|
| 16 |
+
- 320
|
| 17 |
+
scale: 1
|
| 18 |
+
aspect_ratio: 1
|
| 19 |
+
rand_flip: 'no'
|
| 20 |
+
clips_per_video: 1
|
| 21 |
+
tubelets_per_clip: 1
|
| 22 |
+
loader:
|
| 23 |
+
batch_size: 32
|
| 24 |
+
num_workers: 16
|
| 25 |
+
test_dataset:
|
| 26 |
+
name: vidrec_dataset_patch_tubelet_inference_lazy_pairs_uvg
|
| 27 |
+
args:
|
| 28 |
+
root_path: data/dataset_meta
|
| 29 |
+
frame_num: 8
|
| 30 |
+
cls_vid_num: -1_-1
|
| 31 |
+
crop_size:
|
| 32 |
+
- 480
|
| 33 |
+
- 640
|
| 34 |
+
tubelet_size:
|
| 35 |
+
- 160
|
| 36 |
+
- 320
|
| 37 |
+
csv_paths:
|
| 38 |
+
uvg: uvg_hd.csv
|
| 39 |
+
frames:
|
| 40 |
+
input: none
|
| 41 |
+
output: none
|
| 42 |
+
loader:
|
| 43 |
+
batch_size: 32
|
| 44 |
+
num_workers: 16
|
| 45 |
+
model:
|
| 46 |
+
name: nerv_enc_full_res_pairs
|
| 47 |
+
args:
|
| 48 |
+
tokenizer:
|
| 49 |
+
name: vidrec_tokenizer
|
| 50 |
+
args:
|
| 51 |
+
input_size:
|
| 52 |
+
- 160
|
| 53 |
+
- 320
|
| 54 |
+
patch_size: 32
|
| 55 |
+
padding: 0
|
| 56 |
+
frame_num: 8
|
| 57 |
+
eval_frames: none
|
| 58 |
+
img_groups: 1
|
| 59 |
+
hyponet:
|
| 60 |
+
name: hypo_convnets_full_res
|
| 61 |
+
args:
|
| 62 |
+
in_dim: 1
|
| 63 |
+
out_dim: 3
|
| 64 |
+
out_bias: tanh
|
| 65 |
+
strds_h: '5_4_4_2'
|
| 66 |
+
strds_w: '5_4_4_4'
|
| 67 |
+
ks: '1_3'
|
| 68 |
+
hid_dim: 14
|
| 69 |
+
size: none
|
| 70 |
+
act: gelu
|
| 71 |
+
use_pe: true
|
| 72 |
+
pe_dim: 14
|
| 73 |
+
n_tokens: '5_56_4_0'
|
| 74 |
+
token_dims: '196_252_196_0'
|
| 75 |
+
transformer_encoder:
|
| 76 |
+
name: transformer_encoder
|
| 77 |
+
args:
|
| 78 |
+
dim: 720
|
| 79 |
+
depth: 6
|
| 80 |
+
n_head: 12
|
| 81 |
+
head_dim: 64
|
| 82 |
+
ff_dim: 2880
|
| 83 |
+
optimizer:
|
| 84 |
+
name: adam
|
| 85 |
+
args:
|
| 86 |
+
lr: 0.0001
|
| 87 |
+
weight_decay: 0.0
|
| 88 |
+
lr_type: step
|
| 89 |
+
max_epoch: 50
|
| 90 |
+
eval_epoch: 50
|
| 91 |
+
vis_epoch: 2000
|
| 92 |
+
dump_ckt: 'no'
|
| 93 |
+
dump_pred: 'no'
|
| 94 |
+
dump_video: 'no'
|
| 95 |
+
generate_from_single_frame: false
|
| 96 |
+
finetune_model: checkpoints/patch_tubelet/pre_finetune/pre_finetune_320x160_patch/epoch-last.pth
|
| 97 |
+
finetune_same_model: false
|
| 98 |
+
param_reg_mode: mod
|
| 99 |
+
param_reg_lambda_l1: 0.1
|
| 100 |
+
param_reg_lambda_l2: 0.0
|
| 101 |
+
env:
|
| 102 |
+
exp_name: teconerv
|
| 103 |
+
save_dir: checkpoints/teconerv/320x160_pairs_teco
|
| 104 |
+
instance_tag: 320x160_pairs_teco
|
| 105 |
+
tot_gpus: 4
|
| 106 |
+
cudnn: false
|
| 107 |
+
port: '15419'
|
| 108 |
+
wandb_upload: false
|
| 109 |
+
rank: 0
|
| 110 |
+
world_size: 4
|
| 111 |
+
gpu: 0
|
| 112 |
+
distributed: true
|
| 113 |
+
dist_backend: nccl
|
| 114 |
+
wandb_exp_name: null
|
| 115 |
+
wandb_run_id: none
|
teconerv/320x160_pairs_teco/epoch-last.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d2ed76b258306ec03a2c14ed37d2fd223debdf84ff4589a5e3876ddef8436ff6
|
| 3 |
+
size 495460398
|
teconerv/320x160_pairs_teco_small/cfg.yaml
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
trainer: nerv_enc_trainer_full_res_pairs
|
| 2 |
+
train_dataset:
|
| 3 |
+
name: vidrec_dataset_patch_tubelet_sampler_lazy_pairs
|
| 4 |
+
args:
|
| 5 |
+
root_path: data/dataset_meta
|
| 6 |
+
split: train
|
| 7 |
+
frame_num: 8
|
| 8 |
+
rand_augment: '1_2_5'
|
| 9 |
+
csv_file: k400_2023_train_cls400_50_480p.js
|
| 10 |
+
cls_vid_num: '400_25'
|
| 11 |
+
crop_size:
|
| 12 |
+
- 480
|
| 13 |
+
- 640
|
| 14 |
+
tubelet_size:
|
| 15 |
+
- 160
|
| 16 |
+
- 320
|
| 17 |
+
scale: 1
|
| 18 |
+
aspect_ratio: 1
|
| 19 |
+
rand_flip: 'no'
|
| 20 |
+
clips_per_video: 1
|
| 21 |
+
tubelets_per_clip: 1
|
| 22 |
+
loader:
|
| 23 |
+
batch_size: 32
|
| 24 |
+
num_workers: 16
|
| 25 |
+
test_dataset:
|
| 26 |
+
name: vidrec_dataset_patch_tubelet_inference_lazy_pairs_uvg
|
| 27 |
+
args:
|
| 28 |
+
root_path: data/dataset_meta
|
| 29 |
+
frame_num: 8
|
| 30 |
+
cls_vid_num: -1_-1
|
| 31 |
+
crop_size:
|
| 32 |
+
- 480
|
| 33 |
+
- 640
|
| 34 |
+
tubelet_size:
|
| 35 |
+
- 160
|
| 36 |
+
- 320
|
| 37 |
+
csv_paths:
|
| 38 |
+
uvg: uvg_hd.csv
|
| 39 |
+
frames:
|
| 40 |
+
input: none
|
| 41 |
+
output: none
|
| 42 |
+
loader:
|
| 43 |
+
batch_size: 32
|
| 44 |
+
num_workers: 16
|
| 45 |
+
model:
|
| 46 |
+
name: nerv_enc_full_res_pairs
|
| 47 |
+
args:
|
| 48 |
+
tokenizer:
|
| 49 |
+
name: vidrec_tokenizer
|
| 50 |
+
args:
|
| 51 |
+
input_size:
|
| 52 |
+
- 160
|
| 53 |
+
- 320
|
| 54 |
+
patch_size: 32
|
| 55 |
+
padding: 0
|
| 56 |
+
frame_num: 8
|
| 57 |
+
eval_frames: none
|
| 58 |
+
img_groups: 1
|
| 59 |
+
hyponet:
|
| 60 |
+
name: hypo_convnets_full_res
|
| 61 |
+
args:
|
| 62 |
+
in_dim: 1
|
| 63 |
+
out_dim: 3
|
| 64 |
+
out_bias: tanh
|
| 65 |
+
strds_h: '5_4_4_2'
|
| 66 |
+
strds_w: '5_4_4_4'
|
| 67 |
+
ks: '1_3'
|
| 68 |
+
hid_dim: 14
|
| 69 |
+
size: none
|
| 70 |
+
act: gelu
|
| 71 |
+
use_pe: true
|
| 72 |
+
pe_dim: 14
|
| 73 |
+
n_tokens: '5_16_4_0'
|
| 74 |
+
token_dims: '140_252_98_0'
|
| 75 |
+
transformer_encoder:
|
| 76 |
+
name: transformer_encoder
|
| 77 |
+
args:
|
| 78 |
+
dim: 720
|
| 79 |
+
depth: 6
|
| 80 |
+
n_head: 12
|
| 81 |
+
head_dim: 64
|
| 82 |
+
ff_dim: 2880
|
| 83 |
+
optimizer:
|
| 84 |
+
name: adam
|
| 85 |
+
args:
|
| 86 |
+
lr: 0.0001
|
| 87 |
+
weight_decay: 0.0
|
| 88 |
+
lr_type: step
|
| 89 |
+
max_epoch: 50
|
| 90 |
+
eval_epoch: 50
|
| 91 |
+
vis_epoch: 2000
|
| 92 |
+
dump_ckt: 'no'
|
| 93 |
+
dump_pred: 'no'
|
| 94 |
+
dump_video: 'no'
|
| 95 |
+
generate_from_single_frame: false
|
| 96 |
+
finetune_model: checkpoints/patch_tubelet/pre_finetune/pre_finetune_320x160_patch_small/epoch-last.pth
|
| 97 |
+
finetune_same_model: false
|
| 98 |
+
param_reg_mode: mod
|
| 99 |
+
param_reg_lambda_l1: 0.1
|
| 100 |
+
param_reg_lambda_l2: 0.0
|
| 101 |
+
env:
|
| 102 |
+
exp_name: teconerv
|
| 103 |
+
save_dir: checkpoints/teconerv/320x160_pairs_teco_small
|
| 104 |
+
instance_tag: 320x160_pairs_teco_small
|
| 105 |
+
tot_gpus: 4
|
| 106 |
+
cudnn: false
|
| 107 |
+
port: '15419'
|
| 108 |
+
wandb_upload: false
|
| 109 |
+
rank: 0
|
| 110 |
+
world_size: 4
|
| 111 |
+
gpu: 0
|
| 112 |
+
distributed: true
|
| 113 |
+
dist_backend: nccl
|
| 114 |
+
wandb_exp_name: null
|
| 115 |
+
wandb_run_id: none
|
teconerv/320x160_pairs_teco_small/epoch-last.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:64b09a04bdbe61ed92046b528cb73d6a72817c8a945c7514d59d29e094ed4627
|
| 3 |
+
size 493782382
|
teconerv/320x240_pairs_teco/cfg.yaml
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
trainer: nerv_enc_trainer_full_res_pairs
|
| 2 |
+
train_dataset:
|
| 3 |
+
name: vidrec_dataset_patch_tubelet_sampler_lazy_pairs
|
| 4 |
+
args:
|
| 5 |
+
root_path: data/dataset_meta
|
| 6 |
+
split: train
|
| 7 |
+
frame_num: 8
|
| 8 |
+
rand_augment: '1_2_5'
|
| 9 |
+
csv_file: k400_2023_train_cls400_50_480p.js
|
| 10 |
+
cls_vid_num: '400_25'
|
| 11 |
+
crop_size:
|
| 12 |
+
- 480
|
| 13 |
+
- 640
|
| 14 |
+
tubelet_size:
|
| 15 |
+
- 240
|
| 16 |
+
- 320
|
| 17 |
+
scale: 1
|
| 18 |
+
aspect_ratio: 1
|
| 19 |
+
rand_flip: 'no'
|
| 20 |
+
clips_per_video: 1
|
| 21 |
+
tubelets_per_clip: 1
|
| 22 |
+
loader:
|
| 23 |
+
batch_size: 32
|
| 24 |
+
num_workers: 16
|
| 25 |
+
test_dataset:
|
| 26 |
+
name: vidrec_dataset_patch_tubelet_inference_lazy_pairs_uvg
|
| 27 |
+
args:
|
| 28 |
+
root_path: data/dataset_meta
|
| 29 |
+
frame_num: 8
|
| 30 |
+
cls_vid_num: -1_-1
|
| 31 |
+
crop_size:
|
| 32 |
+
- 480
|
| 33 |
+
- 640
|
| 34 |
+
tubelet_size:
|
| 35 |
+
- 240
|
| 36 |
+
- 320
|
| 37 |
+
csv_paths:
|
| 38 |
+
uvg: uvg_hd.csv
|
| 39 |
+
frames:
|
| 40 |
+
input: none
|
| 41 |
+
output: none
|
| 42 |
+
loader:
|
| 43 |
+
batch_size: 32
|
| 44 |
+
num_workers: 16
|
| 45 |
+
model:
|
| 46 |
+
name: nerv_enc_full_res_pairs
|
| 47 |
+
args:
|
| 48 |
+
tokenizer:
|
| 49 |
+
name: vidrec_tokenizer
|
| 50 |
+
args:
|
| 51 |
+
input_size:
|
| 52 |
+
- 240
|
| 53 |
+
- 320
|
| 54 |
+
patch_size: 32
|
| 55 |
+
padding: 0
|
| 56 |
+
frame_num: 8
|
| 57 |
+
eval_frames: none
|
| 58 |
+
img_groups: 1
|
| 59 |
+
hyponet:
|
| 60 |
+
name: hypo_convnets_full_res
|
| 61 |
+
args:
|
| 62 |
+
in_dim: 1
|
| 63 |
+
out_dim: 3
|
| 64 |
+
out_bias: tanh
|
| 65 |
+
strds_h: '5_4_4_3'
|
| 66 |
+
strds_w: '5_4_4_4'
|
| 67 |
+
ks: '1_3'
|
| 68 |
+
hid_dim: 20
|
| 69 |
+
size: none
|
| 70 |
+
act: gelu
|
| 71 |
+
use_pe: true
|
| 72 |
+
pe_dim: 16
|
| 73 |
+
n_tokens: '10_80_16_0'
|
| 74 |
+
token_dims: '200_240_240_0'
|
| 75 |
+
transformer_encoder:
|
| 76 |
+
name: transformer_encoder
|
| 77 |
+
args:
|
| 78 |
+
dim: 720
|
| 79 |
+
depth: 6
|
| 80 |
+
n_head: 12
|
| 81 |
+
head_dim: 64
|
| 82 |
+
ff_dim: 2880
|
| 83 |
+
optimizer:
|
| 84 |
+
name: adam
|
| 85 |
+
args:
|
| 86 |
+
lr: 0.0001
|
| 87 |
+
weight_decay: 0.0
|
| 88 |
+
lr_type: step
|
| 89 |
+
max_epoch: 50
|
| 90 |
+
eval_epoch: 50
|
| 91 |
+
vis_epoch: 2000
|
| 92 |
+
dump_ckt: 'no'
|
| 93 |
+
dump_pred: 'no'
|
| 94 |
+
dump_video: 'no'
|
| 95 |
+
generate_from_single_frame: false
|
| 96 |
+
finetune_model: checkpoints/patch_tubelet/pre_finetune/pre_finetune_320x240_patch_train_480p/epoch-last.pth
|
| 97 |
+
finetune_same_model: false
|
| 98 |
+
param_reg_mode: mod
|
| 99 |
+
param_reg_lambda_l1: 0.1
|
| 100 |
+
param_reg_lambda_l2: 0.0
|
| 101 |
+
env:
|
| 102 |
+
exp_name: teconerv
|
| 103 |
+
save_dir: checkpoints/teconerv/320x240_pairs_teco
|
| 104 |
+
instance_tag: 320x240_pairs_teco
|
| 105 |
+
tot_gpus: 4
|
| 106 |
+
cudnn: false
|
| 107 |
+
port: '15419'
|
| 108 |
+
wandb_upload: false
|
| 109 |
+
rank: 0
|
| 110 |
+
world_size: 4
|
| 111 |
+
gpu: 0
|
| 112 |
+
distributed: true
|
| 113 |
+
dist_backend: nccl
|
| 114 |
+
wandb_exp_name: null
|
| 115 |
+
wandb_run_id: none
|
teconerv/320x240_pairs_teco/epoch-last.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0508493a47424adb8c317a2cef6adca4294063c6f8c79f487f616e1ff44739fa
|
| 3 |
+
size 498296686
|
teconerv/320x240_pairs_teco_train_720p/cfg.yaml
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
trainer: nerv_enc_trainer_full_res_pairs
|
| 2 |
+
train_dataset:
|
| 3 |
+
name: vidrec_dataset_patch_tubelet_sampler_lazy_pairs
|
| 4 |
+
args:
|
| 5 |
+
root_path: data/dataset_meta
|
| 6 |
+
split: train
|
| 7 |
+
frame_num: 8
|
| 8 |
+
rand_augment: '1_2_5'
|
| 9 |
+
csv_file: k400_2023_train_cls400_50_720p.js
|
| 10 |
+
cls_vid_num: '400_25'
|
| 11 |
+
crop_size:
|
| 12 |
+
- 720
|
| 13 |
+
- 1280
|
| 14 |
+
tubelet_size:
|
| 15 |
+
- 240
|
| 16 |
+
- 320
|
| 17 |
+
scale: 1
|
| 18 |
+
aspect_ratio: 1
|
| 19 |
+
rand_flip: 'no'
|
| 20 |
+
clips_per_video: 1
|
| 21 |
+
tubelets_per_clip: 1
|
| 22 |
+
loader:
|
| 23 |
+
batch_size: 32
|
| 24 |
+
num_workers: 16
|
| 25 |
+
test_dataset:
|
| 26 |
+
name: vidrec_dataset_patch_tubelet_inference_lazy_pairs_uvg
|
| 27 |
+
args:
|
| 28 |
+
root_path: data/dataset_meta
|
| 29 |
+
frame_num: 8
|
| 30 |
+
cls_vid_num: -1_-1
|
| 31 |
+
crop_size:
|
| 32 |
+
- 720
|
| 33 |
+
- 1280
|
| 34 |
+
tubelet_size:
|
| 35 |
+
- 240
|
| 36 |
+
- 320
|
| 37 |
+
csv_paths:
|
| 38 |
+
uvg: uvg_hd.csv
|
| 39 |
+
frames:
|
| 40 |
+
input: none
|
| 41 |
+
output: none
|
| 42 |
+
loader:
|
| 43 |
+
batch_size: 32
|
| 44 |
+
num_workers: 16
|
| 45 |
+
model:
|
| 46 |
+
name: nerv_enc_full_res_pairs
|
| 47 |
+
args:
|
| 48 |
+
tokenizer:
|
| 49 |
+
name: vidrec_tokenizer
|
| 50 |
+
args:
|
| 51 |
+
input_size:
|
| 52 |
+
- 240
|
| 53 |
+
- 320
|
| 54 |
+
patch_size: 32
|
| 55 |
+
padding: 0
|
| 56 |
+
frame_num: 8
|
| 57 |
+
eval_frames: none
|
| 58 |
+
img_groups: 1
|
| 59 |
+
hyponet:
|
| 60 |
+
name: hypo_convnets_full_res
|
| 61 |
+
args:
|
| 62 |
+
in_dim: 1
|
| 63 |
+
out_dim: 3
|
| 64 |
+
out_bias: tanh
|
| 65 |
+
strds_h: '5_4_4_3'
|
| 66 |
+
strds_w: '5_4_4_4'
|
| 67 |
+
ks: '1_3'
|
| 68 |
+
hid_dim: 20
|
| 69 |
+
size: none
|
| 70 |
+
act: gelu
|
| 71 |
+
use_pe: true
|
| 72 |
+
pe_dim: 16
|
| 73 |
+
n_tokens: '10_80_16_0'
|
| 74 |
+
token_dims: '200_240_240_0'
|
| 75 |
+
transformer_encoder:
|
| 76 |
+
name: transformer_encoder
|
| 77 |
+
args:
|
| 78 |
+
dim: 720
|
| 79 |
+
depth: 6
|
| 80 |
+
n_head: 12
|
| 81 |
+
head_dim: 64
|
| 82 |
+
ff_dim: 2880
|
| 83 |
+
optimizer:
|
| 84 |
+
name: adam
|
| 85 |
+
args:
|
| 86 |
+
lr: 0.0001
|
| 87 |
+
weight_decay: 0.0
|
| 88 |
+
lr_type: step
|
| 89 |
+
max_epoch: 50
|
| 90 |
+
eval_epoch: 50
|
| 91 |
+
vis_epoch: 2000
|
| 92 |
+
dump_ckt: 'no'
|
| 93 |
+
dump_pred: 'no'
|
| 94 |
+
dump_video: 'no'
|
| 95 |
+
generate_from_single_frame: false
|
| 96 |
+
finetune_model: checkpoints/patch_tubelet/pre_finetune/pre_finetune_320x240_patch_train_720p/epoch-last.pth
|
| 97 |
+
finetune_same_model: false
|
| 98 |
+
param_reg_mode: mod
|
| 99 |
+
param_reg_lambda_l1: 0.1
|
| 100 |
+
param_reg_lambda_l2: 0.0
|
| 101 |
+
env:
|
| 102 |
+
exp_name: teconerv
|
| 103 |
+
save_dir: checkpoints/teconerv/320x240_pairs_teco_train_720p
|
| 104 |
+
instance_tag: 320x240_pairs_teco_train_720p
|
| 105 |
+
tot_gpus: 4
|
| 106 |
+
cudnn: false
|
| 107 |
+
port: '15419'
|
| 108 |
+
wandb_upload: false
|
| 109 |
+
rank: 0
|
| 110 |
+
world_size: 4
|
| 111 |
+
gpu: 0
|
| 112 |
+
distributed: true
|
| 113 |
+
dist_backend: nccl
|
| 114 |
+
wandb_exp_name: null
|
| 115 |
+
wandb_run_id: none
|
teconerv/320x240_pairs_teco_train_720p/epoch-last.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3047d69f34fdaafeb71586eaf9d5c23dab3d540826a5b234f9003775d4c0df00
|
| 3 |
+
size 498296686
|
teconerv/384x270_pairs_teco/cfg.yaml
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
trainer: nerv_enc_trainer_full_res_pairs
|
| 2 |
+
train_dataset:
|
| 3 |
+
name: vidrec_dataset_patch_tubelet_sampler_lazy_pairs
|
| 4 |
+
args:
|
| 5 |
+
root_path: data/dataset_meta
|
| 6 |
+
split: train
|
| 7 |
+
frame_num: 8
|
| 8 |
+
rand_augment: '1_2_5'
|
| 9 |
+
csv_file: k400_2023_train_cls400_50_480p.js
|
| 10 |
+
cls_vid_num: '400_25'
|
| 11 |
+
crop_size:
|
| 12 |
+
- 480
|
| 13 |
+
- 640
|
| 14 |
+
tubelet_size:
|
| 15 |
+
- 270
|
| 16 |
+
- 384
|
| 17 |
+
scale: 1
|
| 18 |
+
aspect_ratio: 1
|
| 19 |
+
rand_flip: 'no'
|
| 20 |
+
clips_per_video: 1
|
| 21 |
+
tubelets_per_clip: 1
|
| 22 |
+
loader:
|
| 23 |
+
batch_size: 32
|
| 24 |
+
num_workers: 16
|
| 25 |
+
test_dataset:
|
| 26 |
+
name: vidrec_dataset_patch_tubelet_inference_lazy_pairs_uvg
|
| 27 |
+
args:
|
| 28 |
+
root_path: data/dataset_meta
|
| 29 |
+
frame_num: 8
|
| 30 |
+
cls_vid_num: -1_-1
|
| 31 |
+
crop_size:
|
| 32 |
+
- 480
|
| 33 |
+
- 640
|
| 34 |
+
tubelet_size:
|
| 35 |
+
- 270
|
| 36 |
+
- 384
|
| 37 |
+
csv_paths:
|
| 38 |
+
uvg: uvg_hd.csv
|
| 39 |
+
frames:
|
| 40 |
+
input: none
|
| 41 |
+
output: none
|
| 42 |
+
loader:
|
| 43 |
+
batch_size: 32
|
| 44 |
+
num_workers: 16
|
| 45 |
+
model:
|
| 46 |
+
name: nerv_enc_full_res_pairs
|
| 47 |
+
args:
|
| 48 |
+
tokenizer:
|
| 49 |
+
name: vidrec_tokenizer
|
| 50 |
+
args:
|
| 51 |
+
input_size:
|
| 52 |
+
- 270
|
| 53 |
+
- 384
|
| 54 |
+
patch_size: 32
|
| 55 |
+
padding: 0
|
| 56 |
+
frame_num: 8
|
| 57 |
+
eval_frames: none
|
| 58 |
+
img_groups: 1
|
| 59 |
+
hyponet:
|
| 60 |
+
name: hypo_convnets_full_res
|
| 61 |
+
args:
|
| 62 |
+
in_dim: 1
|
| 63 |
+
out_dim: 3
|
| 64 |
+
out_bias: tanh
|
| 65 |
+
strds_h: '6_5_3_3'
|
| 66 |
+
strds_w: '6_4_4_4'
|
| 67 |
+
ks: '1_3'
|
| 68 |
+
hid_dim: 20
|
| 69 |
+
size: none
|
| 70 |
+
act: gelu
|
| 71 |
+
use_pe: true
|
| 72 |
+
pe_dim: 20
|
| 73 |
+
n_tokens: '16_100_16_0'
|
| 74 |
+
token_dims: '180_240_180_0'
|
| 75 |
+
transformer_encoder:
|
| 76 |
+
name: transformer_encoder
|
| 77 |
+
args:
|
| 78 |
+
dim: 720
|
| 79 |
+
depth: 6
|
| 80 |
+
n_head: 12
|
| 81 |
+
head_dim: 64
|
| 82 |
+
ff_dim: 2880
|
| 83 |
+
optimizer:
|
| 84 |
+
name: adam
|
| 85 |
+
args:
|
| 86 |
+
lr: 0.0001
|
| 87 |
+
weight_decay: 0.0
|
| 88 |
+
lr_type: step
|
| 89 |
+
max_epoch: 50
|
| 90 |
+
eval_epoch: 50
|
| 91 |
+
vis_epoch: 2000
|
| 92 |
+
dump_ckt: 'no'
|
| 93 |
+
dump_pred: 'no'
|
| 94 |
+
dump_video: 'no'
|
| 95 |
+
generate_from_single_frame: false
|
| 96 |
+
finetune_model: checkpoints/patch_tubelet/pre_finetune/pre_finetune_384x270_patch_train_480p/epoch-last.pth
|
| 97 |
+
finetune_same_model: false
|
| 98 |
+
param_reg_mode: mod
|
| 99 |
+
param_reg_lambda_l1: 0.1
|
| 100 |
+
param_reg_lambda_l2: 0.0
|
| 101 |
+
env:
|
| 102 |
+
exp_name: teconerv
|
| 103 |
+
save_dir: checkpoints/teconerv/384x270_pairs_teco
|
| 104 |
+
instance_tag: 384x270_pairs_teco
|
| 105 |
+
tot_gpus: 4
|
| 106 |
+
cudnn: false
|
| 107 |
+
port: '15419'
|
| 108 |
+
wandb_upload: false
|
| 109 |
+
rank: 0
|
| 110 |
+
world_size: 4
|
| 111 |
+
gpu: 0
|
| 112 |
+
distributed: true
|
| 113 |
+
dist_backend: nccl
|
| 114 |
+
wandb_exp_name: null
|
| 115 |
+
wandb_run_id: none
|
teconerv/384x270_pairs_teco/epoch-last.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bbdb918f792590202bb56a40b89e684d9cd9a7e04eac73705a91562b5d33d7cb
|
| 3 |
+
size 499705774
|
teconerv/384x270_pairs_teco_train_720p/cfg.yaml
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
trainer: nerv_enc_trainer_full_res_pairs
|
| 2 |
+
train_dataset:
|
| 3 |
+
name: vidrec_dataset_patch_tubelet_sampler_lazy_pairs
|
| 4 |
+
args:
|
| 5 |
+
root_path: data/dataset_meta
|
| 6 |
+
split: train
|
| 7 |
+
frame_num: 8
|
| 8 |
+
rand_augment: '1_2_5'
|
| 9 |
+
csv_file: k400_2023_train_cls400_50_720p.js
|
| 10 |
+
cls_vid_num: '400_25'
|
| 11 |
+
crop_size:
|
| 12 |
+
- 720
|
| 13 |
+
- 1280
|
| 14 |
+
tubelet_size:
|
| 15 |
+
- 270
|
| 16 |
+
- 384
|
| 17 |
+
scale: 1
|
| 18 |
+
aspect_ratio: 1
|
| 19 |
+
rand_flip: 'no'
|
| 20 |
+
clips_per_video: 1
|
| 21 |
+
tubelets_per_clip: 1
|
| 22 |
+
loader:
|
| 23 |
+
batch_size: 32
|
| 24 |
+
num_workers: 16
|
| 25 |
+
test_dataset:
|
| 26 |
+
name: vidrec_dataset_patch_tubelet_inference_lazy_pairs_uvg
|
| 27 |
+
args:
|
| 28 |
+
root_path: data/dataset_meta
|
| 29 |
+
frame_num: 8
|
| 30 |
+
cls_vid_num: -1_-1
|
| 31 |
+
crop_size:
|
| 32 |
+
- 720
|
| 33 |
+
- 1280
|
| 34 |
+
tubelet_size:
|
| 35 |
+
- 270
|
| 36 |
+
- 384
|
| 37 |
+
csv_paths:
|
| 38 |
+
uvg: uvg_hd.csv
|
| 39 |
+
frames:
|
| 40 |
+
input: none
|
| 41 |
+
output: none
|
| 42 |
+
loader:
|
| 43 |
+
batch_size: 32
|
| 44 |
+
num_workers: 16
|
| 45 |
+
model:
|
| 46 |
+
name: nerv_enc_full_res_pairs
|
| 47 |
+
args:
|
| 48 |
+
tokenizer:
|
| 49 |
+
name: vidrec_tokenizer
|
| 50 |
+
args:
|
| 51 |
+
input_size:
|
| 52 |
+
- 270
|
| 53 |
+
- 384
|
| 54 |
+
patch_size: 32
|
| 55 |
+
padding: 0
|
| 56 |
+
frame_num: 8
|
| 57 |
+
eval_frames: none
|
| 58 |
+
img_groups: 1
|
| 59 |
+
hyponet:
|
| 60 |
+
name: hypo_convnets_full_res
|
| 61 |
+
args:
|
| 62 |
+
in_dim: 1
|
| 63 |
+
out_dim: 3
|
| 64 |
+
out_bias: tanh
|
| 65 |
+
strds_h: '6_5_3_3'
|
| 66 |
+
strds_w: '6_4_4_4'
|
| 67 |
+
ks: '1_3'
|
| 68 |
+
hid_dim: 20
|
| 69 |
+
size: none
|
| 70 |
+
act: gelu
|
| 71 |
+
use_pe: true
|
| 72 |
+
pe_dim: 20
|
| 73 |
+
n_tokens: '16_100_16_0'
|
| 74 |
+
token_dims: '180_240_180_0'
|
| 75 |
+
transformer_encoder:
|
| 76 |
+
name: transformer_encoder
|
| 77 |
+
args:
|
| 78 |
+
dim: 720
|
| 79 |
+
depth: 6
|
| 80 |
+
n_head: 12
|
| 81 |
+
head_dim: 64
|
| 82 |
+
ff_dim: 2880
|
| 83 |
+
optimizer:
|
| 84 |
+
name: adam
|
| 85 |
+
args:
|
| 86 |
+
lr: 0.0001
|
| 87 |
+
weight_decay: 0.0
|
| 88 |
+
lr_type: step
|
| 89 |
+
max_epoch: 50
|
| 90 |
+
eval_epoch: 50
|
| 91 |
+
vis_epoch: 2000
|
| 92 |
+
dump_ckt: 'no'
|
| 93 |
+
dump_pred: 'no'
|
| 94 |
+
dump_video: 'no'
|
| 95 |
+
generate_from_single_frame: false
|
| 96 |
+
finetune_model: checkpoints/patch_tubelet/pre_finetune/pre_finetune_384x270_patch_train_720p/epoch-last.pth
|
| 97 |
+
finetune_same_model: false
|
| 98 |
+
param_reg_mode: mod
|
| 99 |
+
param_reg_lambda_l1: 0.1
|
| 100 |
+
param_reg_lambda_l2: 0.0
|
| 101 |
+
env:
|
| 102 |
+
exp_name: teconerv
|
| 103 |
+
save_dir: checkpoints/teconerv/384x270_pairs_teco_train_720p
|
| 104 |
+
instance_tag: 384x270_pairs_teco_train_720p
|
| 105 |
+
tot_gpus: 4
|
| 106 |
+
cudnn: false
|
| 107 |
+
port: '15419'
|
| 108 |
+
wandb_upload: false
|
| 109 |
+
rank: 0
|
| 110 |
+
world_size: 4
|
| 111 |
+
gpu: 0
|
| 112 |
+
distributed: true
|
| 113 |
+
dist_backend: nccl
|
| 114 |
+
wandb_exp_name: null
|
| 115 |
+
wandb_run_id: none
|
teconerv/384x270_pairs_teco_train_720p/epoch-last.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ac262b0a9e57ddf044c31693944a9d679d1aa79e28b850f2c8335f0f5abcb671
|
| 3 |
+
size 499705774
|