camenduru commited on
Commit
8540b52
·
1 Parent(s): 9bd91a4

thanks to brjathu ❤

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. lart_mvit.ckpt +3 -0
  3. lart_mvit.config +206 -0
  4. mvit.pyth +3 -0
  5. mvit.yaml +142 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ mvit.pyth filter=lfs diff=lfs merge=lfs -text
lart_mvit.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9985bc712a1ca1341ae1c4cfe26c0124be0222f08e1ca9b08afcc764be6a84d
3
+ size 582097545
lart_mvit.config ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ trainer:
2
+ _target_: lightning.pytorch.Trainer
3
+ default_root_dir: ${paths.output_dir}
4
+ min_epochs: 1
5
+ max_epochs: 30
6
+ accelerator: gpu
7
+ devices: 8
8
+ num_nodes: 8
9
+ check_val_every_n_epoch: 1
10
+ deterministic: false
11
+ benchmark: true
12
+ accumulate_grad_batches: 1
13
+ gradient_clip_val: 2.0
14
+ precision: 32
15
+ num_sanity_val_steps: 0
16
+ limit_train_batches: 1.0
17
+ limit_val_batches: 1.0
18
+ sync_batchnorm: true
19
+ strategy: ddp_find_unused_parameters_true
20
+ callbacks:
21
+ model_checkpoint:
22
+ _target_: lart.utils.ema_checkpoint.EMACheckpoint
23
+ dirpath: ${paths.output_dir}/checkpoints
24
+ filename: epoch_{epoch:03d}
25
+ monitor: step
26
+ mode: max
27
+ save_last: true
28
+ auto_insert_metric_name: false
29
+ verbose: false
30
+ save_top_k: -1
31
+ save_weights_only: false
32
+ every_n_train_steps: null
33
+ train_time_interval: null
34
+ every_n_epochs: 1
35
+ save_on_train_epoch_end: true
36
+ model_summary:
37
+ _target_: lightning.pytorch.callbacks.RichModelSummary
38
+ max_depth: 1
39
+ rich_progress_bar:
40
+ _target_: lightning.pytorch.callbacks.RichProgressBar
41
+ refresh_rate: 1
42
+ learning_rate_monitor:
43
+ _target_: lightning.pytorch.callbacks.LearningRateMonitor
44
+ timer:
45
+ _target_: lightning.pytorch.callbacks.Timer
46
+ ema:
47
+ _target_: lart.utils.ema.EMA
48
+ decay: 0.9999
49
+ cpu_offload: false
50
+ validate_original_weights: false
51
+ every_n_steps: 1
52
+ logger:
53
+ tensorboard:
54
+ _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger
55
+ save_dir: ${paths.output_dir}/tensorboard/
56
+ version: 0
57
+ paths:
58
+ root_dir: ${oc.env:PROJECT_ROOT}
59
+ data_dir: ${paths.root_dir}/data/
60
+ log_dir: ${paths.root_dir}/logs/
61
+ output_dir: ${hydra:runtime.output_dir}
62
+ work_dir: ${hydra:runtime.cwd}
63
+ extras:
64
+ print_config: true
65
+ hydra_logging: colorlog
66
+ job_logging: colorlog
67
+ task_name: LART_mvit_1
68
+ tags:
69
+ - dev
70
+ train: true
71
+ test: true
72
+ ckpt_path: null
73
+ seed: null
74
+ datamodule:
75
+ _target_: lart.datamodules.phalp_datamodule.PHALPDataModule
76
+ cfg: ${configs}
77
+ train: ${train}
78
+ model:
79
+ _target_: lart.models.lart.LART_LitModule
80
+ cfg: ${configs}
81
+ configs:
82
+ data_dir: ${paths.data_dir}
83
+ storage_folder: ${paths.log_dir}/${task_name}/${hydra:sweep.subdir}
84
+ train_dataset: ava_train,kinetics_train
85
+ test_dataset: ava_val
86
+ map_on: AVA
87
+ train_batch_size: 8
88
+ train_num_workers: 8
89
+ test_batch_size: 8
90
+ test_num_workers: 8
91
+ test_class: ''
92
+ test_batch_id: -1
93
+ number_of_processes: 25
94
+ pin_memory: true
95
+ full_seq_render: false
96
+ frame_length: 125
97
+ max_people: 1
98
+ load_other_tracks: false
99
+ img_size: 256
100
+ load_images: false
101
+ use_mean_std: true
102
+ use_mean_std_mid: false
103
+ frame_rate_range: 1
104
+ num_smpl_heads: 1
105
+ finetune: false
106
+ bottle_neck: conv
107
+ pos_embedding: learned
108
+ mask_ratio: 0.4
109
+ in_feat: 512
110
+ one_euro_filter: pred_loca,pred_pose
111
+ loss_type: action_BCE
112
+ mask_type: random
113
+ mask_type_test: zero
114
+ test_type: track.fullframe@
115
+ encode_type: 4c
116
+ masked: false
117
+ weights_path: null
118
+ loss_on_others_action: true
119
+ debug: false
120
+ load_strict: true
121
+ mixed_training: 0
122
+ compute_map: true
123
+ compute_acc: true
124
+ log_frequency: 100
125
+ hmr_model: hmr2018
126
+ loca_l1_weight: 1
127
+ action_space: ava
128
+ solver:
129
+ name: AdamW
130
+ lr: 0.00012
131
+ momentum: 0.9
132
+ decay_steps:
133
+ - 10
134
+ - 20
135
+ decay_gamma: 0.1
136
+ layer_decay: null
137
+ ZERO_WD_1D_PARAM: true
138
+ warmup_epochs: 5
139
+ weight_decay: 0.05
140
+ scheduler: cosine
141
+ apply_linear_scaling: true
142
+ ava:
143
+ sampling_factor: 1
144
+ num_action_classes: 80
145
+ num_valid_action_classes: 60
146
+ gt_type: all
147
+ head_dropout: 0.0
148
+ predict_valid: true
149
+ map_on: AVA
150
+ kinetics:
151
+ sampling_factor: 1
152
+ num_action_classes: 400
153
+ loss:
154
+ focal:
155
+ gamma: 2
156
+ alpha: 0.25
157
+ extra_feat:
158
+ enable: joints_3D,apperance
159
+ pose_shape:
160
+ dim: 229
161
+ mid_dim: 256
162
+ en_dim: 128
163
+ joints_3D:
164
+ dim: 135
165
+ mid_dim: 256
166
+ en_dim: 128
167
+ apperance:
168
+ dim: 1152
169
+ mid_dim: 512
170
+ en_dim: 256
171
+ transformer:
172
+ model: legacy
173
+ depth: 16
174
+ heads: 16
175
+ mlp_dim: 512
176
+ dim_head: 64
177
+ dropout: 0.1
178
+ emb_dropout: 0.1
179
+ droppath: 0.4
180
+ use_interaction_module: false
181
+ use_perceiver: false
182
+ use_interaction_module_action_only: false
183
+ conv:
184
+ pad: 1
185
+ stride: 5
186
+ smpl_cfg:
187
+ SMPL:
188
+ MODEL_PATH: data/3D
189
+ GENDER: neutral
190
+ MODEL_TYPE: smpl
191
+ NUM_BODY_JOINTS: 23
192
+ JOINT_REGRESSOR_H36M: data/3D/J_regressor_h36m.npy
193
+ JOINT_REGRESSOR_EXTRA: data/3D/SMPL_to_J19.pkl
194
+ TEXTURE: data/3D/texture.npz
195
+ MODEL:
196
+ IMAGE_SIZE: 256
197
+ SMPL_HEAD:
198
+ TYPE: basic
199
+ POOL: max
200
+ SMPL_MEAN_PARAMS: data/3D/smpl_mean_params.npz
201
+ IN_CHANNELS: 2048
202
+ BACKBONE:
203
+ TYPE: resnet
204
+ NUM_LAYERS: 50
205
+ EXTRA:
206
+ FOCAL_LENGTH: 5000
mvit.pyth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acff8a726a3c2ee8e8b75914f2cc17d0ec14808a428f6ce05afceb6ca00732f7
3
+ size 1739882689
mvit.yaml ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LOG_MODEL_INFO: False
2
+ TRAIN:
3
+ DATASET: ava
4
+ BATCH_SIZE: 16
5
+ BATCH_SIZE: 8
6
+
7
+ EVAL_PERIOD: 2
8
+ CHECKPOINT_PERIOD: 1
9
+ AUTO_RESUME: True
10
+ CHECKPOINT_EPOCH_RESET: True
11
+ CHECKPOINT_IN_INIT: True
12
+ CHECKPOINT_FILE_PATH: ""
13
+ CHECKPOINT_TYPE: pytorch
14
+ CHECKPOINT_EPOCH_RESET: True
15
+
16
+ DATA:
17
+ USE_OFFSET_SAMPLING: True
18
+ DECODING_BACKEND: torchvision
19
+ NUM_FRAMES: 40
20
+ SAMPLING_RATE: 3
21
+ TRAIN_JITTER_SCALES: [356, 446]
22
+ TRAIN_CROP_SIZE: 312
23
+ TEST_CROP_SIZE: 312 # use if TEST.NUM_SPATIAL_CROPS: 1
24
+ INPUT_CHANNEL_NUM: [3]
25
+ PATH_TO_DATA_DIR: ""
26
+
27
+ TRAIN_JITTER_SCALES_RELATIVE: [0.08, 1.0]
28
+ TRAIN_JITTER_ASPECT_RELATIVE: [0.75, 1.3333]
29
+ MEAN: [0.485, 0.456, 0.406]
30
+ STD: [0.229, 0.224, 0.225]
31
+ MVIT:
32
+ ZERO_DECAY_POS_CLS: False
33
+ SEP_POS_EMBED: True
34
+ DEPTH: 24
35
+ NUM_HEADS: 1
36
+ EMBED_DIM: 96
37
+ PATCH_KERNEL: (3, 7, 7)
38
+ PATCH_STRIDE: (2, 4, 4)
39
+ PATCH_PADDING: (1, 3, 3)
40
+ MLP_RATIO: 4.0
41
+ QKV_BIAS: True
42
+ DROPPATH_RATE: 0.0
43
+ NORM: "layernorm"
44
+ EMBED_DIM: 144
45
+ NUM_HEADS: 2
46
+ DEPTH: 48 # [2, 6, 36, 2]
47
+ DIM_MUL: [[2, 2.0], [8, 2.0], [44, 2.0]]
48
+ HEAD_MUL: [[2, 2.0], [8, 2.0], [44, 2.0]]
49
+ POOL_Q_STRIDE: [[2, 1, 2, 2], [8, 1, 2, 2], [44, 1, 2, 2]]
50
+ DROPPATH_RATE: 0.0
51
+
52
+ POOL_KV_STRIDE_ADAPTIVE: [1, 8, 8]
53
+ POOL_KVQ_KERNEL: [3, 3, 3]
54
+ USE_ABS_POS: False # default: True
55
+ REL_POS_SPATIAL: True # default: false
56
+ REL_POS_TEMPORAL: True # default: false
57
+ MODE: "conv_unshared"
58
+
59
+ POOL_Q_STRIDE: [[0, 1, 1, 1], [1, 1, 1, 1], [2, 1, 2, 2], [3, 1, 1, 1], [4, 1, 1, 1], [5, 1, 1, 1], [6, 1, 1, 1], [7, 1, 1, 1], [8, 1, 2, 2], [9, 1, 1, 1], [10, 1, 1, 1],
60
+ [11, 1, 1, 1], [12, 1, 1, 1], [13, 1, 1, 1], [14, 1, 1, 1], [15, 1, 1, 1], [16, 1, 1, 1], [17, 1, 1, 1], [18, 1, 1, 1], [19, 1, 1, 1], [20, 1, 1, 1],
61
+ [21, 1, 1, 1], [22, 1, 1, 1], [23, 1, 1, 1], [24, 1, 1, 1], [25, 1, 1, 1], [26, 1, 1, 1], [27, 1, 1, 1], [28, 1, 1, 1], [29, 1, 1, 1], [30, 1, 1, 1],
62
+ [31, 1, 1, 1], [32, 1, 1, 1], [33, 1, 1, 1], [34, 1, 1, 1], [35, 1, 1, 1], [36, 1, 1, 1], [37, 1, 1, 1], [38, 1, 1, 1], [39, 1, 1, 1], [40, 1, 1, 1],
63
+ [41, 1, 1, 1], [42, 1, 1, 1], [43, 1, 1, 1], [44, 1, 2, 2], [45, 1, 1, 1], [46, 1, 1, 1], [47, 1, 1, 1] ]
64
+ MODE: "conv"
65
+ RESIDUAL_POOLING: True
66
+ SEPARATE_QKV: True
67
+ CLS_EMBED_ON: False # defauult: True
68
+
69
+ BN:
70
+ USE_PRECISE_STATS: False
71
+ NUM_BATCHES_PRECISE: 200
72
+
73
+ DETECTION:
74
+ ENABLE: True
75
+ ALIGNED: True
76
+ SPATIAL_SCALE_FACTOR: 32
77
+ AVA:
78
+ BGR: False
79
+ DETECTION_SCORE_THRESH: 0.9
80
+ TRAIN_PREDICT_BOX_LISTS: [
81
+ "ava_train_v2.2.csv",
82
+ "person_box_67091280_iou90/ava_detection_train_boxes_and_labels_include_negative_v2.2.csv",
83
+ ]
84
+ TEST_PREDICT_BOX_LISTS: ["person_box_67091280_iou90/ava_detection_val_boxes_and_labels.csv"]
85
+ ANNOTATION_DIR: /datasets01/AVA/080720/frame_list/
86
+ FRAME_LIST_DIR: /datasets01/AVA/080720/frame_list/
87
+ FRAME_DIR: /datasets01/AVA/080720/frames/
88
+ FULL_TEST_ON_VAL: True
89
+
90
+
91
+ SOLVER:
92
+ CLIP_GRAD_L2NORM: 2.0
93
+ ZERO_WD_1D_PARAM: True
94
+ BASE_LR_SCALE_NUM_SHARDS: True
95
+ BASE_LR: 0.075
96
+ COSINE_AFTER_WARMUP: True
97
+ COSINE_END_LR: 1e-7
98
+ WARMUP_START_LR: 1e-8
99
+ WARMUP_EPOCHS: 5.0
100
+ LR_POLICY: cosine
101
+ MAX_EPOCH: 20
102
+ MOMENTUM: 0.9
103
+ WEIGHT_DECAY: 1e-8
104
+ OPTIMIZING_METHOD: sgd
105
+ MODEL:
106
+ NUM_CLASSES: 80
107
+ HEAD_ACT: sigmoid
108
+ # NUM_CLASSES: 600
109
+ ARCH: mvit
110
+ MODEL_NAME: MViT
111
+ LOSS_FUNC: bce # soft_cross_entropy # default cross_entropy
112
+ DROPOUT_RATE: 0.0
113
+ ACT_CHECKPOINT: True # for test flops
114
+
115
+ TEST:
116
+ ENABLE: True
117
+ DATASET: ava
118
+ BATCH_SIZE: 1
119
+ NUM_SPATIAL_CROPS: 1
120
+ CHECKPOINT_FILE_PATH: /home/jathu/mvit.pyth
121
+
122
+ DATA_LOADER:
123
+ NUM_WORKERS: 4
124
+ PIN_MEMORY: True
125
+ NUM_GPUS: 1
126
+ NUM_SHARDS: 1
127
+ SHARD_ID: 0
128
+ RNG_SEED: 0
129
+ OUTPUT_DIR: .
130
+
131
+ DEMO:
132
+ ENABLE: True
133
+ LABEL_FILE_PATH: /private/home/jathushan/3D/slowfast/ava_names.json
134
+ WEBCAM: -1
135
+ INPUT_VIDEO: /private/home/jathushan/datasets/ttv/webm2/82FE8F069F1354550003607470080_1fcf1757309.4.7.mp4
136
+ OUTPUT_FILE: output.mp4
137
+
138
+
139
+ # #dbg
140
+ # DATA_LOADER:
141
+ # NUM_WORKERS: 0
142
+ # NUM_GPUS: 1