File size: 6,293 Bytes
4d7702e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
{
  "root_path": "/home/zli",
  "available_corpus": {
    "cc3m": {
      "anno_path": "your_path",
      "data_root": "",
      "media_type": "image"
    },
    "webvid_10m": {
      "anno_path": "your_path",
      "data_root": "",
      "media_type": "video"
    },
    "smol_test": {
      "anno_path": "/root/IV2/InternVideo2/multi_modality/data_test/smol_test.json",
      "data_root": "/root/IV2/InternVideo2/multi_modality/data_test/",
      "media_type": "video"
    },
    "slim_kinetics": {
      "anno_path": "/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json",
      "data_root": "/home/zli/kinetics-dataset/k600/train/train",
      "media_type": "video"
    },
    "slim_kinetics_act_val": {
      "anno_path": "/home/zli/kinetics-dataset/k600/test/kinetics-test.json",
      "data_root": "/home/zli/kinetics-dataset/k600/test/",
      "media_type": "video",
      "is_act_rec": true
    }
  },
  "VisionEncoders": {},
  "TextEncoders": {
    "bert": {
      "name": "bert_base",
      "pretrained": "bert-base-uncased",
      "config": "configs/config_bert.json",
      "d_model": 768,
      "fusion_layer": 9
    },
    "bert_large": {
      "name": "bert_large",
      "pretrained": "bert-large-uncased",
      "config": "configs/config_bert_large.json",
      "d_model": 1024,
      "fusion_layer": 19
    },
    "med_bert": {
      "name": "med_bert_base",
      "pretrained": "bert-base-uncased",
      "config": "configs/med_config.json",
      "d_model": 768
    },
    "med_bert_large": {
      "name": "med_bert_large",
      "pretrained": "bert-base-uncased",
      "config": "configs/med_large_config.json",
      "d_model": 768
    }
  },
  "train_corpus": "slim_kinetics",
  "train_file": {
    "anno_path": "/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json",
    "data_root": "/home/zli/kinetics-dataset/k600/train/train",
    "media_type": "video"
  },
  "test_file": {
    "act_val": {
      "anno_path": "/home/zli/kinetics-dataset/k600/test/kinetics-test.json",
      "data_root": "/home/zli/kinetics-dataset/k600/test/",
      "media_type": "video",
      "is_act_rec": true
    }
  },
  "test_types": [
    "act_val"
  ],
  "num_workers": 2,
  "stop_key": null,
  "num_frames": 8,
  "num_frames_test": 8,
  "batch_size": 16,
  "batch_size_test": 16,
  "max_txt_l": 32,
  "size_t": 224,
  "inputs": {
    "image_res": 224,
    "video_input": {
      "num_frames": 8,
      "sample_type": "all",
      "num_frames_test": 8,
      "sample_type_test": "all",
      "random_aug": false
    },
    "max_txt_l": {
      "image": 32,
      "video": 32
    },
    "batch_size": {
      "image": 16,
      "video": 16
    },
    "batch_size_test": {
      "image": 16,
      "video": 16
    }
  },
  "model": {
    "model_cls": "InternVideo2_CLIP_small",
    "vision_encoder": {
      "name": "internvideo2",
      "in_chans": 3,
      "patch_size": 14,
      "img_size": 224,
      "qkv_bias": false,
      "drop_path_rate": 0.0,
      "head_drop_path_rate": 0.0,
      "embed_dim": 768,
      "num_heads": 12,
      "mlp_ratio": 4,
      "init_values": 0.1,
      "qk_normalization": true,
      "depth": 12,
      "use_flash_attn": true,
      "use_fused_rmsnorm": true,
      "use_fused_mlp": true,
      "fused_mlp_heuristic": 1,
      "drop_cls_token": false,
      "attn_pool_num_heads": 16,
      "clip_embed_dim": 768,
      "layerscale_no_force_fp32": true,
      "num_frames": 8,
      "tubelet_size": 1,
      "sep_pos_embed": false,
      "use_checkpoint": false,
      "checkpoint_num": 0,
      "align_dim": 512
    },
    "streaming_vision_encoder": {
      "in_chans": 3,
      "patch_size": 14,
      "img_size": 224,
      "vit_qkv_bias": true,
      "vit_drop_path_rate": 0.05,
      "student_embed_dim": 384,
      "student_depth": 4,
      "student_num_heads": 6,
      "vit_mlp_ratio": 3.0,
      "vit_init_values": null,
      "vit_qk_normalization": false,
      "vit_sep_pos_embed": true,
      "vit_norm_layer_type": "rmsnorm",
      "rnn_type": "lstm",
      "rnn_hidden_size": 1024,
      "rnn_num_layers": 1,
      "fc_hidden_layers": [],
      "teacher_clip_embed_dim": 768,
      "student_num_frames_processed_by_vit": 1,
      "student_tubelet_size_for_vit": 1
    },
    "text_encoder": {
      "name": "mobileclip_b"
    },
    "temp": 0.01,
    "temp_min": 0.01,
    "freeze_vision": true,
    "open_vision_clip_projector": false,
    "freeze_text": true,
    "open_text_projection": false,
    "open_text_lora": false,
    "vision_ckpt_path": "/home/zli/IV2/models/stage1/B14/B14_dist_1B_stage2/pytorch_model.bin",
    "load_vision_ckpt_from_internvideo2_stage2": false,
    "text_ckpt_path": "/home/zli/IV2/models/mobileclip_blt.pt",
    "extra_ckpt_path": "/home/zli/IV2/models/clip/B14/pytorch_model.bin"
  },
  "criterion": {
    "loss_weight": {
      "vtc": 1.0
    }
  },
  "optimizer": {
    "opt": "adamW",
    "lr": 1e-05,
    "opt_betas": [
      0.9,
      0.98
    ],
    "weight_decay": 0.01,
    "max_grad_norm": 0.7,
    "different_lr": {
      "enable": false,
      "module_names": [],
      "lr": 1e-05
    }
  },
  "scheduler": {
    "sched": "cosine",
    "epochs": 1,
    "min_lr_multi": 0.01,
    "warmup_epochs": 0.05
  },
  "evaluate": false,
  "deep_fusion": false,
  "evaluation": {
    "eval_frame_ensemble": "concat",
    "eval_x_only": false,
    "k_test": 128,
    "eval_offload": true
  },
  "use_half_precision": true,
  "use_bf16": true,
  "gradient_checkpointing": true,
  "wandb": {
    "enable": true,
    "entity": "qingy2019-conker-mobile-inc-",
    "project": "window_iv2"
  },
  "dist_url": "env://",
  "device": "cuda",
  "mode": "pt",
  "output_dir": "scripts/pretraining/clip/B14/B14",
  "resume": true,
  "debug": false,
  "log_freq": 1,
  "seed": 42,
  "save_latest": false,
  "save_iter": 5000,
  "eval_freq_steps": 1000,
  "eval_video_repo_id": "qingy2024/backflip_train",
  "eval_video_filename": "1.mp4",
  "eval_plot_output_dir": "scripts/pretraining/clip/B14/cosine_sim_graphs",
  "auto_resume": true,
  "pretrained_path": "",
  "deepspeed": {
    "enable": true,
    "stage": 1
  },
  "rank": 0,
  "world_size": 1,
  "gpu": 0,
  "distributed": true,
  "dist_backend": "nccl",
  "deepspeed_config": "scripts/pretraining/clip/B14/B14/deepspeed_config.json"
}