File size: 6,293 Bytes
4d7702e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 | {
"root_path": "/home/zli",
"available_corpus": {
"cc3m": {
"anno_path": "your_path",
"data_root": "",
"media_type": "image"
},
"webvid_10m": {
"anno_path": "your_path",
"data_root": "",
"media_type": "video"
},
"smol_test": {
"anno_path": "/root/IV2/InternVideo2/multi_modality/data_test/smol_test.json",
"data_root": "/root/IV2/InternVideo2/multi_modality/data_test/",
"media_type": "video"
},
"slim_kinetics": {
"anno_path": "/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json",
"data_root": "/home/zli/kinetics-dataset/k600/train/train",
"media_type": "video"
},
"slim_kinetics_act_val": {
"anno_path": "/home/zli/kinetics-dataset/k600/test/kinetics-test.json",
"data_root": "/home/zli/kinetics-dataset/k600/test/",
"media_type": "video",
"is_act_rec": true
}
},
"VisionEncoders": {},
"TextEncoders": {
"bert": {
"name": "bert_base",
"pretrained": "bert-base-uncased",
"config": "configs/config_bert.json",
"d_model": 768,
"fusion_layer": 9
},
"bert_large": {
"name": "bert_large",
"pretrained": "bert-large-uncased",
"config": "configs/config_bert_large.json",
"d_model": 1024,
"fusion_layer": 19
},
"med_bert": {
"name": "med_bert_base",
"pretrained": "bert-base-uncased",
"config": "configs/med_config.json",
"d_model": 768
},
"med_bert_large": {
"name": "med_bert_large",
"pretrained": "bert-base-uncased",
"config": "configs/med_large_config.json",
"d_model": 768
}
},
"train_corpus": "slim_kinetics",
"train_file": {
"anno_path": "/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json",
"data_root": "/home/zli/kinetics-dataset/k600/train/train",
"media_type": "video"
},
"test_file": {
"act_val": {
"anno_path": "/home/zli/kinetics-dataset/k600/test/kinetics-test.json",
"data_root": "/home/zli/kinetics-dataset/k600/test/",
"media_type": "video",
"is_act_rec": true
}
},
"test_types": [
"act_val"
],
"num_workers": 2,
"stop_key": null,
"num_frames": 8,
"num_frames_test": 8,
"batch_size": 16,
"batch_size_test": 16,
"max_txt_l": 32,
"size_t": 224,
"inputs": {
"image_res": 224,
"video_input": {
"num_frames": 8,
"sample_type": "all",
"num_frames_test": 8,
"sample_type_test": "all",
"random_aug": false
},
"max_txt_l": {
"image": 32,
"video": 32
},
"batch_size": {
"image": 16,
"video": 16
},
"batch_size_test": {
"image": 16,
"video": 16
}
},
"model": {
"model_cls": "InternVideo2_CLIP_small",
"vision_encoder": {
"name": "internvideo2",
"in_chans": 3,
"patch_size": 14,
"img_size": 224,
"qkv_bias": false,
"drop_path_rate": 0.0,
"head_drop_path_rate": 0.0,
"embed_dim": 768,
"num_heads": 12,
"mlp_ratio": 4,
"init_values": 0.1,
"qk_normalization": true,
"depth": 12,
"use_flash_attn": true,
"use_fused_rmsnorm": true,
"use_fused_mlp": true,
"fused_mlp_heuristic": 1,
"drop_cls_token": false,
"attn_pool_num_heads": 16,
"clip_embed_dim": 768,
"layerscale_no_force_fp32": true,
"num_frames": 8,
"tubelet_size": 1,
"sep_pos_embed": false,
"use_checkpoint": false,
"checkpoint_num": 0,
"align_dim": 512
},
"streaming_vision_encoder": {
"in_chans": 3,
"patch_size": 14,
"img_size": 224,
"vit_qkv_bias": true,
"vit_drop_path_rate": 0.05,
"student_embed_dim": 384,
"student_depth": 4,
"student_num_heads": 6,
"vit_mlp_ratio": 3.0,
"vit_init_values": null,
"vit_qk_normalization": false,
"vit_sep_pos_embed": true,
"vit_norm_layer_type": "rmsnorm",
"rnn_type": "lstm",
"rnn_hidden_size": 1024,
"rnn_num_layers": 1,
"fc_hidden_layers": [],
"teacher_clip_embed_dim": 768,
"student_num_frames_processed_by_vit": 1,
"student_tubelet_size_for_vit": 1
},
"text_encoder": {
"name": "mobileclip_b"
},
"temp": 0.01,
"temp_min": 0.01,
"freeze_vision": true,
"open_vision_clip_projector": false,
"freeze_text": true,
"open_text_projection": false,
"open_text_lora": false,
"vision_ckpt_path": "/home/zli/IV2/models/stage1/B14/B14_dist_1B_stage2/pytorch_model.bin",
"load_vision_ckpt_from_internvideo2_stage2": false,
"text_ckpt_path": "/home/zli/IV2/models/mobileclip_blt.pt",
"extra_ckpt_path": "/home/zli/IV2/models/clip/B14/pytorch_model.bin"
},
"criterion": {
"loss_weight": {
"vtc": 1.0
}
},
"optimizer": {
"opt": "adamW",
"lr": 1e-05,
"opt_betas": [
0.9,
0.98
],
"weight_decay": 0.01,
"max_grad_norm": 0.7,
"different_lr": {
"enable": false,
"module_names": [],
"lr": 1e-05
}
},
"scheduler": {
"sched": "cosine",
"epochs": 1,
"min_lr_multi": 0.01,
"warmup_epochs": 0.05
},
"evaluate": false,
"deep_fusion": false,
"evaluation": {
"eval_frame_ensemble": "concat",
"eval_x_only": false,
"k_test": 128,
"eval_offload": true
},
"use_half_precision": true,
"use_bf16": true,
"gradient_checkpointing": true,
"wandb": {
"enable": true,
"entity": "qingy2019-conker-mobile-inc-",
"project": "window_iv2"
},
"dist_url": "env://",
"device": "cuda",
"mode": "pt",
"output_dir": "scripts/pretraining/clip/B14/B14",
"resume": true,
"debug": false,
"log_freq": 1,
"seed": 42,
"save_latest": false,
"save_iter": 5000,
"eval_freq_steps": 1000,
"eval_video_repo_id": "qingy2024/backflip_train",
"eval_video_filename": "1.mp4",
"eval_plot_output_dir": "scripts/pretraining/clip/B14/cosine_sim_graphs",
"auto_resume": true,
"pretrained_path": "",
"deepspeed": {
"enable": true,
"stage": 1
},
"rank": 0,
"world_size": 1,
"gpu": 0,
"distributed": true,
"dist_backend": "nccl",
"deepspeed_config": "scripts/pretraining/clip/B14/B14/deepspeed_config.json"
} |