| from sacred import Experiment |
|
|
| ex = Experiment("VLMo") |
|
|
|
|
| def _loss_names(d): |
| ret = { |
| "itm": 0, |
| "itc": 0, |
| "caption": 0, |
| "mvlm": 0, |
| "textmlm": 0, |
| "imagemlm": 0, |
| "vqa": 0, |
| "nlvr2": 0, |
| "irtr": 0, |
| } |
| ret.update(d) |
| return ret |
|
|
|
|
| @ex.config |
| def config(): |
| exp_name = "vlmo" |
| seed = 1 |
| datasets = ["coco", "vg", "sbu", "gcc"] |
| loss_names = _loss_names({"itm": 0, "itc": 0, "mvlm": 0}) |
| batch_size = 1024 |
|
|
| |
| encoder_layers = 12 |
| encoder_embed_dim = 768 |
| out_embed_dim = 768 |
| beit_version = "base" |
| beit3_vl_layers = 3 |
| deepnorm_init = True |
| share_layer = False |
| share_attn = False |
| one_attn = False |
|
|
| |
| train_transform_keys = ["square_transform_randaug"] |
| val_transform_keys = ["square_transform"] |
| image_size = 224 |
| reclip_image_size = None |
| patch_size = 16 |
| draw_false_image = 0 |
| image_only = False |
| text_only = False |
|
|
| |
| video_num_frm = None |
|
|
| |
| tokenizer_model = "beit2_visual_tokenizer" |
| codebook_size = 8192 |
| codebook_dim = 32 |
| visual_mask_size = 14 |
| visual_mask_num = 80 |
|
|
| |
| lang = 'cn' |
| vqav2_label_size = 3129 |
| max_text_len = 52 |
| max_text_len_of_initckpt = 196 |
| tokenizer_type = "BertTokenizer" |
| vocab_size = 21128 |
| tokenizer = "./vocab.txt" |
| whole_word_masking = True |
| mlm_prob = 0.15 |
| draw_false_text = 0 |
| mvlm_prob = 0.50 |
| mask_ratio = 0 |
|
|
| |
| cap_onlytext = False |
|
|
| |
| split_data_for_imagemlm = False |
|
|
| |
| itc_mask = False |
| aggregate_nodes = -1 |
|
|
| |
| model_arch = "vlmo_base_patch16" |
| drop_path_rate = 0.1 |
|
|
| |
| get_recall_metric = False |
| get_recall_rerank_metric = False |
| get_zeroshot_metric = False |
| get_muge_feat = False |
| get_f30k_feat = False |
| k_test = 32 |
|
|
| |
| resume_from = None |
| fast_dev_run = False |
| val_check_interval = 1.0 |
| test_only = False |
| use_sharded_training = False |
| resume_during_training = False |
| save_top_k = 10 |
| every_n_train_steps = 2000 |
| log_metric_steps = 100 |
|
|
| |
| use_pcache = False |
| pcache_root = "" |
| |
| |
| gpu_env = "main_site" |
| data_root = "" |
|
|
|
|
| log_dir = "result" |
| per_gpu_batchsize = 4 |
| num_gpus = 1 |
| num_nodes = 1 |
| load_path = "" |
| num_workers = 8 |
| precision = 16 |
| local_run = True |
| flash_attn = False |
| deepspeed_config = None |
| coalesce_backbone = False |
| mask_data = "v+l" |
| communication_benchmark = False |
| checkpoint_activations = False |
|
|
| |
| single_cap = True |
| random_one = False |
|
|
| |
| itc_feats_name = "cls_vlffn_feats" |
| itc_distill = "" |
| itc_distill_dim = 1024 |
| itc_teacher_weights = "" |
|
|
| |
| mup = False |
| base_encoder_embed_dim = 1 |
| delta_encoder_embed_dim = 2 |
| mup_encoder_attention_heads = 1 |
| base_encoder_ffn_embed_dim = 1 |
| delta_encoder_ffn_embed_dim = 2 |
|
|
| |
| atorch_config = None |
| compile_op = False |
| optimizer_state_shard_save = False |
| model_state_shard_save = False |
|
|
| |
| local_loss = False |
| use_dual_softmax = False |
|
|
| num_frames = 1 |
| |
|
|
| |
| deepnorm = False |
|
|
|
|