Spaces:
Runtime error
Runtime error
add config directory
Browse files- configs/bert_config.json +21 -0
- configs/caption_coco.yaml +33 -0
- configs/med_config.json +21 -0
- configs/nlvr.yaml +21 -0
- configs/nocaps.yaml +15 -0
- configs/pretrain.yaml +27 -0
- configs/retrieval_coco.yaml +34 -0
- configs/retrieval_flickr.yaml +34 -0
- configs/retrieval_msrvtt.yaml +12 -0
- configs/vqa.yaml +25 -0
configs/bert_config.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"BertModel"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"hidden_act": "gelu",
|
| 7 |
+
"hidden_dropout_prob": 0.1,
|
| 8 |
+
"hidden_size": 768,
|
| 9 |
+
"initializer_range": 0.02,
|
| 10 |
+
"intermediate_size": 3072,
|
| 11 |
+
"layer_norm_eps": 1e-12,
|
| 12 |
+
"max_position_embeddings": 512,
|
| 13 |
+
"model_type": "bert",
|
| 14 |
+
"num_attention_heads": 12,
|
| 15 |
+
"num_hidden_layers": 12,
|
| 16 |
+
"pad_token_id": 0,
|
| 17 |
+
"type_vocab_size": 2,
|
| 18 |
+
"vocab_size": 30522,
|
| 19 |
+
"encoder_width": 768,
|
| 20 |
+
"add_cross_attention": true
|
| 21 |
+
}
|
configs/caption_coco.yaml
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
image_root: '/export/share/datasets/vision/coco/images/'
|
| 2 |
+
ann_root: 'annotation'
|
| 3 |
+
coco_gt_root: 'annotation/coco_gt'
|
| 4 |
+
|
| 5 |
+
# set pretrained as a file path or an url
|
| 6 |
+
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth'
|
| 7 |
+
|
| 8 |
+
# size of vit model; base or large
|
| 9 |
+
vit: 'base'
|
| 10 |
+
vit_grad_ckpt: False
|
| 11 |
+
vit_ckpt_layer: 0
|
| 12 |
+
batch_size: 32
|
| 13 |
+
init_lr: 1e-5
|
| 14 |
+
|
| 15 |
+
# vit: 'large'
|
| 16 |
+
# vit_grad_ckpt: True
|
| 17 |
+
# vit_ckpt_layer: 5
|
| 18 |
+
# batch_size: 16
|
| 19 |
+
# init_lr: 2e-6
|
| 20 |
+
|
| 21 |
+
image_size: 384
|
| 22 |
+
|
| 23 |
+
# generation configs
|
| 24 |
+
max_length: 20
|
| 25 |
+
min_length: 5
|
| 26 |
+
num_beams: 3
|
| 27 |
+
prompt: 'a picture of '
|
| 28 |
+
|
| 29 |
+
# optimizer
|
| 30 |
+
weight_decay: 0.05
|
| 31 |
+
min_lr: 0
|
| 32 |
+
max_epoch: 5
|
| 33 |
+
|
configs/med_config.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"BertModel"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"hidden_act": "gelu",
|
| 7 |
+
"hidden_dropout_prob": 0.1,
|
| 8 |
+
"hidden_size": 768,
|
| 9 |
+
"initializer_range": 0.02,
|
| 10 |
+
"intermediate_size": 3072,
|
| 11 |
+
"layer_norm_eps": 1e-12,
|
| 12 |
+
"max_position_embeddings": 512,
|
| 13 |
+
"model_type": "bert",
|
| 14 |
+
"num_attention_heads": 12,
|
| 15 |
+
"num_hidden_layers": 12,
|
| 16 |
+
"pad_token_id": 0,
|
| 17 |
+
"type_vocab_size": 2,
|
| 18 |
+
"vocab_size": 30524,
|
| 19 |
+
"encoder_width": 768,
|
| 20 |
+
"add_cross_attention": true
|
| 21 |
+
}
|
configs/nlvr.yaml
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
image_root: '/export/share/datasets/vision/NLVR2/'
|
| 2 |
+
ann_root: 'annotation'
|
| 3 |
+
|
| 4 |
+
# set pretrained as a file path or an url
|
| 5 |
+
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth'
|
| 6 |
+
|
| 7 |
+
#size of vit model; base or large
|
| 8 |
+
vit: 'base'
|
| 9 |
+
batch_size_train: 16
|
| 10 |
+
batch_size_test: 64
|
| 11 |
+
vit_grad_ckpt: False
|
| 12 |
+
vit_ckpt_layer: 0
|
| 13 |
+
max_epoch: 15
|
| 14 |
+
|
| 15 |
+
image_size: 384
|
| 16 |
+
|
| 17 |
+
# optimizer
|
| 18 |
+
weight_decay: 0.05
|
| 19 |
+
init_lr: 3e-5
|
| 20 |
+
min_lr: 0
|
| 21 |
+
|
configs/nocaps.yaml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
image_root: '/export/share/datasets/vision/nocaps/'
|
| 2 |
+
ann_root: 'annotation'
|
| 3 |
+
|
| 4 |
+
# set pretrained as a file path or an url
|
| 5 |
+
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth'
|
| 6 |
+
|
| 7 |
+
vit: 'base'
|
| 8 |
+
batch_size: 32
|
| 9 |
+
|
| 10 |
+
image_size: 384
|
| 11 |
+
|
| 12 |
+
max_length: 20
|
| 13 |
+
min_length: 5
|
| 14 |
+
num_beams: 3
|
| 15 |
+
prompt: 'a picture of '
|
configs/pretrain.yaml
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
train_file: ['/export/share/junnan-li/VL_pretrain/annotation/coco_karpathy_train.json',
|
| 2 |
+
'/export/share/junnan-li/VL_pretrain/annotation/vg_caption.json',
|
| 3 |
+
]
|
| 4 |
+
laion_path: ''
|
| 5 |
+
|
| 6 |
+
# size of vit model; base or large
|
| 7 |
+
vit: 'base'
|
| 8 |
+
vit_grad_ckpt: False
|
| 9 |
+
vit_ckpt_layer: 0
|
| 10 |
+
|
| 11 |
+
image_size: 224
|
| 12 |
+
batch_size: 75
|
| 13 |
+
|
| 14 |
+
queue_size: 57600
|
| 15 |
+
alpha: 0.4
|
| 16 |
+
|
| 17 |
+
# optimizer
|
| 18 |
+
weight_decay: 0.05
|
| 19 |
+
init_lr: 3e-4
|
| 20 |
+
min_lr: 1e-6
|
| 21 |
+
warmup_lr: 1e-6
|
| 22 |
+
lr_decay_rate: 0.9
|
| 23 |
+
max_epoch: 20
|
| 24 |
+
warmup_steps: 3000
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
|
configs/retrieval_coco.yaml
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
image_root: '/export/share/datasets/vision/coco/images/'
|
| 2 |
+
ann_root: 'annotation'
|
| 3 |
+
dataset: 'coco'
|
| 4 |
+
|
| 5 |
+
# set pretrained as a file path or an url
|
| 6 |
+
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth'
|
| 7 |
+
|
| 8 |
+
# size of vit model; base or large
|
| 9 |
+
|
| 10 |
+
vit: 'base'
|
| 11 |
+
batch_size_train: 32
|
| 12 |
+
batch_size_test: 64
|
| 13 |
+
vit_grad_ckpt: True
|
| 14 |
+
vit_ckpt_layer: 4
|
| 15 |
+
init_lr: 1e-5
|
| 16 |
+
|
| 17 |
+
# vit: 'large'
|
| 18 |
+
# batch_size_train: 16
|
| 19 |
+
# batch_size_test: 32
|
| 20 |
+
# vit_grad_ckpt: True
|
| 21 |
+
# vit_ckpt_layer: 12
|
| 22 |
+
# init_lr: 5e-6
|
| 23 |
+
|
| 24 |
+
image_size: 384
|
| 25 |
+
queue_size: 57600
|
| 26 |
+
alpha: 0.4
|
| 27 |
+
k_test: 256
|
| 28 |
+
negative_all_rank: True
|
| 29 |
+
|
| 30 |
+
# optimizer
|
| 31 |
+
weight_decay: 0.05
|
| 32 |
+
min_lr: 0
|
| 33 |
+
max_epoch: 6
|
| 34 |
+
|
configs/retrieval_flickr.yaml
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
image_root: '/export/share/datasets/vision/flickr30k/'
|
| 2 |
+
ann_root: 'annotation'
|
| 3 |
+
dataset: 'flickr'
|
| 4 |
+
|
| 5 |
+
# set pretrained as a file path or an url
|
| 6 |
+
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_flickr.pth'
|
| 7 |
+
|
| 8 |
+
# size of vit model; base or large
|
| 9 |
+
|
| 10 |
+
vit: 'base'
|
| 11 |
+
batch_size_train: 32
|
| 12 |
+
batch_size_test: 64
|
| 13 |
+
vit_grad_ckpt: True
|
| 14 |
+
vit_ckpt_layer: 4
|
| 15 |
+
init_lr: 1e-5
|
| 16 |
+
|
| 17 |
+
# vit: 'large'
|
| 18 |
+
# batch_size_train: 16
|
| 19 |
+
# batch_size_test: 32
|
| 20 |
+
# vit_grad_ckpt: True
|
| 21 |
+
# vit_ckpt_layer: 10
|
| 22 |
+
# init_lr: 5e-6
|
| 23 |
+
|
| 24 |
+
image_size: 384
|
| 25 |
+
queue_size: 57600
|
| 26 |
+
alpha: 0.4
|
| 27 |
+
k_test: 128
|
| 28 |
+
negative_all_rank: False
|
| 29 |
+
|
| 30 |
+
# optimizer
|
| 31 |
+
weight_decay: 0.05
|
| 32 |
+
min_lr: 0
|
| 33 |
+
max_epoch: 6
|
| 34 |
+
|
configs/retrieval_msrvtt.yaml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
video_root: '/export/share/dongxuli/data/msrvtt_retrieval/videos'
|
| 2 |
+
ann_root: 'annotation'
|
| 3 |
+
|
| 4 |
+
# set pretrained as a file path or an url
|
| 5 |
+
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth'
|
| 6 |
+
|
| 7 |
+
# size of vit model; base or large
|
| 8 |
+
vit: 'base'
|
| 9 |
+
batch_size: 64
|
| 10 |
+
k_test: 128
|
| 11 |
+
image_size: 384
|
| 12 |
+
num_frm_test: 8
|
configs/vqa.yaml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
vqa_root: '/export/share/datasets/vision/VQA/Images/mscoco/' #followed by train2014/
|
| 2 |
+
vg_root: '/export/share/datasets/vision/visual-genome/' #followed by image/
|
| 3 |
+
train_files: ['vqa_train','vqa_val','vg_qa']
|
| 4 |
+
ann_root: 'annotation'
|
| 5 |
+
|
| 6 |
+
# set pretrained as a file path or an url
|
| 7 |
+
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth'
|
| 8 |
+
|
| 9 |
+
# size of vit model; base or large
|
| 10 |
+
vit: 'base'
|
| 11 |
+
batch_size_train: 16
|
| 12 |
+
batch_size_test: 32
|
| 13 |
+
vit_grad_ckpt: False
|
| 14 |
+
vit_ckpt_layer: 0
|
| 15 |
+
init_lr: 2e-5
|
| 16 |
+
|
| 17 |
+
image_size: 480
|
| 18 |
+
|
| 19 |
+
k_test: 128
|
| 20 |
+
inference: 'rank'
|
| 21 |
+
|
| 22 |
+
# optimizer
|
| 23 |
+
weight_decay: 0.05
|
| 24 |
+
min_lr: 0
|
| 25 |
+
max_epoch: 10
|