Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- LAVIS-main/lavis/configs/datasets/shapenet/defaults_mm_cap.yaml +51 -0
- LAVIS-main/lavis/configs/datasets/shapenet/defaults_mm_cap_instruct.yaml +53 -0
- LAVIS-main/lavis/configs/datasets/snli_ve/defaults.yaml +25 -0
- LAVIS-main/lavis/configs/datasets/snli_ve/defaults_instruct.yaml +49 -0
- LAVIS-main/lavis/configs/datasets/textcaps/defaults.yaml +46 -0
- LAVIS-main/lavis/configs/datasets/textcaps/defaults_instruct.yaml +47 -0
- LAVIS-main/lavis/configs/datasets/valor/defaults_mm_cap.yaml +68 -0
- LAVIS-main/lavis/configs/datasets/valor/defaults_mm_cap_instruct.yaml +70 -0
- LAVIS-main/lavis/configs/datasets/vatex/defaults_cap.yaml +24 -0
- LAVIS-main/lavis/configs/datasets/vatex/defaults_cap_instruct.yaml +62 -0
- LAVIS-main/lavis/configs/datasets/vg/defaults_caption.yaml +18 -0
- LAVIS-main/lavis/configs/datasets/vg/defaults_caption_instruct.yaml +34 -0
- LAVIS-main/lavis/configs/datasets/vg/defaults_vqa.yaml +18 -0
- LAVIS-main/lavis/configs/datasets/vg/defaults_vqa_instruct.yaml +34 -0
- LAVIS-main/lavis/configs/datasets/violin/defaults_cap.yaml +51 -0
- LAVIS-main/lavis/configs/datasets/violin/defaults_cap_instruct.yaml +53 -0
- LAVIS-main/lavis/configs/datasets/violin/defaults_entail.yaml +52 -0
- LAVIS-main/lavis/configs/datasets/violin/defaults_entail_instruct.yaml +51 -0
- LAVIS-main/lavis/configs/datasets/visdial/defaults_dial.yaml +41 -0
- LAVIS-main/lavis/configs/datasets/visdial/defaults_dial_instruct.yaml +41 -0
- LAVIS-main/lavis/configs/datasets/vizwiz/defaults.yaml +43 -0
- LAVIS-main/lavis/configs/datasets/vlep/defaults_cap.yaml +51 -0
- LAVIS-main/lavis/configs/datasets/vlep/defaults_cap_instruct.yaml +53 -0
- LAVIS-main/lavis/configs/datasets/vsr/defaults.yaml +49 -0
- LAVIS-main/lavis/configs/datasets/vsr/defaults_classification.yaml +49 -0
- LAVIS-main/lavis/configs/datasets/vsr/defaults_classification_instruct.yaml +49 -0
- LAVIS-main/lavis/configs/datasets/vsr/defaults_instruct.yaml +53 -0
- LAVIS-main/lavis/configs/datasets/wavcaps/defaults_mm_cap.yaml +63 -0
- LAVIS-main/lavis/configs/datasets/wavcaps/defaults_mm_cap_instruct.yaml +63 -0
- LAVIS-main/lavis/configs/datasets/webvid/defaults_cap.yaml +41 -0
- LAVIS-main/lavis/configs/datasets/webvid/defaults_cap_instruct.yaml +43 -0
- LAVIS-main/lavis/configs/datasets/youcook/defaults_cap.yaml +51 -0
- LAVIS-main/lavis/configs/datasets/youcook/defaults_cap_instruct.yaml +53 -0
- LAVIS-main/lavis/configs/datasets/yt8m/defaults_mm_dial.yaml +62 -0
- LAVIS-main/lavis/configs/models/albef_classification_ve.yaml +40 -0
- LAVIS-main/lavis/configs/models/albef_feature_extractor.yaml +30 -0
- LAVIS-main/lavis/configs/models/albef_nlvr.yaml +42 -0
- LAVIS-main/lavis/configs/models/albef_pretrain_base.yaml +38 -0
- LAVIS-main/lavis/configs/models/albef_retrieval_coco.yaml +46 -0
- LAVIS-main/lavis/configs/models/albef_retrieval_flickr.yaml +46 -0
- LAVIS-main/lavis/configs/models/albef_vqav2.yaml +40 -0
- LAVIS-main/lavis/configs/models/alpro_qa_msrvtt.yaml +44 -0
- LAVIS-main/lavis/configs/models/alpro_qa_msvd.yaml +43 -0
- LAVIS-main/lavis/configs/models/alpro_retrieval_didemo.yaml +35 -0
- LAVIS-main/lavis/configs/models/alpro_retrieval_msrvtt.yaml +41 -0
- LAVIS-main/lavis/configs/models/bert_config.json +21 -0
- LAVIS-main/lavis/configs/models/bert_config_alpro.json +23 -0
- LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_base.yaml +25 -0
- LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_canny.yaml +27 -0
- LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_depth.yaml +27 -0
LAVIS-main/lavis/configs/datasets/shapenet/defaults_mm_cap.yaml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
datasets:
|
| 6 |
+
shapenet_mm_caption: # name of the dataset builder
|
| 7 |
+
vis_processor:
|
| 8 |
+
train:
|
| 9 |
+
name: "clip_image_train"
|
| 10 |
+
image_size: 224
|
| 11 |
+
eval:
|
| 12 |
+
name: "clip_image_train"
|
| 13 |
+
image_size: 224
|
| 14 |
+
pc_processor:
|
| 15 |
+
train:
|
| 16 |
+
name: "ulip_pc"
|
| 17 |
+
eval:
|
| 18 |
+
name: "ulip_pc"
|
| 19 |
+
text_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: "blip_caption"
|
| 22 |
+
eval:
|
| 23 |
+
name: "blip_caption"
|
| 24 |
+
|
| 25 |
+
data_type: [pc, images] # [images|videos|features]
|
| 26 |
+
|
| 27 |
+
build_info:
|
| 28 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 29 |
+
annotations:
|
| 30 |
+
train:
|
| 31 |
+
url:
|
| 32 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/shapenet/train_ann.json
|
| 33 |
+
# - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/train_ann.json
|
| 34 |
+
storage:
|
| 35 |
+
- shapenet/annotations/train_ann.json
|
| 36 |
+
# - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/train_ann.json
|
| 37 |
+
val:
|
| 38 |
+
url:
|
| 39 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/shapenet/test_ann.json
|
| 40 |
+
# - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/test_ann.json
|
| 41 |
+
storage:
|
| 42 |
+
- shapenet/annotations/test_ann.json
|
| 43 |
+
# - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/test_ann.json
|
| 44 |
+
|
| 45 |
+
templates: null
|
| 46 |
+
|
| 47 |
+
pc:
|
| 48 |
+
storage: /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/shapenet_pc
|
| 49 |
+
|
| 50 |
+
images:
|
| 51 |
+
storage: /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/rendered_images
|
LAVIS-main/lavis/configs/datasets/shapenet/defaults_mm_cap_instruct.yaml
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
datasets:
|
| 6 |
+
shapenet_mm_caption_instruct: # name of the dataset builder
|
| 7 |
+
vis_processor:
|
| 8 |
+
train:
|
| 9 |
+
name: "clip_image_train"
|
| 10 |
+
image_size: 224
|
| 11 |
+
eval:
|
| 12 |
+
name: "clip_image_train"
|
| 13 |
+
image_size: 224
|
| 14 |
+
pc_processor:
|
| 15 |
+
train:
|
| 16 |
+
name: "ulip_pc"
|
| 17 |
+
eval:
|
| 18 |
+
name: "ulip_pc"
|
| 19 |
+
text_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: "blip_instruction"
|
| 22 |
+
modality: pc
|
| 23 |
+
task: caption
|
| 24 |
+
eval:
|
| 25 |
+
name: "blip_caption"
|
| 26 |
+
|
| 27 |
+
data_type: [pc, images] # [images|videos|features]
|
| 28 |
+
|
| 29 |
+
build_info:
|
| 30 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 31 |
+
annotations:
|
| 32 |
+
train:
|
| 33 |
+
url:
|
| 34 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/shapenet/train_ann.json
|
| 35 |
+
# - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/train_ann.json
|
| 36 |
+
storage:
|
| 37 |
+
- shapenet/annotations/train_ann.json
|
| 38 |
+
# - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/train_ann.json
|
| 39 |
+
val:
|
| 40 |
+
url:
|
| 41 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/shapenet/test_ann.json
|
| 42 |
+
# - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/test_ann.json
|
| 43 |
+
storage:
|
| 44 |
+
- shapenet/annotations/test_ann.json
|
| 45 |
+
# - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/test_ann.json
|
| 46 |
+
|
| 47 |
+
templates: null
|
| 48 |
+
|
| 49 |
+
pc:
|
| 50 |
+
storage: /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/shapenet_pc
|
| 51 |
+
|
| 52 |
+
images:
|
| 53 |
+
storage: /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/rendered_images
|
LAVIS-main/lavis/configs/datasets/snli_ve/defaults.yaml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
snli_ve:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
build_info:
|
| 12 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 13 |
+
annotations:
|
| 14 |
+
train:
|
| 15 |
+
url: /export/share/dongxuli/data/lavis/snli/annotation/ve_train.json
|
| 16 |
+
storage: snli/annotations/ve_train.json
|
| 17 |
+
val:
|
| 18 |
+
url: /export/share/dongxuli/data/lavis/snli/annotation/ve_dev.json
|
| 19 |
+
storage: snli/annotations/ve_dev.json
|
| 20 |
+
test:
|
| 21 |
+
url: /export/share/dongxuli/data/lavis/snli/annotation/ve_test.json
|
| 22 |
+
storage: snli/annotations/ve_test.json
|
| 23 |
+
images:
|
| 24 |
+
storage: flickr30k/images/flickr30k-images
|
| 25 |
+
# storage: /export/share/datasets/vision/flickr30k/flickr30k-images
|
LAVIS-main/lavis/configs/datasets/snli_ve/defaults_instruct.yaml
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
snli_ve_instruct:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
eval:
|
| 16 |
+
name: "clip_image_eval"
|
| 17 |
+
image_size: 224
|
| 18 |
+
|
| 19 |
+
text_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: blip_caption
|
| 22 |
+
eval:
|
| 23 |
+
name: blip_caption
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
build_info:
|
| 27 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 28 |
+
annotations:
|
| 29 |
+
train:
|
| 30 |
+
url:
|
| 31 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/snlive/ve_train.json
|
| 32 |
+
# - /export/share/dongxuli/data/lavis/snli/ve_train.json
|
| 33 |
+
storage:
|
| 34 |
+
- snli/annotations/ve_train.json
|
| 35 |
+
val:
|
| 36 |
+
url:
|
| 37 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/snlive/ve_dev.json
|
| 38 |
+
# - /export/share/dongxuli/data/lavis/snli/ve_dev.json
|
| 39 |
+
storage:
|
| 40 |
+
- snli/annotations/ve_dev.json
|
| 41 |
+
test:
|
| 42 |
+
url:
|
| 43 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/snlive/ve_test.json
|
| 44 |
+
# - /export/share/dongxuli/data/lavis/snli/ve_test.json
|
| 45 |
+
storage:
|
| 46 |
+
- snli/annotations/ve_test.json
|
| 47 |
+
images:
|
| 48 |
+
# storage: flickr30k/images/flickr30k-images
|
| 49 |
+
storage: /export/share/datasets/vision/flickr30k/flickr30k-images
|
LAVIS-main/lavis/configs/datasets/textcaps/defaults.yaml
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
textcaps_caption: # name of the dataset builder
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
eval:
|
| 16 |
+
name: "clip_image_eval"
|
| 17 |
+
image_size: 224
|
| 18 |
+
|
| 19 |
+
text_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: blip_caption
|
| 22 |
+
eval:
|
| 23 |
+
name: blip_caption
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
build_info:
|
| 27 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 28 |
+
annotations:
|
| 29 |
+
train:
|
| 30 |
+
url:
|
| 31 |
+
- https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_train.json
|
| 32 |
+
storage:
|
| 33 |
+
- TextCaps/TextCaps_0.1_train.json
|
| 34 |
+
val:
|
| 35 |
+
url:
|
| 36 |
+
- https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_val.json
|
| 37 |
+
storage:
|
| 38 |
+
- TextCaps/TextCaps_0.1_val.json
|
| 39 |
+
test:
|
| 40 |
+
url:
|
| 41 |
+
- https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_test.json
|
| 42 |
+
storage:
|
| 43 |
+
- TextCaps/TextCaps_0.1_test.json
|
| 44 |
+
images:
|
| 45 |
+
# storage: nocaps/images
|
| 46 |
+
storage: /export/share/datasets/vision_language/TextCaps/images
|
LAVIS-main/lavis/configs/datasets/textcaps/defaults_instruct.yaml
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
textcaps_caption_instruct: # name of the dataset builder
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
eval:
|
| 16 |
+
name: "clip_image_eval"
|
| 17 |
+
image_size: 224
|
| 18 |
+
|
| 19 |
+
text_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: blip_instruction
|
| 22 |
+
modality: image
|
| 23 |
+
task: caption
|
| 24 |
+
eval:
|
| 25 |
+
name: blip_caption
|
| 26 |
+
|
| 27 |
+
build_info:
|
| 28 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 29 |
+
annotations:
|
| 30 |
+
train:
|
| 31 |
+
url:
|
| 32 |
+
- https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_train.json
|
| 33 |
+
storage:
|
| 34 |
+
- TextCaps/TextCaps_0.1_train.json
|
| 35 |
+
val:
|
| 36 |
+
url:
|
| 37 |
+
- https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_val.json
|
| 38 |
+
storage:
|
| 39 |
+
- TextCaps/TextCaps_0.1_val.json
|
| 40 |
+
test:
|
| 41 |
+
url:
|
| 42 |
+
- https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_test.json
|
| 43 |
+
storage:
|
| 44 |
+
- TextCaps/TextCaps_0.1_test.json
|
| 45 |
+
images:
|
| 46 |
+
# storage: nocaps/images
|
| 47 |
+
storage: /export/share/datasets/vision_language/TextCaps/images
|
LAVIS-main/lavis/configs/datasets/valor/defaults_mm_cap.yaml
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
valor_mm_caption: # name of the dataset builder
|
| 8 |
+
data_type: [video, audio]
|
| 9 |
+
|
| 10 |
+
video_processor:
|
| 11 |
+
train:
|
| 12 |
+
name: alpro_video_train
|
| 13 |
+
n_frms: 4
|
| 14 |
+
image_size: 224
|
| 15 |
+
min_scale: 0.9
|
| 16 |
+
max_scale: 1.0
|
| 17 |
+
full_video: True
|
| 18 |
+
eval:
|
| 19 |
+
name: alpro_video_eval
|
| 20 |
+
n_frms: 4
|
| 21 |
+
image_size: 224
|
| 22 |
+
min_scale: 0.9
|
| 23 |
+
max_scale: 1.0
|
| 24 |
+
full_video: True
|
| 25 |
+
|
| 26 |
+
audio_processor:
|
| 27 |
+
train:
|
| 28 |
+
name: beats_audio
|
| 29 |
+
sampling_rate: 16000
|
| 30 |
+
eval:
|
| 31 |
+
name: beats_audio
|
| 32 |
+
sampling_rate: 16000
|
| 33 |
+
is_eval: False
|
| 34 |
+
|
| 35 |
+
text_processor:
|
| 36 |
+
train:
|
| 37 |
+
name: blip_caption
|
| 38 |
+
eval:
|
| 39 |
+
name: blip_caption
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
build_info:
|
| 43 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 44 |
+
annotations:
|
| 45 |
+
val:
|
| 46 |
+
url:
|
| 47 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/valor/desc_val.json
|
| 48 |
+
# - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_val.json
|
| 49 |
+
storage:
|
| 50 |
+
- valor/annotations/desc_val.json
|
| 51 |
+
# - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_val.json
|
| 52 |
+
|
| 53 |
+
test:
|
| 54 |
+
url:
|
| 55 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/valor/desc_test.json
|
| 56 |
+
# - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_test.json
|
| 57 |
+
storage:
|
| 58 |
+
- valor/annotations/desc_test.json
|
| 59 |
+
# - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_test.json
|
| 60 |
+
|
| 61 |
+
templates: null
|
| 62 |
+
|
| 63 |
+
audio:
|
| 64 |
+
storage: /export/video-language-dataset/data/VALOR/videos
|
| 65 |
+
|
| 66 |
+
video:
|
| 67 |
+
storage: /export/video-language-dataset/data/VALOR/videos
|
| 68 |
+
|
LAVIS-main/lavis/configs/datasets/valor/defaults_mm_cap_instruct.yaml
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
valor_mm_caption_instruct: # name of the dataset builder
|
| 8 |
+
data_type: [video, audio]
|
| 9 |
+
|
| 10 |
+
video_processor:
|
| 11 |
+
train:
|
| 12 |
+
name: alpro_video_train
|
| 13 |
+
n_frms: 4
|
| 14 |
+
image_size: 224
|
| 15 |
+
min_scale: 0.9
|
| 16 |
+
max_scale: 1.0
|
| 17 |
+
full_video: True
|
| 18 |
+
eval:
|
| 19 |
+
name: alpro_video_eval
|
| 20 |
+
n_frms: 4
|
| 21 |
+
image_size: 224
|
| 22 |
+
min_scale: 0.9
|
| 23 |
+
max_scale: 1.0
|
| 24 |
+
full_video: True
|
| 25 |
+
|
| 26 |
+
audio_processor:
|
| 27 |
+
train:
|
| 28 |
+
name: beats_audio
|
| 29 |
+
sampling_rate: 16000
|
| 30 |
+
eval:
|
| 31 |
+
name: beats_audio
|
| 32 |
+
sampling_rate: 16000
|
| 33 |
+
is_eval: False
|
| 34 |
+
|
| 35 |
+
text_processor:
|
| 36 |
+
train:
|
| 37 |
+
name: blip_instruction
|
| 38 |
+
modality: image
|
| 39 |
+
task: caption
|
| 40 |
+
eval:
|
| 41 |
+
name: blip_caption
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
build_info:
|
| 45 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 46 |
+
annotations:
|
| 47 |
+
val:
|
| 48 |
+
url:
|
| 49 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/valor/desc_val.json
|
| 50 |
+
# - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_val.json
|
| 51 |
+
storage:
|
| 52 |
+
- valor/annotations/desc_val.json
|
| 53 |
+
# - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_val.json
|
| 54 |
+
|
| 55 |
+
test:
|
| 56 |
+
url:
|
| 57 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/valor/desc_test.json
|
| 58 |
+
# - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_test.json
|
| 59 |
+
storage:
|
| 60 |
+
- valor/annotations/desc_test.json
|
| 61 |
+
# - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_test.json
|
| 62 |
+
|
| 63 |
+
templates: null
|
| 64 |
+
|
| 65 |
+
audio:
|
| 66 |
+
storage: /export/video-language-dataset/data/VALOR/videos
|
| 67 |
+
|
| 68 |
+
video:
|
| 69 |
+
storage: /export/video-language-dataset/data/VALOR/videos
|
| 70 |
+
|
LAVIS-main/lavis/configs/datasets/vatex/defaults_cap.yaml
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
msvd_cap: # name of the dataset builder
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: videos # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
build_info:
|
| 12 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 13 |
+
annotations:
|
| 14 |
+
train:
|
| 15 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
|
| 16 |
+
storage: vatex/annotations/cap_train.json
|
| 17 |
+
val:
|
| 18 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
|
| 19 |
+
storage: vatex/annotations/cap_val.json
|
| 20 |
+
test:
|
| 21 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
|
| 22 |
+
storage: vatex/annotations/cap_test.json
|
| 23 |
+
videos:
|
| 24 |
+
storage: /export/share/dongxuli/data/vatex
|
LAVIS-main/lavis/configs/datasets/vatex/defaults_cap_instruct.yaml
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
vatex_caption_instruct: # name of the dataset builder
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: videos # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
video_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: alpro_video_train
|
| 14 |
+
n_frms: 4
|
| 15 |
+
image_size: 224
|
| 16 |
+
min_scale: 0.9
|
| 17 |
+
max_scale: 1.0
|
| 18 |
+
full_video: True
|
| 19 |
+
eval:
|
| 20 |
+
name: alpro_video_eval
|
| 21 |
+
n_frms: 4
|
| 22 |
+
image_size: 224
|
| 23 |
+
min_scale: 0.9
|
| 24 |
+
max_scale: 1.0
|
| 25 |
+
full_video: True
|
| 26 |
+
|
| 27 |
+
data_type: [video, audio]
|
| 28 |
+
|
| 29 |
+
audio_processor:
|
| 30 |
+
train:
|
| 31 |
+
name: beats_audio
|
| 32 |
+
sampling_rate: 16000
|
| 33 |
+
eval:
|
| 34 |
+
name: beats_audio
|
| 35 |
+
sampling_rate: 16000
|
| 36 |
+
is_eval: False
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
build_info:
|
| 40 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 41 |
+
annotations:
|
| 42 |
+
train:
|
| 43 |
+
url:
|
| 44 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
|
| 45 |
+
storage:
|
| 46 |
+
- vatex/annotations/cap_train.json
|
| 47 |
+
val:
|
| 48 |
+
url:
|
| 49 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
|
| 50 |
+
storage:
|
| 51 |
+
- vatex/annotations/cap_val.json
|
| 52 |
+
test:
|
| 53 |
+
url:
|
| 54 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
|
| 55 |
+
storage:
|
| 56 |
+
- vatex/annotations/cap_test.json
|
| 57 |
+
|
| 58 |
+
video:
|
| 59 |
+
storage: /export/video-language-dataset/data/vatex/
|
| 60 |
+
|
| 61 |
+
audio:
|
| 62 |
+
storage: /export/video-language-dataset/data/vatex/
|
LAVIS-main/lavis/configs/datasets/vg/defaults_caption.yaml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
vg_caption:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
build_info:
|
| 12 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 13 |
+
annotations:
|
| 14 |
+
train:
|
| 15 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json
|
| 16 |
+
storage: vg/annotations/vg_caption.json
|
| 17 |
+
images:
|
| 18 |
+
storage: vg/images/
|
LAVIS-main/lavis/configs/datasets/vg/defaults_caption_instruct.yaml
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
vg_caption_instruct:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
eval:
|
| 16 |
+
name: "clip_image_eval"
|
| 17 |
+
image_size: 224
|
| 18 |
+
|
| 19 |
+
text_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: blip_instruction
|
| 22 |
+
task: caption
|
| 23 |
+
modality: image
|
| 24 |
+
eval:
|
| 25 |
+
name: blip_caption
|
| 26 |
+
|
| 27 |
+
build_info:
|
| 28 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 29 |
+
annotations:
|
| 30 |
+
train:
|
| 31 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json
|
| 32 |
+
storage: vg/annotations/vg_caption.json
|
| 33 |
+
images:
|
| 34 |
+
storage: /export/share/datasets/vision/visual-genome/ #vg/images/
|
LAVIS-main/lavis/configs/datasets/vg/defaults_vqa.yaml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
vg_vqa:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
build_info:
|
| 12 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 13 |
+
annotations:
|
| 14 |
+
train:
|
| 15 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json
|
| 16 |
+
storage: vg/annotations/vg_qa.json
|
| 17 |
+
images:
|
| 18 |
+
storage: vg/images/
|
LAVIS-main/lavis/configs/datasets/vg/defaults_vqa_instruct.yaml
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
vg_vqa_instruct:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
eval:
|
| 16 |
+
name: "clip_image_eval"
|
| 17 |
+
image_size: 224
|
| 18 |
+
|
| 19 |
+
text_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: blip_instruction
|
| 22 |
+
task: qa
|
| 23 |
+
modality: image
|
| 24 |
+
eval:
|
| 25 |
+
name: blip_question
|
| 26 |
+
|
| 27 |
+
build_info:
|
| 28 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 29 |
+
annotations:
|
| 30 |
+
train:
|
| 31 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json
|
| 32 |
+
storage: vg/annotations/vg_qa.json
|
| 33 |
+
images:
|
| 34 |
+
storage: /export/share/datasets/vision/visual-genome/ #vg/images/
|
LAVIS-main/lavis/configs/datasets/violin/defaults_cap.yaml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
violin_caption: # name of the dataset builder
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: videos # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: alpro_video_train
|
| 14 |
+
n_frms: 4
|
| 15 |
+
image_size: 224
|
| 16 |
+
min_scale: 0.9
|
| 17 |
+
max_scale: 1.0
|
| 18 |
+
full_video: False
|
| 19 |
+
eval:
|
| 20 |
+
name: alpro_video_eval
|
| 21 |
+
n_frms: 4
|
| 22 |
+
image_size: 224
|
| 23 |
+
min_scale: 0.9
|
| 24 |
+
max_scale: 1.0
|
| 25 |
+
full_video: False
|
| 26 |
+
|
| 27 |
+
text_processor:
|
| 28 |
+
train:
|
| 29 |
+
name: blip_caption
|
| 30 |
+
eval:
|
| 31 |
+
name: blip_caption
|
| 32 |
+
|
| 33 |
+
build_info:
|
| 34 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 35 |
+
annotations:
|
| 36 |
+
train:
|
| 37 |
+
url:
|
| 38 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/train.json
|
| 39 |
+
# - /export/video-language-dataset/data/violin/annotations_lavis.json
|
| 40 |
+
storage:
|
| 41 |
+
- violin/annotations/train.json
|
| 42 |
+
# - /export/video-language-dataset/data/violin/annotations_lavis.json
|
| 43 |
+
# val:
|
| 44 |
+
# url:
|
| 45 |
+
# # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/test.json
|
| 46 |
+
# - /export/video-language-dataset/data/violin/annotations_lavis_test.json
|
| 47 |
+
# storage:
|
| 48 |
+
# # - violin/annotations/test.json
|
| 49 |
+
# - /export/video-language-dataset/data/violin/annotations_lavis_test.json
|
| 50 |
+
videos:
|
| 51 |
+
storage: /export/video-language-dataset/data/violin/videos
|
LAVIS-main/lavis/configs/datasets/violin/defaults_cap_instruct.yaml
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
violin_caption_instruct: # name of the dataset builder
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: videos # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: alpro_video_train
|
| 14 |
+
n_frms: 4
|
| 15 |
+
image_size: 224
|
| 16 |
+
min_scale: 0.9
|
| 17 |
+
max_scale: 1.0
|
| 18 |
+
full_video: False
|
| 19 |
+
eval:
|
| 20 |
+
name: alpro_video_eval
|
| 21 |
+
n_frms: 4
|
| 22 |
+
image_size: 224
|
| 23 |
+
min_scale: 0.9
|
| 24 |
+
max_scale: 1.0
|
| 25 |
+
full_video: False
|
| 26 |
+
|
| 27 |
+
text_processor:
|
| 28 |
+
train:
|
| 29 |
+
name: blip_instruction
|
| 30 |
+
modality: video
|
| 31 |
+
task: caption
|
| 32 |
+
eval:
|
| 33 |
+
name: blip_caption
|
| 34 |
+
|
| 35 |
+
build_info:
|
| 36 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 37 |
+
annotations:
|
| 38 |
+
train:
|
| 39 |
+
url:
|
| 40 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/train.json
|
| 41 |
+
# - /export/video-language-dataset/data/violin/annotations_lavis.json
|
| 42 |
+
storage:
|
| 43 |
+
- violin/annotations/train.json
|
| 44 |
+
# - /export/video-language-dataset/data/violin/annotations_lavis.json
|
| 45 |
+
# val:
|
| 46 |
+
# url:
|
| 47 |
+
# # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/test.json
|
| 48 |
+
# - /export/video-language-dataset/data/violin/annotations_lavis_test.json
|
| 49 |
+
# storage:
|
| 50 |
+
# # - violin/annotations/test.json
|
| 51 |
+
# - /export/video-language-dataset/data/violin/annotations_lavis_test.json
|
| 52 |
+
videos:
|
| 53 |
+
storage: /export/video-language-dataset/data/violin/videos
|
LAVIS-main/lavis/configs/datasets/violin/defaults_entail.yaml
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
violin_entailment: # 22452
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: videos # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: alpro_video_train
|
| 14 |
+
n_frms: 4
|
| 15 |
+
image_size: 224
|
| 16 |
+
min_scale: 0.9
|
| 17 |
+
max_scale: 1.0
|
| 18 |
+
full_video: False
|
| 19 |
+
eval:
|
| 20 |
+
name: alpro_video_eval
|
| 21 |
+
n_frms: 4
|
| 22 |
+
image_size: 224
|
| 23 |
+
min_scale: 0.9
|
| 24 |
+
max_scale: 1.0
|
| 25 |
+
full_video: False
|
| 26 |
+
|
| 27 |
+
text_processor:
|
| 28 |
+
train:
|
| 29 |
+
name: blip_caption
|
| 30 |
+
eval:
|
| 31 |
+
name: blip_caption
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
build_info:
|
| 35 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 36 |
+
annotations:
|
| 37 |
+
train:
|
| 38 |
+
url:
|
| 39 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/train.json
|
| 40 |
+
# - /export/video-language-dataset/data/violin/annotations_lavis.json
|
| 41 |
+
storage:
|
| 42 |
+
- violin/annotations/train.json
|
| 43 |
+
# - /export/video-language-dataset/data/violin/annotations_lavis.json
|
| 44 |
+
# val:
|
| 45 |
+
# url:
|
| 46 |
+
# # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/test.json
|
| 47 |
+
# - /export/video-language-dataset/data/violin/annotations_lavis_test.json
|
| 48 |
+
# storage:
|
| 49 |
+
# # - violin/annotations/test.json
|
| 50 |
+
# - /export/video-language-dataset/data/violin/annotations_lavis_test.json
|
| 51 |
+
videos:
|
| 52 |
+
storage: /export/video-language-dataset/data/violin/videos
|
LAVIS-main/lavis/configs/datasets/violin/defaults_entail_instruct.yaml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
violin_entailment_instruct: # 22452
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: videos # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: alpro_video_train
|
| 14 |
+
n_frms: 4
|
| 15 |
+
image_size: 224
|
| 16 |
+
min_scale: 0.9
|
| 17 |
+
max_scale: 1.0
|
| 18 |
+
full_video: False
|
| 19 |
+
eval:
|
| 20 |
+
name: alpro_video_eval
|
| 21 |
+
n_frms: 4
|
| 22 |
+
image_size: 224
|
| 23 |
+
min_scale: 0.9
|
| 24 |
+
max_scale: 1.0
|
| 25 |
+
full_video: False
|
| 26 |
+
|
| 27 |
+
text_processor:
|
| 28 |
+
train:
|
| 29 |
+
name: blip_caption
|
| 30 |
+
eval:
|
| 31 |
+
name: blip_caption
|
| 32 |
+
|
| 33 |
+
build_info:
|
| 34 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 35 |
+
annotations:
|
| 36 |
+
train:
|
| 37 |
+
url:
|
| 38 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/train.json
|
| 39 |
+
# - /export/video-language-dataset/data/violin/annotations_lavis.json
|
| 40 |
+
storage:
|
| 41 |
+
- violin/annotations/train.json
|
| 42 |
+
# - /export/video-language-dataset/data/violin/annotations_lavis.json
|
| 43 |
+
# val:
|
| 44 |
+
# url:
|
| 45 |
+
# # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/test.json
|
| 46 |
+
# - /export/video-language-dataset/data/violin/annotations_lavis_test.json
|
| 47 |
+
# storage:
|
| 48 |
+
# # - violin/annotations/test.json
|
| 49 |
+
# - /export/video-language-dataset/data/violin/annotations_lavis_test.json
|
| 50 |
+
videos:
|
| 51 |
+
storage: /export/video-language-dataset/data/violin/videos
|
LAVIS-main/lavis/configs/datasets/visdial/defaults_dial.yaml
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
visdial: # name of the dataset builder
|
| 8 |
+
data_type: images #extracted features of videos (I3D, VGGish) # [images|videos|features]
|
| 9 |
+
|
| 10 |
+
vis_processor:
|
| 11 |
+
train:
|
| 12 |
+
name: "clip_image_train"
|
| 13 |
+
image_size: 224
|
| 14 |
+
eval:
|
| 15 |
+
name: "clip_image_eval"
|
| 16 |
+
image_size: 224
|
| 17 |
+
|
| 18 |
+
text_processor:
|
| 19 |
+
train:
|
| 20 |
+
name: blip_caption
|
| 21 |
+
eval:
|
| 22 |
+
name: blip_caption
|
| 23 |
+
|
| 24 |
+
build_info:
|
| 25 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 26 |
+
annotations:
|
| 27 |
+
train:
|
| 28 |
+
url:
|
| 29 |
+
- /export/share/datasets/vision_language/visdial/visdial_1.0_train.json
|
| 30 |
+
storage:
|
| 31 |
+
- /export/share/datasets/vision_language/visdial/visdial_1.0_train.json
|
| 32 |
+
val:
|
| 33 |
+
url:
|
| 34 |
+
- /export/share/datasets/vision_language/visdial/visdial_1.0_val.json
|
| 35 |
+
storage:
|
| 36 |
+
- /export/share/datasets/vision_language/visdial/visdial_1.0_val.json
|
| 37 |
+
# test:
|
| 38 |
+
# url: /export/share/datasets/vision_language/visdial/visdial_1.0_test.json
|
| 39 |
+
# storage: /export/share/datasets/vision_language/visdial/visdial_1.0_test.json
|
| 40 |
+
images:
|
| 41 |
+
storage: /export/share/datasets/vision_language/visdial/
|
LAVIS-main/lavis/configs/datasets/visdial/defaults_dial_instruct.yaml
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
visdial_instruct: # name of the dataset builder
|
| 8 |
+
data_type: images #extracted features of videos (I3D, VGGish) # [images|videos|features]
|
| 9 |
+
|
| 10 |
+
vis_processor:
|
| 11 |
+
train:
|
| 12 |
+
name: "clip_image_train"
|
| 13 |
+
image_size: 224
|
| 14 |
+
eval:
|
| 15 |
+
name: "clip_image_eval"
|
| 16 |
+
image_size: 224
|
| 17 |
+
|
| 18 |
+
text_processor:
|
| 19 |
+
train:
|
| 20 |
+
name: blip_caption
|
| 21 |
+
eval:
|
| 22 |
+
name: blip_caption
|
| 23 |
+
|
| 24 |
+
build_info:
|
| 25 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 26 |
+
annotations:
|
| 27 |
+
train:
|
| 28 |
+
url:
|
| 29 |
+
- /export/share/datasets/vision_language/visdial/visdial_1.0_train.json
|
| 30 |
+
storage:
|
| 31 |
+
- /export/share/datasets/vision_language/visdial/visdial_1.0_train.json
|
| 32 |
+
val:
|
| 33 |
+
url:
|
| 34 |
+
- /export/share/datasets/vision_language/visdial/visdial_1.0_val.json
|
| 35 |
+
storage:
|
| 36 |
+
- /export/share/datasets/vision_language/visdial/visdial_1.0_val.json
|
| 37 |
+
# test:
|
| 38 |
+
# url: /export/share/datasets/vision_language/visdial/visdial_1.0_test.json
|
| 39 |
+
# storage: /export/share/datasets/vision_language/visdial/visdial_1.0_test.json
|
| 40 |
+
images:
|
| 41 |
+
storage: /export/share/datasets/vision_language/visdial/
|
LAVIS-main/lavis/configs/datasets/vizwiz/defaults.yaml
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
vizwiz_vqa:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
eval:
|
| 16 |
+
name: "clip_image_eval"
|
| 17 |
+
image_size: 224
|
| 18 |
+
|
| 19 |
+
text_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: blip_question
|
| 22 |
+
eval:
|
| 23 |
+
name: blip_question
|
| 24 |
+
|
| 25 |
+
build_info:
|
| 26 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 27 |
+
annotations:
|
| 28 |
+
val:
|
| 29 |
+
url:
|
| 30 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vizwiz/val.json
|
| 31 |
+
# - /export/share/datasets/vision/vizwiz/Annotations/val.json
|
| 32 |
+
storage:
|
| 33 |
+
- vizwiz/annotations/val.json
|
| 34 |
+
# - /export/share/datasets/vision/vizwiz/Annotations/val.json
|
| 35 |
+
test:
|
| 36 |
+
url:
|
| 37 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vizwiz/test.json
|
| 38 |
+
# - /export/share/datasets/vision/vizwiz/Annotations/test.json
|
| 39 |
+
storage:
|
| 40 |
+
- vizwiz/annotations/test.json
|
| 41 |
+
# - /export/share/datasets/vision/vizwiz/Annotations/test.json
|
| 42 |
+
images:
|
| 43 |
+
storage: /export/share/datasets/vision/vizwiz/images
|
LAVIS-main/lavis/configs/datasets/vlep/defaults_cap.yaml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
vlep_caption: # 4900
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: videos # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: alpro_video_train
|
| 14 |
+
n_frms: 4
|
| 15 |
+
image_size: 224
|
| 16 |
+
min_scale: 0.9
|
| 17 |
+
max_scale: 1.0
|
| 18 |
+
full_video: False
|
| 19 |
+
eval:
|
| 20 |
+
name: alpro_video_eval
|
| 21 |
+
n_frms: 4
|
| 22 |
+
image_size: 224
|
| 23 |
+
min_scale: 0.9
|
| 24 |
+
max_scale: 1.0
|
| 25 |
+
full_video: False
|
| 26 |
+
|
| 27 |
+
text_processor:
|
| 28 |
+
train:
|
| 29 |
+
name: blip_caption
|
| 30 |
+
eval:
|
| 31 |
+
name: blip_caption
|
| 32 |
+
|
| 33 |
+
build_info:
|
| 34 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 35 |
+
annotations:
|
| 36 |
+
train:
|
| 37 |
+
url:
|
| 38 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vlep/annotations_train_existing.json
|
| 39 |
+
# - /export/video-language-dataset/data/vlep/annotations/annotations_train_existing.json
|
| 40 |
+
storage:
|
| 41 |
+
- vlep/annotations/annotations_train_existing.json
|
| 42 |
+
# - /export/video-language-dataset/data/vlep/annotations/annotations_train_existing.json
|
| 43 |
+
val:
|
| 44 |
+
url:
|
| 45 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vlep/annotations_dev_existing.json
|
| 46 |
+
# - /export/video-language-dataset/data/vlep/annotations/annotations_dev_existing.json
|
| 47 |
+
storage:
|
| 48 |
+
- vlep/annotations/annotations_dev_existing.json
|
| 49 |
+
# - /export/video-language-dataset/data/vlep/annotations/annotations_dev_existing.json
|
| 50 |
+
videos:
|
| 51 |
+
storage: /export/video-language-dataset/data/vlep/videos
|
LAVIS-main/lavis/configs/datasets/vlep/defaults_cap_instruct.yaml
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
vlep_caption_instruct: # 4900
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: videos # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: alpro_video_train
|
| 14 |
+
n_frms: 4
|
| 15 |
+
image_size: 224
|
| 16 |
+
min_scale: 0.9
|
| 17 |
+
max_scale: 1.0
|
| 18 |
+
full_video: False
|
| 19 |
+
eval:
|
| 20 |
+
name: alpro_video_eval
|
| 21 |
+
n_frms: 4
|
| 22 |
+
image_size: 224
|
| 23 |
+
min_scale: 0.9
|
| 24 |
+
max_scale: 1.0
|
| 25 |
+
full_video: False
|
| 26 |
+
|
| 27 |
+
text_processor:
|
| 28 |
+
train:
|
| 29 |
+
name: blip_instruction
|
| 30 |
+
modality: image
|
| 31 |
+
task: caption
|
| 32 |
+
eval:
|
| 33 |
+
name: blip_caption
|
| 34 |
+
|
| 35 |
+
build_info:
|
| 36 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 37 |
+
annotations:
|
| 38 |
+
train:
|
| 39 |
+
url:
|
| 40 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vlep/annotations_train_existing.json
|
| 41 |
+
# - /export/video-language-dataset/data/vlep/annotations/annotations_train_existing.json
|
| 42 |
+
storage:
|
| 43 |
+
- vlep/annotations/annotations_train_existing.json
|
| 44 |
+
# - /export/video-language-dataset/data/vlep/annotations/annotations_train_existing.json
|
| 45 |
+
val:
|
| 46 |
+
url:
|
| 47 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vlep/annotations_dev_existing.json
|
| 48 |
+
# - /export/video-language-dataset/data/vlep/annotations/annotations_dev_existing.json
|
| 49 |
+
storage:
|
| 50 |
+
- vlep/annotations/annotations_dev_existing.json
|
| 51 |
+
# - /export/video-language-dataset/data/vlep/annotations/annotations_dev_existing.json
|
| 52 |
+
videos:
|
| 53 |
+
storage: /export/video-language-dataset/data/vlep/videos
|
LAVIS-main/lavis/configs/datasets/vsr/defaults.yaml
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
vsr_classification_instruct:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
eval:
|
| 16 |
+
name: "clip_image_eval"
|
| 17 |
+
image_size: 224
|
| 18 |
+
|
| 19 |
+
text_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: blip_caption
|
| 22 |
+
eval:
|
| 23 |
+
name: blip_caption
|
| 24 |
+
|
| 25 |
+
build_info:
|
| 26 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 27 |
+
train:
|
| 28 |
+
url:
|
| 29 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/train.jsonl
|
| 30 |
+
# - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
|
| 31 |
+
storage:
|
| 32 |
+
- vsr/annotations/train.jsonl
|
| 33 |
+
# - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
|
| 34 |
+
val:
|
| 35 |
+
url:
|
| 36 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/dev.jsonl
|
| 37 |
+
# - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
|
| 38 |
+
storage:
|
| 39 |
+
- vsr/annotations/dev.jsonl
|
| 40 |
+
# - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
|
| 41 |
+
test:
|
| 42 |
+
url:
|
| 43 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/test.jsonl
|
| 44 |
+
# - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
|
| 45 |
+
storage:
|
| 46 |
+
- vsr/annotations/test.jsonl
|
| 47 |
+
# - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
|
| 48 |
+
images:
|
| 49 |
+
storage: /export/share/datasets/vision_language/VSR/images
|
LAVIS-main/lavis/configs/datasets/vsr/defaults_classification.yaml
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
vsr_classification:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
eval:
|
| 16 |
+
name: "clip_image_eval"
|
| 17 |
+
image_size: 224
|
| 18 |
+
|
| 19 |
+
text_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: blip_caption
|
| 22 |
+
eval:
|
| 23 |
+
name: blip_caption
|
| 24 |
+
|
| 25 |
+
build_info:
|
| 26 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 27 |
+
train:
|
| 28 |
+
url:
|
| 29 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/train.jsonl
|
| 30 |
+
# - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
|
| 31 |
+
storage:
|
| 32 |
+
- vsr/annotations/train.jsonl
|
| 33 |
+
# - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
|
| 34 |
+
val:
|
| 35 |
+
url:
|
| 36 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/dev.jsonl
|
| 37 |
+
# - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
|
| 38 |
+
storage:
|
| 39 |
+
- vsr/annotations/dev.jsonl
|
| 40 |
+
# - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
|
| 41 |
+
test:
|
| 42 |
+
url:
|
| 43 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/test.jsonl
|
| 44 |
+
# - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
|
| 45 |
+
storage:
|
| 46 |
+
- vsr/annotations/test.jsonl
|
| 47 |
+
# - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
|
| 48 |
+
images:
|
| 49 |
+
storage: /export/share/datasets/vision_language/VSR/images
|
LAVIS-main/lavis/configs/datasets/vsr/defaults_classification_instruct.yaml
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
vsr_caption_instruct:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
eval:
|
| 16 |
+
name: "clip_image_eval"
|
| 17 |
+
image_size: 224
|
| 18 |
+
|
| 19 |
+
text_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: blip_caption
|
| 22 |
+
eval:
|
| 23 |
+
name: blip_caption
|
| 24 |
+
|
| 25 |
+
build_info:
|
| 26 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 27 |
+
train:
|
| 28 |
+
url:
|
| 29 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/train.jsonl
|
| 30 |
+
# - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
|
| 31 |
+
storage:
|
| 32 |
+
- vsr/annotations/train.jsonl
|
| 33 |
+
# - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
|
| 34 |
+
val:
|
| 35 |
+
url:
|
| 36 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/dev.jsonl
|
| 37 |
+
# - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
|
| 38 |
+
storage:
|
| 39 |
+
- vsr/annotations/dev.jsonl
|
| 40 |
+
# - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
|
| 41 |
+
test:
|
| 42 |
+
url:
|
| 43 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/test.jsonl
|
| 44 |
+
# - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
|
| 45 |
+
storage:
|
| 46 |
+
- vsr/annotations/test.jsonl
|
| 47 |
+
# - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
|
| 48 |
+
images:
|
| 49 |
+
storage: /export/share/datasets/vision_language/VSR/images
|
LAVIS-main/lavis/configs/datasets/vsr/defaults_instruct.yaml
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
vsr_caption_instruct:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
vis_processor:
|
| 13 |
+
train:
|
| 14 |
+
name: "clip_image_train"
|
| 15 |
+
image_size: 224
|
| 16 |
+
eval:
|
| 17 |
+
name: "clip_image_eval"
|
| 18 |
+
image_size: 224
|
| 19 |
+
|
| 20 |
+
text_processor:
|
| 21 |
+
train:
|
| 22 |
+
name: blip_instruction
|
| 23 |
+
task: caption
|
| 24 |
+
modality: image
|
| 25 |
+
eval:
|
| 26 |
+
name: blip_caption
|
| 27 |
+
|
| 28 |
+
build_info:
|
| 29 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 30 |
+
annotations:
|
| 31 |
+
train:
|
| 32 |
+
url:
|
| 33 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/train.jsonl
|
| 34 |
+
# - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
|
| 35 |
+
storage:
|
| 36 |
+
- vsr/annotations/train.jsonl
|
| 37 |
+
# - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
|
| 38 |
+
val:
|
| 39 |
+
url:
|
| 40 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/dev.jsonl
|
| 41 |
+
# - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
|
| 42 |
+
storage:
|
| 43 |
+
- vsr/annotations/dev.jsonl
|
| 44 |
+
# - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
|
| 45 |
+
test:
|
| 46 |
+
url:
|
| 47 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/test.jsonl
|
| 48 |
+
# - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
|
| 49 |
+
storage:
|
| 50 |
+
- vsr/annotations/test.jsonl
|
| 51 |
+
# - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
|
| 52 |
+
images:
|
| 53 |
+
storage: /export/share/datasets/vision_language/VSR/images
|
LAVIS-main/lavis/configs/datasets/wavcaps/defaults_mm_cap.yaml
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
datasets:
|
| 6 |
+
wavcaps_mm_caption: # name of the dataset builder
|
| 7 |
+
audio_processor:
|
| 8 |
+
train:
|
| 9 |
+
name: beats_audio
|
| 10 |
+
sampling_rate: 16000
|
| 11 |
+
n_frames: 2
|
| 12 |
+
frame_length: 512
|
| 13 |
+
eval:
|
| 14 |
+
name: beats_audio
|
| 15 |
+
sampling_rate: 16000
|
| 16 |
+
n_frames: 2
|
| 17 |
+
frame_length: 512
|
| 18 |
+
|
| 19 |
+
text_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: blip_caption
|
| 22 |
+
eval:
|
| 23 |
+
name: blip_caption
|
| 24 |
+
|
| 25 |
+
data_type: [audio]
|
| 26 |
+
|
| 27 |
+
build_info:
|
| 28 |
+
kwargs:
|
| 29 |
+
cached: False
|
| 30 |
+
cached_dir: /export/share/datasets/audio/WavCaps/beats_features/
|
| 31 |
+
|
| 32 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 33 |
+
annotations:
|
| 34 |
+
train:
|
| 35 |
+
url:
|
| 36 |
+
- https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/BBC_Sound_Effects/bbc_final.json
|
| 37 |
+
- https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/FreeSound/fsd_final.json
|
| 38 |
+
- https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/SoundBible/sb_final.json
|
| 39 |
+
- https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/AudioSet_SL/as_final.json
|
| 40 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/wavcaps/json_data.json
|
| 41 |
+
storage:
|
| 42 |
+
- wavcaps/json_files/BBC_Sound_Effects/bbc_final.json
|
| 43 |
+
- wavcaps/json_files/FreeSound/fsd_final.json
|
| 44 |
+
- wavcaps/json_files/SoundBible/sb_final.json
|
| 45 |
+
- wavcaps/json_files/AudioSet_SL/as_final.json
|
| 46 |
+
- wavcaps/annotations/json_data.json
|
| 47 |
+
# train:
|
| 48 |
+
# url:
|
| 49 |
+
# - /export/share/datasets/audio/WavCaps/json_files/BBC_Sound_Effects/bbc_final.json
|
| 50 |
+
# - /export/share/datasets/audio/WavCaps/json_files/FreeSound/fsd_final.json
|
| 51 |
+
# - /export/share/datasets/audio/WavCaps/json_files/SoundBible/sb_final.json
|
| 52 |
+
# - /export/share/datasets/audio/WavCaps/json_files/AudioSet_SL/as_final.json
|
| 53 |
+
# - /export/share/datasets/audio/WavCaps/json_data.json
|
| 54 |
+
# storage:
|
| 55 |
+
# - /export/share/datasets/audio/WavCaps/json_files/BBC_Sound_Effects/bbc_final.json
|
| 56 |
+
# - /export/share/datasets/audio/WavCaps/json_files/FreeSound/fsd_final.json
|
| 57 |
+
# - /export/share/datasets/audio/WavCaps/json_files/SoundBible/sb_final.json
|
| 58 |
+
# - /export/share/datasets/audio/WavCaps/json_files/AudioSet_SL/as_final.json
|
| 59 |
+
# - /export/share/datasets/audio/WavCaps/json_data.json
|
| 60 |
+
|
| 61 |
+
audio:
|
| 62 |
+
storage: /export/share/datasets/audio/WavCaps/
|
| 63 |
+
|
LAVIS-main/lavis/configs/datasets/wavcaps/defaults_mm_cap_instruct.yaml
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
datasets:
|
| 6 |
+
wavcaps_mm_caption_instruct: # name of the dataset builder
|
| 7 |
+
audio_processor:
|
| 8 |
+
train:
|
| 9 |
+
name: beats_audio
|
| 10 |
+
sampling_rate: 16000
|
| 11 |
+
n_frames: 2
|
| 12 |
+
frame_length: 512
|
| 13 |
+
eval:
|
| 14 |
+
name: beats_audio
|
| 15 |
+
sampling_rate: 16000
|
| 16 |
+
n_frames: 2
|
| 17 |
+
frame_length: 512
|
| 18 |
+
text_processor:
|
| 19 |
+
train:
|
| 20 |
+
name: "blip_instruction"
|
| 21 |
+
modality: audio
|
| 22 |
+
task: caption
|
| 23 |
+
eval:
|
| 24 |
+
name: "blip_caption"
|
| 25 |
+
|
| 26 |
+
data_type: [audio]
|
| 27 |
+
|
| 28 |
+
build_info:
|
| 29 |
+
kwargs:
|
| 30 |
+
cached: True
|
| 31 |
+
cached_dir: /export/share/datasets/audio/WavCaps/beats_features/
|
| 32 |
+
|
| 33 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 34 |
+
annotations:
|
| 35 |
+
train:
|
| 36 |
+
# url:
|
| 37 |
+
# - /export/share/datasets/audio/WavCaps/json_files/BBC_Sound_Effects/bbc_final.json
|
| 38 |
+
# - /export/share/datasets/audio/WavCaps/json_files/FreeSound/fsd_final.json
|
| 39 |
+
# - /export/share/datasets/audio/WavCaps/json_files/SoundBible/sb_final.json
|
| 40 |
+
# - /export/share/datasets/audio/WavCaps/json_files/AudioSet_SL/as_final.json
|
| 41 |
+
# - /export/share/datasets/audio/WavCaps/json_data.json
|
| 42 |
+
# storage:
|
| 43 |
+
# - /export/share/datasets/audio/WavCaps/json_files/BBC_Sound_Effects/bbc_final.json
|
| 44 |
+
# - /export/share/datasets/audio/WavCaps/json_files/FreeSound/fsd_final.json
|
| 45 |
+
# - /export/share/datasets/audio/WavCaps/json_files/SoundBible/sb_final.json
|
| 46 |
+
# - /export/share/datasets/audio/WavCaps/json_files/AudioSet_SL/as_final.json
|
| 47 |
+
# - /export/share/datasets/audio/WavCaps/json_data.json
|
| 48 |
+
url:
|
| 49 |
+
- https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/BBC_Sound_Effects/bbc_final.json
|
| 50 |
+
- https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/FreeSound/fsd_final.json
|
| 51 |
+
- https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/SoundBible/sb_final.json
|
| 52 |
+
- https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/AudioSet_SL/as_final.json
|
| 53 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/wavcaps/json_data.json
|
| 54 |
+
storage:
|
| 55 |
+
- wavcaps/json_files/BBC_Sound_Effects/bbc_final.json
|
| 56 |
+
- wavcaps/json_files/FreeSound/fsd_final.json
|
| 57 |
+
- wavcaps/json_files/SoundBible/sb_final.json
|
| 58 |
+
- wavcaps/json_files/AudioSet_SL/as_final.json
|
| 59 |
+
- wavcaps/annotations/json_data.json
|
| 60 |
+
|
| 61 |
+
audio:
|
| 62 |
+
storage: /export/share/datasets/audio/WavCaps/
|
| 63 |
+
|
LAVIS-main/lavis/configs/datasets/webvid/defaults_cap.yaml
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
webvid2m_caption: # name of the dataset builder
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: alpro_video_train
|
| 14 |
+
n_frms: 5
|
| 15 |
+
image_size: 224
|
| 16 |
+
min_scale: 0.9
|
| 17 |
+
max_scale: 1.0
|
| 18 |
+
eval:
|
| 19 |
+
name: alpro_video_eval
|
| 20 |
+
n_frms: 5
|
| 21 |
+
image_size: 224
|
| 22 |
+
min_scale: 0.9
|
| 23 |
+
max_scale: 1.0
|
| 24 |
+
text_processor:
|
| 25 |
+
train:
|
| 26 |
+
name: "blip_caption"
|
| 27 |
+
eval:
|
| 28 |
+
name: "blip_caption"
|
| 29 |
+
|
| 30 |
+
build_info:
|
| 31 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 32 |
+
annotations:
|
| 33 |
+
train:
|
| 34 |
+
url:
|
| 35 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/webvid2m/train.json
|
| 36 |
+
# - /export/home/LAVIS/webvid_annotation.json
|
| 37 |
+
storage:
|
| 38 |
+
- webvid2m/annotations/train.json
|
| 39 |
+
# - /export/home/LAVIS/webvid_annotation.json
|
| 40 |
+
images:
|
| 41 |
+
storage: /export/video-language-dataset/data/webvid2m/postprocess/downsampled_videos
|
LAVIS-main/lavis/configs/datasets/webvid/defaults_cap_instruct.yaml
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
webvid2m_caption_instruct: # name of the dataset builder
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: alpro_video_train
|
| 14 |
+
n_frms: 5
|
| 15 |
+
image_size: 224
|
| 16 |
+
min_scale: 0.9
|
| 17 |
+
max_scale: 1.0
|
| 18 |
+
eval:
|
| 19 |
+
name: alpro_video_eval
|
| 20 |
+
n_frms: 5
|
| 21 |
+
image_size: 224
|
| 22 |
+
min_scale: 0.9
|
| 23 |
+
max_scale: 1.0
|
| 24 |
+
text_processor:
|
| 25 |
+
train:
|
| 26 |
+
name: "blip_instruction"
|
| 27 |
+
modality: video
|
| 28 |
+
task: caption
|
| 29 |
+
eval:
|
| 30 |
+
name: "blip_caption"
|
| 31 |
+
|
| 32 |
+
build_info:
|
| 33 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 34 |
+
annotations:
|
| 35 |
+
train:
|
| 36 |
+
url:
|
| 37 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/webvid2m/train.json
|
| 38 |
+
# - /export/home/LAVIS/webvid_annotation.json
|
| 39 |
+
storage:
|
| 40 |
+
- webvid2m/annotations/train.json
|
| 41 |
+
# - /export/home/LAVIS/webvid_annotation.json
|
| 42 |
+
images:
|
| 43 |
+
storage: /export/video-language-dataset/data/webvid2m/postprocess/downsampled_videos
|
LAVIS-main/lavis/configs/datasets/youcook/defaults_cap.yaml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
youcook_caption: # name of the dataset builder
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: videos # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: alpro_video_train
|
| 14 |
+
n_frms: 4
|
| 15 |
+
image_size: 224
|
| 16 |
+
min_scale: 0.9
|
| 17 |
+
max_scale: 1.0
|
| 18 |
+
full_video: False
|
| 19 |
+
eval:
|
| 20 |
+
name: alpro_video_eval
|
| 21 |
+
n_frms: 4
|
| 22 |
+
image_size: 224
|
| 23 |
+
min_scale: 0.9
|
| 24 |
+
max_scale: 1.0
|
| 25 |
+
full_video: False
|
| 26 |
+
|
| 27 |
+
text_processor:
|
| 28 |
+
train:
|
| 29 |
+
name: blip_caption
|
| 30 |
+
eval:
|
| 31 |
+
name: blip_caption
|
| 32 |
+
|
| 33 |
+
build_info:
|
| 34 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 35 |
+
annotations:
|
| 36 |
+
train:
|
| 37 |
+
url:
|
| 38 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/youcook/train_annotations.json
|
| 39 |
+
# - /export/video-language-dataset/data/youcook/annotations/train_annotations.json
|
| 40 |
+
storage:
|
| 41 |
+
- youcook/annotations/train_annotations.json
|
| 42 |
+
# - /export/video-language-dataset/data/youcook/annotations/train_annotations.json
|
| 43 |
+
val:
|
| 44 |
+
url:
|
| 45 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/youcook/val_annotations.json
|
| 46 |
+
# - /export/video-language-dataset/data/youcook/annotations/val_annotations.json
|
| 47 |
+
storage:
|
| 48 |
+
- youcook/annotations/val_annotations.json
|
| 49 |
+
# - /export/video-language-dataset/data/youcook/annotations/val_annotations.json
|
| 50 |
+
videos:
|
| 51 |
+
storage: /export/video-language-dataset/data/youcook/raw_videos
|
LAVIS-main/lavis/configs/datasets/youcook/defaults_cap_instruct.yaml
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
youcook_caption_instruct: # name of the dataset builder
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: videos # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: alpro_video_train
|
| 14 |
+
n_frms: 4
|
| 15 |
+
image_size: 224
|
| 16 |
+
min_scale: 0.9
|
| 17 |
+
max_scale: 1.0
|
| 18 |
+
full_video: False
|
| 19 |
+
eval:
|
| 20 |
+
name: alpro_video_eval
|
| 21 |
+
n_frms: 4
|
| 22 |
+
image_size: 224
|
| 23 |
+
min_scale: 0.9
|
| 24 |
+
max_scale: 1.0
|
| 25 |
+
full_video: False
|
| 26 |
+
|
| 27 |
+
text_processor:
|
| 28 |
+
train:
|
| 29 |
+
name: blip_instruction
|
| 30 |
+
modality: video
|
| 31 |
+
task: caption
|
| 32 |
+
eval:
|
| 33 |
+
name: blip_caption
|
| 34 |
+
|
| 35 |
+
build_info:
|
| 36 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 37 |
+
annotations:
|
| 38 |
+
train:
|
| 39 |
+
url:
|
| 40 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/youcook/train_annotations.json
|
| 41 |
+
# - /export/video-language-dataset/data/youcook/annotations/train_annotations.json
|
| 42 |
+
storage:
|
| 43 |
+
- youcook/annotations/train_annotations.json
|
| 44 |
+
# - /export/video-language-dataset/data/youcook/annotations/train_annotations.json
|
| 45 |
+
val:
|
| 46 |
+
url:
|
| 47 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/youcook/val_annotations.json
|
| 48 |
+
# - /export/video-language-dataset/data/youcook/annotations/val_annotations.json
|
| 49 |
+
storage:
|
| 50 |
+
- youcook/annotations/val_annotations.json
|
| 51 |
+
# - /export/video-language-dataset/data/youcook/annotations/val_annotations.json
|
| 52 |
+
videos:
|
| 53 |
+
storage: /export/video-language-dataset/data/youcook/raw_videos
|
LAVIS-main/lavis/configs/datasets/yt8m/defaults_mm_dial.yaml
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
yt8m_mm_dialogue: # name of the dataset builder
|
| 8 |
+
data_type: [video] #extracted features of videos (I3D, VGGish) # [images|videos|features]
|
| 9 |
+
|
| 10 |
+
video_processor:
|
| 11 |
+
train:
|
| 12 |
+
name: alpro_video_train
|
| 13 |
+
n_frms: 4
|
| 14 |
+
image_size: 224
|
| 15 |
+
min_scale: 0.9
|
| 16 |
+
max_scale: 1.0
|
| 17 |
+
full_video: False
|
| 18 |
+
eval:
|
| 19 |
+
name: alpro_video_eval
|
| 20 |
+
n_frms: 4
|
| 21 |
+
image_size: 224
|
| 22 |
+
min_scale: 0.9
|
| 23 |
+
max_scale: 1.0
|
| 24 |
+
full_video: False
|
| 25 |
+
|
| 26 |
+
audio_processor:
|
| 27 |
+
train:
|
| 28 |
+
name: beats_audio
|
| 29 |
+
# sampling_rate: 16000
|
| 30 |
+
eval:
|
| 31 |
+
name: beats_audio
|
| 32 |
+
# sampling_rate: 16000
|
| 33 |
+
is_eval: True
|
| 34 |
+
|
| 35 |
+
text_processor:
|
| 36 |
+
train:
|
| 37 |
+
name: blip_caption
|
| 38 |
+
eval:
|
| 39 |
+
name: blip_caption
|
| 40 |
+
|
| 41 |
+
build_info:
|
| 42 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 43 |
+
annotations:
|
| 44 |
+
train:
|
| 45 |
+
url:
|
| 46 |
+
- /export/video-language-dataset/data/yt-8m/ytd_gpt3_safe_json/train.json
|
| 47 |
+
storage:
|
| 48 |
+
- /export/video-language-dataset/data/yt-8m/ytd_gpt3_safe_json/train.json
|
| 49 |
+
val:
|
| 50 |
+
url:
|
| 51 |
+
- /export/video-language-dataset/data/yt-8m/ytd_gpt3_safe_json/validation.json
|
| 52 |
+
storage:
|
| 53 |
+
- /export/video-language-dataset/data/yt-8m/ytd_gpt3_safe_json/validation.json
|
| 54 |
+
|
| 55 |
+
templates: null
|
| 56 |
+
|
| 57 |
+
audio:
|
| 58 |
+
storage: /export/video-language-dataset/data/yt-8m/audios
|
| 59 |
+
|
| 60 |
+
video:
|
| 61 |
+
storage: /export/video-language-dataset/data/yt-8m/videos
|
| 62 |
+
|
LAVIS-main/lavis/configs/models/albef_classification_ve.yaml
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: albef_classification
|
| 8 |
+
load_finetuned: True
|
| 9 |
+
|
| 10 |
+
finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_snli_ve_lavis.pt"
|
| 11 |
+
pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
|
| 12 |
+
|
| 13 |
+
num_classes: 3
|
| 14 |
+
|
| 15 |
+
use_distill: True
|
| 16 |
+
momentum: 0.995
|
| 17 |
+
alpha: 0.4
|
| 18 |
+
|
| 19 |
+
# vit encoder
|
| 20 |
+
vit_type: "base"
|
| 21 |
+
vit_grad_ckpt: False
|
| 22 |
+
vit_ckpt_layer: 0
|
| 23 |
+
vit_layer_norm_epsilon: 1e-6
|
| 24 |
+
|
| 25 |
+
image_size: 384
|
| 26 |
+
|
| 27 |
+
# bert config
|
| 28 |
+
med_config_path: "configs/models/med_config_albef.json"
|
| 29 |
+
|
| 30 |
+
preprocess:
|
| 31 |
+
vis_processor:
|
| 32 |
+
train:
|
| 33 |
+
name: "blip_image_train"
|
| 34 |
+
eval:
|
| 35 |
+
name: "blip_image_eval"
|
| 36 |
+
text_processor:
|
| 37 |
+
train:
|
| 38 |
+
name: "blip_caption"
|
| 39 |
+
eval:
|
| 40 |
+
name: "blip_caption"
|
LAVIS-main/lavis/configs/models/albef_feature_extractor.yaml
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: albef_pretrain
|
| 8 |
+
pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
|
| 9 |
+
|
| 10 |
+
# vit encoder
|
| 11 |
+
vit_type: "base"
|
| 12 |
+
image_size: 224
|
| 13 |
+
vit_ckpt_layer: 0
|
| 14 |
+
vit_drop_path_rate: 0
|
| 15 |
+
vit_layer_norm_epsilon: 1e-6
|
| 16 |
+
vit_grad_ckpt: False
|
| 17 |
+
|
| 18 |
+
# bert config
|
| 19 |
+
med_config_path: "configs/models/med_config_albef.json"
|
| 20 |
+
|
| 21 |
+
embed_dim: 256
|
| 22 |
+
|
| 23 |
+
preprocess:
|
| 24 |
+
vis_processor:
|
| 25 |
+
eval:
|
| 26 |
+
name: "blip_image_eval"
|
| 27 |
+
image_size: 224
|
| 28 |
+
text_processor:
|
| 29 |
+
eval:
|
| 30 |
+
name: "blip_caption"
|
LAVIS-main/lavis/configs/models/albef_nlvr.yaml
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: albef_nlvr
|
| 8 |
+
load_finetuned: True
|
| 9 |
+
|
| 10 |
+
pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/pretrain_model_nlvr.pth"
|
| 11 |
+
finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_nlvr_lavis.pt"
|
| 12 |
+
|
| 13 |
+
num_classes: 2
|
| 14 |
+
|
| 15 |
+
use_distill: True
|
| 16 |
+
momentum: 0.995
|
| 17 |
+
alpha: 0.4
|
| 18 |
+
|
| 19 |
+
# vit encoder
|
| 20 |
+
vit_type: "base"
|
| 21 |
+
vit_grad_ckpt: False
|
| 22 |
+
vit_ckpt_layer: 0
|
| 23 |
+
vit_layer_norm_epsilon: 1e-6
|
| 24 |
+
|
| 25 |
+
image_size: 384
|
| 26 |
+
|
| 27 |
+
# bert config
|
| 28 |
+
med_config_path: "configs/models/med_config_albef.json"
|
| 29 |
+
|
| 30 |
+
preprocess:
|
| 31 |
+
vis_processor:
|
| 32 |
+
train:
|
| 33 |
+
name: "blip_image_train"
|
| 34 |
+
image_size: 384
|
| 35 |
+
eval:
|
| 36 |
+
name: "blip_image_eval"
|
| 37 |
+
image_size: 384
|
| 38 |
+
text_processor:
|
| 39 |
+
train:
|
| 40 |
+
name: "blip_caption"
|
| 41 |
+
eval:
|
| 42 |
+
name: "blip_caption"
|
LAVIS-main/lavis/configs/models/albef_pretrain_base.yaml
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: albef_pretrain
|
| 8 |
+
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
|
| 11 |
+
|
| 12 |
+
# vit encoder
|
| 13 |
+
vit_type: "base"
|
| 14 |
+
image_size: 224
|
| 15 |
+
vit_ckpt_layer: 0
|
| 16 |
+
vit_drop_path_rate: 0
|
| 17 |
+
vit_layer_norm_epsilon: 1e-6
|
| 18 |
+
vit_grad_ckpt: False
|
| 19 |
+
|
| 20 |
+
# bert config
|
| 21 |
+
med_config_path: "configs/models/med_config_albef.json"
|
| 22 |
+
mlm_mask_prob: 0.15
|
| 23 |
+
|
| 24 |
+
embed_dim: 256
|
| 25 |
+
momentum: 0.995
|
| 26 |
+
alpha: 0.4
|
| 27 |
+
temp: 0.07
|
| 28 |
+
|
| 29 |
+
max_txt_len: 30
|
| 30 |
+
|
| 31 |
+
preprocess:
|
| 32 |
+
vis_processor:
|
| 33 |
+
train:
|
| 34 |
+
name: "blip_image_train"
|
| 35 |
+
image_size: 256
|
| 36 |
+
text_processor:
|
| 37 |
+
train:
|
| 38 |
+
name: "blip_caption"
|
LAVIS-main/lavis/configs/models/albef_retrieval_coco.yaml
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: albef_retrieval
|
| 8 |
+
load_finetuned: True
|
| 9 |
+
|
| 10 |
+
pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
|
| 11 |
+
finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_coco_retrieval_lavis.pt"
|
| 12 |
+
|
| 13 |
+
queue_size: 65536
|
| 14 |
+
|
| 15 |
+
# vit encoder
|
| 16 |
+
vit_type: "base"
|
| 17 |
+
image_size: 384
|
| 18 |
+
vit_ckpt_layer: 0
|
| 19 |
+
vit_drop_path_rate: 0
|
| 20 |
+
vit_layer_norm_epsilon: 1e-6
|
| 21 |
+
vit_grad_ckpt: False
|
| 22 |
+
|
| 23 |
+
# bert config
|
| 24 |
+
med_config_path: "configs/models/med_config_albef.json"
|
| 25 |
+
|
| 26 |
+
embed_dim: 256
|
| 27 |
+
momentum: 0.995
|
| 28 |
+
alpha: 0.4
|
| 29 |
+
temp: 0.07
|
| 30 |
+
use_distill: True
|
| 31 |
+
|
| 32 |
+
max_txt_len: 30
|
| 33 |
+
|
| 34 |
+
preprocess:
|
| 35 |
+
vis_processor:
|
| 36 |
+
train:
|
| 37 |
+
name: "blip_image_train"
|
| 38 |
+
image_size: 384
|
| 39 |
+
eval:
|
| 40 |
+
name: "blip_image_eval"
|
| 41 |
+
image_size: 384
|
| 42 |
+
text_processor:
|
| 43 |
+
train:
|
| 44 |
+
name: "blip_caption"
|
| 45 |
+
eval:
|
| 46 |
+
name: "blip_caption"
|
LAVIS-main/lavis/configs/models/albef_retrieval_flickr.yaml
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: albef_retrieval
|
| 8 |
+
load_finetuned: True
|
| 9 |
+
|
| 10 |
+
pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
|
| 11 |
+
finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_flickr_retrieval_lavis.pt
|
| 12 |
+
|
| 13 |
+
queue_size: 65536
|
| 14 |
+
|
| 15 |
+
# vit encoder
|
| 16 |
+
vit_type: "base"
|
| 17 |
+
image_size: 384
|
| 18 |
+
vit_ckpt_layer: 0
|
| 19 |
+
vit_drop_path_rate: 0
|
| 20 |
+
vit_layer_norm_epsilon: 1e-6
|
| 21 |
+
vit_grad_ckpt: False
|
| 22 |
+
|
| 23 |
+
# bert config
|
| 24 |
+
med_config_path: "configs/models/med_config_albef.json"
|
| 25 |
+
|
| 26 |
+
embed_dim: 256
|
| 27 |
+
momentum: 0.995
|
| 28 |
+
alpha: 0.4
|
| 29 |
+
temp: 0.07
|
| 30 |
+
use_distill: True
|
| 31 |
+
|
| 32 |
+
max_txt_len: 30
|
| 33 |
+
|
| 34 |
+
preprocess:
|
| 35 |
+
vis_processor:
|
| 36 |
+
train:
|
| 37 |
+
name: "blip_image_train"
|
| 38 |
+
image_size: 384
|
| 39 |
+
eval:
|
| 40 |
+
name: "blip_image_eval"
|
| 41 |
+
image_size: 384
|
| 42 |
+
text_processor:
|
| 43 |
+
train:
|
| 44 |
+
name: "blip_caption"
|
| 45 |
+
eval:
|
| 46 |
+
name: "blip_caption"
|
LAVIS-main/lavis/configs/models/albef_vqav2.yaml
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: albef_vqa
|
| 8 |
+
load_finetuned: True
|
| 9 |
+
|
| 10 |
+
pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
|
| 11 |
+
finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_vqav2_lavis.pt"
|
| 12 |
+
|
| 13 |
+
use_distill: True
|
| 14 |
+
momentum: 0.995
|
| 15 |
+
alpha: 0.4
|
| 16 |
+
|
| 17 |
+
# vit encoder
|
| 18 |
+
vit_type: "base"
|
| 19 |
+
vit_grad_ckpt: False
|
| 20 |
+
vit_ckpt_layer: 0
|
| 21 |
+
vit_layer_norm_epsilon: 1e-6
|
| 22 |
+
|
| 23 |
+
image_size: 384
|
| 24 |
+
|
| 25 |
+
# bert config
|
| 26 |
+
med_config_path: "configs/models/med_config_albef.json"
|
| 27 |
+
|
| 28 |
+
preprocess:
|
| 29 |
+
vis_processor:
|
| 30 |
+
train:
|
| 31 |
+
name: "blip_image_train"
|
| 32 |
+
image_size: 384
|
| 33 |
+
eval:
|
| 34 |
+
name: "blip_image_eval"
|
| 35 |
+
image_size: 384
|
| 36 |
+
text_processor:
|
| 37 |
+
train:
|
| 38 |
+
name: "blip_question"
|
| 39 |
+
eval:
|
| 40 |
+
name: "blip_question"
|
LAVIS-main/lavis/configs/models/alpro_qa_msrvtt.yaml
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: alpro_qa
|
| 8 |
+
num_classes: 1500
|
| 9 |
+
|
| 10 |
+
load_finetuned: True
|
| 11 |
+
|
| 12 |
+
finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_qa.pth"
|
| 13 |
+
pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
|
| 14 |
+
|
| 15 |
+
timesformer:
|
| 16 |
+
n_frms: 16
|
| 17 |
+
image_size: 224
|
| 18 |
+
|
| 19 |
+
patch_size: 16
|
| 20 |
+
attn_drop_rate: 0.
|
| 21 |
+
drop_rate: 0.
|
| 22 |
+
drop_path_rate: 0.1
|
| 23 |
+
|
| 24 |
+
use_grad_ckpt: True
|
| 25 |
+
ckpt_layer: 12
|
| 26 |
+
|
| 27 |
+
# bert config
|
| 28 |
+
med_config_path: "configs/models/bert_config_alpro.json"
|
| 29 |
+
|
| 30 |
+
preprocess:
|
| 31 |
+
vis_processor:
|
| 32 |
+
train:
|
| 33 |
+
name: "alpro_video_train"
|
| 34 |
+
n_frms: 16
|
| 35 |
+
image_size: 224
|
| 36 |
+
eval:
|
| 37 |
+
name: "alpro_video_eval"
|
| 38 |
+
n_frms: 16
|
| 39 |
+
image_size: 224
|
| 40 |
+
text_processor:
|
| 41 |
+
train:
|
| 42 |
+
name: "blip_caption"
|
| 43 |
+
eval:
|
| 44 |
+
name: "blip_caption"
|
LAVIS-main/lavis/configs/models/alpro_qa_msvd.yaml
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: alpro_qa
|
| 8 |
+
num_classes: 2423
|
| 9 |
+
|
| 10 |
+
load_finetuned: True
|
| 11 |
+
|
| 12 |
+
finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msvd_qa.pth"
|
| 13 |
+
pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
|
| 14 |
+
|
| 15 |
+
timesformer:
|
| 16 |
+
n_frms: 16
|
| 17 |
+
image_size: 224
|
| 18 |
+
|
| 19 |
+
patch_size: 16
|
| 20 |
+
attn_drop_rate: 0.
|
| 21 |
+
drop_rate: 0.
|
| 22 |
+
drop_path_rate: 0.1
|
| 23 |
+
use_grad_ckpt: True
|
| 24 |
+
ckpt_layer: 12
|
| 25 |
+
|
| 26 |
+
# bert config
|
| 27 |
+
med_config_path: "configs/models/bert_config_alpro.json"
|
| 28 |
+
|
| 29 |
+
preprocess:
|
| 30 |
+
vis_processor:
|
| 31 |
+
train:
|
| 32 |
+
name: "alpro_video_train"
|
| 33 |
+
n_frms: 16
|
| 34 |
+
image_size: 224
|
| 35 |
+
eval:
|
| 36 |
+
name: "alpro_video_eval"
|
| 37 |
+
n_frms: 16
|
| 38 |
+
image_size: 224
|
| 39 |
+
text_processor:
|
| 40 |
+
train:
|
| 41 |
+
name: "blip_caption"
|
| 42 |
+
eval:
|
| 43 |
+
name: "blip_caption"
|
LAVIS-main/lavis/configs/models/alpro_retrieval_didemo.yaml
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: alpro_retrieval
|
| 8 |
+
|
| 9 |
+
load_finetuned: True
|
| 10 |
+
|
| 11 |
+
finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_didemo_retrieval.pt
|
| 12 |
+
pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
|
| 13 |
+
|
| 14 |
+
timesformer:
|
| 15 |
+
n_frms: 8
|
| 16 |
+
image_size: 224
|
| 17 |
+
|
| 18 |
+
patch_size: 16
|
| 19 |
+
attn_drop_rate: 0.
|
| 20 |
+
drop_rate: 0.
|
| 21 |
+
drop_path_rate: 0.1
|
| 22 |
+
use_grad_ckpt: False
|
| 23 |
+
|
| 24 |
+
# bert config
|
| 25 |
+
med_config_path: "configs/models/bert_config_alpro.json"
|
| 26 |
+
|
| 27 |
+
preprocess:
|
| 28 |
+
vis_processor:
|
| 29 |
+
eval:
|
| 30 |
+
name: "alpro_video_eval"
|
| 31 |
+
n_frms: 8
|
| 32 |
+
image_size: 224
|
| 33 |
+
text_processor:
|
| 34 |
+
eval:
|
| 35 |
+
name: "blip_caption"
|
LAVIS-main/lavis/configs/models/alpro_retrieval_msrvtt.yaml
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: alpro_retrieval
|
| 8 |
+
|
| 9 |
+
load_finetuned: True
|
| 10 |
+
|
| 11 |
+
finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_retrieval.pt"
|
| 12 |
+
pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
|
| 13 |
+
|
| 14 |
+
timesformer:
|
| 15 |
+
n_frms: 8
|
| 16 |
+
image_size: 224
|
| 17 |
+
|
| 18 |
+
patch_size: 16
|
| 19 |
+
attn_drop_rate: 0.
|
| 20 |
+
drop_rate: 0.
|
| 21 |
+
drop_path_rate: 0.1
|
| 22 |
+
use_grad_ckpt: False
|
| 23 |
+
|
| 24 |
+
# bert config
|
| 25 |
+
med_config_path: "configs/models/bert_config_alpro.json"
|
| 26 |
+
|
| 27 |
+
preprocess:
|
| 28 |
+
vis_processor:
|
| 29 |
+
train:
|
| 30 |
+
name: "alpro_video_train"
|
| 31 |
+
n_frms: 8
|
| 32 |
+
image_size: 224
|
| 33 |
+
eval:
|
| 34 |
+
name: "alpro_video_eval"
|
| 35 |
+
n_frms: 8
|
| 36 |
+
image_size: 224
|
| 37 |
+
text_processor:
|
| 38 |
+
train:
|
| 39 |
+
name: "blip_caption"
|
| 40 |
+
eval:
|
| 41 |
+
name: "blip_caption"
|
LAVIS-main/lavis/configs/models/bert_config.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"BertModel"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"hidden_act": "gelu",
|
| 7 |
+
"hidden_dropout_prob": 0.1,
|
| 8 |
+
"hidden_size": 768,
|
| 9 |
+
"initializer_range": 0.02,
|
| 10 |
+
"intermediate_size": 3072,
|
| 11 |
+
"layer_norm_eps": 1e-12,
|
| 12 |
+
"max_position_embeddings": 512,
|
| 13 |
+
"model_type": "bert",
|
| 14 |
+
"num_attention_heads": 12,
|
| 15 |
+
"num_hidden_layers": 12,
|
| 16 |
+
"pad_token_id": 0,
|
| 17 |
+
"add_type_embeddings": false,
|
| 18 |
+
"vocab_size": 30522,
|
| 19 |
+
"encoder_width": 768,
|
| 20 |
+
"add_cross_attention": true
|
| 21 |
+
}
|
LAVIS-main/lavis/configs/models/bert_config_alpro.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"BertModel"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"hidden_act": "gelu",
|
| 7 |
+
"hidden_dropout_prob": 0.1,
|
| 8 |
+
"hidden_size": 768,
|
| 9 |
+
"initializer_range": 0.02,
|
| 10 |
+
"intermediate_size": 3072,
|
| 11 |
+
"layer_norm_eps": 1e-12,
|
| 12 |
+
"max_position_embeddings": 512,
|
| 13 |
+
"model_type": "bert",
|
| 14 |
+
"num_attention_heads": 12,
|
| 15 |
+
"num_hidden_layers": 12,
|
| 16 |
+
"pad_token_id": 0,
|
| 17 |
+
"add_type_embeddings": true,
|
| 18 |
+
"type_vocab_size": 2,
|
| 19 |
+
"vocab_size": 30522,
|
| 20 |
+
"encoder_width": 768,
|
| 21 |
+
"add_cross_attention": false,
|
| 22 |
+
"fusion_layer": 6
|
| 23 |
+
}
|
LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_base.yaml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model:
|
| 2 |
+
vit_model: "clip_L"
|
| 3 |
+
|
| 4 |
+
qformer_num_query_token: 16
|
| 5 |
+
qformer_cross_attention_freq: 1
|
| 6 |
+
|
| 7 |
+
sd_train_text_encoder: False
|
| 8 |
+
sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5"
|
| 9 |
+
|
| 10 |
+
load_finetuned: False
|
| 11 |
+
load_pretrained: True
|
| 12 |
+
# pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
|
| 13 |
+
pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion.tar.gz"
|
| 14 |
+
|
| 15 |
+
preprocess:
|
| 16 |
+
vis_processor:
|
| 17 |
+
train:
|
| 18 |
+
name: "blip_diffusion_inp_image_eval"
|
| 19 |
+
eval:
|
| 20 |
+
name: "blip_diffusion_inp_image_eval"
|
| 21 |
+
text_processor:
|
| 22 |
+
train:
|
| 23 |
+
name: "blip_caption"
|
| 24 |
+
eval:
|
| 25 |
+
name: "blip_caption"
|
LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_canny.yaml
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model:
|
| 2 |
+
vit_model: "clip_L"
|
| 3 |
+
|
| 4 |
+
qformer_num_query_token: 16
|
| 5 |
+
qformer_cross_attention_freq: 1
|
| 6 |
+
|
| 7 |
+
sd_train_text_encoder: False
|
| 8 |
+
sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5"
|
| 9 |
+
|
| 10 |
+
load_finetuned: False
|
| 11 |
+
load_pretrained: True
|
| 12 |
+
# pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
|
| 13 |
+
pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion.tar.gz"
|
| 14 |
+
|
| 15 |
+
controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-canny"
|
| 16 |
+
|
| 17 |
+
preprocess:
|
| 18 |
+
vis_processor:
|
| 19 |
+
train:
|
| 20 |
+
name: "blip_diffusion_inp_image_eval"
|
| 21 |
+
eval:
|
| 22 |
+
name: "blip_diffusion_inp_image_eval"
|
| 23 |
+
text_processor:
|
| 24 |
+
train:
|
| 25 |
+
name: "blip_caption"
|
| 26 |
+
eval:
|
| 27 |
+
name: "blip_caption"
|
LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_depth.yaml
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model:
|
| 2 |
+
vit_model: "clip_L"
|
| 3 |
+
|
| 4 |
+
qformer_num_query_token: 16
|
| 5 |
+
qformer_cross_attention_freq: 1
|
| 6 |
+
|
| 7 |
+
sd_train_text_encoder: False
|
| 8 |
+
sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5"
|
| 9 |
+
|
| 10 |
+
load_finetuned: False
|
| 11 |
+
load_pretrained: True
|
| 12 |
+
# pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
|
| 13 |
+
pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion-openimage.tar.gz"
|
| 14 |
+
|
| 15 |
+
controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-depth"
|
| 16 |
+
|
| 17 |
+
preprocess:
|
| 18 |
+
vis_processor:
|
| 19 |
+
train:
|
| 20 |
+
name: "blip_diffusion_inp_image_eval"
|
| 21 |
+
eval:
|
| 22 |
+
name: "blip_diffusion_inp_image_eval"
|
| 23 |
+
text_processor:
|
| 24 |
+
train:
|
| 25 |
+
name: "blip_caption"
|
| 26 |
+
eval:
|
| 27 |
+
name: "blip_caption"
|