yuccaaa commited on Sep 3, 2025

Commit

5c8f92e

verified ·

1 Parent(s): 9627ce0

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

LAVIS-main/lavis/configs/datasets/shapenet/defaults_mm_cap.yaml +51 -0
LAVIS-main/lavis/configs/datasets/shapenet/defaults_mm_cap_instruct.yaml +53 -0
LAVIS-main/lavis/configs/datasets/snli_ve/defaults.yaml +25 -0
LAVIS-main/lavis/configs/datasets/snli_ve/defaults_instruct.yaml +49 -0
LAVIS-main/lavis/configs/datasets/textcaps/defaults.yaml +46 -0
LAVIS-main/lavis/configs/datasets/textcaps/defaults_instruct.yaml +47 -0
LAVIS-main/lavis/configs/datasets/valor/defaults_mm_cap.yaml +68 -0
LAVIS-main/lavis/configs/datasets/valor/defaults_mm_cap_instruct.yaml +70 -0
LAVIS-main/lavis/configs/datasets/vatex/defaults_cap.yaml +24 -0
LAVIS-main/lavis/configs/datasets/vatex/defaults_cap_instruct.yaml +62 -0
LAVIS-main/lavis/configs/datasets/vg/defaults_caption.yaml +18 -0
LAVIS-main/lavis/configs/datasets/vg/defaults_caption_instruct.yaml +34 -0
LAVIS-main/lavis/configs/datasets/vg/defaults_vqa.yaml +18 -0
LAVIS-main/lavis/configs/datasets/vg/defaults_vqa_instruct.yaml +34 -0
LAVIS-main/lavis/configs/datasets/violin/defaults_cap.yaml +51 -0
LAVIS-main/lavis/configs/datasets/violin/defaults_cap_instruct.yaml +53 -0
LAVIS-main/lavis/configs/datasets/violin/defaults_entail.yaml +52 -0
LAVIS-main/lavis/configs/datasets/violin/defaults_entail_instruct.yaml +51 -0
LAVIS-main/lavis/configs/datasets/visdial/defaults_dial.yaml +41 -0
LAVIS-main/lavis/configs/datasets/visdial/defaults_dial_instruct.yaml +41 -0
LAVIS-main/lavis/configs/datasets/vizwiz/defaults.yaml +43 -0
LAVIS-main/lavis/configs/datasets/vlep/defaults_cap.yaml +51 -0
LAVIS-main/lavis/configs/datasets/vlep/defaults_cap_instruct.yaml +53 -0
LAVIS-main/lavis/configs/datasets/vsr/defaults.yaml +49 -0
LAVIS-main/lavis/configs/datasets/vsr/defaults_classification.yaml +49 -0
LAVIS-main/lavis/configs/datasets/vsr/defaults_classification_instruct.yaml +49 -0
LAVIS-main/lavis/configs/datasets/vsr/defaults_instruct.yaml +53 -0
LAVIS-main/lavis/configs/datasets/wavcaps/defaults_mm_cap.yaml +63 -0
LAVIS-main/lavis/configs/datasets/wavcaps/defaults_mm_cap_instruct.yaml +63 -0
LAVIS-main/lavis/configs/datasets/webvid/defaults_cap.yaml +41 -0
LAVIS-main/lavis/configs/datasets/webvid/defaults_cap_instruct.yaml +43 -0
LAVIS-main/lavis/configs/datasets/youcook/defaults_cap.yaml +51 -0
LAVIS-main/lavis/configs/datasets/youcook/defaults_cap_instruct.yaml +53 -0
LAVIS-main/lavis/configs/datasets/yt8m/defaults_mm_dial.yaml +62 -0
LAVIS-main/lavis/configs/models/albef_classification_ve.yaml +40 -0
LAVIS-main/lavis/configs/models/albef_feature_extractor.yaml +30 -0
LAVIS-main/lavis/configs/models/albef_nlvr.yaml +42 -0
LAVIS-main/lavis/configs/models/albef_pretrain_base.yaml +38 -0
LAVIS-main/lavis/configs/models/albef_retrieval_coco.yaml +46 -0
LAVIS-main/lavis/configs/models/albef_retrieval_flickr.yaml +46 -0
LAVIS-main/lavis/configs/models/albef_vqav2.yaml +40 -0
LAVIS-main/lavis/configs/models/alpro_qa_msrvtt.yaml +44 -0
LAVIS-main/lavis/configs/models/alpro_qa_msvd.yaml +43 -0
LAVIS-main/lavis/configs/models/alpro_retrieval_didemo.yaml +35 -0
LAVIS-main/lavis/configs/models/alpro_retrieval_msrvtt.yaml +41 -0
LAVIS-main/lavis/configs/models/bert_config.json +21 -0
LAVIS-main/lavis/configs/models/bert_config_alpro.json +23 -0
LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_base.yaml +25 -0
LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_canny.yaml +27 -0
LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_depth.yaml +27 -0

LAVIS-main/lavis/configs/datasets/shapenet/defaults_mm_cap.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  shapenet_mm_caption: # name of the dataset builder
+    vis_processor:
+          train:
+            name: "clip_image_train"
+            image_size: 224
+          eval:
+            name: "clip_image_train"
+            image_size: 224
+    pc_processor:
+          train:
+            name: "ulip_pc"
+          eval:
+            name: "ulip_pc"
+    text_processor:
+          train:
+              name: "blip_caption"
+          eval:
+            name: "blip_caption"
+    data_type: [pc, images] # [images|videos|features]
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/shapenet/train_ann.json
+            # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/train_ann.json
+          storage:
+            - shapenet/annotations/train_ann.json
+            # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/train_ann.json
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/shapenet/test_ann.json
+            # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/test_ann.json
+          storage:
+            - shapenet/annotations/test_ann.json
+            # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/test_ann.json
+      templates: null
+      pc:
+        storage: /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/shapenet_pc
+      images:
+        storage: /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/rendered_images

LAVIS-main/lavis/configs/datasets/shapenet/defaults_mm_cap_instruct.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  shapenet_mm_caption_instruct: # name of the dataset builder
+    vis_processor:
+          train:
+            name: "clip_image_train"
+            image_size: 224
+          eval:
+            name: "clip_image_train"
+            image_size: 224
+    pc_processor:
+          train:
+            name: "ulip_pc"
+          eval:
+            name: "ulip_pc"
+    text_processor:
+          train:
+              name: "blip_instruction"
+              modality: pc
+              task: caption
+          eval:
+            name: "blip_caption"
+    data_type: [pc, images] # [images|videos|features]
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/shapenet/train_ann.json
+            # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/train_ann.json
+          storage:
+            - shapenet/annotations/train_ann.json
+            # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/train_ann.json
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/shapenet/test_ann.json
+            # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/test_ann.json
+          storage:
+            - shapenet/annotations/test_ann.json
+            # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/test_ann.json
+      templates: null
+      pc:
+        storage: /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/shapenet_pc
+      images:
+        storage: /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/rendered_images

LAVIS-main/lavis/configs/datasets/snli_ve/defaults.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  snli_ve:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: /export/share/dongxuli/data/lavis/snli/annotation/ve_train.json
+          storage: snli/annotations/ve_train.json
+        val:
+          url: /export/share/dongxuli/data/lavis/snli/annotation/ve_dev.json
+          storage: snli/annotations/ve_dev.json
+        test:
+          url: /export/share/dongxuli/data/lavis/snli/annotation/ve_test.json
+          storage: snli/annotations/ve_test.json
+      images:
+          storage: flickr30k/images/flickr30k-images
+          # storage: /export/share/datasets/vision/flickr30k/flickr30k-images

LAVIS-main/lavis/configs/datasets/snli_ve/defaults_instruct.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  snli_ve_instruct:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/snlive/ve_train.json
+            # - /export/share/dongxuli/data/lavis/snli/ve_train.json
+          storage:
+            - snli/annotations/ve_train.json
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/snlive/ve_dev.json
+            # - /export/share/dongxuli/data/lavis/snli/ve_dev.json
+          storage:
+            - snli/annotations/ve_dev.json
+        test:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/snlive/ve_test.json
+            # - /export/share/dongxuli/data/lavis/snli/ve_test.json
+          storage:
+            - snli/annotations/ve_test.json
+      images:
+          # storage: flickr30k/images/flickr30k-images
+          storage: /export/share/datasets/vision/flickr30k/flickr30k-images

LAVIS-main/lavis/configs/datasets/textcaps/defaults.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  textcaps_caption: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_train.json
+          storage:
+            - TextCaps/TextCaps_0.1_train.json
+        val:
+          url:
+            - https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_val.json
+          storage:
+            - TextCaps/TextCaps_0.1_val.json
+        test:
+          url:
+            - https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_test.json
+          storage:
+            - TextCaps/TextCaps_0.1_test.json
+      images:
+        # storage: nocaps/images
+        storage: /export/share/datasets/vision_language/TextCaps/images

LAVIS-main/lavis/configs/datasets/textcaps/defaults_instruct.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  textcaps_caption_instruct: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: blip_instruction
+        modality: image
+        task: caption
+      eval:
+        name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_train.json
+          storage:
+            - TextCaps/TextCaps_0.1_train.json
+        val:
+          url:
+            - https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_val.json
+          storage:
+            - TextCaps/TextCaps_0.1_val.json
+        test:
+          url:
+            - https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_test.json
+          storage:
+            - TextCaps/TextCaps_0.1_test.json
+      images:
+        # storage: nocaps/images
+        storage: /export/share/datasets/vision_language/TextCaps/images

LAVIS-main/lavis/configs/datasets/valor/defaults_mm_cap.yaml ADDED Viewed

	@@ -0,0 +1,68 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  valor_mm_caption: # name of the dataset builder
+    data_type: [video, audio]
+    video_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+    audio_processor:
+        train:
+          name: beats_audio
+          sampling_rate: 16000
+        eval:
+          name: beats_audio
+          sampling_rate: 16000
+          is_eval: False
+    text_processor:
+        train:
+          name: blip_caption
+        eval:
+          name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/valor/desc_val.json
+            # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_val.json
+          storage:
+            - valor/annotations/desc_val.json
+            # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_val.json
+        test:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/valor/desc_test.json
+            # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_test.json
+          storage:
+            - valor/annotations/desc_test.json
+            # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_test.json
+      templates: null
+      audio:
+        storage: /export/video-language-dataset/data/VALOR/videos
+      video:
+        storage: /export/video-language-dataset/data/VALOR/videos

LAVIS-main/lavis/configs/datasets/valor/defaults_mm_cap_instruct.yaml ADDED Viewed

	@@ -0,0 +1,70 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  valor_mm_caption_instruct: # name of the dataset builder
+    data_type: [video, audio]
+    video_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+    audio_processor:
+        train:
+          name: beats_audio
+          sampling_rate: 16000
+        eval:
+          name: beats_audio
+          sampling_rate: 16000
+          is_eval: False
+    text_processor:
+        train:
+          name: blip_instruction
+          modality: image
+          task: caption
+        eval:
+          name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/valor/desc_val.json
+            # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_val.json
+          storage:
+            - valor/annotations/desc_val.json
+            # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_val.json
+        test:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/valor/desc_test.json
+            # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_test.json
+          storage:
+            - valor/annotations/desc_test.json
+            # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_test.json
+      templates: null
+      audio:
+        storage: /export/video-language-dataset/data/VALOR/videos
+      video:
+        storage: /export/video-language-dataset/data/VALOR/videos

LAVIS-main/lavis/configs/datasets/vatex/defaults_cap.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  msvd_cap: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
+          storage: vatex/annotations/cap_train.json
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
+          storage: vatex/annotations/cap_val.json
+        test:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
+          storage: vatex/annotations/cap_test.json
+      videos:
+        storage: /export/share/dongxuli/data/vatex

LAVIS-main/lavis/configs/datasets/vatex/defaults_cap_instruct.yaml ADDED Viewed

	@@ -0,0 +1,62 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  vatex_caption_instruct: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+    video_processor:
+      train:
+        name: alpro_video_train
+        n_frms: 4
+        image_size: 224
+        min_scale: 0.9
+        max_scale: 1.0
+        full_video: True
+      eval:
+        name: alpro_video_eval
+        n_frms: 4
+        image_size: 224
+        min_scale: 0.9
+        max_scale: 1.0
+        full_video: True
+    data_type: [video, audio]
+    audio_processor:
+        train:
+          name: beats_audio
+          sampling_rate: 16000
+        eval:
+          name: beats_audio
+          sampling_rate: 16000
+          is_eval: False
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
+          storage:
+            - vatex/annotations/cap_train.json
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
+          storage:
+            - vatex/annotations/cap_val.json
+        test:
+          url:
+            - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
+          storage:
+            - vatex/annotations/cap_test.json
+      video:
+        storage: /export/video-language-dataset/data/vatex/
+      audio:
+        storage: /export/video-language-dataset/data/vatex/

LAVIS-main/lavis/configs/datasets/vg/defaults_caption.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  vg_caption:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json
+          storage: vg/annotations/vg_caption.json
+      images:
+        storage: vg/images/

LAVIS-main/lavis/configs/datasets/vg/defaults_caption_instruct.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  vg_caption_instruct:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: blip_instruction
+        task: caption
+        modality: image
+      eval:
+        name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json
+          storage: vg/annotations/vg_caption.json
+      images:
+        storage: /export/share/datasets/vision/visual-genome/ #vg/images/

LAVIS-main/lavis/configs/datasets/vg/defaults_vqa.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  vg_vqa:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json
+          storage: vg/annotations/vg_qa.json
+      images:
+        storage: vg/images/

LAVIS-main/lavis/configs/datasets/vg/defaults_vqa_instruct.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  vg_vqa_instruct:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: blip_instruction
+        task: qa
+        modality: image
+      eval:
+        name: blip_question
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json
+          storage: vg/annotations/vg_qa.json
+      images:
+        storage: /export/share/datasets/vision/visual-genome/ #vg/images/

LAVIS-main/lavis/configs/datasets/violin/defaults_cap.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  violin_caption: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+    vis_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/train.json
+            # - /export/video-language-dataset/data/violin/annotations_lavis.json
+          storage:
+            - violin/annotations/train.json
+            # - /export/video-language-dataset/data/violin/annotations_lavis.json
+        # val:
+        #   url:
+        #     # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/test.json
+        #     - /export/video-language-dataset/data/violin/annotations_lavis_test.json
+        #   storage:
+        #     # - violin/annotations/test.json
+        #     - /export/video-language-dataset/data/violin/annotations_lavis_test.json
+      videos:
+        storage: /export/video-language-dataset/data/violin/videos

LAVIS-main/lavis/configs/datasets/violin/defaults_cap_instruct.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  violin_caption_instruct: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+    vis_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+    text_processor:
+      train:
+        name: blip_instruction
+        modality: video
+        task: caption
+      eval:
+        name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/train.json
+            # - /export/video-language-dataset/data/violin/annotations_lavis.json
+          storage:
+            - violin/annotations/train.json
+            # - /export/video-language-dataset/data/violin/annotations_lavis.json
+        # val:
+        #   url:
+        #     # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/test.json
+        #     - /export/video-language-dataset/data/violin/annotations_lavis_test.json
+        #   storage:
+        #     # - violin/annotations/test.json
+        #     - /export/video-language-dataset/data/violin/annotations_lavis_test.json
+      videos:
+        storage: /export/video-language-dataset/data/violin/videos

LAVIS-main/lavis/configs/datasets/violin/defaults_entail.yaml ADDED Viewed

	@@ -0,0 +1,52 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  violin_entailment: # 22452
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+    vis_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/train.json
+            # - /export/video-language-dataset/data/violin/annotations_lavis.json
+          storage:
+            - violin/annotations/train.json
+            # - /export/video-language-dataset/data/violin/annotations_lavis.json
+        # val:
+        #   url:
+        #     # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/test.json
+        #     - /export/video-language-dataset/data/violin/annotations_lavis_test.json
+        #   storage:
+        #     # - violin/annotations/test.json
+        #     - /export/video-language-dataset/data/violin/annotations_lavis_test.json
+      videos:
+        storage: /export/video-language-dataset/data/violin/videos

LAVIS-main/lavis/configs/datasets/violin/defaults_entail_instruct.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  violin_entailment_instruct: # 22452
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+    vis_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/train.json
+            # - /export/video-language-dataset/data/violin/annotations_lavis.json
+          storage:
+            - violin/annotations/train.json
+            # - /export/video-language-dataset/data/violin/annotations_lavis.json
+        # val:
+        #   url:
+        #     # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/test.json
+        #     - /export/video-language-dataset/data/violin/annotations_lavis_test.json
+        #   storage:
+        #     # - violin/annotations/test.json
+        #     - /export/video-language-dataset/data/violin/annotations_lavis_test.json
+      videos:
+        storage: /export/video-language-dataset/data/violin/videos

LAVIS-main/lavis/configs/datasets/visdial/defaults_dial.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  visdial: # name of the dataset builder
+    data_type: images #extracted features of videos (I3D, VGGish) # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+             - /export/share/datasets/vision_language/visdial/visdial_1.0_train.json
+          storage:
+            - /export/share/datasets/vision_language/visdial/visdial_1.0_train.json
+        val:
+          url:
+            - /export/share/datasets/vision_language/visdial/visdial_1.0_val.json
+          storage:
+            - /export/share/datasets/vision_language/visdial/visdial_1.0_val.json
+        # test:
+        #   url: /export/share/datasets/vision_language/visdial/visdial_1.0_test.json
+        #   storage: /export/share/datasets/vision_language/visdial/visdial_1.0_test.json
+      images:
+        storage: /export/share/datasets/vision_language/visdial/

LAVIS-main/lavis/configs/datasets/visdial/defaults_dial_instruct.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  visdial_instruct: # name of the dataset builder
+    data_type: images #extracted features of videos (I3D, VGGish) # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+             - /export/share/datasets/vision_language/visdial/visdial_1.0_train.json
+          storage:
+            - /export/share/datasets/vision_language/visdial/visdial_1.0_train.json
+        val:
+          url:
+            - /export/share/datasets/vision_language/visdial/visdial_1.0_val.json
+          storage:
+            - /export/share/datasets/vision_language/visdial/visdial_1.0_val.json
+        # test:
+        #   url: /export/share/datasets/vision_language/visdial/visdial_1.0_test.json
+        #   storage: /export/share/datasets/vision_language/visdial/visdial_1.0_test.json
+      images:
+        storage: /export/share/datasets/vision_language/visdial/

LAVIS-main/lavis/configs/datasets/vizwiz/defaults.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  vizwiz_vqa:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: blip_question
+      eval:
+        name: blip_question
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        val:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vizwiz/val.json
+              # - /export/share/datasets/vision/vizwiz/Annotations/val.json
+          storage:
+              - vizwiz/annotations/val.json
+              # - /export/share/datasets/vision/vizwiz/Annotations/val.json
+        test:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vizwiz/test.json
+              # - /export/share/datasets/vision/vizwiz/Annotations/test.json
+          storage:
+              - vizwiz/annotations/test.json
+              # - /export/share/datasets/vision/vizwiz/Annotations/test.json
+      images:
+          storage: /export/share/datasets/vision/vizwiz/images

LAVIS-main/lavis/configs/datasets/vlep/defaults_cap.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  vlep_caption: # 4900
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+    vis_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vlep/annotations_train_existing.json
+            # - /export/video-language-dataset/data/vlep/annotations/annotations_train_existing.json
+          storage:
+            - vlep/annotations/annotations_train_existing.json
+            # - /export/video-language-dataset/data/vlep/annotations/annotations_train_existing.json
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vlep/annotations_dev_existing.json
+            # - /export/video-language-dataset/data/vlep/annotations/annotations_dev_existing.json
+          storage:
+            - vlep/annotations/annotations_dev_existing.json
+            # - /export/video-language-dataset/data/vlep/annotations/annotations_dev_existing.json
+      videos:
+        storage: /export/video-language-dataset/data/vlep/videos

LAVIS-main/lavis/configs/datasets/vlep/defaults_cap_instruct.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  vlep_caption_instruct: # 4900
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+    vis_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+    text_processor:
+      train:
+        name: blip_instruction
+        modality: image
+        task: caption
+      eval:
+        name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vlep/annotations_train_existing.json
+            # - /export/video-language-dataset/data/vlep/annotations/annotations_train_existing.json
+          storage:
+            - vlep/annotations/annotations_train_existing.json
+            # - /export/video-language-dataset/data/vlep/annotations/annotations_train_existing.json
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vlep/annotations_dev_existing.json
+            # - /export/video-language-dataset/data/vlep/annotations/annotations_dev_existing.json
+          storage:
+            - vlep/annotations/annotations_dev_existing.json
+            # - /export/video-language-dataset/data/vlep/annotations/annotations_dev_existing.json
+      videos:
+        storage: /export/video-language-dataset/data/vlep/videos

LAVIS-main/lavis/configs/datasets/vsr/defaults.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  vsr_classification_instruct:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+        train:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/train.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
+          storage:
+              - vsr/annotations/train.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
+        val:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/dev.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
+          storage:
+              - vsr/annotations/dev.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
+        test:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/test.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
+          storage:
+              - vsr/annotations/test.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
+      images:
+          storage: /export/share/datasets/vision_language/VSR/images

LAVIS-main/lavis/configs/datasets/vsr/defaults_classification.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  vsr_classification:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+        train:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/train.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
+          storage:
+              - vsr/annotations/train.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
+        val:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/dev.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
+          storage:
+              - vsr/annotations/dev.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
+        test:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/test.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
+          storage:
+              - vsr/annotations/test.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
+      images:
+          storage: /export/share/datasets/vision_language/VSR/images

LAVIS-main/lavis/configs/datasets/vsr/defaults_classification_instruct.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  vsr_caption_instruct:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+        train:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/train.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
+          storage:
+              - vsr/annotations/train.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
+        val:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/dev.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
+          storage:
+              - vsr/annotations/dev.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
+        test:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/test.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
+          storage:
+              - vsr/annotations/test.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
+      images:
+          storage: /export/share/datasets/vision_language/VSR/images

LAVIS-main/lavis/configs/datasets/vsr/defaults_instruct.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  vsr_caption_instruct:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: blip_instruction
+        task: caption
+        modality: image
+      eval:
+        name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/train.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
+          storage:
+              - vsr/annotations/train.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
+        val:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/dev.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
+          storage:
+              - vsr/annotations/dev.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
+        test:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/test.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
+          storage:
+              - vsr/annotations/test.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
+      images:
+          storage: /export/share/datasets/vision_language/VSR/images

LAVIS-main/lavis/configs/datasets/wavcaps/defaults_mm_cap.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  wavcaps_mm_caption: # name of the dataset builder
+    audio_processor:
+        train:
+          name: beats_audio
+          sampling_rate: 16000
+          n_frames: 2
+          frame_length: 512
+        eval:
+          name: beats_audio
+          sampling_rate: 16000
+          n_frames: 2
+          frame_length: 512
+    text_processor:
+        train:
+          name: blip_caption
+        eval:
+          name: blip_caption
+    data_type: [audio]
+    build_info:
+      kwargs:
+        cached: False
+        cached_dir: /export/share/datasets/audio/WavCaps/beats_features/
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/BBC_Sound_Effects/bbc_final.json
+            - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/FreeSound/fsd_final.json
+            - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/SoundBible/sb_final.json
+            - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/AudioSet_SL/as_final.json
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/wavcaps/json_data.json
+          storage:
+            - wavcaps/json_files/BBC_Sound_Effects/bbc_final.json
+            - wavcaps/json_files/FreeSound/fsd_final.json
+            - wavcaps/json_files/SoundBible/sb_final.json
+            - wavcaps/json_files/AudioSet_SL/as_final.json
+            - wavcaps/annotations/json_data.json
+        # train:
+        #   url:
+        #     - /export/share/datasets/audio/WavCaps/json_files/BBC_Sound_Effects/bbc_final.json
+        #     - /export/share/datasets/audio/WavCaps/json_files/FreeSound/fsd_final.json
+        #     - /export/share/datasets/audio/WavCaps/json_files/SoundBible/sb_final.json
+        #     - /export/share/datasets/audio/WavCaps/json_files/AudioSet_SL/as_final.json
+        #     - /export/share/datasets/audio/WavCaps/json_data.json
+        #   storage:
+        #     - /export/share/datasets/audio/WavCaps/json_files/BBC_Sound_Effects/bbc_final.json
+        #     - /export/share/datasets/audio/WavCaps/json_files/FreeSound/fsd_final.json
+        #     - /export/share/datasets/audio/WavCaps/json_files/SoundBible/sb_final.json
+        #     - /export/share/datasets/audio/WavCaps/json_files/AudioSet_SL/as_final.json
+        #     - /export/share/datasets/audio/WavCaps/json_data.json
+      audio:
+        storage: /export/share/datasets/audio/WavCaps/

LAVIS-main/lavis/configs/datasets/wavcaps/defaults_mm_cap_instruct.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  wavcaps_mm_caption_instruct: # name of the dataset builder
+    audio_processor:
+        train:
+          name: beats_audio
+          sampling_rate: 16000
+          n_frames: 2
+          frame_length: 512
+        eval:
+          name: beats_audio
+          sampling_rate: 16000
+          n_frames: 2
+          frame_length: 512
+    text_processor:
+        train:
+          name: "blip_instruction"
+          modality: audio
+          task: caption
+        eval:
+          name: "blip_caption"
+    data_type: [audio]
+    build_info:
+      kwargs:
+        cached: True
+        cached_dir: /export/share/datasets/audio/WavCaps/beats_features/
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          # url:
+          #   - /export/share/datasets/audio/WavCaps/json_files/BBC_Sound_Effects/bbc_final.json
+          #   - /export/share/datasets/audio/WavCaps/json_files/FreeSound/fsd_final.json
+          #   - /export/share/datasets/audio/WavCaps/json_files/SoundBible/sb_final.json
+          #   - /export/share/datasets/audio/WavCaps/json_files/AudioSet_SL/as_final.json
+          #   - /export/share/datasets/audio/WavCaps/json_data.json
+          # storage:
+          #   - /export/share/datasets/audio/WavCaps/json_files/BBC_Sound_Effects/bbc_final.json
+          #   - /export/share/datasets/audio/WavCaps/json_files/FreeSound/fsd_final.json
+          #   - /export/share/datasets/audio/WavCaps/json_files/SoundBible/sb_final.json
+          #   - /export/share/datasets/audio/WavCaps/json_files/AudioSet_SL/as_final.json
+          #   - /export/share/datasets/audio/WavCaps/json_data.json
+          url:
+            - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/BBC_Sound_Effects/bbc_final.json
+            - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/FreeSound/fsd_final.json
+            - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/SoundBible/sb_final.json
+            - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/AudioSet_SL/as_final.json
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/wavcaps/json_data.json
+          storage:
+            - wavcaps/json_files/BBC_Sound_Effects/bbc_final.json
+            - wavcaps/json_files/FreeSound/fsd_final.json
+            - wavcaps/json_files/SoundBible/sb_final.json
+            - wavcaps/json_files/AudioSet_SL/as_final.json
+            - wavcaps/annotations/json_data.json
+      audio:
+        storage: /export/share/datasets/audio/WavCaps/

LAVIS-main/lavis/configs/datasets/webvid/defaults_cap.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  webvid2m_caption: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+          train:
+            name: alpro_video_train
+            n_frms: 5
+            image_size: 224
+            min_scale: 0.9
+            max_scale: 1.0
+          eval:
+            name: alpro_video_eval
+            n_frms: 5
+            image_size: 224
+            min_scale: 0.9
+            max_scale: 1.0
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/webvid2m/train.json
+            # - /export/home/LAVIS/webvid_annotation.json
+          storage:
+            - webvid2m/annotations/train.json
+            # - /export/home/LAVIS/webvid_annotation.json
+      images:
+        storage: /export/video-language-dataset/data/webvid2m/postprocess/downsampled_videos

LAVIS-main/lavis/configs/datasets/webvid/defaults_cap_instruct.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  webvid2m_caption_instruct: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+          train:
+            name: alpro_video_train
+            n_frms: 5
+            image_size: 224
+            min_scale: 0.9
+            max_scale: 1.0
+          eval:
+            name: alpro_video_eval
+            n_frms: 5
+            image_size: 224
+            min_scale: 0.9
+            max_scale: 1.0
+    text_processor:
+        train:
+          name: "blip_instruction"
+          modality: video
+          task: caption
+        eval:
+          name: "blip_caption"
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/webvid2m/train.json
+            # - /export/home/LAVIS/webvid_annotation.json
+          storage:
+            - webvid2m/annotations/train.json
+            # - /export/home/LAVIS/webvid_annotation.json
+      images:
+        storage: /export/video-language-dataset/data/webvid2m/postprocess/downsampled_videos

LAVIS-main/lavis/configs/datasets/youcook/defaults_cap.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  youcook_caption: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+    vis_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+    text_processor:
+        train:
+          name: blip_caption
+        eval:
+          name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/youcook/train_annotations.json
+            # - /export/video-language-dataset/data/youcook/annotations/train_annotations.json
+          storage:
+            - youcook/annotations/train_annotations.json
+            # - /export/video-language-dataset/data/youcook/annotations/train_annotations.json
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/youcook/val_annotations.json
+            # - /export/video-language-dataset/data/youcook/annotations/val_annotations.json
+          storage:
+            - youcook/annotations/val_annotations.json
+            # - /export/video-language-dataset/data/youcook/annotations/val_annotations.json
+      videos:
+        storage: /export/video-language-dataset/data/youcook/raw_videos

LAVIS-main/lavis/configs/datasets/youcook/defaults_cap_instruct.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  youcook_caption_instruct: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+    vis_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+    text_processor:
+        train:
+          name: blip_instruction
+          modality: video
+          task: caption
+        eval:
+          name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/youcook/train_annotations.json
+            # - /export/video-language-dataset/data/youcook/annotations/train_annotations.json
+          storage:
+            - youcook/annotations/train_annotations.json
+            # - /export/video-language-dataset/data/youcook/annotations/train_annotations.json
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/youcook/val_annotations.json
+            # - /export/video-language-dataset/data/youcook/annotations/val_annotations.json
+          storage:
+            - youcook/annotations/val_annotations.json
+            # - /export/video-language-dataset/data/youcook/annotations/val_annotations.json
+      videos:
+        storage: /export/video-language-dataset/data/youcook/raw_videos

LAVIS-main/lavis/configs/datasets/yt8m/defaults_mm_dial.yaml ADDED Viewed

	@@ -0,0 +1,62 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  yt8m_mm_dialogue: # name of the dataset builder
+    data_type: [video] #extracted features of videos (I3D, VGGish) # [images|videos|features]
+    video_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+    audio_processor:
+        train:
+          name: beats_audio
+          # sampling_rate: 16000
+        eval:
+          name: beats_audio
+          # sampling_rate: 16000
+          is_eval: True
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - /export/video-language-dataset/data/yt-8m/ytd_gpt3_safe_json/train.json
+          storage:
+            - /export/video-language-dataset/data/yt-8m/ytd_gpt3_safe_json/train.json
+        val:
+          url:
+            - /export/video-language-dataset/data/yt-8m/ytd_gpt3_safe_json/validation.json
+          storage:
+            - /export/video-language-dataset/data/yt-8m/ytd_gpt3_safe_json/validation.json
+      templates: null
+      audio:
+        storage: /export/video-language-dataset/data/yt-8m/audios
+      video:
+        storage: /export/video-language-dataset/data/yt-8m/videos

LAVIS-main/lavis/configs/models/albef_classification_ve.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: albef_classification
+  load_finetuned: True
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_snli_ve_lavis.pt"
+  pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
+  num_classes: 3
+  use_distill: True
+  momentum: 0.995
+  alpha: 0.4
+  # vit encoder
+  vit_type: "base"
+  vit_grad_ckpt: False
+  vit_ckpt_layer: 0
+  vit_layer_norm_epsilon: 1e-6
+  image_size: 384
+  # bert config
+  med_config_path: "configs/models/med_config_albef.json"
+preprocess:
+  vis_processor:
+      train:
+        name: "blip_image_train"
+      eval:
+        name: "blip_image_eval"
+  text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"

LAVIS-main/lavis/configs/models/albef_feature_extractor.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: albef_pretrain
+  pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
+  # vit encoder
+  vit_type: "base"
+  image_size: 224
+  vit_ckpt_layer: 0
+  vit_drop_path_rate: 0
+  vit_layer_norm_epsilon: 1e-6
+  vit_grad_ckpt: False
+  # bert config
+  med_config_path: "configs/models/med_config_albef.json"
+  embed_dim: 256
+preprocess:
+  vis_processor:
+      eval:
+        name: "blip_image_eval"
+        image_size: 224
+  text_processor:
+      eval:
+        name: "blip_caption"

LAVIS-main/lavis/configs/models/albef_nlvr.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: albef_nlvr
+  load_finetuned: True
+  pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/pretrain_model_nlvr.pth"
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_nlvr_lavis.pt"
+  num_classes: 2
+  use_distill: True
+  momentum: 0.995
+  alpha: 0.4
+  # vit encoder
+  vit_type: "base"
+  vit_grad_ckpt: False
+  vit_ckpt_layer: 0
+  vit_layer_norm_epsilon: 1e-6
+  image_size: 384
+  # bert config
+  med_config_path: "configs/models/med_config_albef.json"
+preprocess:
+  vis_processor:
+      train:
+        name: "blip_image_train"
+        image_size: 384
+      eval:
+        name: "blip_image_eval"
+        image_size: 384
+  text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"

LAVIS-main/lavis/configs/models/albef_pretrain_base.yaml ADDED Viewed

	@@ -0,0 +1,38 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: albef_pretrain
+  load_pretrained: True
+  pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
+  # vit encoder
+  vit_type: "base"
+  image_size: 224
+  vit_ckpt_layer: 0
+  vit_drop_path_rate: 0
+  vit_layer_norm_epsilon: 1e-6
+  vit_grad_ckpt: False
+  # bert config
+  med_config_path: "configs/models/med_config_albef.json"
+  mlm_mask_prob: 0.15
+  embed_dim: 256
+  momentum: 0.995
+  alpha: 0.4
+  temp: 0.07
+  max_txt_len: 30
+preprocess:
+    vis_processor:
+        train:
+          name: "blip_image_train"
+          image_size: 256
+    text_processor:
+        train:
+          name: "blip_caption"

LAVIS-main/lavis/configs/models/albef_retrieval_coco.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: albef_retrieval
+  load_finetuned: True
+  pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_coco_retrieval_lavis.pt"
+  queue_size: 65536
+  # vit encoder
+  vit_type: "base"
+  image_size: 384
+  vit_ckpt_layer: 0
+  vit_drop_path_rate: 0
+  vit_layer_norm_epsilon: 1e-6
+  vit_grad_ckpt: False
+  # bert config
+  med_config_path: "configs/models/med_config_albef.json"
+  embed_dim: 256
+  momentum: 0.995
+  alpha: 0.4
+  temp: 0.07
+  use_distill: True
+  max_txt_len: 30
+preprocess:
+  vis_processor:
+      train:
+        name: "blip_image_train"
+        image_size: 384
+      eval:
+        name: "blip_image_eval"
+        image_size: 384
+  text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"

LAVIS-main/lavis/configs/models/albef_retrieval_flickr.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: albef_retrieval
+  load_finetuned: True
+  pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
+  finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_flickr_retrieval_lavis.pt
+  queue_size: 65536
+  # vit encoder
+  vit_type: "base"
+  image_size: 384
+  vit_ckpt_layer: 0
+  vit_drop_path_rate: 0
+  vit_layer_norm_epsilon: 1e-6
+  vit_grad_ckpt: False
+  # bert config
+  med_config_path: "configs/models/med_config_albef.json"
+  embed_dim: 256
+  momentum: 0.995
+  alpha: 0.4
+  temp: 0.07
+  use_distill: True
+  max_txt_len: 30
+preprocess:
+  vis_processor:
+      train:
+        name: "blip_image_train"
+        image_size: 384
+      eval:
+        name: "blip_image_eval"
+        image_size: 384
+  text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"

LAVIS-main/lavis/configs/models/albef_vqav2.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: albef_vqa
+  load_finetuned: True
+  pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_vqav2_lavis.pt"
+  use_distill: True
+  momentum: 0.995
+  alpha: 0.4
+  # vit encoder
+  vit_type: "base"
+  vit_grad_ckpt: False
+  vit_ckpt_layer: 0
+  vit_layer_norm_epsilon: 1e-6
+  image_size: 384
+  # bert config
+  med_config_path: "configs/models/med_config_albef.json"
+preprocess:
+  vis_processor:
+      train:
+        name: "blip_image_train"
+        image_size: 384
+      eval:
+        name: "blip_image_eval"
+        image_size: 384
+  text_processor:
+      train:
+        name: "blip_question"
+      eval:
+        name: "blip_question"

LAVIS-main/lavis/configs/models/alpro_qa_msrvtt.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: alpro_qa
+  num_classes: 1500
+  load_finetuned: True
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_qa.pth"
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
+  timesformer:
+    n_frms: 16
+    image_size: 224
+    patch_size: 16
+    attn_drop_rate: 0.
+    drop_rate: 0.
+    drop_path_rate: 0.1
+    use_grad_ckpt: True
+    ckpt_layer: 12
+  # bert config
+  med_config_path: "configs/models/bert_config_alpro.json"
+preprocess:
+  vis_processor:
+      train:
+        name: "alpro_video_train"
+        n_frms: 16
+        image_size: 224
+      eval:
+        name: "alpro_video_eval"
+        n_frms: 16
+        image_size: 224
+  text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"

LAVIS-main/lavis/configs/models/alpro_qa_msvd.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: alpro_qa
+  num_classes: 2423
+  load_finetuned: True
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msvd_qa.pth"
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
+  timesformer:
+    n_frms: 16
+    image_size: 224
+    patch_size: 16
+    attn_drop_rate: 0.
+    drop_rate: 0.
+    drop_path_rate: 0.1
+    use_grad_ckpt: True
+    ckpt_layer: 12
+  # bert config
+  med_config_path: "configs/models/bert_config_alpro.json"
+preprocess:
+  vis_processor:
+      train:
+        name: "alpro_video_train"
+        n_frms: 16
+        image_size: 224
+      eval:
+        name: "alpro_video_eval"
+        n_frms: 16
+        image_size: 224
+  text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"

LAVIS-main/lavis/configs/models/alpro_retrieval_didemo.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: alpro_retrieval
+  load_finetuned: True
+  finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_didemo_retrieval.pt
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
+  timesformer:
+    n_frms: 8
+    image_size: 224
+    patch_size: 16
+    attn_drop_rate: 0.
+    drop_rate: 0.
+    drop_path_rate: 0.1
+    use_grad_ckpt: False
+  # bert config
+  med_config_path: "configs/models/bert_config_alpro.json"
+preprocess:
+  vis_processor:
+      eval:
+        name: "alpro_video_eval"
+        n_frms: 8
+        image_size: 224
+  text_processor:
+      eval:
+        name: "blip_caption"

LAVIS-main/lavis/configs/models/alpro_retrieval_msrvtt.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: alpro_retrieval
+  load_finetuned: True
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_retrieval.pt"
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
+  timesformer:
+    n_frms: 8
+    image_size: 224
+    patch_size: 16
+    attn_drop_rate: 0.
+    drop_rate: 0.
+    drop_path_rate: 0.1
+    use_grad_ckpt: False
+  # bert config
+  med_config_path: "configs/models/bert_config_alpro.json"
+preprocess:
+  vis_processor:
+      train:
+        name: "alpro_video_train"
+        n_frms: 8
+        image_size: 224
+      eval:
+        name: "alpro_video_eval"
+        n_frms: 8
+        image_size: 224
+  text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"

LAVIS-main/lavis/configs/models/bert_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "add_type_embeddings": false,
+  "vocab_size": 30522,
+  "encoder_width": 768,
+  "add_cross_attention": true
+}

LAVIS-main/lavis/configs/models/bert_config_alpro.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "add_type_embeddings": true,
+  "type_vocab_size": 2,
+  "vocab_size": 30522,
+  "encoder_width": 768,
+  "add_cross_attention": false,
+  "fusion_layer": 6
+}

LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_base.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+model:
+  vit_model: "clip_L"
+  qformer_num_query_token: 16
+  qformer_cross_attention_freq: 1
+  sd_train_text_encoder: False
+  sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5"
+  load_finetuned: False
+  load_pretrained: True
+  # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion.tar.gz"
+preprocess:
+  vis_processor:
+    train:
+      name: "blip_diffusion_inp_image_eval"
+    eval:
+      name: "blip_diffusion_inp_image_eval"
+  text_processor:
+    train:
+      name: "blip_caption"
+    eval:
+      name: "blip_caption"

LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_canny.yaml ADDED Viewed

	@@ -0,0 +1,27 @@

+model:
+  vit_model: "clip_L"
+  qformer_num_query_token: 16
+  qformer_cross_attention_freq: 1
+  sd_train_text_encoder: False
+  sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5"
+  load_finetuned: False
+  load_pretrained: True
+  # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion.tar.gz"
+  controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-canny"
+preprocess:
+  vis_processor:
+    train:
+      name: "blip_diffusion_inp_image_eval"
+    eval:
+      name: "blip_diffusion_inp_image_eval"
+  text_processor:
+    train:
+      name: "blip_caption"
+    eval:
+      name: "blip_caption"

LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_depth.yaml ADDED Viewed

	@@ -0,0 +1,27 @@

+model:
+  vit_model: "clip_L"
+  qformer_num_query_token: 16
+  qformer_cross_attention_freq: 1
+  sd_train_text_encoder: False
+  sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5"
+  load_finetuned: False
+  load_pretrained: True
+  # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion-openimage.tar.gz"
+  controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-depth"
+preprocess:
+  vis_processor:
+    train:
+      name: "blip_diffusion_inp_image_eval"
+    eval:
+      name: "blip_diffusion_inp_image_eval"
+  text_processor:
+    train:
+      name: "blip_caption"
+    eval:
+      name: "blip_caption"