diff --git a/LAVIS-main/lavis/configs/datasets/shapenet/defaults_mm_cap.yaml b/LAVIS-main/lavis/configs/datasets/shapenet/defaults_mm_cap.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1c6fb08a21f7cd245139f83f4e6f840f03606211
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/shapenet/defaults_mm_cap.yaml
@@ -0,0 +1,51 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  shapenet_mm_caption: # name of the dataset builder
+    vis_processor:
+          train:
+            name: "clip_image_train"
+            image_size: 224
+          eval:
+            name: "clip_image_train"
+            image_size: 224
+    pc_processor:
+          train:
+            name: "ulip_pc"
+          eval:
+            name: "ulip_pc"
+    text_processor:
+          train:
+              name: "blip_caption"
+          eval:
+            name: "blip_caption"
+
+    data_type: [pc, images] # [images|videos|features]
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/shapenet/train_ann.json
+            # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/train_ann.json
+          storage:
+            - shapenet/annotations/train_ann.json
+            # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/train_ann.json
+        val:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/shapenet/test_ann.json
+            # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/test_ann.json
+          storage: 
+            - shapenet/annotations/test_ann.json
+            # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/test_ann.json
+            
+      templates: null
+
+      pc:
+        storage: /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/shapenet_pc
+
+      images:
+        storage: /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/rendered_images
\ No newline at end of file
diff --git a/LAVIS-main/lavis/configs/datasets/shapenet/defaults_mm_cap_instruct.yaml b/LAVIS-main/lavis/configs/datasets/shapenet/defaults_mm_cap_instruct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..002379fc0bef72c8e327c3284967f4f234c76303
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/shapenet/defaults_mm_cap_instruct.yaml
@@ -0,0 +1,53 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  shapenet_mm_caption_instruct: # name of the dataset builder
+    vis_processor:
+          train:
+            name: "clip_image_train"
+            image_size: 224
+          eval:
+            name: "clip_image_train"
+            image_size: 224
+    pc_processor:
+          train:
+            name: "ulip_pc"
+          eval:
+            name: "ulip_pc"
+    text_processor:
+          train:
+              name: "blip_instruction"
+              modality: pc
+              task: caption
+          eval:
+            name: "blip_caption"
+
+    data_type: [pc, images] # [images|videos|features]
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/shapenet/train_ann.json
+            # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/train_ann.json
+          storage:
+            - shapenet/annotations/train_ann.json
+            # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/train_ann.json
+        val:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/shapenet/test_ann.json
+            # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/test_ann.json
+          storage: 
+            - shapenet/annotations/test_ann.json
+            # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/test_ann.json
+            
+      templates: null
+
+      pc:
+        storage: /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/shapenet_pc
+
+      images:
+        storage: /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/rendered_images
\ No newline at end of file
diff --git a/LAVIS-main/lavis/configs/datasets/snli_ve/defaults.yaml b/LAVIS-main/lavis/configs/datasets/snli_ve/defaults.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..91b6cf7fd9b79b1d6a26ae25eed38cda61b83d01
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/snli_ve/defaults.yaml
@@ -0,0 +1,25 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  snli_ve:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: /export/share/dongxuli/data/lavis/snli/annotation/ve_train.json
+          storage: snli/annotations/ve_train.json
+        val:
+          url: /export/share/dongxuli/data/lavis/snli/annotation/ve_dev.json
+          storage: snli/annotations/ve_dev.json
+        test:
+          url: /export/share/dongxuli/data/lavis/snli/annotation/ve_test.json
+          storage: snli/annotations/ve_test.json
+      images:
+          storage: flickr30k/images/flickr30k-images
+          # storage: /export/share/datasets/vision/flickr30k/flickr30k-images
diff --git a/LAVIS-main/lavis/configs/datasets/snli_ve/defaults_instruct.yaml b/LAVIS-main/lavis/configs/datasets/snli_ve/defaults_instruct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..32d30846c11f85c042416d34b08024325cf443b9
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/snli_ve/defaults_instruct.yaml
@@ -0,0 +1,49 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  snli_ve_instruct:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/snlive/ve_train.json
+            # - /export/share/dongxuli/data/lavis/snli/ve_train.json
+          storage: 
+            - snli/annotations/ve_train.json
+        val:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/snlive/ve_dev.json
+            # - /export/share/dongxuli/data/lavis/snli/ve_dev.json
+          storage: 
+            - snli/annotations/ve_dev.json
+        test:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/snlive/ve_test.json
+            # - /export/share/dongxuli/data/lavis/snli/ve_test.json
+          storage: 
+            - snli/annotations/ve_test.json
+      images:
+          # storage: flickr30k/images/flickr30k-images
+          storage: /export/share/datasets/vision/flickr30k/flickr30k-images
diff --git a/LAVIS-main/lavis/configs/datasets/textcaps/defaults.yaml b/LAVIS-main/lavis/configs/datasets/textcaps/defaults.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..36b30915de6dfe86195b45dd8246e2d5affcb1d7
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/textcaps/defaults.yaml
@@ -0,0 +1,46 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  textcaps_caption: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: 
+            - https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_train.json
+          storage: 
+            - TextCaps/TextCaps_0.1_train.json
+        val:
+          url: 
+            - https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_val.json
+          storage: 
+            - TextCaps/TextCaps_0.1_val.json
+        test:
+          url: 
+            - https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_test.json
+          storage: 
+            - TextCaps/TextCaps_0.1_test.json
+      images:
+        # storage: nocaps/images
+        storage: /export/share/datasets/vision_language/TextCaps/images
diff --git a/LAVIS-main/lavis/configs/datasets/textcaps/defaults_instruct.yaml b/LAVIS-main/lavis/configs/datasets/textcaps/defaults_instruct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..109da7c897df72f05125fe657127c7784b78785a
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/textcaps/defaults_instruct.yaml
@@ -0,0 +1,47 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  textcaps_caption_instruct: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+
+    text_processor:
+      train:
+        name: blip_instruction
+        modality: image
+        task: caption
+      eval:
+        name: blip_caption
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: 
+            - https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_train.json
+          storage: 
+            - TextCaps/TextCaps_0.1_train.json
+        val:
+          url: 
+            - https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_val.json
+          storage: 
+            - TextCaps/TextCaps_0.1_val.json
+        test:
+          url: 
+            - https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_test.json
+          storage: 
+            - TextCaps/TextCaps_0.1_test.json
+      images:
+        # storage: nocaps/images
+        storage: /export/share/datasets/vision_language/TextCaps/images
diff --git a/LAVIS-main/lavis/configs/datasets/valor/defaults_mm_cap.yaml b/LAVIS-main/lavis/configs/datasets/valor/defaults_mm_cap.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7451e6e0aa7004e79d204bc6f5e4f8cdd4bcd4de
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/valor/defaults_mm_cap.yaml
@@ -0,0 +1,68 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  valor_mm_caption: # name of the dataset builder
+    data_type: [video, audio]
+
+    video_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+    
+    audio_processor:
+        train:
+          name: beats_audio
+          sampling_rate: 16000
+        eval:
+          name: beats_audio
+          sampling_rate: 16000
+          is_eval: False
+    
+    text_processor:
+        train:
+          name: blip_caption
+        eval:
+          name: blip_caption
+      
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        val:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/valor/desc_val.json
+            # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_val.json
+          storage: 
+            - valor/annotations/desc_val.json
+            # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_val.json
+        
+        test:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/valor/desc_test.json
+            # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_test.json
+          storage: 
+            - valor/annotations/desc_test.json
+            # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_test.json
+
+      templates: null
+
+      audio:
+        storage: /export/video-language-dataset/data/VALOR/videos
+
+      video:
+        storage: /export/video-language-dataset/data/VALOR/videos
+
diff --git a/LAVIS-main/lavis/configs/datasets/valor/defaults_mm_cap_instruct.yaml b/LAVIS-main/lavis/configs/datasets/valor/defaults_mm_cap_instruct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5a47aa8570a655536681f4c1f427a2c90844ca2d
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/valor/defaults_mm_cap_instruct.yaml
@@ -0,0 +1,70 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  valor_mm_caption_instruct: # name of the dataset builder
+    data_type: [video, audio]
+
+    video_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+    
+    audio_processor:
+        train:
+          name: beats_audio
+          sampling_rate: 16000
+        eval:
+          name: beats_audio
+          sampling_rate: 16000
+          is_eval: False
+    
+    text_processor:
+        train:
+          name: blip_instruction
+          modality: image
+          task: caption
+        eval:
+          name: blip_caption
+      
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        val:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/valor/desc_val.json
+            # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_val.json
+          storage: 
+            - valor/annotations/desc_val.json
+            # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_val.json
+        
+        test:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/valor/desc_test.json
+            # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_test.json
+          storage: 
+            - valor/annotations/desc_test.json
+            # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_test.json
+
+      templates: null
+
+      audio:
+        storage: /export/video-language-dataset/data/VALOR/videos
+
+      video:
+        storage: /export/video-language-dataset/data/VALOR/videos
+
diff --git a/LAVIS-main/lavis/configs/datasets/vatex/defaults_cap.yaml b/LAVIS-main/lavis/configs/datasets/vatex/defaults_cap.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a5dc1c7ff6d30e8235582c44b9c284a90ab5b3b3
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/vatex/defaults_cap.yaml
@@ -0,0 +1,24 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  msvd_cap: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
+          storage: vatex/annotations/cap_train.json
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
+          storage: vatex/annotations/cap_val.json
+        test:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
+          storage: vatex/annotations/cap_test.json
+      videos:
+        storage: /export/share/dongxuli/data/vatex
diff --git a/LAVIS-main/lavis/configs/datasets/vatex/defaults_cap_instruct.yaml b/LAVIS-main/lavis/configs/datasets/vatex/defaults_cap_instruct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7fd5f0038c693e671afb397735e1529265d40f7d
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/vatex/defaults_cap_instruct.yaml
@@ -0,0 +1,62 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  vatex_caption_instruct: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+
+    video_processor:
+      train:
+        name: alpro_video_train
+        n_frms: 4
+        image_size: 224
+        min_scale: 0.9
+        max_scale: 1.0
+        full_video: True
+      eval:
+        name: alpro_video_eval
+        n_frms: 4
+        image_size: 224
+        min_scale: 0.9
+        max_scale: 1.0
+        full_video: True
+
+    data_type: [video, audio]
+    
+    audio_processor:
+        train:
+          name: beats_audio
+          sampling_rate: 16000
+        eval:
+          name: beats_audio
+          sampling_rate: 16000
+          is_eval: False
+      
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: 
+            - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
+          storage: 
+            - vatex/annotations/cap_train.json
+        val:
+          url: 
+            - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
+          storage: 
+            - vatex/annotations/cap_val.json
+        test:
+          url: 
+            - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
+          storage: 
+            - vatex/annotations/cap_test.json
+
+      video:
+        storage: /export/video-language-dataset/data/vatex/
+
+      audio:
+        storage: /export/video-language-dataset/data/vatex/
diff --git a/LAVIS-main/lavis/configs/datasets/vg/defaults_caption.yaml b/LAVIS-main/lavis/configs/datasets/vg/defaults_caption.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ed303b58d8976ab5a4b1da7c234405a14d559fff
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/vg/defaults_caption.yaml
@@ -0,0 +1,18 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  vg_caption:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json
+          storage: vg/annotations/vg_caption.json
+      images:
+        storage: vg/images/
diff --git a/LAVIS-main/lavis/configs/datasets/vg/defaults_caption_instruct.yaml b/LAVIS-main/lavis/configs/datasets/vg/defaults_caption_instruct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8015e94ae6ee97bc821ab152b3897944aec2aaf4
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/vg/defaults_caption_instruct.yaml
@@ -0,0 +1,34 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  vg_caption_instruct:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    
+    text_processor:
+      train:
+        name: blip_instruction
+        task: caption
+        modality: image
+      eval:
+        name: blip_caption
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json
+          storage: vg/annotations/vg_caption.json
+      images:
+        storage: /export/share/datasets/vision/visual-genome/ #vg/images/
diff --git a/LAVIS-main/lavis/configs/datasets/vg/defaults_vqa.yaml b/LAVIS-main/lavis/configs/datasets/vg/defaults_vqa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e12e5c860a0db616a80967f7515b47abedba519e
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/vg/defaults_vqa.yaml
@@ -0,0 +1,18 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  vg_vqa:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json
+          storage: vg/annotations/vg_qa.json
+      images:
+        storage: vg/images/
diff --git a/LAVIS-main/lavis/configs/datasets/vg/defaults_vqa_instruct.yaml b/LAVIS-main/lavis/configs/datasets/vg/defaults_vqa_instruct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..787c9529c21cf37fbdfcfd7f1c06593fec76163d
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/vg/defaults_vqa_instruct.yaml
@@ -0,0 +1,34 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  vg_vqa_instruct:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    
+    text_processor:
+      train:
+        name: blip_instruction
+        task: qa
+        modality: image
+      eval:
+        name: blip_question
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json
+          storage: vg/annotations/vg_qa.json
+      images:
+        storage: /export/share/datasets/vision/visual-genome/ #vg/images/
diff --git a/LAVIS-main/lavis/configs/datasets/violin/defaults_cap.yaml b/LAVIS-main/lavis/configs/datasets/violin/defaults_cap.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dc90d482333cf2aaf6770a1ef83faef737101d1b
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/violin/defaults_cap.yaml
@@ -0,0 +1,51 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  violin_caption: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+
+    vis_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+    
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/train.json
+            # - /export/video-language-dataset/data/violin/annotations_lavis.json
+          storage: 
+            - violin/annotations/train.json
+            # - /export/video-language-dataset/data/violin/annotations_lavis.json
+        # val:
+        #   url: 
+        #     # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/test.json
+        #     - /export/video-language-dataset/data/violin/annotations_lavis_test.json
+        #   storage: 
+        #     # - violin/annotations/test.json
+        #     - /export/video-language-dataset/data/violin/annotations_lavis_test.json
+      videos:
+        storage: /export/video-language-dataset/data/violin/videos
diff --git a/LAVIS-main/lavis/configs/datasets/violin/defaults_cap_instruct.yaml b/LAVIS-main/lavis/configs/datasets/violin/defaults_cap_instruct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0d43317f497b2aa2cf4db0322a3003ddb13ca76d
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/violin/defaults_cap_instruct.yaml
@@ -0,0 +1,53 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  violin_caption_instruct: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+
+    vis_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+    
+    text_processor:
+      train:
+        name: blip_instruction
+        modality: video
+        task: caption
+      eval:
+        name: blip_caption
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/train.json
+            # - /export/video-language-dataset/data/violin/annotations_lavis.json
+          storage: 
+            - violin/annotations/train.json
+            # - /export/video-language-dataset/data/violin/annotations_lavis.json
+        # val:
+        #   url: 
+        #     # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/test.json
+        #     - /export/video-language-dataset/data/violin/annotations_lavis_test.json
+        #   storage: 
+        #     # - violin/annotations/test.json
+        #     - /export/video-language-dataset/data/violin/annotations_lavis_test.json
+      videos:
+        storage: /export/video-language-dataset/data/violin/videos
diff --git a/LAVIS-main/lavis/configs/datasets/violin/defaults_entail.yaml b/LAVIS-main/lavis/configs/datasets/violin/defaults_entail.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..82c33bd7ec84bafa4628eec6e6ac6b350a85c7a0
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/violin/defaults_entail.yaml
@@ -0,0 +1,52 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  violin_entailment: # 22452
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+
+    vis_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+  
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/train.json
+            # - /export/video-language-dataset/data/violin/annotations_lavis.json
+          storage: 
+            - violin/annotations/train.json
+            # - /export/video-language-dataset/data/violin/annotations_lavis.json
+        # val:
+        #   url: 
+        #     # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/test.json
+        #     - /export/video-language-dataset/data/violin/annotations_lavis_test.json
+        #   storage: 
+        #     # - violin/annotations/test.json
+        #     - /export/video-language-dataset/data/violin/annotations_lavis_test.json
+      videos:
+        storage: /export/video-language-dataset/data/violin/videos
diff --git a/LAVIS-main/lavis/configs/datasets/violin/defaults_entail_instruct.yaml b/LAVIS-main/lavis/configs/datasets/violin/defaults_entail_instruct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8eda1a263b8a51c0a425bbf6e5bdabc61a3de717
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/violin/defaults_entail_instruct.yaml
@@ -0,0 +1,51 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  violin_entailment_instruct: # 22452
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+
+    vis_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/train.json
+            # - /export/video-language-dataset/data/violin/annotations_lavis.json
+          storage: 
+            - violin/annotations/train.json
+            # - /export/video-language-dataset/data/violin/annotations_lavis.json
+        # val:
+        #   url: 
+        #     # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/test.json
+        #     - /export/video-language-dataset/data/violin/annotations_lavis_test.json
+        #   storage: 
+        #     # - violin/annotations/test.json
+        #     - /export/video-language-dataset/data/violin/annotations_lavis_test.json
+      videos:
+        storage: /export/video-language-dataset/data/violin/videos
diff --git a/LAVIS-main/lavis/configs/datasets/visdial/defaults_dial.yaml b/LAVIS-main/lavis/configs/datasets/visdial/defaults_dial.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b4aaf71584c32c1566658c1439180030fcf0f2e6
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/visdial/defaults_dial.yaml
@@ -0,0 +1,41 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  visdial: # name of the dataset builder
+    data_type: images #extracted features of videos (I3D, VGGish) # [images|videos|features]
+
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+        
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: 
+             - /export/share/datasets/vision_language/visdial/visdial_1.0_train.json
+          storage: 
+            - /export/share/datasets/vision_language/visdial/visdial_1.0_train.json
+        val:
+          url: 
+            - /export/share/datasets/vision_language/visdial/visdial_1.0_val.json
+          storage: 
+            - /export/share/datasets/vision_language/visdial/visdial_1.0_val.json
+        # test:
+        #   url: /export/share/datasets/vision_language/visdial/visdial_1.0_test.json
+        #   storage: /export/share/datasets/vision_language/visdial/visdial_1.0_test.json
+      images:
+        storage: /export/share/datasets/vision_language/visdial/
diff --git a/LAVIS-main/lavis/configs/datasets/visdial/defaults_dial_instruct.yaml b/LAVIS-main/lavis/configs/datasets/visdial/defaults_dial_instruct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4849c914822acf73a45440c6b2c432e3423c3261
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/visdial/defaults_dial_instruct.yaml
@@ -0,0 +1,41 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  visdial_instruct: # name of the dataset builder
+    data_type: images #extracted features of videos (I3D, VGGish) # [images|videos|features]
+
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: 
+             - /export/share/datasets/vision_language/visdial/visdial_1.0_train.json
+          storage: 
+            - /export/share/datasets/vision_language/visdial/visdial_1.0_train.json
+        val:
+          url: 
+            - /export/share/datasets/vision_language/visdial/visdial_1.0_val.json
+          storage: 
+            - /export/share/datasets/vision_language/visdial/visdial_1.0_val.json
+        # test:
+        #   url: /export/share/datasets/vision_language/visdial/visdial_1.0_test.json
+        #   storage: /export/share/datasets/vision_language/visdial/visdial_1.0_test.json
+      images:
+        storage: /export/share/datasets/vision_language/visdial/
diff --git a/LAVIS-main/lavis/configs/datasets/vizwiz/defaults.yaml b/LAVIS-main/lavis/configs/datasets/vizwiz/defaults.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e44fa7761305da83331bbcd06bbf43db794c03cf
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/vizwiz/defaults.yaml
@@ -0,0 +1,43 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  vizwiz_vqa:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    
+    text_processor:
+      train:
+        name: blip_question
+      eval:
+        name: blip_question
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        val:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vizwiz/val.json
+              # - /export/share/datasets/vision/vizwiz/Annotations/val.json
+          storage:
+              - vizwiz/annotations/val.json
+              # - /export/share/datasets/vision/vizwiz/Annotations/val.json
+        test:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vizwiz/test.json
+              # - /export/share/datasets/vision/vizwiz/Annotations/test.json
+          storage:
+              - vizwiz/annotations/test.json
+              # - /export/share/datasets/vision/vizwiz/Annotations/test.json
+      images:
+          storage: /export/share/datasets/vision/vizwiz/images
diff --git a/LAVIS-main/lavis/configs/datasets/vlep/defaults_cap.yaml b/LAVIS-main/lavis/configs/datasets/vlep/defaults_cap.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0987a49e21e7427046bc9b0aead656819ba1b533
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/vlep/defaults_cap.yaml
@@ -0,0 +1,51 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  vlep_caption: # 4900
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+
+    vis_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+    
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vlep/annotations_train_existing.json
+            # - /export/video-language-dataset/data/vlep/annotations/annotations_train_existing.json
+          storage: 
+            - vlep/annotations/annotations_train_existing.json
+            # - /export/video-language-dataset/data/vlep/annotations/annotations_train_existing.json
+        val:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vlep/annotations_dev_existing.json
+            # - /export/video-language-dataset/data/vlep/annotations/annotations_dev_existing.json
+          storage:  
+            - vlep/annotations/annotations_dev_existing.json
+            # - /export/video-language-dataset/data/vlep/annotations/annotations_dev_existing.json
+      videos:
+        storage: /export/video-language-dataset/data/vlep/videos
diff --git a/LAVIS-main/lavis/configs/datasets/vlep/defaults_cap_instruct.yaml b/LAVIS-main/lavis/configs/datasets/vlep/defaults_cap_instruct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5d5e703ca5daa65a58e369643599da13dc71edcf
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/vlep/defaults_cap_instruct.yaml
@@ -0,0 +1,53 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  vlep_caption_instruct: # 4900
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+
+    vis_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+    
+    text_processor:
+      train:
+        name: blip_instruction
+        modality: image
+        task: caption
+      eval:
+        name: blip_caption
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vlep/annotations_train_existing.json
+            # - /export/video-language-dataset/data/vlep/annotations/annotations_train_existing.json
+          storage: 
+            - vlep/annotations/annotations_train_existing.json
+            # - /export/video-language-dataset/data/vlep/annotations/annotations_train_existing.json
+        val:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vlep/annotations_dev_existing.json
+            # - /export/video-language-dataset/data/vlep/annotations/annotations_dev_existing.json
+          storage:  
+            - vlep/annotations/annotations_dev_existing.json
+            # - /export/video-language-dataset/data/vlep/annotations/annotations_dev_existing.json
+      videos:
+        storage: /export/video-language-dataset/data/vlep/videos
diff --git a/LAVIS-main/lavis/configs/datasets/vsr/defaults.yaml b/LAVIS-main/lavis/configs/datasets/vsr/defaults.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f9e29b847b82fa03fc1b68fd821fc80b845a19ae
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/vsr/defaults.yaml
@@ -0,0 +1,49 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  vsr_classification_instruct:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+        train:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/train.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
+          storage:
+              - vsr/annotations/train.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
+        val:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/dev.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
+          storage:
+              - vsr/annotations/dev.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
+        test:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/test.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
+          storage:
+              - vsr/annotations/test.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
+      images:
+          storage: /export/share/datasets/vision_language/VSR/images
diff --git a/LAVIS-main/lavis/configs/datasets/vsr/defaults_classification.yaml b/LAVIS-main/lavis/configs/datasets/vsr/defaults_classification.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..11edcfd285a188c47a702f150d2673c7d96a21f8
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/vsr/defaults_classification.yaml
@@ -0,0 +1,49 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  vsr_classification:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+        train:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/train.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
+          storage:
+              - vsr/annotations/train.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
+        val:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/dev.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
+          storage:
+              - vsr/annotations/dev.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
+        test:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/test.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
+          storage:
+              - vsr/annotations/test.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
+      images:
+          storage: /export/share/datasets/vision_language/VSR/images
diff --git a/LAVIS-main/lavis/configs/datasets/vsr/defaults_classification_instruct.yaml b/LAVIS-main/lavis/configs/datasets/vsr/defaults_classification_instruct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b09c521ef4e3e003edd604e9d93125372f11660a
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/vsr/defaults_classification_instruct.yaml
@@ -0,0 +1,49 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  vsr_caption_instruct:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+        train:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/train.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
+          storage:
+              - vsr/annotations/train.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
+        val:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/dev.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
+          storage:
+              - vsr/annotations/dev.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
+        test:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/test.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
+          storage:
+              - vsr/annotations/test.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
+      images:
+          storage: /export/share/datasets/vision_language/VSR/images
diff --git a/LAVIS-main/lavis/configs/datasets/vsr/defaults_instruct.yaml b/LAVIS-main/lavis/configs/datasets/vsr/defaults_instruct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e96c6e765bda4de503a7f05a3adf91f3db5a40c7
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/vsr/defaults_instruct.yaml
@@ -0,0 +1,53 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  vsr_caption_instruct:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+
+    text_processor:
+      train:
+        name: blip_instruction
+        task: caption
+        modality: image
+      eval:
+        name: blip_caption
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/train.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
+          storage:
+              - vsr/annotations/train.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
+        val:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/dev.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
+          storage:
+              - vsr/annotations/dev.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
+        test:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/test.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
+          storage:
+              - vsr/annotations/test.jsonl
+              # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
+      images:
+          storage: /export/share/datasets/vision_language/VSR/images
diff --git a/LAVIS-main/lavis/configs/datasets/wavcaps/defaults_mm_cap.yaml b/LAVIS-main/lavis/configs/datasets/wavcaps/defaults_mm_cap.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..463a8a5d91890f41abfa6c6e6fa200931c7e639d
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/wavcaps/defaults_mm_cap.yaml
@@ -0,0 +1,63 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  wavcaps_mm_caption: # name of the dataset builder
+    audio_processor:
+        train:
+          name: beats_audio
+          sampling_rate: 16000
+          n_frames: 2
+          frame_length: 512
+        eval:
+          name: beats_audio
+          sampling_rate: 16000
+          n_frames: 2
+          frame_length: 512
+          
+    text_processor:
+        train:
+          name: blip_caption
+        eval:
+          name: blip_caption
+
+    data_type: [audio] 
+
+    build_info:
+      kwargs: 
+        cached: False
+        cached_dir: /export/share/datasets/audio/WavCaps/beats_features/
+
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: 
+            - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/BBC_Sound_Effects/bbc_final.json
+            - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/FreeSound/fsd_final.json
+            - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/SoundBible/sb_final.json
+            - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/AudioSet_SL/as_final.json
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/wavcaps/json_data.json
+          storage: 
+            - wavcaps/json_files/BBC_Sound_Effects/bbc_final.json
+            - wavcaps/json_files/FreeSound/fsd_final.json
+            - wavcaps/json_files/SoundBible/sb_final.json
+            - wavcaps/json_files/AudioSet_SL/as_final.json
+            - wavcaps/annotations/json_data.json
+        # train:
+        #   url: 
+        #     - /export/share/datasets/audio/WavCaps/json_files/BBC_Sound_Effects/bbc_final.json
+        #     - /export/share/datasets/audio/WavCaps/json_files/FreeSound/fsd_final.json
+        #     - /export/share/datasets/audio/WavCaps/json_files/SoundBible/sb_final.json
+        #     - /export/share/datasets/audio/WavCaps/json_files/AudioSet_SL/as_final.json
+        #     - /export/share/datasets/audio/WavCaps/json_data.json
+        #   storage: 
+        #     - /export/share/datasets/audio/WavCaps/json_files/BBC_Sound_Effects/bbc_final.json
+        #     - /export/share/datasets/audio/WavCaps/json_files/FreeSound/fsd_final.json
+        #     - /export/share/datasets/audio/WavCaps/json_files/SoundBible/sb_final.json
+        #     - /export/share/datasets/audio/WavCaps/json_files/AudioSet_SL/as_final.json
+        #     - /export/share/datasets/audio/WavCaps/json_data.json
+          
+      audio:
+        storage: /export/share/datasets/audio/WavCaps/
+  
\ No newline at end of file
diff --git a/LAVIS-main/lavis/configs/datasets/wavcaps/defaults_mm_cap_instruct.yaml b/LAVIS-main/lavis/configs/datasets/wavcaps/defaults_mm_cap_instruct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..227a475701cd73e122fb4ed8a13461b18c114284
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/wavcaps/defaults_mm_cap_instruct.yaml
@@ -0,0 +1,63 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  wavcaps_mm_caption_instruct: # name of the dataset builder
+    audio_processor:
+        train:
+          name: beats_audio
+          sampling_rate: 16000
+          n_frames: 2
+          frame_length: 512
+        eval:
+          name: beats_audio
+          sampling_rate: 16000
+          n_frames: 2
+          frame_length: 512
+    text_processor:
+        train:
+          name: "blip_instruction"
+          modality: audio
+          task: caption
+        eval:
+          name: "blip_caption"
+
+    data_type: [audio] 
+
+    build_info:
+      kwargs:
+        cached: True
+        cached_dir: /export/share/datasets/audio/WavCaps/beats_features/
+
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          # url: 
+          #   - /export/share/datasets/audio/WavCaps/json_files/BBC_Sound_Effects/bbc_final.json
+          #   - /export/share/datasets/audio/WavCaps/json_files/FreeSound/fsd_final.json
+          #   - /export/share/datasets/audio/WavCaps/json_files/SoundBible/sb_final.json
+          #   - /export/share/datasets/audio/WavCaps/json_files/AudioSet_SL/as_final.json
+          #   - /export/share/datasets/audio/WavCaps/json_data.json
+          # storage: 
+          #   - /export/share/datasets/audio/WavCaps/json_files/BBC_Sound_Effects/bbc_final.json
+          #   - /export/share/datasets/audio/WavCaps/json_files/FreeSound/fsd_final.json
+          #   - /export/share/datasets/audio/WavCaps/json_files/SoundBible/sb_final.json
+          #   - /export/share/datasets/audio/WavCaps/json_files/AudioSet_SL/as_final.json
+          #   - /export/share/datasets/audio/WavCaps/json_data.json
+          url: 
+            - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/BBC_Sound_Effects/bbc_final.json
+            - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/FreeSound/fsd_final.json
+            - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/SoundBible/sb_final.json
+            - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/AudioSet_SL/as_final.json
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/wavcaps/json_data.json
+          storage: 
+            - wavcaps/json_files/BBC_Sound_Effects/bbc_final.json
+            - wavcaps/json_files/FreeSound/fsd_final.json
+            - wavcaps/json_files/SoundBible/sb_final.json
+            - wavcaps/json_files/AudioSet_SL/as_final.json
+            - wavcaps/annotations/json_data.json
+          
+      audio:
+        storage: /export/share/datasets/audio/WavCaps/
+  
\ No newline at end of file
diff --git a/LAVIS-main/lavis/configs/datasets/webvid/defaults_cap.yaml b/LAVIS-main/lavis/configs/datasets/webvid/defaults_cap.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..94203848e1baad44ea41ff22942e21762c61fe43
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/webvid/defaults_cap.yaml
@@ -0,0 +1,41 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  webvid2m_caption: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    vis_processor:
+          train:
+            name: alpro_video_train
+            n_frms: 5
+            image_size: 224
+            min_scale: 0.9
+            max_scale: 1.0
+          eval:
+            name: alpro_video_eval
+            n_frms: 5
+            image_size: 224
+            min_scale: 0.9
+            max_scale: 1.0
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/webvid2m/train.json
+            # - /export/home/LAVIS/webvid_annotation.json
+          storage: 
+            - webvid2m/annotations/train.json
+            # - /export/home/LAVIS/webvid_annotation.json
+      images:
+        storage: /export/video-language-dataset/data/webvid2m/postprocess/downsampled_videos
diff --git a/LAVIS-main/lavis/configs/datasets/webvid/defaults_cap_instruct.yaml b/LAVIS-main/lavis/configs/datasets/webvid/defaults_cap_instruct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a98325f148504423583265895fdb6013b783055d
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/webvid/defaults_cap_instruct.yaml
@@ -0,0 +1,43 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  webvid2m_caption_instruct: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    vis_processor:
+          train:
+            name: alpro_video_train
+            n_frms: 5
+            image_size: 224
+            min_scale: 0.9
+            max_scale: 1.0
+          eval:
+            name: alpro_video_eval
+            n_frms: 5
+            image_size: 224
+            min_scale: 0.9
+            max_scale: 1.0
+    text_processor:
+        train:
+          name: "blip_instruction"
+          modality: video
+          task: caption
+        eval:
+          name: "blip_caption"
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/webvid2m/train.json
+            # - /export/home/LAVIS/webvid_annotation.json
+          storage: 
+            - webvid2m/annotations/train.json
+            # - /export/home/LAVIS/webvid_annotation.json
+      images:
+        storage: /export/video-language-dataset/data/webvid2m/postprocess/downsampled_videos
diff --git a/LAVIS-main/lavis/configs/datasets/youcook/defaults_cap.yaml b/LAVIS-main/lavis/configs/datasets/youcook/defaults_cap.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..035c897e898acb8e7d5ad6c1ba29d9d0149712a8
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/youcook/defaults_cap.yaml
@@ -0,0 +1,51 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  youcook_caption: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+
+    vis_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+    
+    text_processor:
+        train:
+          name: blip_caption
+        eval:
+          name: blip_caption
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/youcook/train_annotations.json
+            # - /export/video-language-dataset/data/youcook/annotations/train_annotations.json
+          storage: 
+            - youcook/annotations/train_annotations.json
+            # - /export/video-language-dataset/data/youcook/annotations/train_annotations.json
+        val:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/youcook/val_annotations.json
+            # - /export/video-language-dataset/data/youcook/annotations/val_annotations.json
+          storage: 
+            - youcook/annotations/val_annotations.json
+            # - /export/video-language-dataset/data/youcook/annotations/val_annotations.json
+      videos:
+        storage: /export/video-language-dataset/data/youcook/raw_videos
diff --git a/LAVIS-main/lavis/configs/datasets/youcook/defaults_cap_instruct.yaml b/LAVIS-main/lavis/configs/datasets/youcook/defaults_cap_instruct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..45b371c72238eed9f996efb0b03a89cb80446632
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/youcook/defaults_cap_instruct.yaml
@@ -0,0 +1,53 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  youcook_caption_instruct: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+
+    vis_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+    
+    text_processor:
+        train:
+          name: blip_instruction
+          modality: video
+          task: caption
+        eval:
+          name: blip_caption
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/youcook/train_annotations.json
+            # - /export/video-language-dataset/data/youcook/annotations/train_annotations.json
+          storage: 
+            - youcook/annotations/train_annotations.json
+            # - /export/video-language-dataset/data/youcook/annotations/train_annotations.json
+        val:
+          url: 
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/youcook/val_annotations.json
+            # - /export/video-language-dataset/data/youcook/annotations/val_annotations.json
+          storage: 
+            - youcook/annotations/val_annotations.json
+            # - /export/video-language-dataset/data/youcook/annotations/val_annotations.json
+      videos:
+        storage: /export/video-language-dataset/data/youcook/raw_videos
diff --git a/LAVIS-main/lavis/configs/datasets/yt8m/defaults_mm_dial.yaml b/LAVIS-main/lavis/configs/datasets/yt8m/defaults_mm_dial.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e46f417073a37c1d8f11c480cee6191f0a0bc9da
--- /dev/null
+++ b/LAVIS-main/lavis/configs/datasets/yt8m/defaults_mm_dial.yaml
@@ -0,0 +1,62 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  yt8m_mm_dialogue: # name of the dataset builder
+    data_type: [video] #extracted features of videos (I3D, VGGish) # [images|videos|features]
+
+    video_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: False
+    
+    audio_processor:
+        train:
+          name: beats_audio
+          # sampling_rate: 16000
+        eval:
+          name: beats_audio
+          # sampling_rate: 16000
+          is_eval: True
+    
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: 
+            - /export/video-language-dataset/data/yt-8m/ytd_gpt3_safe_json/train.json
+          storage: 
+            - /export/video-language-dataset/data/yt-8m/ytd_gpt3_safe_json/train.json
+        val:
+          url: 
+            - /export/video-language-dataset/data/yt-8m/ytd_gpt3_safe_json/validation.json
+          storage: 
+            - /export/video-language-dataset/data/yt-8m/ytd_gpt3_safe_json/validation.json
+
+      templates: null
+
+      audio:
+        storage: /export/video-language-dataset/data/yt-8m/audios
+
+      video:
+        storage: /export/video-language-dataset/data/yt-8m/videos
+
diff --git a/LAVIS-main/lavis/configs/models/albef_classification_ve.yaml b/LAVIS-main/lavis/configs/models/albef_classification_ve.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3a2accab99fad7e2a880944515baefab496b18a7
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/albef_classification_ve.yaml
@@ -0,0 +1,40 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: albef_classification
+  load_finetuned: True
+
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_snli_ve_lavis.pt"
+  pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
+
+  num_classes: 3
+
+  use_distill: True
+  momentum: 0.995
+  alpha: 0.4
+
+  # vit encoder
+  vit_type: "base"
+  vit_grad_ckpt: False
+  vit_ckpt_layer: 0
+  vit_layer_norm_epsilon: 1e-6
+
+  image_size: 384
+
+  # bert config
+  med_config_path: "configs/models/med_config_albef.json"
+
+preprocess:
+  vis_processor:
+      train:
+        name: "blip_image_train"
+      eval:
+        name: "blip_image_eval"
+  text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/albef_feature_extractor.yaml b/LAVIS-main/lavis/configs/models/albef_feature_extractor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7def58e04a7b567e0a836e54f3dffdc62e1748ee
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/albef_feature_extractor.yaml
@@ -0,0 +1,30 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: albef_pretrain
+  pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
+
+  # vit encoder
+  vit_type: "base"
+  image_size: 224
+  vit_ckpt_layer: 0
+  vit_drop_path_rate: 0
+  vit_layer_norm_epsilon: 1e-6
+  vit_grad_ckpt: False
+
+  # bert config
+  med_config_path: "configs/models/med_config_albef.json"
+
+  embed_dim: 256
+
+preprocess:
+  vis_processor:
+      eval:
+        name: "blip_image_eval"
+        image_size: 224
+  text_processor:
+      eval:
+        name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/albef_nlvr.yaml b/LAVIS-main/lavis/configs/models/albef_nlvr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..86f17224aa0dfaa4739725e7c0516df4c679aa2d
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/albef_nlvr.yaml
@@ -0,0 +1,42 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: albef_nlvr
+  load_finetuned: True
+
+  pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/pretrain_model_nlvr.pth"
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_nlvr_lavis.pt"
+
+  num_classes: 2
+
+  use_distill: True
+  momentum: 0.995
+  alpha: 0.4
+
+  # vit encoder
+  vit_type: "base"
+  vit_grad_ckpt: False
+  vit_ckpt_layer: 0
+  vit_layer_norm_epsilon: 1e-6
+
+  image_size: 384
+
+  # bert config
+  med_config_path: "configs/models/med_config_albef.json"
+
+preprocess:
+  vis_processor:
+      train:
+        name: "blip_image_train"
+        image_size: 384
+      eval:
+        name: "blip_image_eval"
+        image_size: 384
+  text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/albef_pretrain_base.yaml b/LAVIS-main/lavis/configs/models/albef_pretrain_base.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..26e00efa423345b4a78332635d1a7c2e368fb02e
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/albef_pretrain_base.yaml
@@ -0,0 +1,38 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: albef_pretrain
+
+  load_pretrained: True
+  pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
+
+  # vit encoder
+  vit_type: "base"
+  image_size: 224
+  vit_ckpt_layer: 0
+  vit_drop_path_rate: 0
+  vit_layer_norm_epsilon: 1e-6
+  vit_grad_ckpt: False
+
+  # bert config
+  med_config_path: "configs/models/med_config_albef.json"
+  mlm_mask_prob: 0.15
+
+  embed_dim: 256
+  momentum: 0.995
+  alpha: 0.4
+  temp: 0.07
+
+  max_txt_len: 30
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip_image_train"
+          image_size: 256
+    text_processor:
+        train:
+          name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/albef_retrieval_coco.yaml b/LAVIS-main/lavis/configs/models/albef_retrieval_coco.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9971e6ca5d9aa85790ee2aefd9b7251e8a8b200c
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/albef_retrieval_coco.yaml
@@ -0,0 +1,46 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: albef_retrieval
+  load_finetuned: True
+
+  pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_coco_retrieval_lavis.pt"
+
+  queue_size: 65536
+
+  # vit encoder
+  vit_type: "base"
+  image_size: 384
+  vit_ckpt_layer: 0
+  vit_drop_path_rate: 0
+  vit_layer_norm_epsilon: 1e-6
+  vit_grad_ckpt: False
+
+  # bert config
+  med_config_path: "configs/models/med_config_albef.json"
+
+  embed_dim: 256
+  momentum: 0.995
+  alpha: 0.4
+  temp: 0.07
+  use_distill: True
+
+  max_txt_len: 30
+
+preprocess:
+  vis_processor:
+      train:
+        name: "blip_image_train"
+        image_size: 384
+      eval:
+        name: "blip_image_eval"
+        image_size: 384
+  text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/albef_retrieval_flickr.yaml b/LAVIS-main/lavis/configs/models/albef_retrieval_flickr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f5f77f0f99912d0f2c501e567dd0360e5c2b9336
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/albef_retrieval_flickr.yaml
@@ -0,0 +1,46 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: albef_retrieval
+  load_finetuned: True
+
+  pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
+  finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_flickr_retrieval_lavis.pt
+
+  queue_size: 65536
+
+  # vit encoder
+  vit_type: "base"
+  image_size: 384
+  vit_ckpt_layer: 0
+  vit_drop_path_rate: 0
+  vit_layer_norm_epsilon: 1e-6
+  vit_grad_ckpt: False
+
+  # bert config
+  med_config_path: "configs/models/med_config_albef.json"
+
+  embed_dim: 256
+  momentum: 0.995
+  alpha: 0.4
+  temp: 0.07
+  use_distill: True
+
+  max_txt_len: 30
+
+preprocess:
+  vis_processor:
+      train:
+        name: "blip_image_train"
+        image_size: 384
+      eval:
+        name: "blip_image_eval"
+        image_size: 384
+  text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/albef_vqav2.yaml b/LAVIS-main/lavis/configs/models/albef_vqav2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e35559f356bd77f9eedaa76b43d393a142f40239
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/albef_vqav2.yaml
@@ -0,0 +1,40 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: albef_vqa
+  load_finetuned: True
+
+  pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_vqav2_lavis.pt"
+
+  use_distill: True
+  momentum: 0.995
+  alpha: 0.4
+
+  # vit encoder
+  vit_type: "base"
+  vit_grad_ckpt: False
+  vit_ckpt_layer: 0
+  vit_layer_norm_epsilon: 1e-6
+
+  image_size: 384
+
+  # bert config
+  med_config_path: "configs/models/med_config_albef.json"
+
+preprocess:
+  vis_processor:
+      train:
+        name: "blip_image_train"
+        image_size: 384
+      eval:
+        name: "blip_image_eval"
+        image_size: 384
+  text_processor:
+      train:
+        name: "blip_question"
+      eval:
+        name: "blip_question"
diff --git a/LAVIS-main/lavis/configs/models/alpro_qa_msrvtt.yaml b/LAVIS-main/lavis/configs/models/alpro_qa_msrvtt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e3f58a1308c0d2a2075c037f6defcd4500e29b1b
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/alpro_qa_msrvtt.yaml
@@ -0,0 +1,44 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: alpro_qa
+  num_classes: 1500
+
+  load_finetuned: True
+
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_qa.pth"
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
+
+  timesformer:
+    n_frms: 16
+    image_size: 224
+
+    patch_size: 16
+    attn_drop_rate: 0.
+    drop_rate: 0.
+    drop_path_rate: 0.1
+
+    use_grad_ckpt: True
+    ckpt_layer: 12
+
+  # bert config
+  med_config_path: "configs/models/bert_config_alpro.json"
+
+preprocess:
+  vis_processor:
+      train:
+        name: "alpro_video_train"
+        n_frms: 16
+        image_size: 224
+      eval:
+        name: "alpro_video_eval"
+        n_frms: 16
+        image_size: 224
+  text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/alpro_qa_msvd.yaml b/LAVIS-main/lavis/configs/models/alpro_qa_msvd.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..17d606fcc0fd8fb8adedbb992db49f6e56e67c5f
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/alpro_qa_msvd.yaml
@@ -0,0 +1,43 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: alpro_qa
+  num_classes: 2423
+
+  load_finetuned: True
+
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msvd_qa.pth"
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
+
+  timesformer:
+    n_frms: 16
+    image_size: 224
+
+    patch_size: 16
+    attn_drop_rate: 0.
+    drop_rate: 0.
+    drop_path_rate: 0.1
+    use_grad_ckpt: True
+    ckpt_layer: 12
+
+  # bert config
+  med_config_path: "configs/models/bert_config_alpro.json"
+
+preprocess:
+  vis_processor:
+      train:
+        name: "alpro_video_train"
+        n_frms: 16
+        image_size: 224
+      eval:
+        name: "alpro_video_eval"
+        n_frms: 16
+        image_size: 224
+  text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/alpro_retrieval_didemo.yaml b/LAVIS-main/lavis/configs/models/alpro_retrieval_didemo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bd021c5a5d2e93e53e74ef4cf2a94bb921a6cd83
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/alpro_retrieval_didemo.yaml
@@ -0,0 +1,35 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: alpro_retrieval
+
+  load_finetuned: True
+
+  finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_didemo_retrieval.pt
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
+
+  timesformer:
+    n_frms: 8
+    image_size: 224
+
+    patch_size: 16
+    attn_drop_rate: 0.
+    drop_rate: 0.
+    drop_path_rate: 0.1
+    use_grad_ckpt: False
+
+  # bert config
+  med_config_path: "configs/models/bert_config_alpro.json"
+
+preprocess:
+  vis_processor:
+      eval:
+        name: "alpro_video_eval"
+        n_frms: 8
+        image_size: 224
+  text_processor:
+      eval:
+        name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/alpro_retrieval_msrvtt.yaml b/LAVIS-main/lavis/configs/models/alpro_retrieval_msrvtt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..431aa3ea65f83a6213c88ae07465e0c1ff7cb3ea
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/alpro_retrieval_msrvtt.yaml
@@ -0,0 +1,41 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: alpro_retrieval
+
+  load_finetuned: True
+
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_retrieval.pt"
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
+
+  timesformer:
+    n_frms: 8
+    image_size: 224
+
+    patch_size: 16
+    attn_drop_rate: 0.
+    drop_rate: 0.
+    drop_path_rate: 0.1
+    use_grad_ckpt: False
+
+  # bert config
+  med_config_path: "configs/models/bert_config_alpro.json"
+
+preprocess:
+  vis_processor:
+      train:
+        name: "alpro_video_train"
+        n_frms: 8
+        image_size: 224
+      eval:
+        name: "alpro_video_eval"
+        n_frms: 8
+        image_size: 224
+  text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/bert_config.json b/LAVIS-main/lavis/configs/models/bert_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..477a9f42513d0afb774735f07177161bdd1ae94b
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/bert_config.json
@@ -0,0 +1,21 @@
+{
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "add_type_embeddings": false,
+  "vocab_size": 30522,
+  "encoder_width": 768,
+  "add_cross_attention": true
+}
\ No newline at end of file
diff --git a/LAVIS-main/lavis/configs/models/bert_config_alpro.json b/LAVIS-main/lavis/configs/models/bert_config_alpro.json
new file mode 100644
index 0000000000000000000000000000000000000000..a21b3a2c9344651c1d88797338de5830ca3fc043
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/bert_config_alpro.json
@@ -0,0 +1,23 @@
+{
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "add_type_embeddings": true,
+  "type_vocab_size": 2,
+  "vocab_size": 30522,
+  "encoder_width": 768,
+  "add_cross_attention": false,
+  "fusion_layer": 6
+}
\ No newline at end of file
diff --git a/LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_base.yaml b/LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_base.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..45bd52ad02917fdb9cf67b209e2c1f3b65d4384a
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_base.yaml
@@ -0,0 +1,25 @@
+model:
+  vit_model: "clip_L"
+
+  qformer_num_query_token: 16
+  qformer_cross_attention_freq: 1
+
+  sd_train_text_encoder: False
+  sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" 
+
+  load_finetuned: False
+  load_pretrained: True
+  # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion.tar.gz"
+
+preprocess:
+  vis_processor:
+    train:
+      name: "blip_diffusion_inp_image_eval"
+    eval:
+      name: "blip_diffusion_inp_image_eval"
+  text_processor:
+    train:
+      name: "blip_caption"
+    eval:
+      name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_canny.yaml b/LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_canny.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d3a65ecc70041fee85c4d1f2db0c82c95f211355
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_canny.yaml
@@ -0,0 +1,27 @@
+model:
+  vit_model: "clip_L"
+
+  qformer_num_query_token: 16
+  qformer_cross_attention_freq: 1
+
+  sd_train_text_encoder: False
+  sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" 
+
+  load_finetuned: False
+  load_pretrained: True
+  # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion.tar.gz"
+
+  controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-canny"
+
+preprocess:
+  vis_processor:
+    train:
+      name: "blip_diffusion_inp_image_eval"
+    eval:
+      name: "blip_diffusion_inp_image_eval"
+  text_processor:
+    train:
+      name: "blip_caption"
+    eval:
+      name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_depth.yaml b/LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_depth.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..29421a72565a63a9d60d5c9980a84219fce80155
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_depth.yaml
@@ -0,0 +1,27 @@
+model:
+  vit_model: "clip_L"
+
+  qformer_num_query_token: 16
+  qformer_cross_attention_freq: 1
+
+  sd_train_text_encoder: False
+  sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" 
+
+  load_finetuned: False
+  load_pretrained: True
+  # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion-openimage.tar.gz"
+
+  controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-depth"
+
+preprocess:
+  vis_processor:
+    train:
+      name: "blip_diffusion_inp_image_eval"
+    eval:
+      name: "blip_diffusion_inp_image_eval"
+  text_processor:
+    train:
+      name: "blip_caption"
+    eval:
+      name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_hed.yaml b/LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_hed.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..275eba088a93654ef69304ff127879e50296a910
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_hed.yaml
@@ -0,0 +1,27 @@
+model:
+  vit_model: "clip_L"
+
+  qformer_num_query_token: 16
+  qformer_cross_attention_freq: 1
+
+  sd_train_text_encoder: False
+  sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" 
+
+  load_finetuned: False
+  load_pretrained: True
+  # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion-openimage.tar.gz"
+
+  controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-hed"
+
+preprocess:
+  vis_processor:
+    train:
+      name: "blip_diffusion_inp_image_eval"
+    eval:
+      name: "blip_diffusion_inp_image_eval"
+  text_processor:
+    train:
+      name: "blip_caption"
+    eval:
+      name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml b/LAVIS-main/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6591e15c1a5c9c6052a95caba26c2b635a842785
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml
@@ -0,0 +1,42 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: caption_coco_flant5xl
+  load_finetuned: True
+
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth"
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_flant5xl.pth"
+
+  # vit encoder
+  image_size: 364
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp32"
+  freeze_vit: False
+
+  # Q-Former
+  num_query_token: 32
+
+  # T5
+  t5_model: "google/flan-t5-xl"
+
+  # generation configs
+  prompt: "a photo of"
+
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip_image_train"
+          image_size: 364
+        eval:
+          name: "blip_image_eval"
+          image_size: 364
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml b/LAVIS-main/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5005fb72ada67d0e304483e5b98428f4be7c0236
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml
@@ -0,0 +1,42 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: caption_coco_opt2.7b
+  load_finetuned: True
+
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth"
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt2.7b.pth"
+
+  # vit encoder
+  image_size: 364
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp32"
+  freeze_vit: False
+
+  # Q-Former
+  num_query_token: 32
+
+  # OPT
+  opt_model: "facebook/opt-2.7b"
+
+  # generation configs
+  prompt: "a photo of"
+
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip_image_train"
+          image_size: 364
+        eval:
+          name: "blip_image_eval"
+          image_size: 364
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml b/LAVIS-main/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..464da1bb28668f6aa9106b3aac44cb500f85d727
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml
@@ -0,0 +1,42 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: caption_coco_opt6.7b
+  load_finetuned: True
+
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth"
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt6.7b.pth"
+
+  # vit encoder
+  image_size: 364
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp32"
+  freeze_vit: False
+
+  # Q-Former
+  num_query_token: 32
+
+  # OPT
+  opt_model: "facebook/opt-6.7b"
+
+  # generation configs
+  prompt: "a photo of"
+
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip_image_train"
+          image_size: 364
+        eval:
+          name: "blip_image_eval"
+          image_size: 364
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip2/blip2_coco.yaml b/LAVIS-main/lavis/configs/models/blip2/blip2_coco.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..03abc369b866db180c4e7bff8b00de637bc55cf0
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip2/blip2_coco.yaml
@@ -0,0 +1,36 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: coco
+  load_finetuned: True
+
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_finetune_coco.pth"
+
+  # vit encoder
+  image_size: 364
+  drop_path_rate: 0
+  use_grad_checkpoint: True
+  vit_precision: "fp32"
+  freeze_vit: False
+
+  # Q-Former
+  num_query_token: 32
+
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip_image_train"
+          image_size: 364
+        eval:
+          name: "blip_image_eval"
+          image_size: 364
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml b/LAVIS-main/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8c2e3de96890d7a73aa75d4a35a4ff5928deb24d
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml
@@ -0,0 +1,43 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: flant5xl
+  load_finetuned: False
+  load_pretrained: True
+
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxl_trimmed.pth"
+  finetuned: ""
+
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+
+  # Q-Former
+  num_query_token: 32
+
+  # T5
+  t5_model: "google/flan-t5-xl"
+
+  # generation configs
+  prompt: ""
+
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip_image_train"
+          image_size: 224
+        eval:
+          name: "blip_image_eval"
+          image_size: 224
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml b/LAVIS-main/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c26cce2fc251d91400412ebbdbb66f00ddf77e54
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml
@@ -0,0 +1,43 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: flant5xxl
+  load_finetuned: False
+  load_pretrained: True
+
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxxl_trimmed.pth"
+  finetuned: ""
+
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+
+  # Q-Former
+  num_query_token: 32
+
+  # T5
+  t5_model: "google/flan-t5-xxl"
+
+  # generation configs
+  prompt: ""
+
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip_image_train"
+          image_size: 224
+        eval:
+          name: "blip_image_eval"
+          image_size: 224
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml b/LAVIS-main/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..10365394c0374595cf59d12ef25da3e64ea496f6
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml
@@ -0,0 +1,43 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: instruct_vicuna13b
+  load_finetuned: False
+  load_pretrained: True
+
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth"
+  finetuned: ""
+
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+
+  # Q-Former
+  num_query_token: 32
+
+  # path to Vicuna checkpoint
+  llm_model: "./llm/vicuna-13b"
+
+  # generation configs
+  prompt: ""
+
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip2_image_train"
+          image_size: 224
+        eval:
+          name: "blip_image_eval"
+          image_size: 224
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml b/LAVIS-main/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..af67777d3940c7e6b75ea9ee7cac6a1f56b13b62
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml
@@ -0,0 +1,43 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: instruct_vicuna7b
+  load_finetuned: False
+  load_pretrained: True
+
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth"
+  finetuned: ""
+
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+
+  # Q-Former
+  num_query_token: 32
+
+  # path to Vicuna checkpoint
+  llm_model: "./llm/vicuna-7b"
+
+  # generation configs
+  prompt: ""
+
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip2_image_train"
+          image_size: 224
+        eval:
+          name: "blip_image_eval"
+          image_size: 224
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip2/blip2_pretrain.yaml b/LAVIS-main/lavis/configs/models/blip2/blip2_pretrain.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..126025ebaeb20ec88ebc2af61d16acd37843125d
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip2/blip2_pretrain.yaml
@@ -0,0 +1,36 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: pretrain
+  load_finetuned: False
+
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
+  finetuned: ""
+
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+
+  # Q-Former
+  num_query_token: 32
+
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip_image_train"
+          image_size: 224
+        eval:
+          name: "blip_image_eval"
+          image_size: 224
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml b/LAVIS-main/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cf90da225618de43a3b5fa70954b363227fcd804
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml
@@ -0,0 +1,42 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: pretrain_flant5xl
+  load_finetuned: False
+
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth"
+  finetuned: ""
+
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+
+  # Q-Former
+  num_query_token: 32
+
+  # T5
+  t5_model: "google/flan-t5-xl"
+
+  # generation configs
+  prompt: ""
+
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip_image_train"
+          image_size: 224
+        eval:
+          name: "blip_image_eval"
+          image_size: 224
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml b/LAVIS-main/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fca3e9a0aa053245d08d376594f75336ba0150b7
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml
@@ -0,0 +1,43 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: pretrain_flant5xl
+  load_finetuned: False
+
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl_vitL.pth"
+  finetuned: ""
+
+  # vit encoder
+  vit_model: "clip_L"
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+
+  # Q-Former
+  num_query_token: 32
+
+  # T5
+  t5_model: "google/flan-t5-xl"
+
+  # generation configs
+  prompt: ""
+
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip_image_train"
+          image_size: 224
+        eval:
+          name: "blip_image_eval"
+          image_size: 224
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml b/LAVIS-main/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8240904d01dde5b1dfd74baca6bb83421d92ac3e
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml
@@ -0,0 +1,42 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: pretrain_flant5xxl
+  load_finetuned: False
+
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth"
+  finetuned: ""
+
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+
+  # Q-Former
+  num_query_token: 32
+
+  # T5
+  t5_model: "google/flan-t5-xxl"
+
+  # generation configs
+  prompt: ""
+
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip_image_train"
+          image_size: 224
+        eval:
+          name: "blip_image_eval"
+          image_size: 224
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip2/blip2_pretrain_llama7b.yaml b/LAVIS-main/lavis/configs/models/blip2/blip2_pretrain_llama7b.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4adfa5cd42752d01b4c8126d3b21ec85df000eee
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip2/blip2_pretrain_llama7b.yaml
@@ -0,0 +1,42 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: blip2_llama
+  load_finetuned: False
+  
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
+  finetuned: ""
+
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+
+  # Q-Former
+  num_query_token: 32
+
+  # LLM
+  llm_model: "/export/home/project/stanford_alpaca/llama_7B"
+
+  # generation configs
+  prompt: ""
+
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip2_image_train"
+          image_size: 224
+        eval:
+          name: "blip_image_eval"
+          image_size: 224
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml b/LAVIS-main/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a6e0bccd3fa69814bbcc294bb0a28089f3a62e5a
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml
@@ -0,0 +1,42 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: pretrain_opt2.7b
+  load_finetuned: False
+
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth"
+  finetuned: ""
+
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+
+  # Q-Former
+  num_query_token: 32
+
+  # OPT
+  opt_model: "facebook/opt-2.7b"
+
+  # generation configs
+  prompt: ""
+
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip_image_train"
+          image_size: 224
+        eval:
+          name: "blip_image_eval"
+          image_size: 224
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml b/LAVIS-main/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..89adbfe363272a90c5bc80fbdb8ca33f05e0033c
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml
@@ -0,0 +1,42 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: pretrain_opt6.7b
+  load_finetuned: False
+
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth"
+  finetuned: ""
+
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+
+  # Q-Former
+  num_query_token: 32
+
+  # OPT
+  opt_model: "facebook/opt-6.7b"
+
+  # generation configs
+  prompt: ""
+
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip_image_train"
+          image_size: 224
+        eval:
+          name: "blip_image_eval"
+          image_size: 224
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml b/LAVIS-main/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a0a0fc6464abcfea3e08655e43e381c9456f62b5
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml
@@ -0,0 +1,37 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: pretrain
+  load_finetuned: False
+
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_vitL.pth"
+  finetuned: ""
+
+  # vit encoder
+  vit_model: "clip_L"
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+
+  # Q-Former
+  num_query_token: 32
+
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip_image_train"
+          image_size: 224
+        eval:
+          name: "blip_image_eval"
+          image_size: 224
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip2/blip2_xinstruct_vicuna13b.yaml b/LAVIS-main/lavis/configs/models/blip2/blip2_xinstruct_vicuna13b.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7502033dfadd81c710b5fdf92ef18ad36049a34a
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip2/blip2_xinstruct_vicuna13b.yaml
@@ -0,0 +1,74 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: blip2_vicuna_xinstruct
+  model_type: vicuna13b
+  load_pretrained: True
+  pretrained: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth
+  load_finetuned: False
+  finetuned: ""
+  stage1_url_or_filename: null
+  image_model: "eva_clip_g"
+  pc_model: "ulip2_pointbert"
+  video_model: "eva_clip_g"
+  audio_model: "beats"
+  pretrained_image_qformer:  https://storage.googleapis.com/sfr-xinstructblip-data-research/models/xinstructblip_checkpoints/vicuna13b/image_qformer.pth
+  pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/models/xinstructblip_checkpoints/vicuna13b/pc_qformer.pth
+  pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/models/xinstructblip_checkpoints/vicuna13b/video_qformer.pth
+  pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/models/xinstructblip_checkpoints/vicuna13b/audio_qformer.pth
+  load_attention_image_qformer: True
+  load_attention_pc_qformer: True
+  load_attention_video_qformer: True
+  load_attention_audio_qformer: True
+  load_ln_type_image: "image"
+  load_ln_type_video: "video"
+  load_ln_type_audio: "audio"
+  load_qformer_type_image: "image"
+  load_qformer_type_pc: "pc"
+  load_qformer_type_video: "video"
+  load_qformer_type_audio: "audio"
+  load_projection_image: True
+  load_projection_pc: True
+  load_projection_video: True
+  load_projection_audio: True
+  load_projection_type_image: "image"
+  load_projection_type_pc: "pc"
+  load_projection_type_video: "video"
+  load_projection_type_audio: "audio"
+  image_encoder_kwargs :  {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  pc_encoder_kwargs :  {}
+  video_encoder_kwargs :   {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  audio_encoder_kwargs :  {}
+  image_precision: "fp16"
+  pc_precision: "fp16"
+  video_precision: "fp16"
+  audio_precision: "fp16"
+  freeze_image: True
+  freeze_pc: True
+  freeze_video: True
+  freeze_audio: True
+  num_query_token: 32
+  llm_model: "/path/to/vicuna-13b"
+  prompt:  ""
+  max_txt_len: 128
+  max_output_txt_len: 256
+  apply_lemmatizer: False
+  num_few_shot_examples: 0
+  few_shot_prob: 0
+  qformer_text_input: True
+  llm_text_input: True
+  modalities :  ["image", "video", "audio", "pc"]
+  use_cues: True
+  shared_qformer: False
+  pretrained_shared_qformer: Null
+  load_attention_shared_qformer: False
+  load_qformer_type_shared: ""
+  load_projection_shared: False
+  load_projection_type_shaped: ""
+  load_ln_type_shared: ""
+  shared_qformer_num_features: 512
+  prefix: ""
+  postfix: ""
diff --git a/LAVIS-main/lavis/configs/models/blip2/blip2_xinstruct_vicuna7b.yaml b/LAVIS-main/lavis/configs/models/blip2/blip2_xinstruct_vicuna7b.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bf736815c3914093e534906f871714efc2bf078a
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip2/blip2_xinstruct_vicuna7b.yaml
@@ -0,0 +1,77 @@
+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: blip2_vicuna_xinstruct
+  model_type: vicuna7b
+  load_pretrained: True
+  pretrained:  https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth
+  load_finetuned: False
+  finetuned: ""
+  stage1_url_or_filename: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth
+  image_model: "eva_clip_g"
+  pc_model: "ulip2_pointbert"
+  video_model: "eva_clip_g"
+  audio_model: "beats"
+  pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
+  pretrained_pc_qformer:  https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth
+  pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth
+  pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/audio_qformer_improved.pth
+  load_attention_image_qformer: True
+  load_attention_pc_qformer: True
+  load_attention_video_qformer: True
+  load_attention_audio_qformer: True
+  load_ln_type_image: "image"
+  load_ln_type_video: "video"
+  load_ln_type_pc: "pc"
+  load_ln_type_audio: "audio"
+  load_qformer_type_image: "image"
+  load_qformer_type_pc: "pc"
+  load_qformer_type_video: "video"
+  load_qformer_type_audio: "audio"
+  load_projection_image: True
+  load_projection_pc: True
+  load_projection_video: True
+  load_projection_audio: True
+  load_projection_type_image: "image"
+  load_projection_type_pc: "pc"
+  load_projection_type_video: "video"
+  load_projection_type_audio: "audio"
+  image_encoder_kwargs :  {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  pc_encoder_kwargs :  {}
+  video_encoder_kwargs :   {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  audio_encoder_kwargs :  {}
+  image_precision: "fp16"
+  pc_precision: "fp16"
+  video_precision: "fp16"
+  audio_precision: "fp16"
+  freeze_image: True
+  freeze_pc: True
+  freeze_video: True
+  freeze_audio: True
+  num_query_token: 32
+  llm_model: "/path/to/vicuna-7b"
+  prompt:  ""
+  max_txt_len: 128
+  max_output_txt_len: 256
+  apply_lemmatizer: False
+  num_few_shot_examples: 0
+  few_shot_prob: 0
+  qformer_text_input: True
+  llm_text_input: True
+  modalities :  ["audio", "video", "pc", "image"]
+  use_cues: True
+  shared_qformer: False
+  pretrained_shared_qformer: Null
+  load_attention_shared_qformer: False
+  load_qformer_type_shared: ""
+  load_projection_shared: False
+  load_projection_type_shaped: ""
+  load_ln_type_shared: ""
+  shared_qformer_num_features: 512
+  prefix: "USER: "
+  postfix: "\nASSISTANT:"
+  predict_with_gen: False
+  clean_tokenization: True
\ No newline at end of file
diff --git a/LAVIS-main/lavis/configs/models/blip_caption_base_coco.yaml b/LAVIS-main/lavis/configs/models/blip_caption_base_coco.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2ee481c234290fef7d74667c2ce3e8c66fc7a3ab
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip_caption_base_coco.yaml
@@ -0,0 +1,38 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: blip_caption
+  load_finetuned: True
+
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_caption_base.pth"
+
+  # vit encoder
+  vit_type: "base"
+  vit_grad_ckpt: False
+  vit_ckpt_layer: 0
+
+  image_size: 384
+
+  # bert config
+  med_config_path: "configs/models/med_config.json"
+
+  # generation configs
+  prompt: "a picture of "
+
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip_image_train"
+        eval:
+          name: "blip_image_eval"
+    text_processor:
+        train:
+          name: "blip_caption"
+          prompt: "a picture of "
+        eval:
+          name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip_caption_large_coco.yaml b/LAVIS-main/lavis/configs/models/blip_caption_large_coco.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a0e8ae93c3f5236aac93669c53db448d312aa5eb
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip_caption_large_coco.yaml
@@ -0,0 +1,37 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: blip_caption
+  load_finetuned: True
+
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large.pth"
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth"
+
+  vit_type: "large"
+  vit_grad_ckpt: True
+  vit_ckpt_layer: 5
+
+  image_size: 384
+
+  # bert config
+  med_config_path: "configs/models/med_large_config.json"
+
+  # generation configs
+  prompt: "a picture of "
+
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip_image_train"
+        eval:
+          name: "blip_image_eval"
+    text_processor:
+        train:
+          name: "blip_caption"
+          prompt: "a picture of "
+        eval:
+          name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip_classification_base.yaml b/LAVIS-main/lavis/configs/models/blip_classification_base.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bad38f200daeb3177dce269807ffada275e61ac3
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip_classification_base.yaml
@@ -0,0 +1,22 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: blip_classification
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth"
+
+  use_distill: True
+  momentum: 0.995
+  alpha: 0.4
+
+  # vit encoder
+  vit_type: "base"
+  vit_grad_ckpt: False
+  vit_ckpt_layer: 0
+
+  image_size: 384
+
+  # bert config
+  med_config_path: "configs/models/med_config.json"
diff --git a/LAVIS-main/lavis/configs/models/blip_feature_extractor_base.yaml b/LAVIS-main/lavis/configs/models/blip_feature_extractor_base.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eaee381415c9eb7e0bf787ad5cf9b61bf2690489
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip_feature_extractor_base.yaml
@@ -0,0 +1,29 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: blip_pretrain
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
+
+  # vit encoder
+  vit_type: "base"
+  vit_grad_ckpt: False
+  vit_ckpt_layer: 0
+
+  image_size: 224
+
+  # bert config
+  med_config_path: "configs/models/med_config.json"
+
+  embed_dim: 256
+
+preprocess:
+  vis_processor:
+      eval:
+        name: "blip_image_eval"
+        image_size: 224
+  text_processor:
+      eval:
+        name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip_itm_base.yaml b/LAVIS-main/lavis/configs/models/blip_itm_base.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9c79db89d3cb55575b5f4b8aa499859c5915b183
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip_itm_base.yaml
@@ -0,0 +1,31 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: blip_image_text_matching
+
+  load_finetuned: True
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth"
+
+  # vit encoder
+  vit_type: "base"
+  vit_grad_ckpt: False
+  vit_ckpt_layer: 0
+
+  image_size: 384
+
+  # bert config
+  med_config_path: "configs/models/med_config.json"
+
+  embed_dim: 256
+
+preprocess:
+    vis_processor:
+        eval:
+          name: "blip_image_eval"
+          image_size: 384
+    text_processor:
+        eval:
+          name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip_itm_large.yaml b/LAVIS-main/lavis/configs/models/blip_itm_large.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9bcbf4850d2eb159c506e52a8fa88de59d3a87d7
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip_itm_large.yaml
@@ -0,0 +1,31 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: blip_image_text_matching
+
+  load_finetuned: True
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco.pth"
+
+  # vit encoder
+  vit_type: "large"
+  vit_grad_ckpt: False
+  vit_ckpt_layer: 0
+
+  image_size: 384
+
+  # bert config
+  med_config_path: "configs/models/med_large_config.json"
+
+  embed_dim: 256
+
+preprocess:
+    vis_processor:
+        eval:
+          name: "blip_image_eval"
+          image_size: 384
+    text_processor:
+        eval:
+          name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip_nlvr.yaml b/LAVIS-main/lavis/configs/models/blip_nlvr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..02ecb13f11bdd02b161633d0d8c3c74eab64ba21
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip_nlvr.yaml
@@ -0,0 +1,39 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: blip_nlvr
+  model_type: nlvr
+  load_finetuned: True
+
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth"
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
+
+  num_classes: 2
+
+  # vit encoder
+  vit_type: "base"
+  vit_grad_ckpt: False
+  vit_ckpt_layer: 0
+  vit_layer_norm_epsilon: 1e-6
+
+  image_size: 384
+
+  # bert config
+  med_config_path: "configs/models/med_config.json"
+
+preprocess:
+  vis_processor:
+      train:
+        name: "blip_image_train"
+        image_size: 384
+      eval:
+        name: "blip_image_eval"
+        image_size: 384
+  text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip_pretrain_base.yaml b/LAVIS-main/lavis/configs/models/blip_pretrain_base.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e265b832a618304d50e17a9dbf242bfe4df720db
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip_pretrain_base.yaml
@@ -0,0 +1,35 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: blip_pretrain
+
+  load_pretrained: True
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
+
+  # vit encoder
+  vit_type: "base"
+  vit_grad_ckpt: False
+  vit_ckpt_layer: 0
+
+  image_size: 224
+  alpha: 0.4
+
+  # bert config
+  med_config_path: "configs/models/bert_config.json"
+
+  embed_dim: 256
+
+  # generation configs
+  prompt: "a picture of "
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip_image_train"
+          image_size: 224
+    text_processor:
+        train:
+          name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip_pretrain_large.yaml b/LAVIS-main/lavis/configs/models/blip_pretrain_large.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d01cbe3baf09dd118d3e127c1ce1d8e3ea2238a6
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip_pretrain_large.yaml
@@ -0,0 +1,22 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: blip_pretrain
+
+  # vit encoder
+  vit_type: "large"
+  vit_grad_ckpt: True
+  vit_ckpt_layer: 5
+
+  image_size: 224
+
+  # bert config
+  med_config_path: "configs/models/med_large_config.json"
+
+  embed_dim: 256
+
+  # generation configs
+  prompt: "a picture of "
diff --git a/LAVIS-main/lavis/configs/models/blip_retrieval_coco.yaml b/LAVIS-main/lavis/configs/models/blip_retrieval_coco.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..30eb79028f12266224e5286e563381ba963bd756
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip_retrieval_coco.yaml
@@ -0,0 +1,39 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: blip_retrieval
+  load_finetuned: True
+
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_retrieval.pth"
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
+
+  queue_size: 57600
+
+  # vit encoder
+  vit_type: "base"
+  vit_grad_ckpt: True
+  vit_ckpt_layer: 4
+
+  image_size: 384
+
+  # bert config
+  med_config_path: "configs/models/med_config.json"
+
+  embed_dim: 256
+
+preprocess:
+  vis_processor:
+      train:
+        name: "blip_image_train"
+        image_size: 384
+      eval:
+        name: "blip_image_eval"
+        image_size: 384
+  text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip_retrieval_flickr.yaml b/LAVIS-main/lavis/configs/models/blip_retrieval_flickr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e4bf1fbc2db796a3ce0f08dfa357fe982856d8a0
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip_retrieval_flickr.yaml
@@ -0,0 +1,42 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: blip_retrieval
+  load_finetuned: True
+
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_flickr_retrieval.pth"
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
+
+  queue_size: 57600
+  alpha: 0.4
+
+  negative_all_rank: False
+
+  # vit encoder
+  vit_type: "base"
+  vit_grad_ckpt: True
+  vit_ckpt_layer: 4
+
+  image_size: 384
+
+  # bert config
+  med_config_path: "configs/models/med_config.json"
+
+  embed_dim: 256
+
+preprocess:
+  vis_processor:
+      train:
+        name: "blip_image_train"
+        image_size: 384
+      eval:
+        name: "blip_image_eval"
+        image_size: 384
+  text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/blip_vqa_aokvqa.yaml b/LAVIS-main/lavis/configs/models/blip_vqa_aokvqa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b3afe3e7a2e3a55c569a8c7fce3d83d1ef3ddabe
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip_vqa_aokvqa.yaml
@@ -0,0 +1,36 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: blip_vqa
+  load_finetuned: True
+
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_aokvqa.pth"
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
+
+  # vit encoder
+  vit_type: "base"
+  vit_grad_ckpt: False
+  vit_ckpt_layer: 0
+  vit_drop_path_rate: 0.1
+
+  image_size: 480
+
+  # bert config
+  med_config_path: "configs/models/med_config.json"
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip_image_train"
+          image_size: 480
+        eval:
+          name: "blip_image_eval"
+          image_size: 480
+    text_processor:
+        train:
+          name: "blip_question"
+        eval:
+          name: "blip_question"
diff --git a/LAVIS-main/lavis/configs/models/blip_vqa_okvqa.yaml b/LAVIS-main/lavis/configs/models/blip_vqa_okvqa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eb66ccbbf1f2faed4dfe916b042263861798d951
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip_vqa_okvqa.yaml
@@ -0,0 +1,36 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: blip_vqa
+  load_finetuned: True
+
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_okvqa.pth"
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
+
+  # vit encoder
+  vit_type: "base"
+  vit_grad_ckpt: False
+  vit_ckpt_layer: 0
+  vit_drop_path_rate: 0.1
+
+  image_size: 480
+
+  # bert config
+  med_config_path: "configs/models/med_config.json"
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip_image_train"
+          image_size: 480
+        eval:
+          name: "blip_image_eval"
+          image_size: 480
+    text_processor:
+        train:
+          name: "blip_question"
+        eval:
+          name: "blip_question"
diff --git a/LAVIS-main/lavis/configs/models/blip_vqav2.yaml b/LAVIS-main/lavis/configs/models/blip_vqav2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4f0ce8daac2d23d47d342f17630ca86f7002cc50
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/blip_vqav2.yaml
@@ -0,0 +1,36 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: blip_vqa
+  load_finetuned: True
+
+  finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
+
+  # vit encoder
+  vit_type: "base"
+  vit_grad_ckpt: False
+  vit_ckpt_layer: 0
+  vit_drop_path_rate: 0.1
+
+  image_size: 480
+
+  # bert config
+  med_config_path: "configs/models/med_config.json"
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip_image_train"
+          image_size: 480
+        eval:
+          name: "blip_image_eval"
+          image_size: 480
+    text_processor:
+        train:
+          name: "blip_question"
+        eval:
+          name: "blip_question"
diff --git a/LAVIS-main/lavis/configs/models/clip/RN101-quickgelu.json b/LAVIS-main/lavis/configs/models/clip/RN101-quickgelu.json
new file mode 100644
index 0000000000000000000000000000000000000000..1dbd19be9d289887b4e41bd50acdbdc78709efd3
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/RN101-quickgelu.json
@@ -0,0 +1,22 @@
+{
+    "embed_dim": 512,
+    "quick_gelu": true,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": [
+            3,
+            4,
+            23,
+            3
+        ],
+        "width": 64,
+        "patch_size": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/RN101.json b/LAVIS-main/lavis/configs/models/clip/RN101.json
new file mode 100644
index 0000000000000000000000000000000000000000..bf5babbc5a3ef48653083f10a549f42afe14727a
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/RN101.json
@@ -0,0 +1,21 @@
+{
+    "embed_dim": 512,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": [
+            3,
+            4,
+            23,
+            3
+        ],
+        "width": 64,
+        "patch_size": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/RN50-quickgelu.json b/LAVIS-main/lavis/configs/models/clip/RN50-quickgelu.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c2f91260cdeb043434dc1e893cce81d4ce7f0d1
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/RN50-quickgelu.json
@@ -0,0 +1,22 @@
+{
+    "embed_dim": 1024,
+    "quick_gelu": true,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": [
+            3,
+            4,
+            6,
+            3
+        ],
+        "width": 64,
+        "patch_size": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/RN50.json b/LAVIS-main/lavis/configs/models/clip/RN50.json
new file mode 100644
index 0000000000000000000000000000000000000000..ad98b4b8822d72b5196ddafcb732329ecad2ce56
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/RN50.json
@@ -0,0 +1,21 @@
+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": [
+            3,
+            4,
+            6,
+            3
+        ],
+        "width": 64,
+        "patch_size": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/RN50x16.json b/LAVIS-main/lavis/configs/models/clip/RN50x16.json
new file mode 100644
index 0000000000000000000000000000000000000000..66576383a0cbd2ffcdd7a050e5fcbab420c7fecb
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/RN50x16.json
@@ -0,0 +1,21 @@
+{
+    "embed_dim": 768,
+    "vision_cfg": {
+        "image_size": 384,
+        "layers": [
+            6,
+            8,
+            18,
+            8
+        ],
+        "width": 96,
+        "patch_size": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 768,
+        "heads": 12,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/RN50x4.json b/LAVIS-main/lavis/configs/models/clip/RN50x4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a41cb630517cc155c1ee6aa8660f6c7948f3ee4b
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/RN50x4.json
@@ -0,0 +1,21 @@
+{
+    "embed_dim": 640,
+    "vision_cfg": {
+        "image_size": 288,
+        "layers": [
+            4,
+            6,
+            10,
+            6
+        ],
+        "width": 80,
+        "patch_size": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 640,
+        "heads": 10,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/ViT-B-16-plus-240.json b/LAVIS-main/lavis/configs/models/clip/ViT-B-16-plus-240.json
new file mode 100644
index 0000000000000000000000000000000000000000..9347280c60a2a19233ac027d810ded21c26ea867
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/ViT-B-16-plus-240.json
@@ -0,0 +1,16 @@
+{
+    "embed_dim": 640,
+    "vision_cfg": {
+        "image_size": 240,
+        "layers": 12,
+        "width": 896,
+        "patch_size": 16
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 640,
+        "heads": 10,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/ViT-B-16-plus.json b/LAVIS-main/lavis/configs/models/clip/ViT-B-16-plus.json
new file mode 100644
index 0000000000000000000000000000000000000000..f9cc3e3b0084590581d1ec3e81b930a9a190e036
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/ViT-B-16-plus.json
@@ -0,0 +1,16 @@
+{
+    "embed_dim": 640,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": 12,
+        "width": 896,
+        "patch_size": 16
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 640,
+        "heads": 10,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/ViT-B-16.json b/LAVIS-main/lavis/configs/models/clip/ViT-B-16.json
new file mode 100644
index 0000000000000000000000000000000000000000..9afeef0fbc807f130f2b2bc65c1dd85abc9eba72
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/ViT-B-16.json
@@ -0,0 +1,16 @@
+{
+    "embed_dim": 512,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": 12,
+        "width": 768,
+        "patch_size": 16
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/ViT-B-32-plus-256.json b/LAVIS-main/lavis/configs/models/clip/ViT-B-32-plus-256.json
new file mode 100644
index 0000000000000000000000000000000000000000..27ae13857a0bdf0c7825ba7768de0071bda3e82e
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/ViT-B-32-plus-256.json
@@ -0,0 +1,16 @@
+{
+    "embed_dim": 640,
+    "vision_cfg": {
+        "image_size": 256,
+        "layers": 12,
+        "width": 896,
+        "patch_size": 32
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 640,
+        "heads": 10,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/ViT-B-32-quickgelu.json b/LAVIS-main/lavis/configs/models/clip/ViT-B-32-quickgelu.json
new file mode 100644
index 0000000000000000000000000000000000000000..f5a063adbf96df9e169706286643ab9a261b251c
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/ViT-B-32-quickgelu.json
@@ -0,0 +1,17 @@
+{
+    "embed_dim": 512,
+    "quick_gelu": true,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": 12,
+        "width": 768,
+        "patch_size": 32
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/ViT-B-32.json b/LAVIS-main/lavis/configs/models/clip/ViT-B-32.json
new file mode 100644
index 0000000000000000000000000000000000000000..abd1f7973dc856ba56004ad0538f4f74f5e08a6d
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/ViT-B-32.json
@@ -0,0 +1,16 @@
+{
+    "embed_dim": 512,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": 12,
+        "width": 768,
+        "patch_size": 32
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/ViT-H-14.json b/LAVIS-main/lavis/configs/models/clip/ViT-H-14.json
new file mode 100644
index 0000000000000000000000000000000000000000..d2c01733dcab1293858bf8aa200f05cdb0b6f56c
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/ViT-H-14.json
@@ -0,0 +1,17 @@
+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": 32,
+        "width": 1280,
+        "head_width": 80,
+        "patch_size": 14
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 1024,
+        "heads": 16,
+        "layers": 24
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/ViT-H-16.json b/LAVIS-main/lavis/configs/models/clip/ViT-H-16.json
new file mode 100644
index 0000000000000000000000000000000000000000..942ed56bf6e24a0c19a41fad87db304444402b4f
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/ViT-H-16.json
@@ -0,0 +1,17 @@
+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": 32,
+        "width": 1280,
+        "head_width": 80,
+        "patch_size": 16
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 1024,
+        "heads": 16,
+        "layers": 24
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/ViT-L-14-280.json b/LAVIS-main/lavis/configs/models/clip/ViT-L-14-280.json
new file mode 100644
index 0000000000000000000000000000000000000000..c8e5fbac8a14c4c66c57df166ffe5dceb188e436
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/ViT-L-14-280.json
@@ -0,0 +1,16 @@
+{
+    "embed_dim": 768,
+    "vision_cfg": {
+        "image_size": 280,
+        "layers": 24,
+        "width": 1024,
+        "patch_size": 14
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 768,
+        "heads": 12,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/ViT-L-14-336.json b/LAVIS-main/lavis/configs/models/clip/ViT-L-14-336.json
new file mode 100644
index 0000000000000000000000000000000000000000..4db3a1e77c891cda4d32ea3b9da9bef2c2aade0c
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/ViT-L-14-336.json
@@ -0,0 +1,16 @@
+{
+    "embed_dim": 768,
+    "vision_cfg": {
+        "image_size": 336,
+        "layers": 24,
+        "width": 1024,
+        "patch_size": 14
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 768,
+        "heads": 12,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/ViT-L-14.json b/LAVIS-main/lavis/configs/models/clip/ViT-L-14.json
new file mode 100644
index 0000000000000000000000000000000000000000..98951b0cbff3776e90b0c2685ce4d04f1f874343
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/ViT-L-14.json
@@ -0,0 +1,16 @@
+{
+    "embed_dim": 768,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": 24,
+        "width": 1024,
+        "patch_size": 14
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 768,
+        "heads": 12,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/ViT-L-16-320.json b/LAVIS-main/lavis/configs/models/clip/ViT-L-16-320.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc09c4877d27597fb0f50332e7cbcf8028586ce2
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/ViT-L-16-320.json
@@ -0,0 +1,16 @@
+{
+    "embed_dim": 768,
+    "vision_cfg": {
+        "image_size": 320,
+        "layers": 24,
+        "width": 1024,
+        "patch_size": 16
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 768,
+        "heads": 12,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/ViT-L-16.json b/LAVIS-main/lavis/configs/models/clip/ViT-L-16.json
new file mode 100644
index 0000000000000000000000000000000000000000..78601e7a6822382e3466c1c00459392ee7768024
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/ViT-L-16.json
@@ -0,0 +1,16 @@
+{
+    "embed_dim": 768,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": 24,
+        "width": 1024,
+        "patch_size": 16
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 768,
+        "heads": 12,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/ViT-g-14.json b/LAVIS-main/lavis/configs/models/clip/ViT-g-14.json
new file mode 100644
index 0000000000000000000000000000000000000000..b5c4231a67a82d1c30b675719f3004daed84299b
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/ViT-g-14.json
@@ -0,0 +1,18 @@
+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": 40,
+        "width": 1408,
+        "head_width": 88,
+        "mlp_ratio": 4.3637,
+        "patch_size": 14
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 1024,
+        "heads": 16,
+        "layers": 24
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/timm-efficientnetv2_rw_s.json b/LAVIS-main/lavis/configs/models/clip/timm-efficientnetv2_rw_s.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa4bfb1df0240d72552e7b09dd4d17ee48a1c0e6
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/timm-efficientnetv2_rw_s.json
@@ -0,0 +1,17 @@
+{
+    "embed_dim": 768,
+    "vision_cfg": {
+        "timm_model_name": "efficientnetv2_rw_s",
+        "timm_model_pretrained": false,
+        "timm_pool": "abs_attn",
+        "timm_proj": "",
+        "image_size": 288
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 768,
+        "heads": 8,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/timm-resnet50d.json b/LAVIS-main/lavis/configs/models/clip/timm-resnet50d.json
new file mode 100644
index 0000000000000000000000000000000000000000..7bb0957cd23e3dd0fb461764c959a75e04cae743
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/timm-resnet50d.json
@@ -0,0 +1,17 @@
+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+        "timm_model_name": "resnet50d",
+        "timm_model_pretrained": false,
+        "timm_pool": "abs_attn",
+        "timm_proj": "",
+        "image_size": 224
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/timm-resnetaa50d.json b/LAVIS-main/lavis/configs/models/clip/timm-resnetaa50d.json
new file mode 100644
index 0000000000000000000000000000000000000000..c011e0c02b5d63b1ace51e4625d383adc6aedb50
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/timm-resnetaa50d.json
@@ -0,0 +1,17 @@
+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+        "timm_model_name": "resnetaa50d",
+        "timm_model_pretrained": false,
+        "timm_pool": "abs_attn",
+        "timm_proj": "",
+        "image_size": 224
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/timm-resnetblur50.json b/LAVIS-main/lavis/configs/models/clip/timm-resnetblur50.json
new file mode 100644
index 0000000000000000000000000000000000000000..05d0b209ac44198bd0b45c6931dee71eac9b1eab
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/timm-resnetblur50.json
@@ -0,0 +1,17 @@
+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+        "timm_model_name": "resnetblur50",
+        "timm_model_pretrained": false,
+        "timm_pool": "abs_attn",
+        "timm_proj": "",
+        "image_size": 224
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/timm-swin_base_patch4_window7_224.json b/LAVIS-main/lavis/configs/models/clip/timm-swin_base_patch4_window7_224.json
new file mode 100644
index 0000000000000000000000000000000000000000..bc08f2b78543857445d22eec7d288c5fe86391a9
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/timm-swin_base_patch4_window7_224.json
@@ -0,0 +1,17 @@
+{
+    "embed_dim": 512,
+    "vision_cfg": {
+        "timm_model_name": "swin_base_patch4_window7_224",
+        "timm_model_pretrained": false,
+        "timm_pool": "",
+        "timm_proj": "linear",
+        "image_size": 224
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/timm-vit_base_patch16_224.json b/LAVIS-main/lavis/configs/models/clip/timm-vit_base_patch16_224.json
new file mode 100644
index 0000000000000000000000000000000000000000..133b88f2f919de44c19df8318c7297824accbdce
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/timm-vit_base_patch16_224.json
@@ -0,0 +1,17 @@
+{
+    "embed_dim": 512,
+    "vision_cfg": {
+        "timm_model_name": "vit_base_patch16_224",
+        "timm_model_pretrained": false,
+        "timm_pool": "",
+        "timm_proj": "linear",
+        "image_size": 224
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/timm-vit_base_patch32_224.json b/LAVIS-main/lavis/configs/models/clip/timm-vit_base_patch32_224.json
new file mode 100644
index 0000000000000000000000000000000000000000..9dcc6ffbfda4fb9d206bb693f6c3d53f2757aff8
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/timm-vit_base_patch32_224.json
@@ -0,0 +1,17 @@
+{
+    "embed_dim": 512,
+    "vision_cfg": {
+        "timm_model_name": "vit_base_patch32_224",
+        "timm_model_pretrained": false,
+        "timm_pool": "",
+        "timm_proj": "linear",
+        "image_size": 224
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip/timm-vit_small_patch16_224.json b/LAVIS-main/lavis/configs/models/clip/timm-vit_small_patch16_224.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c3ae01ab318ce07c19b7b6326c07aaec1f321a4
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip/timm-vit_small_patch16_224.json
@@ -0,0 +1,17 @@
+{
+    "embed_dim": 512,
+    "vision_cfg": {
+        "timm_model_name": "vit_small_patch16_224",
+        "timm_model_pretrained": false,
+        "timm_pool": "",
+        "timm_proj": "linear",
+        "image_size": 224
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}
diff --git a/LAVIS-main/lavis/configs/models/clip_resnet50.yaml b/LAVIS-main/lavis/configs/models/clip_resnet50.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ce3a2d429646b4b58706715d07da0ecb6c0d767b
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip_resnet50.yaml
@@ -0,0 +1,11 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: clip
+
+  model_type: RN50
+
+  pretrained: openai
diff --git a/LAVIS-main/lavis/configs/models/clip_vit_base16.yaml b/LAVIS-main/lavis/configs/models/clip_vit_base16.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2a06fa180993c42e63cecee38ec01134c18de7c8
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip_vit_base16.yaml
@@ -0,0 +1,17 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: clip
+
+  model_type: ViT-B-16
+
+  pretrained: openai
+
+preprocess:
+  vis_processor:
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
diff --git a/LAVIS-main/lavis/configs/models/clip_vit_base32.yaml b/LAVIS-main/lavis/configs/models/clip_vit_base32.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..056e3d967853f5c01426514a9f98622bc92241b8
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip_vit_base32.yaml
@@ -0,0 +1,52 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: clip
+
+  model_type: ViT-B-32
+#   ['RN50',
+#  'RN50-quickgelu',
+#  'RN50x4',
+#  'RN50x16',
+#  'RN101',
+#  'RN101-quickgelu',
+#  'timm-efficientnetv2_rw_s',
+#  'timm-resnet50d',
+#  'timm-resnetaa50d',
+#  'timm-resnetblur50',
+#  'timm-swin_base_patch4_window7_224',
+#  'timm-vit_base_patch16_224',
+#  'timm-vit_base_patch32_224',
+#  'timm-vit_small_patch16_224',
+#  'ViT-B-16',
+#  'ViT-B-16-plus',
+#  'ViT-B-16-plus-240',
+#  'ViT-B-32',
+#  'ViT-B-32-plus-256',
+#  'ViT-B-32-quickgelu',
+#  'ViT-g-14',
+#  'ViT-H-14',
+#  'ViT-H-16',
+#  'ViT-L-14',
+#  'ViT-L-14-280',
+#  'ViT-L-14-336',
+#  'ViT-L-16',
+#  'ViT-L-16-320']
+
+  pretrained: openai
+  # "openai"
+  # following not available for all models
+  # "yfcc15m"
+  # "cc12m"
+  # "laion400m_e31"
+  # "laion400m_e32"
+  # "laion400m_avg"
+
+preprocess:
+  vis_processor:
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
diff --git a/LAVIS-main/lavis/configs/models/clip_vit_large14.yaml b/LAVIS-main/lavis/configs/models/clip_vit_large14.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8ab9f2610f1ae9e0164f39565a8302ab33123548
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip_vit_large14.yaml
@@ -0,0 +1,52 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: clip
+
+  model_type: ViT-L-14
+#   ['RN50',
+#  'RN50-quickgelu',
+#  'RN50x4',
+#  'RN50x16',
+#  'RN101',
+#  'RN101-quickgelu',
+#  'timm-efficientnetv2_rw_s',
+#  'timm-resnet50d',
+#  'timm-resnetaa50d',
+#  'timm-resnetblur50',
+#  'timm-swin_base_patch4_window7_224',
+#  'timm-vit_base_patch16_224',
+#  'timm-vit_base_patch32_224',
+#  'timm-vit_small_patch16_224',
+#  'ViT-B-16',
+#  'ViT-B-16-plus',
+#  'ViT-B-16-plus-240',
+#  'ViT-B-32',
+#  'ViT-B-32-plus-256',
+#  'ViT-B-32-quickgelu',
+#  'ViT-g-14',
+#  'ViT-H-14',
+#  'ViT-H-16',
+#  'ViT-L-14',
+#  'ViT-L-14-280',
+#  'ViT-L-14-336',
+#  'ViT-L-16',
+#  'ViT-L-16-320']
+
+  pretrained: openai
+  # "openai"
+  # following not available for all models
+  # "yfcc15m"
+  # "cc12m"
+  # "laion400m_e31"
+  # "laion400m_e32"
+  # "laion400m_avg"
+
+preprocess:
+  vis_processor:
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
diff --git a/LAVIS-main/lavis/configs/models/clip_vit_large14_336.yaml b/LAVIS-main/lavis/configs/models/clip_vit_large14_336.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a6510d73763fd4f0e5c6512c10c5c0ad8242499b
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/clip_vit_large14_336.yaml
@@ -0,0 +1,52 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: clip
+
+  model_type: ViT-L-14-336
+#   ['RN50',
+#  'RN50-quickgelu',
+#  'RN50x4',
+#  'RN50x16',
+#  'RN101',
+#  'RN101-quickgelu',
+#  'timm-efficientnetv2_rw_s',
+#  'timm-resnet50d',
+#  'timm-resnetaa50d',
+#  'timm-resnetblur50',
+#  'timm-swin_base_patch4_window7_224',
+#  'timm-vit_base_patch16_224',
+#  'timm-vit_base_patch32_224',
+#  'timm-vit_small_patch16_224',
+#  'ViT-B-16',
+#  'ViT-B-16-plus',
+#  'ViT-B-16-plus-240',
+#  'ViT-B-32',
+#  'ViT-B-32-plus-256',
+#  'ViT-B-32-quickgelu',
+#  'ViT-g-14',
+#  'ViT-H-14',
+#  'ViT-H-16',
+#  'ViT-L-14',
+#  'ViT-L-14-280',
+#  'ViT-L-14-336',
+#  'ViT-L-16',
+#  'ViT-L-16-320']
+
+  pretrained: openai
+  # "openai"
+  # following not available for all models
+  # "yfcc15m"
+  # "cc12m"
+  # "laion400m_e31"
+  # "laion400m_e32"
+  # "laion400m_avg"
+
+preprocess:
+  vis_processor:
+      eval:
+        name: "clip_image_eval"
+        image_size: 336
diff --git a/LAVIS-main/lavis/configs/models/gpt_dialogue_base.yaml b/LAVIS-main/lavis/configs/models/gpt_dialogue_base.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7bbdae83fbe10b7e7d9001292eb88ba3da4e2e04
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/gpt_dialogue_base.yaml
@@ -0,0 +1,25 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: gpt_dialogue
+  # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth"
+  # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
+
+  len_tokenizer: 50264 # 50257 tokens from gpt2 default tokenizer + additional special tokens 
+  
+  len_video_ft: 4224 # i3d_rgb: 2048 i3d_flow: 2048 vggish: 128
+
+preprocess:
+    vis_processor:
+        train:
+          name: "gpt_video_ft"
+        eval:
+          name: "gpt_video_ft"
+    text_processor:
+        train:
+          name: "gpt_dialogue"
+        eval:
+          name: "gpt_dialogue"
\ No newline at end of file
diff --git a/LAVIS-main/lavis/configs/models/img2prompt-vqa/img2prompt_vqa_base.yaml b/LAVIS-main/lavis/configs/models/img2prompt-vqa/img2prompt_vqa_base.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fac355c4312bf54d3d87057d9bc7d665f1f03a06
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/img2prompt-vqa/img2prompt_vqa_base.yaml
@@ -0,0 +1,58 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: img2prompt_vqa
+  model_type: base
+
+  image_question_matching_model:
+    arch: blip_image_text_matching
+    load_finetuned: True
+
+    finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth"
+
+    # vit encoder
+    vit_type: "large"
+    vit_grad_ckpt: False
+    vit_ckpt_layer: 0
+
+    image_size: 384
+
+    # bert config
+    med_config_path: "configs/models/med_large_config.json"
+
+    embed_dim: 256
+
+  image_captioning_model:
+    arch: blip_caption
+    load_finetuned: True
+
+    finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth"
+
+    vit_type: "large"
+    vit_grad_ckpt: True
+    vit_ckpt_layer: 5
+
+    image_size: 384
+
+    # bert config
+    med_config_path: "configs/models/med_large_config.json"
+
+    # generation configs
+    prompt: "a picture of "
+
+  question_generation_moodel:
+    pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/projects/img2prompt/T5_large_QG.pth"
+
+
+
+preprocess:
+  vis_processor:
+      eval:
+        name: "blip_image_eval"
+        image_size: 384
+  text_processor:
+      eval:
+        name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/med_config.json b/LAVIS-main/lavis/configs/models/med_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a566c17bbc185f5bf8b83c7ed7dcb02e1a0ba1f9
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/med_config.json
@@ -0,0 +1,21 @@
+{
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "add_type_embeddings": false,
+  "vocab_size": 30524,
+  "encoder_width": 768,
+  "add_cross_attention": true
+}
\ No newline at end of file
diff --git a/LAVIS-main/lavis/configs/models/med_config_albef.json b/LAVIS-main/lavis/configs/models/med_config_albef.json
new file mode 100644
index 0000000000000000000000000000000000000000..529636d733bf35cdb82ec4c7950ede79a5ce80fc
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/med_config_albef.json
@@ -0,0 +1,22 @@
+{
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "add_type_embeddings": false,
+  "vocab_size": 30522,
+  "encoder_width": 768,
+  "add_cross_attention": true,
+  "fusion_layer": 6
+}
\ No newline at end of file
diff --git a/LAVIS-main/lavis/configs/models/med_large_config.json b/LAVIS-main/lavis/configs/models/med_large_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d5090b06f13c6c1e42d91e30d2cd76c2b6264d3a
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/med_large_config.json
@@ -0,0 +1,21 @@
+{
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "add_type_embeddings": false,
+  "vocab_size": 30524,
+  "encoder_width": 1024,
+  "add_cross_attention": true
+}
\ No newline at end of file
diff --git a/LAVIS-main/lavis/configs/models/pnp-vqa/pnp_vqa_3b.yaml b/LAVIS-main/lavis/configs/models/pnp-vqa/pnp_vqa_3b.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..31f43778865db534e0070249db1512f50d937238
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/pnp-vqa/pnp_vqa_3b.yaml
@@ -0,0 +1,60 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: pnp_vqa
+  model_type: 3b
+
+  image_question_matching_model:
+    arch: blip_image_text_matching
+    load_finetuned: True
+
+    finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth"
+
+    # vit encoder
+    vit_type: "large"
+    vit_grad_ckpt: False
+    vit_ckpt_layer: 0
+
+    image_size: 384
+
+    # bert config
+    med_config_path: "configs/models/med_large_config.json"
+
+    embed_dim: 256
+
+  image_captioning_model:
+    arch: blip_caption
+    load_finetuned: True
+
+    finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth"
+
+    vit_type: "large"
+    vit_grad_ckpt: True
+    vit_ckpt_layer: 5
+
+    image_size: 384
+
+    # bert config
+    med_config_path: "configs/models/med_large_config.json"
+
+    # generation configs
+    prompt: "a picture of "
+
+  question_answering_model:
+    arch: pnp_unifiedqav2_fid
+
+    pretrained: "allenai/unifiedqa-v2-t5-3b-1363200"
+
+    t5_config_path: "configs/models/pnp-vqa/unifiedqav2_3b_config.json"
+
+preprocess:
+  vis_processor:
+      eval:
+        name: "blip_image_eval"
+        image_size: 384
+  text_processor:
+      eval:
+        name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/pnp-vqa/pnp_vqa_base.yaml b/LAVIS-main/lavis/configs/models/pnp-vqa/pnp_vqa_base.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5630578bbe24f4788396fbe40ae365580911d1aa
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/pnp-vqa/pnp_vqa_base.yaml
@@ -0,0 +1,59 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: pnp_vqa
+  model_type: base
+
+  image_question_matching_model:
+    arch: blip_image_text_matching
+    load_finetuned: True
+
+    finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth"
+
+    # vit encoder
+    vit_type: "large"
+    vit_grad_ckpt: False
+    vit_ckpt_layer: 0
+
+    image_size: 384
+
+    # bert config
+    med_config_path: "configs/models/med_large_config.json"
+
+    embed_dim: 256
+
+  image_captioning_model:
+    arch: blip_caption
+    load_finetuned: True
+
+    finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth"
+
+    vit_type: "large"
+    vit_grad_ckpt: True
+    vit_ckpt_layer: 5
+
+    image_size: 384
+
+    # bert config
+    med_config_path: "configs/models/med_large_config.json"
+
+    # generation configs
+    prompt: "a picture of "
+  question_answering_model:
+    arch: pnp_unifiedqav2_fid
+
+    pretrained: "allenai/unifiedqa-v2-t5-base-1363200"
+
+    t5_config_path: "configs/models/pnp-vqa/unifiedqav2_base_config.json"
+
+preprocess:
+  vis_processor:
+      eval:
+        name: "blip_image_eval"
+        image_size: 384
+  text_processor:
+      eval:
+        name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/pnp-vqa/pnp_vqa_large.yaml b/LAVIS-main/lavis/configs/models/pnp-vqa/pnp_vqa_large.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bea044c9079c33a7f7ec3a31c13f2da311d042e0
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/pnp-vqa/pnp_vqa_large.yaml
@@ -0,0 +1,60 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: pnp_vqa
+  model_type: large
+
+  image_question_matching_model:
+    arch: blip_image_text_matching
+    load_finetuned: True
+
+    finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth"
+
+    # vit encoder
+    vit_type: "large"
+    vit_grad_ckpt: False
+    vit_ckpt_layer: 0
+
+    image_size: 384
+
+    # bert config
+    med_config_path: "configs/models/med_large_config.json"
+
+    embed_dim: 256
+
+  image_captioning_model:
+    arch: blip_caption
+    load_finetuned: True
+
+    finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth"
+
+    vit_type: "large"
+    vit_grad_ckpt: True
+    vit_ckpt_layer: 5
+
+    image_size: 384
+
+    # bert config
+    med_config_path: "configs/models/med_large_config.json"
+
+    # generation configs
+    prompt: "a picture of "
+
+  question_answering_model:
+    arch: pnp_unifiedqav2_fid
+
+    pretrained: "allenai/unifiedqa-v2-t5-large-1363200"
+
+    t5_config_path: "configs/models/pnp-vqa/unifiedqav2_large_config.json"
+
+preprocess:
+  vis_processor:
+      eval:
+        name: "blip_image_eval"
+        image_size: 384
+  text_processor:
+      eval:
+        name: "blip_caption"
diff --git a/LAVIS-main/lavis/configs/models/pnp-vqa/unifiedqav2_3b_config.json b/LAVIS-main/lavis/configs/models/pnp-vqa/unifiedqav2_3b_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5220dc592c03afd94f1a9d2077a2a87a3320856
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/pnp-vqa/unifiedqav2_3b_config.json
@@ -0,0 +1,60 @@
+{
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "d_ff": 16384,
+  "d_kv": 128,
+  "d_model": 1024,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 24,
+  "num_heads": 32,
+  "num_layers": 24,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.21.3",
+  "use_cache": true,
+  "vocab_size": 32128
+}
\ No newline at end of file
diff --git a/LAVIS-main/lavis/configs/models/pnp-vqa/unifiedqav2_base_config.json b/LAVIS-main/lavis/configs/models/pnp-vqa/unifiedqav2_base_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..24ffa8d18a0f317f3c18e5c67bf97ede953d6436
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/pnp-vqa/unifiedqav2_base_config.json
@@ -0,0 +1,59 @@
+{
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "d_ff": 3072,
+  "d_kv": 64,
+  "d_model": 768,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 12,
+  "num_heads": 12,
+  "num_layers": 12,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "transformers_version": "4.21.3",
+  "use_cache": true,
+  "vocab_size": 32128
+}
\ No newline at end of file
diff --git a/LAVIS-main/lavis/configs/models/pnp-vqa/unifiedqav2_large_config.json b/LAVIS-main/lavis/configs/models/pnp-vqa/unifiedqav2_large_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4f87ec69734d35cdc0d76b1b3f11f9e80df3cdc1
--- /dev/null
+++ b/LAVIS-main/lavis/configs/models/pnp-vqa/unifiedqav2_large_config.json
@@ -0,0 +1,59 @@
+{
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "d_ff": 4096,
+  "d_kv": 64,
+  "d_model": 1024,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 24,
+  "num_heads": 16,
+  "num_layers": 24,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "transformers_version": "4.21.3",
+  "use_cache": true,
+  "vocab_size": 32128
+}
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/builders/__init__.py b/LAVIS-main/lavis/datasets/builders/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c77f4e402c9efa61f25a07418990962cd68bdb8
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/builders/__init__.py
@@ -0,0 +1,279 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from lavis.datasets.builders.base_dataset_builder import load_dataset_config
+from lavis.datasets.builders.caption_builder import (
+    COCOCapBuilder,
+    MSRVTTCapBuilder,
+    MSVDCapBuilder,
+    VATEXCapBuilder,
+    MSRVTTCapInstructBuilder,
+    MSVDCapInstructBuilder,
+    VATEXCapInstructBuilder,
+    WebVid2MCapBuilder,
+    WebVid2MCapInstructBuilder,
+    VALORCaptionBuilder,
+    VALORCaptionInstructBuilder,
+    ViolinCapBuilder,
+    ViolinCapInstructBuilder,
+    VlepCaptionInstructBuilder, 
+    VlepCaptionBuilder,
+    YouCookCaptionBuilder,
+    YouCookCaptionInstructBuilder,
+    COINCaptionBuilder,
+    COINCaptionInstructBuilder,
+    CharadeCaptionBuilder,
+    CharadeCaptionInstructBuilder,
+    TextCapsCapBuilder,
+    TextCapsCapInstructBuilder,
+    Flickr30kCapBuilder,
+    Flickr30kCapInstructBuilder
+
+)
+from lavis.datasets.builders.image_text_pair_builder import (
+    ConceptualCaption12MBuilder,
+    ConceptualCaption12MInstructBuilder,
+    ConceptualCaption3MBuilder,
+    ConceptualCaption3MInstructBuilder,
+    VGCaptionBuilder,
+    VGCaptionInstructBuilder,
+    SBUCaptionBuilder,
+    SBUCaptionInstructBuilder,
+    Laion400MBuilder,
+    Laion400MInstructBuilder
+)
+from lavis.datasets.builders.classification_builder import (
+    NLVRBuilder,
+    SNLIVisualEntailmentBuilder,
+    SNLIVisualEntailmentInstructBuilder,
+    ViolinEntailmentInstructBuilder,
+    ViolinEntailmentBuilder,
+    ESC50ClassificationBuilder
+)
+from lavis.datasets.builders.imagefolder_builder import ImageNetBuilder
+from lavis.datasets.builders.video_qa_builder import (
+    MSRVTTQABuilder, 
+    MSVDQABuilder,
+    MSRVTTQAInstructBuilder,
+    MSVDQAInstructBuilder,
+    MusicAVQABuilder,
+    MusicAVQAInstructBuilder
+)
+
+from lavis.datasets.builders.vqa_builder import (
+    COCOVQABuilder,
+    COCOVQAInstructBuilder,
+    OKVQABuilder,
+    OKVQAInstructBuilder,
+    AOKVQABuilder,
+    AOKVQAInstructBuilder,
+    VGVQABuilder,
+    VGVQAInstructBuilder,
+    GQABuilder,
+    GQAInstructBuilder,
+    IconQABuilder,
+    IconQAInstructBuilder,
+    ScienceQABuilder,
+    ScienceQAInstructBuilder,
+    OCRVQABuilder,
+    OCRVQAInstructBuilder,
+    VizWizVQABuilder
+)
+from lavis.datasets.builders.retrieval_builder import (
+    MSRVTTRetrievalBuilder,
+    DiDeMoRetrievalBuilder,
+    COCORetrievalBuilder,
+    Flickr30kBuilder,
+)
+
+from lavis.datasets.builders.audio_caption_builder import (
+    AudioSetBuilder,
+    AudioCapsCapBuilder,
+    AudioSetInstructBuilder,
+    AudioCapsInstructCapBuilder,
+    WavCapsCapInstructBuilder,
+    WavCapsCapBuilder
+)
+
+from lavis.datasets.builders.object3d_caption_builder import (
+    ObjaverseCaptionInstructBuilder,
+    ShapenetCaptionInstructBuilder,
+    ObjaverseCaptionBuilder,
+    ShapenetCaptionBuilder
+)
+from lavis.datasets.builders.object3d_qa_builder import ObjaverseQABuilder
+from lavis.datasets.builders.object3d_classification_builder import ModelNetClassificationBuilder
+
+from lavis.datasets.builders.audio_qa_builder import AudioCapsQABuilder, ClothoQABuilder
+
+from lavis.datasets.builders.dialogue_builder import (
+    AVSDDialBuilder, 
+    AVSDDialInstructBuilder,
+    YT8MDialBuilder,
+    LLaVA150kDialInstructBuilder,
+    VisDialBuilder,
+    VisDialInstructBuilder
+)
+from lavis.datasets.builders.text_to_image_generation_builder import BlipDiffusionFinetuneBuilder
+
+from lavis.datasets.builders.discrn_builders import DiscrnImagePcBuilder, DiscrnAudioVideoBuilder
+
+from lavis.common.registry import registry
+
+__all__ = [
+    "BlipDiffusionFinetuneBuilder",
+    "COCOCapBuilder",
+    "COCORetrievalBuilder",
+    "COCOVQABuilder",
+    "ConceptualCaption12MBuilder",
+    "ConceptualCaption3MBuilder",
+    "DiDeMoRetrievalBuilder",
+    "Flickr30kBuilder",
+    "GQABuilder",
+    "ImageNetBuilder",
+    "MSRVTTCapBuilder",
+    "MSRVTTQABuilder",
+    "MSRVTTRetrievalBuilder",
+    "MSVDCapBuilder",
+    "MSVDQABuilder",
+    "NLVRBuilder",
+    "OKVQABuilder",
+    "AOKVQABuilder",
+    "SBUCaptionBuilder",
+    "SNLIVisualEntailmentBuilder",
+    "VATEXCapBuilder",
+    "VGCaptionBuilder",
+    "VGVQABuilder",
+    "AVSDDialBuilder",
+    "Laion400MBuilder",
+
+    "ViolinCapBuilder",
+    "ViolinEntailmentBuilder",
+    "VlepCaptionBuilder",
+    "YouCookCaptionBuilder",
+    "COINCaptionBuilder",
+    "CharadeCaptionBuilder",
+    "YT8MDialBuilder",
+    "IconQABuilder",
+    "ScienceQABuilder",
+    "VisDialBuilder",
+    "OCRVQABuilder",
+    "VizWizVQABuilder",
+    "TextCapsCapBuilder",
+    "Flickr30kCapBuilder",
+    "AudioSetBuilder",
+    "AudioCapsCapBuilder",
+    "WavCapsCapBuilder",
+    "WebVid2MCapBuilder",
+    "VALORCaptionBuilder",
+    "ObjaverseCaptionBuilder",
+    "ShapenetCaptionBuilder",
+    "ObjaverseQABuilder",
+    "MusicAVQABuilder",
+    "ESC50ClassificationBuilder",
+
+    ## Instruction Builders
+    "AOKVQAInstructBuilder",
+    "OKVQAInstructBuilder",
+    "AudioSetInstructBuilder",
+    "AudioCapsInstructCapBuilder",
+    "AudioCapsQABuilder",
+    "WavCapsCapInstructBuilder",
+    "ObjaverseCaptionInstructBuilder",
+    "ShapenetCaptionInstructBuilder",
+    "ModelNetClassificationBuilder",
+    "ObjaverseCaptionInstructBuilder",
+    "MSRVTTCapInstructBuilder",
+    "MSVDCapInstructBuilder",
+    "VATEXCapInstructBuilder",
+    "WebVid2MCapInstructBuilder",
+    "MSRVTTQAInstructBuilder",
+    "MSVDQAInstructBuilder",
+    "VALORCaptionInstructBuilder",
+    "AVSDDialInstructBuilder",
+    "VisDialInstructBuilder",
+    "MusicAVQAInstructBuilder",
+    "ViolinCapInstructBuilder",
+    "ViolinEntailmentInstructBuilder",
+    "VlepCaptionInstructBuilder", 
+    "YouCookCaptionInstructBuilder",
+    "COINCaptionInstructBuilder",
+    "CharadeCaptionInstructBuilder",
+    "COCOVQAInstructBuilder",
+    "VGVQAInstructBuilder",
+    "GQAInstructBuilder",
+    "IconQAInstructBuilder",
+    "SNLIVisualEntailmentInstructBuilder",
+    "Laion400MInstructBuilder",
+    "LLaVA150kDialInstructBuilder",
+    "ScienceQAInstructBuilder",
+    "OCRVQAInstructBuilder",
+    "TextCapsCapInstructBuilder",
+    "Flickr30kCapInstructBuilder",
+    "ConceptualCaption12MInstructBuilder",
+    "ConceptualCaption3MInstructBuilder",
+    "VGCaptionInstructBuilder",
+    "SBUCaptionInstructBuilder",
+    "ClothoQABuilder",
+
+    # DisCRN
+    "DiscrnImagePcBuilder",
+    "DiscrnAudioVideoBuilder"
+
+]
+
+
+def load_dataset(name, cfg_path=None, vis_path=None, data_type=None):
+    """
+    Example
+
+    >>> dataset = load_dataset("coco_caption", cfg=None)
+    >>> splits = dataset.keys()
+    >>> print([len(dataset[split]) for split in splits])
+
+    """
+    if cfg_path is None:
+        cfg = None
+    else:
+        cfg = load_dataset_config(cfg_path)
+
+    try:
+        builder = registry.get_builder_class(name)(cfg)
+    except TypeError:
+        print(
+            f"Dataset {name} not found. Available datasets:\n"
+            + ", ".join([str(k) for k in dataset_zoo.get_names()])
+        )
+        exit(1)
+
+    if vis_path is not None:
+        if data_type is None:
+            # use default data type in the config
+            data_type = builder.config.data_type
+
+        assert (
+            data_type in builder.config.build_info
+        ), f"Invalid data_type {data_type} for {name}."
+
+        builder.config.build_info.get(data_type).storage = vis_path
+
+    dataset = builder.build_datasets()
+    return dataset
+
+
+class DatasetZoo:
+    def __init__(self) -> None:
+        self.dataset_zoo = {
+            k: list(v.DATASET_CONFIG_DICT.keys())
+            for k, v in sorted(registry.mapping["builder_name_mapping"].items())
+        }
+
+    def get_names(self):
+        return list(self.dataset_zoo.keys())
+
+
+dataset_zoo = DatasetZoo()
diff --git a/LAVIS-main/lavis/datasets/builders/audio_caption_builder.py b/LAVIS-main/lavis/datasets/builders/audio_caption_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4e3828e6ea1226e49d3dd8a212d32e844765416
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/builders/audio_caption_builder.py
@@ -0,0 +1,123 @@
+"""
+ Copyright (c) 2023, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from lavis.common.registry import registry
+from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder, MultiModalDatasetBuilder
+
+from lavis.datasets.datasets.audio_captioning_datasets import (
+    AudioSetDataset,
+    AudioSetEvalDataset,
+    AudioSetInstructDataset,
+    AudioCapsDataset,
+    AudioCapsEvalDataset,
+    AudioCapsInstructDataset,
+    ClothoV2Dataset,
+    ClothoV2InstructDataset,
+    ClothoV2EvalDataset,
+    AudioLanguagePretrainDataset,
+    AudioLanguagePretrainEvalDataset,
+    AudioLanguagePretrainInstructDataset
+)
+
+
+class AudioCapBuilder(MultiModalDatasetBuilder):
+    train_dataset_cls = AudioSetDataset
+    eval_dataset_cls = AudioSetEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/audioset/defaults_mm_cap.yaml",
+    }
+
+
+    def build(self):
+        datasets = super().build()
+        build_info = self.config.build_info
+        for split,ds in datasets.items():
+            # TODO: add option to download templates
+            templates = build_info.get('templates')
+            if templates == None:
+                ds._build_templates(None)
+            else:
+                ds._build_templates(build_info.templates.storage)
+        return datasets
+
+@registry.register_builder("audioset_mm_caption")
+class AudioSetBuilder(AudioCapBuilder):
+    train_dataset_cls = AudioSetDataset
+    eval_dataset_cls = AudioSetEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/audioset/defaults_mm_cap.yaml",
+    }
+
+@registry.register_builder("audioset_mm_caption_instruct")
+class AudioSetInstructBuilder(AudioCapBuilder):
+    train_dataset_cls = AudioSetInstructDataset
+    eval_dataset_cls = AudioSetEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/audioset/defaults_mm_cap_instruct.yaml",
+    }
+
+@registry.register_builder("audiocaps_mm_caption")
+class AudioCapsCapBuilder(AudioCapBuilder):
+    train_dataset_cls = AudioCapsDataset
+    eval_dataset_cls = AudioCapsEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/audiocaps/defaults_mm_cap.yaml",
+    }
+
+@registry.register_builder("audiocaps_mm_caption_instruct")
+class AudioCapsInstructCapBuilder(AudioCapBuilder):
+    train_dataset_cls = AudioCapsInstructDataset
+    eval_dataset_cls = AudioCapsEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/audiocaps/defaults_mm_cap_instruct.yaml",
+    }
+
+@registry.register_builder("clothov2")
+class ClothoCapInstructBuilder(MultiModalDatasetBuilder):
+    train_dataset_cls = ClothoV2Dataset
+    eval_dataset_cls = ClothoV2EvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/clotho/defaults_mm_cap.yaml",
+    }
+
+@registry.register_builder("clothov2_instruct")
+class ClothoCapInstructBuilder(MultiModalDatasetBuilder):
+    train_dataset_cls = ClothoV2InstructDataset
+    eval_dataset_cls = ClothoV2EvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/clotho/defaults_mm_cap_instruct.yaml",
+    }
+
+
+@registry.register_builder("wavcaps_mm_caption")
+class WavCapsCapBuilder(AudioCapBuilder):
+    train_dataset_cls = AudioLanguagePretrainDataset
+    eval_dataset_cls = AudioLanguagePretrainEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/wavcaps/defaults_mm_cap.yaml",
+    }
+
+
+
+@registry.register_builder("wavcaps_mm_caption_instruct")
+class WavCapsCapInstructBuilder(AudioCapBuilder):
+    train_dataset_cls = AudioLanguagePretrainInstructDataset
+    eval_dataset_cls = AudioLanguagePretrainEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/wavcaps/defaults_mm_cap_instruct.yaml",
+    }
+
+
diff --git a/LAVIS-main/lavis/datasets/builders/audio_qa_builder.py b/LAVIS-main/lavis/datasets/builders/audio_qa_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e9435a2da3eeadd74c92ab73944d46bb4b5ca19
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/builders/audio_qa_builder.py
@@ -0,0 +1,28 @@
+"""
+ Copyright (c) 2023, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from lavis.common.registry import registry
+from lavis.datasets.builders.audio_caption_builder import AudioCapBuilder
+from lavis.datasets.datasets.audio_qa_datasets import AudioCapsQADataset, ClothoQADataset
+
+@registry.register_builder("audiocaps_mm_qa")
+class AudioCapsQABuilder(AudioCapBuilder):
+    train_dataset_cls = AudioCapsQADataset
+    eval_dataset_cls = AudioCapsQADataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/audiocaps/defaults_mm_qa.yaml",
+    }
+
+@registry.register_builder("clotho_qa")
+class ClothoQABuilder(AudioCapBuilder):
+    train_dataset_cls = ClothoQADataset
+    eval_dataset_cls = ClothoQADataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/clotho/defaults_mm_qa.yaml",
+    }
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/builders/base_dataset_builder.py b/LAVIS-main/lavis/datasets/builders/base_dataset_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5b0d549a39ba0616608c6f5fc45338e6571e2ce
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/builders/base_dataset_builder.py
@@ -0,0 +1,327 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import logging
+import os
+import shutil
+import warnings
+
+import lavis.common.utils as utils
+import torch.distributed as dist
+from lavis.common.dist_utils import is_dist_avail_and_initialized, is_main_process
+from lavis.common.registry import registry
+from lavis.datasets.data_utils import extract_archive
+from lavis.processors.base_processor import BaseProcessor
+from omegaconf import OmegaConf
+from torchvision.datasets.utils import download_url
+
+
+class BaseDatasetBuilder:
+    train_dataset_cls, eval_dataset_cls = None, None
+
+    def __init__(self, cfg=None):
+        super().__init__()
+
+        if cfg is None:
+            # help to create datasets from default config.
+            self.config = load_dataset_config(self.default_config_path())
+        elif isinstance(cfg, str):
+            self.config = load_dataset_config(cfg)
+        else:
+            # when called from task.build_dataset()
+            self.config = cfg
+
+        self.data_type = self.config.data_type
+
+        self.vis_processors = {"train": BaseProcessor(), "eval": BaseProcessor()}
+        self.text_processors = {"train": BaseProcessor(), "eval": BaseProcessor()}
+
+        # additional processors, each specified by a name in string.
+        self.kw_processors = {}
+
+    def build_datasets(self):
+        # download, split, etc...
+        # only called on 1 GPU/TPU in distributed
+
+        if is_main_process():
+            self._download_data()
+
+        if is_dist_avail_and_initialized():
+            dist.barrier()
+
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        datasets = self.build()  # dataset['train'/'val'/'test']
+
+        return datasets
+
+    def build_processors(self):
+        vis_proc_cfg = self.config.get("vis_processor")
+        txt_proc_cfg = self.config.get("text_processor")
+
+        if vis_proc_cfg is not None:
+            vis_train_cfg = vis_proc_cfg.get("train")
+            vis_eval_cfg = vis_proc_cfg.get("eval")
+
+            self.vis_processors["train"] = self._build_proc_from_cfg(vis_train_cfg)
+            self.vis_processors["eval"] = self._build_proc_from_cfg(vis_eval_cfg)
+
+        if txt_proc_cfg is not None:
+            txt_train_cfg = txt_proc_cfg.get("train")
+            txt_eval_cfg = txt_proc_cfg.get("eval")
+
+            self.text_processors["train"] = self._build_proc_from_cfg(txt_train_cfg)
+            self.text_processors["eval"] = self._build_proc_from_cfg(txt_eval_cfg)
+        
+        kw_proc_cfg = self.config.get("kw_processor")
+        if kw_proc_cfg is not None:
+            for name, cfg in kw_proc_cfg.items():
+                self.kw_processors[name] = self._build_proc_from_cfg(cfg)
+        
+    @staticmethod
+    def _build_proc_from_cfg(cfg):
+        return (
+            registry.get_processor_class(cfg.name).from_config(cfg)
+            if cfg is not None
+            else None
+        )
+
+    @classmethod
+    def default_config_path(cls, type="default"):
+        return utils.get_abs_path(cls.DATASET_CONFIG_DICT[type])
+
+    def _download_data(self):
+        self._download_ann()
+        self._download_vis()
+
+    def _download_ann(self):
+        """
+        Download annotation files if necessary.
+        All the vision-language datasets should have annotations of unified format.
+
+        storage_path can be:
+          (1) relative/absolute: will be prefixed with env.cache_root to make full path if relative.
+          (2) basename/dirname: will be suffixed with base name of URL if dirname is provided.
+
+        Local annotation paths should be relative.
+        """
+        anns = self.config.build_info.annotations
+
+        splits = anns.keys()
+
+        cache_root = registry.get_path("cache_root")
+
+        for split in splits:
+            info = anns[split]
+
+            urls, storage_paths = info.get("url", None), info.storage
+
+            if isinstance(urls, str):
+                urls = [urls]
+            if isinstance(storage_paths, str):
+                storage_paths = [storage_paths]
+
+            assert len(urls) == len(storage_paths)
+
+            for url_or_filename, storage_path in zip(urls, storage_paths):
+                # if storage_path is relative, make it full by prefixing with cache_root.
+                if not os.path.isabs(storage_path):
+                    storage_path = os.path.join(cache_root, storage_path)
+
+                dirname = os.path.dirname(storage_path)
+                if not os.path.exists(dirname):
+                    os.makedirs(dirname)
+
+                if os.path.isfile(url_or_filename):
+                    src, dst = url_or_filename, storage_path
+                    if not os.path.exists(dst):
+                        shutil.copyfile(src=src, dst=dst)
+                    else:
+                        logging.info("Using existing file {}.".format(dst))
+                else:
+                    if os.path.isdir(storage_path):
+                        # if only dirname is provided, suffix with basename of URL.
+                        raise ValueError(
+                            "Expecting storage_path to be a file path, got directory {}".format(
+                                storage_path
+                            )
+                        )
+                    else:
+                        filename = os.path.basename(storage_path)
+
+                    download_url(url=url_or_filename, root=dirname, filename=filename)
+
+    def _download_vis(self):
+
+        storage_path = self.config.build_info.get(self.data_type).storage
+        storage_path = utils.get_cache_path(storage_path)
+
+        if not os.path.exists(storage_path):
+            warnings.warn(
+                f"""
+                The specified path {storage_path} for visual inputs does not exist.
+                Please provide a correct path to the visual inputs or
+                refer to datasets/download_scripts/README.md for downloading instructions.
+                """
+            )
+
+    def build(self):
+        """
+        Create by split datasets inheriting torch.utils.data.Datasets.
+
+        # build() can be dataset-specific. Overwrite to customize.
+        """
+        self.build_processors()
+
+        build_info = self.config.build_info
+
+        ann_info = build_info.annotations
+        vis_info = build_info.get(self.data_type)
+
+        datasets = dict()
+        for split in ann_info.keys():
+            if split not in ["train", "val", "test"]:
+                continue
+
+            is_train = split == "train"
+
+            # processors
+            vis_processor = (
+                self.vis_processors["train"]
+                if is_train
+                else self.vis_processors["eval"]
+            )
+            text_processor = (
+                self.text_processors["train"]
+                if is_train
+                else self.text_processors["eval"]
+            )
+
+            # annotation path
+            ann_paths = ann_info.get(split).storage
+            if isinstance(ann_paths, str):
+                ann_paths = [ann_paths]
+
+            abs_ann_paths = []
+            for ann_path in ann_paths:
+                if not os.path.isabs(ann_path):
+                    ann_path = utils.get_cache_path(ann_path)
+                abs_ann_paths.append(ann_path)
+            ann_paths = abs_ann_paths
+
+            # visual data storage path
+            vis_path = vis_info.storage
+
+            if not os.path.isabs(vis_path):
+                # vis_path = os.path.join(utils.get_cache_path(), vis_path)
+                vis_path = utils.get_cache_path(vis_path)
+
+            if not os.path.exists(vis_path):
+                warnings.warn("storage path {} does not exist.".format(vis_path))
+
+            # create datasets
+            dataset_cls = self.train_dataset_cls if is_train else self.eval_dataset_cls
+            datasets[split] = dataset_cls(
+                vis_processor=vis_processor,
+                text_processor=text_processor,
+                ann_paths=ann_paths,
+                vis_root=vis_path,
+            )
+
+        return datasets
+
+
+class MultiModalDatasetBuilder(BaseDatasetBuilder):
+    """
+    MultiModalDatasetBuilder is a utility class designed to construct datasets
+    suitable for multi-modal tasks. This class simplifies the creation of 
+    datasets that incorporate data of multiple modalities, such as text, 
+    images, video, or audio.
+    """
+    train_dataset_cls, eval_dataset_cls = None, None
+
+    def __init__(self, cfg=None):
+        super().__init__(cfg)
+        if isinstance(self.data_type, str):
+            self.data_type = [self.data_type]
+
+    def _build_processor(self, cfg_name):
+        cfg = self.config.get(cfg_name)
+        return {
+            split: self._build_proc_from_cfg(cfg.get(split)) 
+            if cfg is not None 
+            else None
+            for split in ['train', 'eval']
+        }
+
+    def build_processors(self):
+        self.text_processors = self._build_processor("text_processor")
+        
+        self.processors = {
+            split: {
+                modality: self._build_proc_from_cfg(
+                    self.config.get(f"{'vis' if 'image' in modality else modality}_processor").get(split)
+                )
+                for modality in self.data_type
+            }
+            for split in ['train', 'eval']
+        }
+
+    def _download_multimodal(self, modality):
+        storage_path = utils.get_cache_path(self.config.build_info.get(modality).storage)
+        if not os.path.exists(storage_path):
+            warnings.warn(f"The specified path {storage_path} for {modality} inputs does not exist.")
+
+    def _download_data(self):
+        self._download_ann()
+        for modality in self.data_type:
+            self._download_multimodal(modality)
+
+    def _get_absolute_path(self, path):
+        if not os.path.isabs(path):
+            return utils.get_cache_path(path)
+        return path
+
+    def build(self):
+        self.build_processors()
+        build_info = self.config.build_info
+        datasets = {}
+        
+        for split, info in build_info.annotations.items():
+            if split not in ["train", "val", "test"]:
+                continue
+
+            is_train = split == "train"
+            dataset_args = self._get_dataset_args(info, is_train)
+            
+            dataset_cls = self.train_dataset_cls if is_train else self.eval_dataset_cls
+            datasets[split] = dataset_cls(**dataset_args)
+
+        return datasets
+
+    def _get_dataset_args(self, info, is_train):
+        dataset_args = dict(self.config.build_info.get('kwargs', {}))
+        
+        for modality in self.data_type:
+            proc_name = f"{'vis' if 'image' in modality else modality}_processor"
+            dataset_args[proc_name] = self.processors["train" if is_train else "eval"][modality]
+            mm_path = self._get_absolute_path(self.config.build_info.get(modality).storage)
+            dataset_args[f"{'vis' if 'image' in modality  else modality}_root"] = mm_path
+        
+        dataset_args['text_processor'] = self.text_processors["train" if is_train else "eval"]
+        dataset_args["ann_paths"] = [self._get_absolute_path(path) for path in info.storage]
+        dataset_args['modalities'] = self.data_type
+        
+        # Conform to base
+        for key in ['vis_processor', 'vis_root', 'test_processor']:
+            dataset_args.setdefault(key, None)
+        
+        return dataset_args
+
+def load_dataset_config(cfg_path):
+    cfg = OmegaConf.load(cfg_path).datasets
+    return next(iter(cfg.values()))
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/builders/caption_builder.py b/LAVIS-main/lavis/datasets/builders/caption_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3d9663858be9c34d894116422dff4e3dc19b967
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/builders/caption_builder.py
@@ -0,0 +1,321 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder, MultiModalDatasetBuilder
+from lavis.datasets.datasets.capfilt_dataset import CapFiltCaptionInstructDataset, CapFiltCaptionDataset
+from lavis.datasets.datasets.coco_caption_datasets import (
+    COCOCapDataset,
+    COCOCapInstructDataset,
+    COCOCapEvalDataset,
+    NoCapsEvalDataset,
+)
+
+from lavis.common.registry import registry
+from lavis.datasets.datasets.video_caption_datasets import (
+    VideoCaptionDataset,
+    VideoCaptionEvalDataset,
+    ClipCaptionDataset,
+    ClipCaptionInstructDataset,
+    ClipCaptionEvalDataset,
+    VideoCaptionInstructDataset,
+    WebVideoCaptionDataset,
+    WebVideoCaptionInstructDataset,
+)
+from lavis.datasets.datasets.violin_dataset import (
+    ViolinVideoCaptionDataset,
+    ViolinVideoCaptionInstructDataset,
+    ViolinVideoCaptionEvalDataset
+)
+from lavis.datasets.datasets.valor_caption import VALORCaptionInstuctDataset, VALORCaptionEvalDataset, VALORCaptionDataset
+from lavis.datasets.datasets.vatex_captioning_datasets import VATEXCaptionInstuctDataset, VATEXCaptionEvalDataset, VATEXCaptionDataset
+from lavis.datasets.datasets.vlep_dataset import VlepVideoDataset, VlepVideoInstructDataset, VlepVideoEvalDataset
+from lavis.datasets.datasets.vsr_datasets import VSRCaptionDataset, VSRCaptionInstructDataset, VSRCaptionEvalDataset
+from lavis.datasets.datasets.textcaps_datasets import TextCapsCapDataset, TextCapsCapInstructDataset, TextCapsCapEvalDataset
+
+@registry.register_builder("coco_caption")
+class COCOCapBuilder(BaseDatasetBuilder):
+    train_dataset_cls = COCOCapDataset
+    eval_dataset_cls = COCOCapEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/coco/defaults_cap.yaml",
+    }
+
+@registry.register_builder("coco_caption_instruct")
+class COCOCapInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = COCOCapInstructDataset
+    eval_dataset_cls = COCOCapEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/coco/defaults_cap_instruct.yaml",
+    }
+
+
+@registry.register_builder("flickr30k_caption")
+class Flickr30kCapBuilder(BaseDatasetBuilder):
+    train_dataset_cls = COCOCapDataset
+    eval_dataset_cls = COCOCapEvalDataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/flickr30k/defaults_cap.yaml",
+    }
+
+@registry.register_builder("flickr30k_caption_instruct")
+class Flickr30kCapInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = COCOCapInstructDataset
+    eval_dataset_cls = COCOCapEvalDataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/flickr30k/defaults_cap_instuct.yaml",
+    }
+
+@registry.register_builder("nocaps")
+class COCOCapBuilder(BaseDatasetBuilder):
+    eval_dataset_cls = NoCapsEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/nocaps/defaults.yaml",
+    }
+
+@registry.register_builder("vsr_caption")
+class VSRCapBuilder(BaseDatasetBuilder):
+    train_dataset_cls = VSRCaptionDataset
+    eval_dataset_cls = VSRCaptionEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/vsr/defaults.yaml",
+    }
+
+@registry.register_builder("vsr_caption_instruct")
+class VSRCapInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = VSRCaptionInstructDataset
+    eval_dataset_cls = VSRCaptionEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/vsr/defaults.yaml",
+    }
+
+@registry.register_builder("textcaps_caption")
+class TextCapsCapBuilder(BaseDatasetBuilder):
+    train_dataset_cls = TextCapsCapDataset
+    eval_dataset_cls = TextCapsCapEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/textcaps/defaults.yaml",
+    }
+
+@registry.register_builder("textcaps_caption_instruct")
+class TextCapsCapInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = TextCapsCapInstructDataset
+    eval_dataset_cls = TextCapsCapEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/textcaps/defaults_instruct.yaml",
+    }
+
+
+@registry.register_builder("capfilt14m")
+class CapFiltCapBuilder(BaseDatasetBuilder):
+    train_dataset_cls = CapFiltCaptionDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/capfilt14m/defaults_cap.yaml",
+    }
+
+@registry.register_builder("capfilt14m_instruct")
+class CapFiltCapBuilder(BaseDatasetBuilder):
+    train_dataset_cls = CapFiltCaptionInstructDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/capfilt14m/defaults_cap_instruct.yaml",
+    }
+
+
+@registry.register_builder("msrvtt_caption")
+class MSRVTTCapBuilder(BaseDatasetBuilder):
+    train_dataset_cls = VideoCaptionDataset
+    eval_dataset_cls = VideoCaptionEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/msrvtt/defaults_cap.yaml",
+    }
+
+
+@registry.register_builder("msvd_caption")
+class MSVDCapBuilder(BaseDatasetBuilder):
+    train_dataset_cls = VideoCaptionDataset
+    eval_dataset_cls = VideoCaptionEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/msvd/defaults_cap.yaml",
+    }
+
+
+@registry.register_builder("vatex_caption")
+class VATEXCapBuilder(MultiModalDatasetBuilder):
+    train_dataset_cls = VATEXCaptionDataset
+    eval_dataset_cls = VATEXCaptionEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/vatex/defaults_cap.yaml",
+    }
+
+@registry.register_builder("msrvtt_caption_instruct")
+class MSRVTTCapInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = VideoCaptionInstructDataset
+    eval_dataset_cls = VideoCaptionEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/msrvtt/defaults_cap_instruct.yaml",
+    }
+
+@registry.register_builder("msvd_caption_instruct")
+class MSVDCapInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = VideoCaptionInstructDataset
+    eval_dataset_cls = VideoCaptionEvalDataset
+
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/msvd/defaults_cap_instruct.yaml",
+    }
+
+
+@registry.register_builder("vatex_caption_instruct")
+class VATEXCapInstructBuilder(MultiModalDatasetBuilder):
+    train_dataset_cls = VATEXCaptionInstuctDataset
+    eval_dataset_cls = VATEXCaptionEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/vatex/defaults_cap_instruct.yaml",
+    }
+
+
+@registry.register_builder("webvid2m_caption")
+class WebVid2MCapBuilder(BaseDatasetBuilder):
+    train_dataset_cls = WebVideoCaptionDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/webvid/defaults_cap.yaml",
+    }
+
+@registry.register_builder("webvid2m_caption_instruct")
+class WebVid2MCapInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = WebVideoCaptionInstructDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/webvid/defaults_cap_instruct.yaml",
+    }
+
+@registry.register_builder("violin_caption")
+class ViolinCapBuilder(BaseDatasetBuilder):
+    train_dataset_cls = ViolinVideoCaptionDataset
+    eval_dataset_cls = ViolinVideoCaptionEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/violin/defaults_cap.yaml",
+    }
+
+
+@registry.register_builder("violin_caption_instruct")
+class ViolinCapInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = ViolinVideoCaptionInstructDataset
+    eval_dataset_cls = ViolinVideoCaptionEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/violin/defaults_cap_instruct.yaml",
+    }
+
+@registry.register_builder("valor_mm_caption")
+class VALORCaptionBuilder(MultiModalDatasetBuilder):
+    train_dataset_cls = VALORCaptionDataset
+    eval_dataset_cls = VALORCaptionEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/valor/defaults_mm_cap.yaml"
+    }
+
+@registry.register_builder("valor_mm_caption_instruct")
+class VALORCaptionInstructBuilder(MultiModalDatasetBuilder):
+    train_dataset_cls = VALORCaptionInstuctDataset
+    eval_dataset_cls = VALORCaptionEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/valor/defaults_mm_cap_instruct.yaml"
+    }
+
+@registry.register_builder("vlep_caption")
+class VlepCaptionBuilder(BaseDatasetBuilder):
+    train_dataset_cls = VlepVideoDataset
+    eval_dataset_cls = VlepVideoEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/vlep/defaults_cap.yaml"
+    }
+
+
+@registry.register_builder("vlep_caption_instruct")
+class VlepCaptionInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = VlepVideoInstructDataset
+    eval_dataset_cls = VlepVideoEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/vlep/defaults_cap_instruct.yaml"
+    }
+
+@registry.register_builder("youcook_caption")
+class YouCookCaptionBuilder(BaseDatasetBuilder):
+    train_dataset_cls = ClipCaptionDataset
+    eval_dataset_cls = ClipCaptionEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/youcook/defaults_cap.yaml",
+    }
+
+@registry.register_builder("youcook_caption_instruct")
+class YouCookCaptionInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = ClipCaptionInstructDataset
+    eval_dataset_cls = ClipCaptionEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/youcook/defaults_cap_instruct.yaml",
+    }
+
+@registry.register_builder("coin_caption")
+class COINCaptionBuilder(BaseDatasetBuilder):
+    train_dataset_cls = ClipCaptionDataset
+    eval_dataset_cls = ClipCaptionEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/coin/defaults_cap.yaml",
+    }
+
+
+@registry.register_builder("coin_caption_instruct")
+class COINCaptionInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = ClipCaptionInstructDataset
+    eval_dataset_cls = ClipCaptionEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/coin/defaults_cap_instruct.yaml",
+    }
+
+
+@registry.register_builder("charade_caption")
+class CharadeCaptionBuilder(BaseDatasetBuilder):
+    train_dataset_cls = ClipCaptionDataset
+    eval_dataset_cls = ClipCaptionEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/charade/defaults_cap.yaml",
+    }
+
+@registry.register_builder("charade_caption_instruct")
+class CharadeCaptionInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = ClipCaptionInstructDataset
+    eval_dataset_cls = ClipCaptionEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/charade/defaults_cap_instruct.yaml",
+    }
diff --git a/LAVIS-main/lavis/datasets/builders/classification_builder.py b/LAVIS-main/lavis/datasets/builders/classification_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..eedd8ab3b88d1cc5726a5eca3afbd7d323f6b85e
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/builders/classification_builder.py
@@ -0,0 +1,75 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from lavis.common.registry import registry
+from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder, MultiModalDatasetBuilder
+from lavis.datasets.datasets.nlvr_datasets import NLVRDataset, NLVREvalDataset
+from lavis.datasets.datasets.snli_ve_datasets import SNLIVisualEntialmentDataset, SNLIVisualEntialmentInstructDataset
+from lavis.datasets.datasets.violin_dataset import ViolinVideoEntailmentDataset, ViolinVideoEntailmentInstructDataset
+from lavis.datasets.datasets.vsr_datasets import VSRClassificationDataset, VSRClassificationInstructDataset
+from lavis.datasets.datasets.audio_classification_datasets import ESC50
+@registry.register_builder("violin_entailment")
+class ViolinEntailmentBuilder(BaseDatasetBuilder):
+    train_dataset_cls = ViolinVideoEntailmentDataset
+    eval_dataset_cls = ViolinVideoEntailmentDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/violin/defaults_entail.yaml",
+    }
+
+
+@registry.register_builder("violin_entailment_instruct")
+class ViolinEntailmentInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = ViolinVideoEntailmentInstructDataset
+    eval_dataset_cls = ViolinVideoEntailmentInstructDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/violin/defaults_entail_instruct.yaml",
+    }
+
+@registry.register_builder("nlvr")
+class NLVRBuilder(BaseDatasetBuilder):
+    train_dataset_cls = NLVRDataset
+    eval_dataset_cls = NLVREvalDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/nlvr/defaults.yaml"}
+
+
+@registry.register_builder("snli_ve")
+class SNLIVisualEntailmentBuilder(BaseDatasetBuilder):
+    train_dataset_cls = SNLIVisualEntialmentDataset
+    eval_dataset_cls = SNLIVisualEntialmentDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/snli_ve/defaults.yaml"}
+
+@registry.register_builder("snli_ve_instruct")
+class SNLIVisualEntailmentInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = SNLIVisualEntialmentInstructDataset
+    eval_dataset_cls = SNLIVisualEntialmentInstructDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/snli_ve/defaults_instruct.yaml"}
+
+
+@registry.register_builder("vsr_classification")
+class VSRClassificationBuilder(BaseDatasetBuilder):
+    train_dataset_cls = VSRClassificationDataset
+    eval_dataset_cls = VSRClassificationDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/vsr/defaults_classification.yaml"}
+
+@registry.register_builder("vsr_classification_instruct")
+class SNLIVisualEntailmentInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = VSRClassificationInstructDataset
+    eval_dataset_cls = VSRClassificationInstructDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/vsr/defaults_classification_instruct.yaml"}
+
+@registry.register_builder("esc50_cls")
+class ESC50ClassificationBuilder(MultiModalDatasetBuilder):
+    eval_dataset_cls = ESC50
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/esc50/defaults_mm_cls.yaml"}
diff --git a/LAVIS-main/lavis/datasets/builders/dialogue_builder.py b/LAVIS-main/lavis/datasets/builders/dialogue_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..3add7ba6c9fd6c9e35dbac5a4441b9e3cb6e89af
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/builders/dialogue_builder.py
@@ -0,0 +1,66 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from lavis.common.registry import registry
+from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder, MultiModalDatasetBuilder
+from lavis.datasets.datasets.avsd_dialogue_datasets import (
+    AVSDDialDataset,
+    AVSDDialEvalDataset,
+    AVSDDialInstructEvalDataset
+)
+from lavis.datasets.datasets.visdial_dialogue_datasets import (
+    VisDialDataset,
+    VisDialInstructDataset,
+    VisDialEvalDataset,
+)
+
+from lavis.datasets.datasets.yt8m_video_dialogue_datasets import YT8MDialDataset
+from lavis.datasets.datasets.llava150k_dataset import LLaVA150kInstructDataset
+
+
+@registry.register_builder("avsd_dialogue")
+class AVSDDialBuilder(BaseDatasetBuilder):
+    train_dataset_cls = AVSDDialDataset
+    eval_dataset_cls = AVSDDialEvalDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/avsd/defaults_dial.yaml"}
+
+@registry.register_builder("visdial")
+class VisDialBuilder(BaseDatasetBuilder):
+    train_dataset_cls = VisDialDataset
+    eval_dataset_cls = VisDialEvalDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/visdial/defaults_dial.yaml"}
+
+@registry.register_builder("visdial_instruct")
+class VisDialInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = VisDialInstructDataset
+    eval_dataset_cls = VisDialEvalDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/visdial/defaults_dial_instruct.yaml"}
+
+@registry.register_builder("avsd_mm_dialogue_instruct")
+class AVSDDialInstructBuilder(MultiModalDatasetBuilder):
+    train_dataset_cls = AVSDDialInstructEvalDataset
+    eval_dataset_cls = AVSDDialInstructEvalDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/avsd/defaults_mm_dial_instruct.yaml"}
+
+@registry.register_builder("llava150k_dialogue_instruct")
+class LLaVA150kDialInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = LLaVA150kInstructDataset
+    eval_dataset_cls = LLaVA150kInstructDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/llava150k/defaults_dial.yaml"}
+
+@registry.register_builder("yt8m_mm_dialogue")
+class YT8MDialBuilder(MultiModalDatasetBuilder):
+    train_dataset_cls = YT8MDialDataset
+    eval_dataset_cls = YT8MDialDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/yt8m/defaults_mm_dial.yaml"}
+
diff --git a/LAVIS-main/lavis/datasets/builders/discrn_builders.py b/LAVIS-main/lavis/datasets/builders/discrn_builders.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f7cc690b03c7550a61a1c096fb0d3019bc6ae77
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/builders/discrn_builders.py
@@ -0,0 +1,28 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from lavis.common.registry import registry
+from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder, MultiModalDatasetBuilder
+from lavis.datasets.datasets.discriminatory_reasoning_datasets import DisCRnDataset
+
+
+
+@registry.register_builder("image_pc_discrn")
+class DiscrnImagePcBuilder(MultiModalDatasetBuilder):
+    eval_dataset_cls = DisCRnDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/discriminatory_reasoning/defaults_mm_image_pc.yaml",
+    }
+
+@registry.register_builder("audio_video_discrn")
+class DiscrnAudioVideoBuilder(MultiModalDatasetBuilder):
+    eval_dataset_cls = DisCRnDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/discriminatory_reasoning/defaults_mm_audio_video.yaml",
+    }
diff --git a/LAVIS-main/lavis/datasets/builders/image_text_pair_builder.py b/LAVIS-main/lavis/datasets/builders/image_text_pair_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cc0ad11e8f0943d1785cc4c531ca1c30df19c4e
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/builders/image_text_pair_builder.py
@@ -0,0 +1,120 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+from lavis.common.registry import registry
+
+from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
+from lavis.datasets.datasets.image_text_pair_datasets import ImageTextPairDataset, ImageTextPairInstructDataset
+from lavis.datasets.datasets.laion_dataset import LaionDataset, LaionInstructDataset
+
+@registry.register_builder("conceptual_caption_3m")
+class ConceptualCaption3MBuilder(BaseDatasetBuilder):
+    train_dataset_cls = ImageTextPairDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/conceptual_caption/defaults_3m.yaml"
+    }
+
+@registry.register_builder("conceptual_caption_3m_instruct")
+class ConceptualCaption3MInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = ImageTextPairInstructDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/conceptual_caption/defaults_3m_instruct.yaml"
+    }
+
+
+@registry.register_builder("conceptual_caption_12m")
+class ConceptualCaption12MBuilder(BaseDatasetBuilder):
+    train_dataset_cls = ImageTextPairDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/conceptual_caption/defaults_12m.yaml"
+    }
+
+@registry.register_builder("conceptual_caption_12m_instruct")
+class ConceptualCaption12MInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = ImageTextPairInstructDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/conceptual_caption/defaults_12m_instruct.yaml"
+    }
+
+@registry.register_builder("sbu_caption")
+class SBUCaptionBuilder(BaseDatasetBuilder):
+    train_dataset_cls = ImageTextPairDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/sbu_caption/defaults.yaml"}
+
+
+@registry.register_builder("sbu_caption_instruct")
+class SBUCaptionInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = ImageTextPairInstructDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/sbu_caption/defaults_instruct.yaml"}
+
+
+@registry.register_builder("vg_caption")
+class VGCaptionBuilder(BaseDatasetBuilder):
+    train_dataset_cls = ImageTextPairDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/vg/defaults_caption.yaml"}
+
+
+@registry.register_builder("vg_caption_instruct")
+class VGCaptionInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = ImageTextPairInstructDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/vg/defaults_caption_instruct.yaml"}
+
+
+
+@registry.register_builder("laion2B_multi")
+class Laion2BMultiBuilder(BaseDatasetBuilder):
+    train_dataset_cls = LaionDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/laion/defaults_2B_multi.yaml"}
+
+    def _download_ann(self):
+        pass
+
+    def _download_vis(self):
+        pass
+
+    def build(self):
+        self.build_processors()
+
+        build_info = self.config.build_info
+
+        datasets = dict()
+        split = "train"  # laion dataset only has train split
+
+        # create datasets
+        # [NOTE] return inner_datasets (wds.DataPipeline)
+        dataset_cls = self.train_dataset_cls
+        datasets[split] = dataset_cls(
+            vis_processor=self.vis_processors[split],
+            text_processor=self.text_processors[split],
+            location=build_info.storage,
+        ).inner_dataset
+
+        return datasets
+
+@registry.register_builder("laion400M")
+class Laion400MBuilder(Laion2BMultiBuilder):
+    train_dataset_cls = LaionDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/laion/defaults_400M.yaml"}
+
+
+@registry.register_builder("laion400M_instruct")
+class Laion400MInstructBuilder(Laion2BMultiBuilder):
+    train_dataset_cls = LaionInstructDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/laion/defaults_400M_instruct.yaml"}
+
diff --git a/LAVIS-main/lavis/datasets/builders/imagefolder_builder.py b/LAVIS-main/lavis/datasets/builders/imagefolder_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c71fbe216156f7e18f3a0d49004d558508980e8
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/builders/imagefolder_builder.py
@@ -0,0 +1,1061 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+
+from lavis.common.registry import registry
+from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
+from lavis.datasets.datasets.imagefolder_dataset import ImageFolderDataset
+
+
+@registry.register_builder("imagenet")
+class ImageNetBuilder(BaseDatasetBuilder):
+    train_dataset_cls = ImageFolderDataset
+    eval_dataset_cls = ImageFolderDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/imagenet/defaults.yaml"}
+
+    def _download_ann(self):
+        pass
+
+    def build(self):
+        self.build_processors()
+
+        build_info = self.config.build_info
+
+        vis_info = build_info.get(self.data_type)
+
+        datasets = dict()
+        for split in build_info.splits:
+            assert split in [
+                "train",
+                "val",
+            ], "Invalid split name {}, must be one of 'train', 'val' and 'test'."
+
+            is_train = split == "train"
+
+            vis_processor = (
+                self.vis_processors["train"]
+                if is_train
+                else self.vis_processors["eval"]
+            )
+
+            vis_path = os.path.join(vis_info.storage, split)
+
+            # create datasets
+            dataset_cls = self.train_dataset_cls if is_train else self.eval_dataset_cls
+            datasets[split] = dataset_cls(
+                vis_processor=vis_processor,
+                vis_root=vis_path,
+                classnames=imagenet_classnames,
+            )
+
+        return datasets
+
+
+imagenet_classnames = [
+    "tench",
+    "goldfish",
+    "great white shark",
+    "tiger shark",
+    "hammerhead shark",
+    "electric ray",
+    "stingray",
+    "rooster",
+    "hen",
+    "ostrich",
+    "brambling",
+    "goldfinch",
+    "house finch",
+    "junco",
+    "indigo bunting",
+    "American robin",
+    "bulbul",
+    "jay",
+    "magpie",
+    "chickadee",
+    "American dipper",
+    "kite (bird of prey)",
+    "bald eagle",
+    "vulture",
+    "great grey owl",
+    "fire salamander",
+    "smooth newt",
+    "newt",
+    "spotted salamander",
+    "axolotl",
+    "American bullfrog",
+    "tree frog",
+    "tailed frog",
+    "loggerhead sea turtle",
+    "leatherback sea turtle",
+    "mud turtle",
+    "terrapin",
+    "box turtle",
+    "banded gecko",
+    "green iguana",
+    "Carolina anole",
+    "desert grassland whiptail lizard",
+    "agama",
+    "frilled-necked lizard",
+    "alligator lizard",
+    "Gila monster",
+    "European green lizard",
+    "chameleon",
+    "Komodo dragon",
+    "Nile crocodile",
+    "American alligator",
+    "triceratops",
+    "worm snake",
+    "ring-necked snake",
+    "eastern hog-nosed snake",
+    "smooth green snake",
+    "kingsnake",
+    "garter snake",
+    "water snake",
+    "vine snake",
+    "night snake",
+    "boa constrictor",
+    "African rock python",
+    "Indian cobra",
+    "green mamba",
+    "sea snake",
+    "Saharan horned viper",
+    "eastern diamondback rattlesnake",
+    "sidewinder rattlesnake",
+    "trilobite",
+    "harvestman",
+    "scorpion",
+    "yellow garden spider",
+    "barn spider",
+    "European garden spider",
+    "southern black widow",
+    "tarantula",
+    "wolf spider",
+    "tick",
+    "centipede",
+    "black grouse",
+    "ptarmigan",
+    "ruffed grouse",
+    "prairie grouse",
+    "peafowl",
+    "quail",
+    "partridge",
+    "african grey parrot",
+    "macaw",
+    "sulphur-crested cockatoo",
+    "lorikeet",
+    "coucal",
+    "bee eater",
+    "hornbill",
+    "hummingbird",
+    "jacamar",
+    "toucan",
+    "duck",
+    "red-breasted merganser",
+    "goose",
+    "black swan",
+    "tusker",
+    "echidna",
+    "platypus",
+    "wallaby",
+    "koala",
+    "wombat",
+    "jellyfish",
+    "sea anemone",
+    "brain coral",
+    "flatworm",
+    "nematode",
+    "conch",
+    "snail",
+    "slug",
+    "sea slug",
+    "chiton",
+    "chambered nautilus",
+    "Dungeness crab",
+    "rock crab",
+    "fiddler crab",
+    "red king crab",
+    "American lobster",
+    "spiny lobster",
+    "crayfish",
+    "hermit crab",
+    "isopod",
+    "white stork",
+    "black stork",
+    "spoonbill",
+    "flamingo",
+    "little blue heron",
+    "great egret",
+    "bittern bird",
+    "crane bird",
+    "limpkin",
+    "common gallinule",
+    "American coot",
+    "bustard",
+    "ruddy turnstone",
+    "dunlin",
+    "common redshank",
+    "dowitcher",
+    "oystercatcher",
+    "pelican",
+    "king penguin",
+    "albatross",
+    "grey whale",
+    "killer whale",
+    "dugong",
+    "sea lion",
+    "Chihuahua",
+    "Japanese Chin",
+    "Maltese",
+    "Pekingese",
+    "Shih Tzu",
+    "King Charles Spaniel",
+    "Papillon",
+    "toy terrier",
+    "Rhodesian Ridgeback",
+    "Afghan Hound",
+    "Basset Hound",
+    "Beagle",
+    "Bloodhound",
+    "Bluetick Coonhound",
+    "Black and Tan Coonhound",
+    "Treeing Walker Coonhound",
+    "English foxhound",
+    "Redbone Coonhound",
+    "borzoi",
+    "Irish Wolfhound",
+    "Italian Greyhound",
+    "Whippet",
+    "Ibizan Hound",
+    "Norwegian Elkhound",
+    "Otterhound",
+    "Saluki",
+    "Scottish Deerhound",
+    "Weimaraner",
+    "Staffordshire Bull Terrier",
+    "American Staffordshire Terrier",
+    "Bedlington Terrier",
+    "Border Terrier",
+    "Kerry Blue Terrier",
+    "Irish Terrier",
+    "Norfolk Terrier",
+    "Norwich Terrier",
+    "Yorkshire Terrier",
+    "Wire Fox Terrier",
+    "Lakeland Terrier",
+    "Sealyham Terrier",
+    "Airedale Terrier",
+    "Cairn Terrier",
+    "Australian Terrier",
+    "Dandie Dinmont Terrier",
+    "Boston Terrier",
+    "Miniature Schnauzer",
+    "Giant Schnauzer",
+    "Standard Schnauzer",
+    "Scottish Terrier",
+    "Tibetan Terrier",
+    "Australian Silky Terrier",
+    "Soft-coated Wheaten Terrier",
+    "West Highland White Terrier",
+    "Lhasa Apso",
+    "Flat-Coated Retriever",
+    "Curly-coated Retriever",
+    "Golden Retriever",
+    "Labrador Retriever",
+    "Chesapeake Bay Retriever",
+    "German Shorthaired Pointer",
+    "Vizsla",
+    "English Setter",
+    "Irish Setter",
+    "Gordon Setter",
+    "Brittany dog",
+    "Clumber Spaniel",
+    "English Springer Spaniel",
+    "Welsh Springer Spaniel",
+    "Cocker Spaniel",
+    "Sussex Spaniel",
+    "Irish Water Spaniel",
+    "Kuvasz",
+    "Schipperke",
+    "Groenendael dog",
+    "Malinois",
+    "Briard",
+    "Australian Kelpie",
+    "Komondor",
+    "Old English Sheepdog",
+    "Shetland Sheepdog",
+    "collie",
+    "Border Collie",
+    "Bouvier des Flandres dog",
+    "Rottweiler",
+    "German Shepherd Dog",
+    "Dobermann",
+    "Miniature Pinscher",
+    "Greater Swiss Mountain Dog",
+    "Bernese Mountain Dog",
+    "Appenzeller Sennenhund",
+    "Entlebucher Sennenhund",
+    "Boxer",
+    "Bullmastiff",
+    "Tibetan Mastiff",
+    "French Bulldog",
+    "Great Dane",
+    "St. Bernard",
+    "husky",
+    "Alaskan Malamute",
+    "Siberian Husky",
+    "Dalmatian",
+    "Affenpinscher",
+    "Basenji",
+    "pug",
+    "Leonberger",
+    "Newfoundland dog",
+    "Great Pyrenees dog",
+    "Samoyed",
+    "Pomeranian",
+    "Chow Chow",
+    "Keeshond",
+    "brussels griffon",
+    "Pembroke Welsh Corgi",
+    "Cardigan Welsh Corgi",
+    "Toy Poodle",
+    "Miniature Poodle",
+    "Standard Poodle",
+    "Mexican hairless dog (xoloitzcuintli)",
+    "grey wolf",
+    "Alaskan tundra wolf",
+    "red wolf or maned wolf",
+    "coyote",
+    "dingo",
+    "dhole",
+    "African wild dog",
+    "hyena",
+    "red fox",
+    "kit fox",
+    "Arctic fox",
+    "grey fox",
+    "tabby cat",
+    "tiger cat",
+    "Persian cat",
+    "Siamese cat",
+    "Egyptian Mau",
+    "cougar",
+    "lynx",
+    "leopard",
+    "snow leopard",
+    "jaguar",
+    "lion",
+    "tiger",
+    "cheetah",
+    "brown bear",
+    "American black bear",
+    "polar bear",
+    "sloth bear",
+    "mongoose",
+    "meerkat",
+    "tiger beetle",
+    "ladybug",
+    "ground beetle",
+    "longhorn beetle",
+    "leaf beetle",
+    "dung beetle",
+    "rhinoceros beetle",
+    "weevil",
+    "fly",
+    "bee",
+    "ant",
+    "grasshopper",
+    "cricket insect",
+    "stick insect",
+    "cockroach",
+    "praying mantis",
+    "cicada",
+    "leafhopper",
+    "lacewing",
+    "dragonfly",
+    "damselfly",
+    "red admiral butterfly",
+    "ringlet butterfly",
+    "monarch butterfly",
+    "small white butterfly",
+    "sulphur butterfly",
+    "gossamer-winged butterfly",
+    "starfish",
+    "sea urchin",
+    "sea cucumber",
+    "cottontail rabbit",
+    "hare",
+    "Angora rabbit",
+    "hamster",
+    "porcupine",
+    "fox squirrel",
+    "marmot",
+    "beaver",
+    "guinea pig",
+    "common sorrel horse",
+    "zebra",
+    "pig",
+    "wild boar",
+    "warthog",
+    "hippopotamus",
+    "ox",
+    "water buffalo",
+    "bison",
+    "ram (adult male sheep)",
+    "bighorn sheep",
+    "Alpine ibex",
+    "hartebeest",
+    "impala (antelope)",
+    "gazelle",
+    "arabian camel",
+    "llama",
+    "weasel",
+    "mink",
+    "European polecat",
+    "black-footed ferret",
+    "otter",
+    "skunk",
+    "badger",
+    "armadillo",
+    "three-toed sloth",
+    "orangutan",
+    "gorilla",
+    "chimpanzee",
+    "gibbon",
+    "siamang",
+    "guenon",
+    "patas monkey",
+    "baboon",
+    "macaque",
+    "langur",
+    "black-and-white colobus",
+    "proboscis monkey",
+    "marmoset",
+    "white-headed capuchin",
+    "howler monkey",
+    "titi monkey",
+    "Geoffroy's spider monkey",
+    "common squirrel monkey",
+    "ring-tailed lemur",
+    "indri",
+    "Asian elephant",
+    "African bush elephant",
+    "red panda",
+    "giant panda",
+    "snoek fish",
+    "eel",
+    "silver salmon",
+    "rock beauty fish",
+    "clownfish",
+    "sturgeon",
+    "gar fish",
+    "lionfish",
+    "pufferfish",
+    "abacus",
+    "abaya",
+    "academic gown",
+    "accordion",
+    "acoustic guitar",
+    "aircraft carrier",
+    "airliner",
+    "airship",
+    "altar",
+    "ambulance",
+    "amphibious vehicle",
+    "analog clock",
+    "apiary",
+    "apron",
+    "trash can",
+    "assault rifle",
+    "backpack",
+    "bakery",
+    "balance beam",
+    "balloon",
+    "ballpoint pen",
+    "Band-Aid",
+    "banjo",
+    "baluster / handrail",
+    "barbell",
+    "barber chair",
+    "barbershop",
+    "barn",
+    "barometer",
+    "barrel",
+    "wheelbarrow",
+    "baseball",
+    "basketball",
+    "bassinet",
+    "bassoon",
+    "swimming cap",
+    "bath towel",
+    "bathtub",
+    "station wagon",
+    "lighthouse",
+    "beaker",
+    "military hat (bearskin or shako)",
+    "beer bottle",
+    "beer glass",
+    "bell tower",
+    "baby bib",
+    "tandem bicycle",
+    "bikini",
+    "ring binder",
+    "binoculars",
+    "birdhouse",
+    "boathouse",
+    "bobsleigh",
+    "bolo tie",
+    "poke bonnet",
+    "bookcase",
+    "bookstore",
+    "bottle cap",
+    "hunting bow",
+    "bow tie",
+    "brass memorial plaque",
+    "bra",
+    "breakwater",
+    "breastplate",
+    "broom",
+    "bucket",
+    "buckle",
+    "bulletproof vest",
+    "high-speed train",
+    "butcher shop",
+    "taxicab",
+    "cauldron",
+    "candle",
+    "cannon",
+    "canoe",
+    "can opener",
+    "cardigan",
+    "car mirror",
+    "carousel",
+    "tool kit",
+    "cardboard box / carton",
+    "car wheel",
+    "automated teller machine",
+    "cassette",
+    "cassette player",
+    "castle",
+    "catamaran",
+    "CD player",
+    "cello",
+    "mobile phone",
+    "chain",
+    "chain-link fence",
+    "chain mail",
+    "chainsaw",
+    "storage chest",
+    "chiffonier",
+    "bell or wind chime",
+    "china cabinet",
+    "Christmas stocking",
+    "church",
+    "movie theater",
+    "cleaver",
+    "cliff dwelling",
+    "cloak",
+    "clogs",
+    "cocktail shaker",
+    "coffee mug",
+    "coffeemaker",
+    "spiral or coil",
+    "combination lock",
+    "computer keyboard",
+    "candy store",
+    "container ship",
+    "convertible",
+    "corkscrew",
+    "cornet",
+    "cowboy boot",
+    "cowboy hat",
+    "cradle",
+    "construction crane",
+    "crash helmet",
+    "crate",
+    "infant bed",
+    "Crock Pot",
+    "croquet ball",
+    "crutch",
+    "cuirass",
+    "dam",
+    "desk",
+    "desktop computer",
+    "rotary dial telephone",
+    "diaper",
+    "digital clock",
+    "digital watch",
+    "dining table",
+    "dishcloth",
+    "dishwasher",
+    "disc brake",
+    "dock",
+    "dog sled",
+    "dome",
+    "doormat",
+    "drilling rig",
+    "drum",
+    "drumstick",
+    "dumbbell",
+    "Dutch oven",
+    "electric fan",
+    "electric guitar",
+    "electric locomotive",
+    "entertainment center",
+    "envelope",
+    "espresso machine",
+    "face powder",
+    "feather boa",
+    "filing cabinet",
+    "fireboat",
+    "fire truck",
+    "fire screen",
+    "flagpole",
+    "flute",
+    "folding chair",
+    "football helmet",
+    "forklift",
+    "fountain",
+    "fountain pen",
+    "four-poster bed",
+    "freight car",
+    "French horn",
+    "frying pan",
+    "fur coat",
+    "garbage truck",
+    "gas mask or respirator",
+    "gas pump",
+    "goblet",
+    "go-kart",
+    "golf ball",
+    "golf cart",
+    "gondola",
+    "gong",
+    "gown",
+    "grand piano",
+    "greenhouse",
+    "radiator grille",
+    "grocery store",
+    "guillotine",
+    "hair clip",
+    "hair spray",
+    "half-track",
+    "hammer",
+    "hamper",
+    "hair dryer",
+    "hand-held computer",
+    "handkerchief",
+    "hard disk drive",
+    "harmonica",
+    "harp",
+    "combine harvester",
+    "hatchet",
+    "holster",
+    "home theater",
+    "honeycomb",
+    "hook",
+    "hoop skirt",
+    "gymnastic horizontal bar",
+    "horse-drawn vehicle",
+    "hourglass",
+    "iPod",
+    "clothes iron",
+    "carved pumpkin",
+    "jeans",
+    "jeep",
+    "T-shirt",
+    "jigsaw puzzle",
+    "rickshaw",
+    "joystick",
+    "kimono",
+    "knee pad",
+    "knot",
+    "lab coat",
+    "ladle",
+    "lampshade",
+    "laptop computer",
+    "lawn mower",
+    "lens cap",
+    "letter opener",
+    "library",
+    "lifeboat",
+    "lighter",
+    "limousine",
+    "ocean liner",
+    "lipstick",
+    "slip-on shoe",
+    "lotion",
+    "music speaker",
+    "loupe magnifying glass",
+    "sawmill",
+    "magnetic compass",
+    "messenger bag",
+    "mailbox",
+    "tights",
+    "one-piece bathing suit",
+    "manhole cover",
+    "maraca",
+    "marimba",
+    "mask",
+    "matchstick",
+    "maypole",
+    "maze",
+    "measuring cup",
+    "medicine cabinet",
+    "megalith",
+    "microphone",
+    "microwave oven",
+    "military uniform",
+    "milk can",
+    "minibus",
+    "miniskirt",
+    "minivan",
+    "missile",
+    "mitten",
+    "mixing bowl",
+    "mobile home",
+    "ford model t",
+    "modem",
+    "monastery",
+    "monitor",
+    "moped",
+    "mortar and pestle",
+    "graduation cap",
+    "mosque",
+    "mosquito net",
+    "vespa",
+    "mountain bike",
+    "tent",
+    "computer mouse",
+    "mousetrap",
+    "moving van",
+    "muzzle",
+    "metal nail",
+    "neck brace",
+    "necklace",
+    "baby pacifier",
+    "notebook computer",
+    "obelisk",
+    "oboe",
+    "ocarina",
+    "odometer",
+    "oil filter",
+    "pipe organ",
+    "oscilloscope",
+    "overskirt",
+    "bullock cart",
+    "oxygen mask",
+    "product packet / packaging",
+    "paddle",
+    "paddle wheel",
+    "padlock",
+    "paintbrush",
+    "pajamas",
+    "palace",
+    "pan flute",
+    "paper towel",
+    "parachute",
+    "parallel bars",
+    "park bench",
+    "parking meter",
+    "railroad car",
+    "patio",
+    "payphone",
+    "pedestal",
+    "pencil case",
+    "pencil sharpener",
+    "perfume",
+    "Petri dish",
+    "photocopier",
+    "plectrum",
+    "Pickelhaube",
+    "picket fence",
+    "pickup truck",
+    "pier",
+    "piggy bank",
+    "pill bottle",
+    "pillow",
+    "ping-pong ball",
+    "pinwheel",
+    "pirate ship",
+    "drink pitcher",
+    "block plane",
+    "planetarium",
+    "plastic bag",
+    "plate rack",
+    "farm plow",
+    "plunger",
+    "Polaroid camera",
+    "pole",
+    "police van",
+    "poncho",
+    "pool table",
+    "soda bottle",
+    "plant pot",
+    "potter's wheel",
+    "power drill",
+    "prayer rug",
+    "printer",
+    "prison",
+    "missile",
+    "projector",
+    "hockey puck",
+    "punching bag",
+    "purse",
+    "quill",
+    "quilt",
+    "race car",
+    "racket",
+    "radiator",
+    "radio",
+    "radio telescope",
+    "rain barrel",
+    "recreational vehicle",
+    "fishing casting reel",
+    "reflex camera",
+    "refrigerator",
+    "remote control",
+    "restaurant",
+    "revolver",
+    "rifle",
+    "rocking chair",
+    "rotisserie",
+    "eraser",
+    "rugby ball",
+    "ruler measuring stick",
+    "sneaker",
+    "safe",
+    "safety pin",
+    "salt shaker",
+    "sandal",
+    "sarong",
+    "saxophone",
+    "scabbard",
+    "weighing scale",
+    "school bus",
+    "schooner",
+    "scoreboard",
+    "CRT monitor",
+    "screw",
+    "screwdriver",
+    "seat belt",
+    "sewing machine",
+    "shield",
+    "shoe store",
+    "shoji screen / room divider",
+    "shopping basket",
+    "shopping cart",
+    "shovel",
+    "shower cap",
+    "shower curtain",
+    "ski",
+    "balaclava ski mask",
+    "sleeping bag",
+    "slide rule",
+    "sliding door",
+    "slot machine",
+    "snorkel",
+    "snowmobile",
+    "snowplow",
+    "soap dispenser",
+    "soccer ball",
+    "sock",
+    "solar thermal collector",
+    "sombrero",
+    "soup bowl",
+    "keyboard space bar",
+    "space heater",
+    "space shuttle",
+    "spatula",
+    "motorboat",
+    "spider web",
+    "spindle",
+    "sports car",
+    "spotlight",
+    "stage",
+    "steam locomotive",
+    "through arch bridge",
+    "steel drum",
+    "stethoscope",
+    "scarf",
+    "stone wall",
+    "stopwatch",
+    "stove",
+    "strainer",
+    "tram",
+    "stretcher",
+    "couch",
+    "stupa",
+    "submarine",
+    "suit",
+    "sundial",
+    "sunglasses",
+    "sunglasses",
+    "sunscreen",
+    "suspension bridge",
+    "mop",
+    "sweatshirt",
+    "swim trunks / shorts",
+    "swing",
+    "electrical switch",
+    "syringe",
+    "table lamp",
+    "tank",
+    "tape player",
+    "teapot",
+    "teddy bear",
+    "television",
+    "tennis ball",
+    "thatched roof",
+    "front curtain",
+    "thimble",
+    "threshing machine",
+    "throne",
+    "tile roof",
+    "toaster",
+    "tobacco shop",
+    "toilet seat",
+    "torch",
+    "totem pole",
+    "tow truck",
+    "toy store",
+    "tractor",
+    "semi-trailer truck",
+    "tray",
+    "trench coat",
+    "tricycle",
+    "trimaran",
+    "tripod",
+    "triumphal arch",
+    "trolleybus",
+    "trombone",
+    "hot tub",
+    "turnstile",
+    "typewriter keyboard",
+    "umbrella",
+    "unicycle",
+    "upright piano",
+    "vacuum cleaner",
+    "vase",
+    "vaulted or arched ceiling",
+    "velvet fabric",
+    "vending machine",
+    "vestment",
+    "viaduct",
+    "violin",
+    "volleyball",
+    "waffle iron",
+    "wall clock",
+    "wallet",
+    "wardrobe",
+    "military aircraft",
+    "sink",
+    "washing machine",
+    "water bottle",
+    "water jug",
+    "water tower",
+    "whiskey jug",
+    "whistle",
+    "hair wig",
+    "window screen",
+    "window shade",
+    "Windsor tie",
+    "wine bottle",
+    "airplane wing",
+    "wok",
+    "wooden spoon",
+    "wool",
+    "split-rail fence",
+    "shipwreck",
+    "sailboat",
+    "yurt",
+    "website",
+    "comic book",
+    "crossword",
+    "traffic or street sign",
+    "traffic light",
+    "dust jacket",
+    "menu",
+    "plate",
+    "guacamole",
+    "consomme",
+    "hot pot",
+    "trifle",
+    "ice cream",
+    "popsicle",
+    "baguette",
+    "bagel",
+    "pretzel",
+    "cheeseburger",
+    "hot dog",
+    "mashed potatoes",
+    "cabbage",
+    "broccoli",
+    "cauliflower",
+    "zucchini",
+    "spaghetti squash",
+    "acorn squash",
+    "butternut squash",
+    "cucumber",
+    "artichoke",
+    "bell pepper",
+    "cardoon",
+    "mushroom",
+    "Granny Smith apple",
+    "strawberry",
+    "orange",
+    "lemon",
+    "fig",
+    "pineapple",
+    "banana",
+    "jackfruit",
+    "cherimoya (custard apple)",
+    "pomegranate",
+    "hay",
+    "carbonara",
+    "chocolate syrup",
+    "dough",
+    "meatloaf",
+    "pizza",
+    "pot pie",
+    "burrito",
+    "red wine",
+    "espresso",
+    "tea cup",
+    "eggnog",
+    "mountain",
+    "bubble",
+    "cliff",
+    "coral reef",
+    "geyser",
+    "lakeshore",
+    "promontory",
+    "sandbar",
+    "beach",
+    "valley",
+    "volcano",
+    "baseball player",
+    "bridegroom",
+    "scuba diver",
+    "rapeseed",
+    "daisy",
+    "yellow lady's slipper",
+    "corn",
+    "acorn",
+    "rose hip",
+    "horse chestnut seed",
+    "coral fungus",
+    "agaric",
+    "gyromitra",
+    "stinkhorn mushroom",
+    "earth star fungus",
+    "hen of the woods mushroom",
+    "bolete",
+    "corn cob",
+    "toilet paper",
+]
diff --git a/LAVIS-main/lavis/datasets/builders/object3d_caption_builder.py b/LAVIS-main/lavis/datasets/builders/object3d_caption_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd764cb765ea77f1d0ebbe6abd084cb6c506e7e1
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/builders/object3d_caption_builder.py
@@ -0,0 +1,65 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from lavis.common.registry import registry
+from lavis.datasets.builders.base_dataset_builder import MultiModalDatasetBuilder
+from lavis.datasets.datasets.object3d_captioning_datasets import (
+    ObjaverseCaptionDataset,
+    ObjaverseCaptionEvalDataset,
+    ObjaverseCaptionInstructDataset,
+    ShapenetCaptionDataset,
+    ShapenetCaptionEvalDataset,
+    ShapenetCaptionInstructDataset,
+)
+
+@registry.register_builder("objaverse_mm_caption")
+class ObjaverseCaptionBuilder(MultiModalDatasetBuilder):
+    train_dataset_cls = ObjaverseCaptionDataset
+    eval_dataset_cls = ObjaverseCaptionEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/objaverse/defaults_mm_cap.yaml",
+    }
+
+    def build(self):
+        datasets = super().build()
+        build_info = self.config.build_info
+        for split,ds in datasets.items():
+            # TODO: add option to download templates
+            templates = build_info.get('templates')
+            if templates == None:
+                ds._build_templates(None)
+            else:
+                ds._build_templates(build_info.templates.storage)
+        return datasets
+
+@registry.register_builder("objaverse_mm_caption_instruct")
+class ObjaverseCaptionInstructBuilder(ObjaverseCaptionBuilder):
+    train_dataset_cls = ObjaverseCaptionInstructDataset
+    eval_dataset_cls = ObjaverseCaptionEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/objaverse/defaults_mm_cap_instruct.yaml",
+    }
+
+@registry.register_builder("shapenet_mm_caption")
+class ShapenetCaptionBuilder(ObjaverseCaptionBuilder):
+    train_dataset_cls = ShapenetCaptionDataset
+    eval_dataset_cls = ShapenetCaptionEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/shapenet/defaults_mm_cap.yaml",
+    }
+
+@registry.register_builder("shapenet_mm_caption_instruct")
+class ShapenetCaptionInstructBuilder(ObjaverseCaptionBuilder):
+    train_dataset_cls = ShapenetCaptionInstructDataset
+    eval_dataset_cls = ShapenetCaptionEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/shapenet/defaults_mm_cap_instruct.yaml",
+    }
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/builders/object3d_classification_builder.py b/LAVIS-main/lavis/datasets/builders/object3d_classification_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..b214d5df726929a3c319a4ac21144f492d42b9b0
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/builders/object3d_classification_builder.py
@@ -0,0 +1,19 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from lavis.common.registry import registry
+from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder, MultiModalDatasetBuilder
+from lavis.datasets.datasets.object3d_classification_datasets import ModelNetClassificationDataset
+
+@registry.register_builder("modelnet40_cls")
+class ModelNetClassificationBuilder(MultiModalDatasetBuilder):
+    train_dataset_cls = ModelNetClassificationDataset
+    eval_dataset_cls = ModelNetClassificationDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/modelnet40/defaults_cls.yaml",
+    }
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/builders/object3d_qa_builder.py b/LAVIS-main/lavis/datasets/builders/object3d_qa_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..50ef545f8907300f5f4ed1ad27c66697bc8e5747
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/builders/object3d_qa_builder.py
@@ -0,0 +1,19 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from lavis.common.registry import registry
+from lavis.datasets.builders.object3d_caption_builder import ObjaverseCaptionBuilder
+from lavis.datasets.datasets.object3d_qa_datasets import ObjaverseQADataset
+
+@registry.register_builder("objaverse_mm_qa")
+class ObjaverseQABuilder(ObjaverseCaptionBuilder):
+    train_dataset_cls = ObjaverseQADataset
+    eval_dataset_cls = ObjaverseQADataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/objaverse/defaults_mm_qa.yaml",
+    }
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/builders/retrieval_builder.py b/LAVIS-main/lavis/datasets/builders/retrieval_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..98ca3bdf572fe007ea1bd97d75aefcb8ae02fe3d
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/builders/retrieval_builder.py
@@ -0,0 +1,48 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
+from lavis.datasets.datasets.retrieval_datasets import (
+    RetrievalDataset,
+    RetrievalEvalDataset,
+    VideoRetrievalDataset,
+    VideoRetrievalEvalDataset,
+)
+
+from lavis.common.registry import registry
+
+
+@registry.register_builder("msrvtt_retrieval")
+class MSRVTTRetrievalBuilder(BaseDatasetBuilder):
+    train_dataset_cls = VideoRetrievalDataset
+    eval_dataset_cls = VideoRetrievalEvalDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/msrvtt/defaults_ret.yaml"}
+
+
+@registry.register_builder("didemo_retrieval")
+class DiDeMoRetrievalBuilder(BaseDatasetBuilder):
+    train_dataset_cls = VideoRetrievalDataset
+    eval_dataset_cls = VideoRetrievalEvalDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/didemo/defaults_ret.yaml"}
+
+
+@registry.register_builder("coco_retrieval")
+class COCORetrievalBuilder(BaseDatasetBuilder):
+    train_dataset_cls = RetrievalDataset
+    eval_dataset_cls = RetrievalEvalDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/coco/defaults_ret.yaml"}
+
+
+@registry.register_builder("flickr30k")
+class Flickr30kBuilder(BaseDatasetBuilder):
+    train_dataset_cls = RetrievalDataset
+    eval_dataset_cls = RetrievalEvalDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/flickr30k/defaults.yaml"}
diff --git a/LAVIS-main/lavis/datasets/builders/text_to_image_generation_builder.py b/LAVIS-main/lavis/datasets/builders/text_to_image_generation_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..b93368a71f62b463b91f53ba407c767ed44e74cd
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/builders/text_to_image_generation_builder.py
@@ -0,0 +1,39 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from lavis.common.registry import registry
+from lavis.datasets.datasets.subject_driven_t2i_dataset import (
+    SubjectDrivenTextToImageDataset,
+)
+from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
+
+
+@registry.register_builder("blip_diffusion_finetune")
+class BlipDiffusionFinetuneBuilder(BaseDatasetBuilder):
+    train_dataset_cls = SubjectDrivenTextToImageDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/blip_diffusion_datasets/defaults.yaml"
+    }
+
+    def _download_ann(self):
+        pass
+
+    def build(self):
+        self.build_processors()
+
+        build_info = self.config.build_info
+
+        dataset = self.train_dataset_cls(
+            image_dir=build_info.images.storage,
+            subject_text=build_info.subject_text,
+            inp_image_processor=self.kw_processors["inp_vis_processor"],
+            tgt_image_processor=self.kw_processors["tgt_vis_processor"],
+            txt_processor=self.text_processors["eval"],
+        )
+
+        return {"train": dataset}
diff --git a/LAVIS-main/lavis/datasets/builders/video_qa_builder.py b/LAVIS-main/lavis/datasets/builders/video_qa_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..28e76a59e238f6b18fb052561bd27bdc4186af36
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/builders/video_qa_builder.py
@@ -0,0 +1,77 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from lavis.common.registry import registry
+from lavis.common.utils import get_cache_path
+from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder, MultiModalDatasetBuilder
+from lavis.datasets.datasets.video_vqa_datasets import VideoQADataset, VideoQAInstructDataset
+from lavis.datasets.datasets.music_avqa import MusicAVQAInstructDataset, MusicAVQADataset
+
+
+class VideoQABuilder(BaseDatasetBuilder):
+    train_dataset_cls = VideoQADataset
+    eval_dataset_cls = VideoQADataset
+
+    def build(self):
+        datasets = super().build()
+
+        ans2label = self.config.build_info.annotations.get("ans2label")
+        if ans2label is None:
+            raise ValueError("ans2label is not specified in build_info.")
+
+        ans2label = get_cache_path(ans2label.storage)
+
+        for split in datasets:
+            datasets[split]._build_class_labels(ans2label)
+
+        return datasets
+
+
+@registry.register_builder("msrvtt_qa")
+class MSRVTTQABuilder(VideoQABuilder):
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/msrvtt/defaults_qa.yaml",
+    }
+
+
+@registry.register_builder("msvd_qa")
+class MSVDQABuilder(VideoQABuilder):
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/msvd/defaults_qa.yaml",
+    }
+
+
+@registry.register_builder("msrvtt_qa_instruct")
+class MSRVTTQAInstructBuilder(VideoQABuilder):
+    train_dataset_cls = VideoQAInstructDataset
+    eval_dataset_cls = VideoQAInstructDataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/msrvtt/defaults_qa_instruct.yaml",
+    }
+
+
+@registry.register_builder("msvd_qa_instruct")
+class MSVDQAInstructBuilder(VideoQABuilder):
+    train_dataset_cls = VideoQAInstructDataset
+    eval_dataset_cls = VideoQAInstructDataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/msvd/defaults_qa_instruct.yaml",
+    }
+
+@registry.register_builder("musicavqa_mm")
+class MusicAVQABuilder(MultiModalDatasetBuilder):
+    train_dataset_cls = MusicAVQADataset
+    eval_dataset_cls = MusicAVQADataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/music_avqa/defaults_mm_qa.yaml"}
+
+@registry.register_builder("musicavqa_mm_instruct")
+class MusicAVQAInstructBuilder(MultiModalDatasetBuilder):
+    train_dataset_cls = MusicAVQAInstructDataset
+    eval_dataset_cls = MusicAVQAInstructDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/music_avqa/defaults_mm_qa_instruct.yaml"}
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/builders/vqa_builder.py b/LAVIS-main/lavis/datasets/builders/vqa_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..028fc434795793d612c646319f2e4f8b6394fd69
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/builders/vqa_builder.py
@@ -0,0 +1,152 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
+
+from lavis.common.registry import registry
+from lavis.datasets.datasets.aok_vqa_datasets import AOKVQADataset, AOKVQAEvalDataset, AOKVQAInstructDataset
+from lavis.datasets.datasets.coco_vqa_datasets import COCOVQADataset, COCOVQAEvalDataset, COCOVQAInstructDataset
+from lavis.datasets.datasets.vg_vqa_datasets import VGVQADataset, VGVQAInstructDataset
+from lavis.datasets.datasets.gqa_datasets import GQADataset, GQAEvalDataset, GQAInstructDataset
+from lavis.datasets.datasets.iconqa_datasets import IconQADataset, IconQAEvalDataset, IconQAInstructDataset
+from lavis.datasets.datasets.ocr_datasets import OCRVQADataset, OCRVQAInstructDataset
+from lavis.datasets.datasets.vizwiz_vqa_datasets import VizWizEvalDataset
+
+@registry.register_builder("coco_vqa")
+class COCOVQABuilder(BaseDatasetBuilder):
+    train_dataset_cls = COCOVQADataset
+    eval_dataset_cls = COCOVQAEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/coco/defaults_vqa.yaml",
+        "eval": "configs/datasets/coco/eval_vqa.yaml",
+    }
+
+@registry.register_builder("coco_vqa_instruct")
+class COCOVQAInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = COCOVQAInstructDataset
+    eval_dataset_cls = COCOVQAEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/coco/defaults_vqa_instruct.yaml",
+        "eval": "configs/datasets/coco/eval_vqa.yaml",
+    }
+
+@registry.register_builder("vg_vqa")
+class VGVQABuilder(BaseDatasetBuilder):
+    train_dataset_cls = VGVQADataset
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/vg/defaults_vqa.yaml"}
+
+@registry.register_builder("vg_vqa_instruct")
+class VGVQAInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = VGVQAInstructDataset
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/vg/defaults_vqa_instruct.yaml"}
+
+@registry.register_builder("ok_vqa")
+class OKVQABuilder(COCOVQABuilder):
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/okvqa/defaults.yaml",
+    }
+
+@registry.register_builder("ok_vqa_instruct")
+class OKVQAInstructBuilder(COCOVQAInstructBuilder):
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/okvqa/defaults_instruct.yaml",
+    }
+
+@registry.register_builder("aok_vqa")
+class AOKVQABuilder(BaseDatasetBuilder):
+    train_dataset_cls = AOKVQADataset
+    eval_dataset_cls = AOKVQAEvalDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/aokvqa/defaults.yaml"}
+
+@registry.register_builder("aok_vqa_instruct")
+class AOKVQAInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = AOKVQAInstructDataset
+    eval_dataset_cls = AOKVQAEvalDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/aokvqa/defaults_instruct.yaml"}
+
+
+@registry.register_builder("gqa")
+class GQABuilder(BaseDatasetBuilder):
+    train_dataset_cls = GQADataset
+    eval_dataset_cls = GQAEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/gqa/defaults.yaml",
+        "balanced_val": "configs/datasets/gqa/balanced_val.yaml",
+        "balanced_testdev": "configs/datasets/gqa/balanced_testdev.yaml",
+    }
+
+@registry.register_builder("gqa_instruct")
+class GQAInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = GQAInstructDataset
+    eval_dataset_cls = GQAEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/gqa/defaults_instruct.yaml",
+        "balanced_val": "configs/datasets/gqa/balanced_val_instruct.yaml",
+        "balanced_testdev": "configs/datasets/gqa/balanced_testdev_instruct.yaml",
+    }
+
+@registry.register_builder("iconqa")
+class IconQABuilder(BaseDatasetBuilder):
+    train_dataset_cls = IconQADataset
+    eval_dataset_cls = IconQAEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/iconqa/defaults.yaml",
+    }
+
+@registry.register_builder("iconqa_instruct")
+class IconQAInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = IconQAInstructDataset
+    eval_dataset_cls = IconQAEvalDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/iconqa/defaults_instruct.yaml",
+    }
+
+@registry.register_builder("scienceqa")
+class ScienceQABuilder(BaseDatasetBuilder):
+    train_dataset_cls = IconQADataset
+    eval_dataset_cls = IconQAEvalDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/scienceqa/defaults.yaml"}
+
+@registry.register_builder("scienceqa_instruct")
+class ScienceQAInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = IconQAInstructDataset
+    eval_dataset_cls = IconQAEvalDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/scienceqa/defaults_instruct.yaml"}
+
+@registry.register_builder("ocr_vqa")
+class OCRVQABuilder(BaseDatasetBuilder):
+    train_dataset_cls = OCRVQADataset
+    eval_dataset_cls = OCRVQADataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/ocrvqa/defaults.yaml"}
+
+@registry.register_builder("ocr_vqa_instruct")
+class OCRVQAInstructBuilder(BaseDatasetBuilder):
+    train_dataset_cls = OCRVQAInstructDataset
+    eval_dataset_cls = OCRVQAInstructDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/ocrvqa/defaults_instruct.yaml"}
+
+
+@registry.register_builder("vizwiz_vqa")
+class VizWizVQABuilder(BaseDatasetBuilder):
+    eval_dataset_cls = VizWizEvalDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/vizwiz/defaults.yaml"}
+
+
+
diff --git a/LAVIS-main/lavis/datasets/data_utils.py b/LAVIS-main/lavis/datasets/data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..098c7e0c591208b2d2f5f25c83164f1996ab2014
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/data_utils.py
@@ -0,0 +1,351 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import gzip
+import logging
+import os
+import random as rnd
+import tarfile
+import zipfile
+import cv2
+
+import decord
+import webdataset as wds
+import numpy as np
+import torch
+from torch.utils.data.dataset import IterableDataset, ChainDataset
+from decord import VideoReader
+from lavis.common.registry import registry
+from lavis.datasets.datasets.base_dataset import ConcatDataset
+from tqdm import tqdm
+
+decord.bridge.set_bridge("torch")
+MAX_INT = registry.get("MAX_INT")
+
+
+def load_video(video_path, n_frms=MAX_INT, height=-1, width=-1, sampling="uniform"):
+    vr = VideoReader(uri=video_path, height=height, width=width)
+
+    vlen = len(vr)
+    start, end = 0, vlen
+
+    n_frms = min(n_frms, vlen)
+
+    if sampling == "uniform":
+        indices = np.arange(start, end, vlen / n_frms).astype(int)
+    elif sampling == "headtail":
+        indices_h = sorted(rnd.sample(range(vlen // 2), n_frms // 2))
+        indices_t = sorted(rnd.sample(range(vlen // 2, vlen), n_frms // 2))
+        indices = indices_h + indices_t
+    else:
+        raise NotImplementedError
+
+    # get_batch -> T, H, W, C
+    frms = vr.get_batch(indices).permute(3, 0, 1, 2).float()  # (C, T, H, W)
+
+    return frms
+
+
+def apply_to_sample(f, sample):
+    ## add check for datasets that return none samples for missing items
+    if sample == None or len(sample) == 0:
+        return {}
+
+    def _apply(x):
+        if torch.is_tensor(x):
+            return f(x)
+        elif isinstance(x, dict):
+            return {key: _apply(value) for key, value in x.items()}
+        elif isinstance(x, list):
+            return [_apply(x) for x in x]
+        else:
+            return x
+
+    return _apply(sample)
+
+
+def move_to_cuda(sample):
+    def _move_to_cuda(tensor):
+        return tensor.cuda()
+
+    return apply_to_sample(_move_to_cuda, sample)
+
+
+def prepare_sample(samples, cuda_enabled=True):
+    if cuda_enabled:
+        samples = move_to_cuda(samples)
+
+    # TODO fp16 support
+
+    return samples
+
+
+def reorg_datasets_by_split(datasets):
+    """
+    Organizes datasets by split.
+
+    Args:
+        datasets: dict of torch.utils.data.Dataset objects by name.
+
+    Returns:
+        Dict of datasets by split {split_name: List[Datasets]}.
+    """
+    # if len(datasets) == 1:
+    #     return datasets[list(datasets.keys())[0]]
+    # else:
+    reorg_datasets = dict()
+
+    # reorganize by split
+    for _, dataset in datasets.items():
+        for split_name, dataset_split in dataset.items():
+            if split_name not in reorg_datasets:
+                reorg_datasets[split_name] = [dataset_split]
+            else:
+                reorg_datasets[split_name].append(dataset_split)
+
+    return reorg_datasets
+
+
+def concat_datasets(datasets):
+    """
+    Concatenates multiple datasets into a single dataset.
+
+    It supports may-style datasets and DataPipeline from WebDataset. Currently, does not support
+    generic IterableDataset because it requires creating separate samplers.
+
+    Now only supports conctenating training datasets and assuming validation and testing
+    have only a single dataset. This is because metrics should not be computed on the concatenated
+    datasets.
+
+    Args:
+        datasets: dict of torch.utils.data.Dataset objects by split.
+
+    Returns:
+        Dict of concatenated datasets by split, "train" is the concatenation of multiple datasets,
+        "val" and "test" remain the same.
+
+        If the input training datasets contain both map-style and DataPipeline datasets, returns
+        a tuple, where the first element is a concatenated map-style dataset and the second
+        element is a chained DataPipeline dataset.
+
+    """
+    # concatenate datasets in the same split
+    for split_name in datasets:
+        if split_name != "train":
+            assert (
+                len(datasets[split_name]) == 1
+            ), "Do not support multiple {} datasets.".format(split_name)
+            datasets[split_name] = datasets[split_name][0]
+        else:
+            iterable_datasets, map_datasets = [], []
+            for dataset in datasets[split_name]:
+                if isinstance(dataset, wds.DataPipeline):
+                    logging.info(
+                        "Dataset {} is IterableDataset, can't be concatenated.".format(
+                            dataset
+                        )
+                    )
+                    iterable_datasets.append(dataset)
+                elif isinstance(dataset, IterableDataset):
+                    raise NotImplementedError(
+                        "Do not support concatenation of generic IterableDataset."
+                    )
+                else:
+                    map_datasets.append(dataset)
+
+            # if len(iterable_datasets) > 0:
+            # concatenate map-style datasets and iterable-style datasets separately
+            chained_datasets = (
+                ChainDataset(iterable_datasets) if len(iterable_datasets) > 0 else None
+            )
+            concat_datasets = (
+                ConcatDataset(map_datasets) if len(map_datasets) > 0 else None
+            )
+
+            train_datasets = concat_datasets, chained_datasets
+            train_datasets = tuple([x for x in train_datasets if x is not None])
+            train_datasets = (
+                train_datasets[0] if len(train_datasets) == 1 else train_datasets
+            )
+
+            datasets[split_name] = train_datasets
+
+    return datasets
+
+
+def extract_archive(from_path, to_path=None, overwrite=False):
+    """Extract archive.
+
+    Args:
+        from_path: the path of the archive.
+        to_path: the root path of the extracted files (directory of from_path)
+        overwrite: overwrite existing files (False)
+
+    Returns:
+        List of paths to extracted files even if not overwritten.
+
+    Examples:
+        >>> url = 'http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz'
+        >>> from_path = './validation.tar.gz'
+        >>> to_path = './'
+        >>> torchtext.utils.download_from_url(url, from_path)
+        >>> torchtext.utils.extract_archive(from_path, to_path)
+        >>> ['.data/val.de', '.data/val.en']
+        >>> torchtext.utils.download_from_url(url, from_path)
+        >>> torchtext.utils.extract_archive(from_path, to_path)
+        >>> ['.data/val.de', '.data/val.en']
+
+    """
+
+    if to_path is None:
+        to_path = os.path.dirname(from_path)
+
+    if from_path.endswith((".tar.gz", ".tgz")):
+        logging.info("Opening tar file {} to {}.".format(from_path, to_path))
+        with tarfile.open(from_path, "r") as tar:
+            files = []
+            for file_ in tqdm(tar):
+                file_path = os.path.join(to_path, file_.name)
+                if file_.isfile():
+                    files.append(file_path)
+                    if os.path.exists(file_path):
+                        logging.info("{} already extracted.".format(file_path))
+                        if not overwrite:
+                            continue
+                tar.extract(file_, to_path)
+            logging.info("Finished extracting tar file {}.".format(from_path))
+            return files
+
+    elif from_path.endswith(".zip"):
+        assert zipfile.is_zipfile(from_path), from_path
+        logging.info("Opening zip file {} to {}.".format(from_path, to_path))
+        with zipfile.ZipFile(from_path, "r") as zfile:
+            files = []
+            for file_ in tqdm(zfile.namelist()):
+                file_path = os.path.join(to_path, file_)
+                files.append(file_path)
+                if os.path.exists(file_path):
+                    logging.info("{} already extracted.".format(file_path))
+                    if not overwrite:
+                        continue
+                zfile.extract(file_, to_path)
+        files = [f for f in files if os.path.isfile(f)]
+        logging.info("Finished extracting zip file {}.".format(from_path))
+        return files
+
+    elif from_path.endswith(".gz"):
+        logging.info("Opening gz file {} to {}.".format(from_path, to_path))
+        default_block_size = 65536
+        filename = from_path[:-3]
+        files = [filename]
+        with gzip.open(from_path, "rb") as gzfile, open(filename, "wb") as d_file:
+            while True:
+                block = gzfile.read(default_block_size)
+                if not block:
+                    break
+                else:
+                    d_file.write(block)
+            d_file.write(block)
+        logging.info("Finished extracting gz file {}.".format(from_path))
+        return files
+
+    else:
+        raise NotImplementedError(
+            "We currently only support tar.gz, .tgz, .gz and zip achives."
+        )
+
+
+def save_frames_grid(img_array, out_path):
+    import torch
+    from PIL import Image
+    from torchvision.utils import make_grid
+
+    if len(img_array.shape) == 3:
+        img_array = img_array.unsqueeze(0)
+    elif len(img_array.shape) == 5:
+        b, t, c, h, w = img_array.shape
+        img_array = img_array.view(-1, c, h, w)
+    elif len(img_array.shape) == 4:
+        pass
+    else:
+        raise NotImplementedError(
+            "Supports only (b,t,c,h,w)-shaped inputs. First two dimensions can be ignored."
+        )
+
+    assert img_array.shape[1] == 3, "Exepcting input shape of (H, W, 3), i.e. RGB-only."
+
+    grid = make_grid(img_array)
+    ndarr = grid.permute(1, 2, 0).to("cpu", torch.uint8).numpy()
+
+    img = Image.fromarray(ndarr)
+
+    img.save(out_path)
+
+
+def uniform_frame_sampling(video_path, num_frames, target_height, target_width, start_time=None, end_time=None):
+    cap = cv2.VideoCapture(video_path)
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    frame_rate = cap.get(cv2.CAP_PROP_FPS)
+
+    if start_time is None:
+        start_time = 0
+    if end_time is None:
+        end_time = total_frames / frame_rate
+
+    start_frame = int(start_time * frame_rate)
+    end_frame = int(end_time * frame_rate)
+    frame_indices = list(range(start_frame, end_frame + 1, (end_frame - start_frame + 1) // num_frames))
+
+    frames = []
+    for frame_index in frame_indices:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frame = cv2.resize(frame, (target_width, target_height))
+        frames.append(frame)
+
+    cap.release()
+    return frames
+
+
+def head_tail_frame_sampling(video_path, num_frames, target_height, target_width, start_time=None, end_time=None):
+    cap = cv2.VideoCapture(video_path)
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    frame_rate = cap.get(cv2.CAP_PROP_FPS)
+
+    if start_time is None:
+        start_time = 0
+    if end_time is None:
+        end_time = total_frames / frame_rate
+
+    start_frame = int(start_time * frame_rate)
+    end_frame = int(end_time * frame_rate)
+    frame_indices = [start_frame] + [start_frame + (end_frame - start_frame) // (num_frames - 1) * i for i in range(1, num_frames - 1)] + [end_frame]
+
+    frames = []
+    for frame_index in frame_indices:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frame = cv2.resize(frame, (target_width, target_height))
+        frames.append(frame)
+
+    cap.release()
+    if len(frames) == 0:
+        return None
+    return torch.stack([torch.tensor(f).permute(2,0,1).float() for f in frames], dim=1)
+
+
+def load_clip(video_path, num_frames, target_height, target_width, start_time=None, end_time=None, sampling="headtail"):
+    if sampling == "headtail":
+        return head_tail_frame_sampling(video_path, num_frames, target_height, target_width, start_time, end_time)
+    elif sampling == "uniform":
+        return uniform_frame_sampling(video_path, num_frames, target_height, target_width, start_time, end_time)
+    else:
+        raise NotImplementedError
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/datasets/aok_vqa_datasets.py b/LAVIS-main/lavis/datasets/datasets/aok_vqa_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..53ad32e793c7f9c025274a11a2712404a205657e
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/aok_vqa_datasets.py
@@ -0,0 +1,167 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from collections import OrderedDict
+import json
+import os
+import torch
+import random
+
+from PIL import Image
+
+from lavis.datasets.datasets.vqa_datasets import VQADataset, VQAEvalDataset
+
+
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+        return OrderedDict(
+            {
+                "file": ann["image"],
+                "question": ann["question"],
+                "question_id": ann["question_id"],
+                "direct_answers": "; ".join(ann["direct_answers"]),
+                "choices": "; ".join(ann["choices"]),
+                "correct_choice": ann["choices"][ann["correct_choice_idx"]],
+                "image": sample["image"],
+            }
+        )
+
+
+class AOKVQADataset(VQADataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        question = self.text_processor(ann["question"])
+
+        answer_key = "direct_answers"
+
+        answer_weight = {}
+        for answer in ann[answer_key]:
+            if answer in answer_weight.keys():
+                answer_weight[answer] += 1 / len(ann[answer_key])
+            else:
+                answer_weight[answer] = 1 / len(ann[answer_key])
+
+        answers = list(answer_weight.keys())
+        weights = list(answer_weight.values())
+
+        return {
+            "image": image,
+            "text_input": question,
+            "answers": answers,
+            "weights": weights,
+        }
+
+class AOKVQAInstructDataset(AOKVQADataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data["text_output"] = random.choice(data['answers'])
+        return data
+
+    def collater(self, samples):
+        data = super().collater(samples)
+        data['text_output'] = data['answer']
+        return data
+
+
+class AOKVQAEvalDataset(VQAEvalDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+
+        self.vis_root = vis_root
+
+        self.annotation = json.load(open(ann_paths[0]))
+
+        answer_list_path = ann_paths[1]
+        if os.path.exists(answer_list_path):
+            self.answer_list = json.load(open(answer_list_path))
+        else:
+            self.answer_list = None
+
+        try:
+            self.coco_fmt_qust_file = ann_paths[2]
+            self.coco_fmt_anno_file = ann_paths[3]
+        except IndexError:
+            self.coco_fmt_qust_file = None
+            self.coco_fmt_anno_file = None
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        self._add_instance_ids()
+
+    def collater(self, samples):
+        (
+            image_list,
+            question_list,
+            question_id_list,
+            instance_id_list,
+            choices_list,
+            correct_choice_idx_list,
+            direct_answers_list,
+        ) = ([], [], [], [], [], [], [])
+
+        for sample in samples:
+            image_list.append(sample["image"])
+            question_list.append(sample["text_input"])
+            question_id_list.append(sample["question_id"])
+            instance_id_list.append(sample["instance_id"])
+            choices_list.append(sample["choices"])
+            correct_choice_idx_list.append(sample["correct_choice_idx"])
+            direct_answers_list.append(sample["direct_answers"])
+
+        return {
+            "image": torch.stack(image_list, dim=0),
+            "text_input": question_list,
+            "question_id": question_id_list,
+            "instance_id": instance_id_list,
+            "choices": choices_list,
+            "correct_choice_idx": correct_choice_idx_list,
+            "direct_answers": direct_answers_list,
+        }
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        question = self.text_processor(ann["question"])
+
+        choices = ann["choices"]
+        if "correct_choice_idx" in ann:
+            correct_choice_idx = ann["correct_choice_idx"]
+        else:
+            correct_choice_idx = None
+
+        if "direct_answers" in ann:
+            direct_answers = ann["direct_answers"]
+        else:
+            direct_answers = None
+
+        return {
+            "image": image,
+            "text_input": question,
+            "question_id": ann["question_id"],
+            "instance_id": ann["instance_id"],
+            "choices": choices,
+            "correct_choice_idx": correct_choice_idx,
+            "direct_answers": direct_answers,
+        }
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/datasets/audio_captioning_datasets.py b/LAVIS-main/lavis/datasets/datasets/audio_captioning_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7a00e99cb983da4d1900253a15eb5a796422fbe
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/audio_captioning_datasets.py
@@ -0,0 +1,407 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+from collections import OrderedDict
+import torch
+import copy
+import pathlib
+import random
+import json
+import pandas as pd
+import torchaudio
+import torch
+from tqdm import tqdm
+
+from lavis.datasets.datasets.base_dataset import BaseDataset
+
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+
+        return OrderedDict(
+            {
+                "file": ann["image"],
+                "label": ann["caption"],
+                "audio": sample["audio"],
+                "audio_path": sample["audio_path"],
+                "caption": sample["caption"],
+    
+            }
+        )
+
+
+class AudioCaptioningDataset(BaseDataset, __DisplMixin):
+    def __init__(self, **kwargs):
+        self.modalities = kwargs['modalities']
+        super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], kwargs['ann_paths'])
+        for modality in self.modalities:
+            setattr(self, f"{modality}_root", kwargs[f"{modality}_root"])
+            setattr(self, f"{modality}_processor", kwargs[f"{modality}_processor"])
+            setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')())
+        
+    def get_audio_path(self, ann):
+        raise NotImplementedError("Subclasses should implement this!")
+    
+    def is_empty_audio(self, ann):
+        path = self.get_audio_path(ann)
+        try:
+            waveform, sr = torchaudio.load(path)
+
+            # Convert to mono if it's stereo
+            if waveform.shape[0] == 2:
+                waveform = torch.mean(waveform, dim=0)
+
+        except torchaudio.TorchaudioException:
+            return True  # Audio loading failed
+
+        return waveform.nelement() == 0
+    
+    def get_existing_audio_annotations(self):
+        return [f.split('_')[0] for f in os.listdir(self.audio_root)]
+
+    def get_existing_video_annotations(self):
+        return os.listdir(self.video_root)
+    
+    def get_existing_images_annotations(self):
+        return os.listdir(self.vis_root)
+    
+    def get_video_path(self, ann):
+        return  pathlib.Path(os.path.join(self.video_root, ann[self.sample_id_key])).resolve()
+     
+    def get_images_path(self, ann):
+        return  pathlib.Path(os.path.join(self.vis_root, ann[self.sample_id_key])).resolve()
+    
+    def __len__(self):
+        return len(self.annotation)
+    
+    def __getitem__(self, index):
+        raise NotImplementedError("Subclasses should implement this!")
+    
+    def _build_templates(self, templates_path):
+        # use captions not templates
+        if templates_path is None:
+            self.templates = None
+        else:
+            with open(templates_path) as f:
+                self.templates = json.load(f)
+
+class AudioSetDataset(AudioCaptioningDataset):
+    def __init__(self, **kwargs):
+        self.dataset_name = 'audioset'
+        self.sample_id_key = 'YTID'
+        clean_ids = [l.strip() for l in open(kwargs['ann_paths'][-1]).readlines()]
+        df = pd.read_csv(kwargs['ann_paths'][-1])
+        self.mid2label = {k: v for k, v in zip(df['mid'].tolist(), df['display_name'].tolist())}
+        annotation = []
+        for ann_path in kwargs['ann_paths'][:-1]:
+            df = pd.read_csv(ann_path, comment='#', header=None,names=['YTID', 'start_seconds', 'end_seconds', 'positive_labels'], skiprows=3, quotechar='"', delimiter=',', skipinitialspace=True )
+            annotation.extend([row.to_dict() for i,row in df.iterrows()])
+        kwargs['ann_paths'] = []
+        super().__init__(**kwargs)
+        self.annotation = annotation
+        self.sample_ids = set.intersection(*[set(getattr(self, f"existing_{modality}_annotation")) for modality in self.modalities])
+        
+        self.annotation = [ann for ann in self.annotation if ann[self.sample_id_key] in self.sample_ids and ann[self.sample_id_key]]
+        self._add_instance_ids()
+        print(f"Loaded {len(self.annotation)} examples.")
+    
+    def get_audio_path(self, ann):
+        if 'end_seconds' not in ann:
+            ann['start_seconds'] = float(ann['start_time'])
+            del ann['start_time']
+            ann['end_seconds'] = float(ann['start_seconds']) + 10.0
+        return str(os.path.realpath(os.path.join(self.audio_root, ann[self.sample_id_key] + '_{:.1f}-{:.1f}.wav'.format(ann['start_seconds'], ann['end_seconds'])))).replace('all_audio/', '')
+    
+
+    def __getitem__(self, index):
+        ann = copy.deepcopy(self.annotation[index])
+        ann["sample_id"] = ann["YTID"]
+        objects = ann['positive_labels'].split(',')
+        objects = [self.mid2label[l] for l in objects]
+        ann['label'] = objects
+        if self.templates:
+            ann['captions'] = [random.choice(self.templates).format(obj) for obj in objects]
+        else:
+            ann['captions'] = [random.choice(objects)]
+
+        for modality in self.modalities:
+            ann[f"{modality}_path"] = getattr(self, f"get_{modality}_path")(ann)
+            if isinstance(ann[f"{modality}_path"], list):
+                ann[f"{modality}_path"] = random.choice(ann[f"{modality}_path"])
+            else:
+                ann[modality if 'image' not in modality else 'image'] = getattr(self, f"{'vis' if 'image' in modality else modality}_processor")(ann[f"{modality}_path"])
+        
+        if isinstance(ann['captions'], list):
+            ann['text_input'] = self.text_processor(random.choice(ann['captions']))
+        else:
+            ann['text_input'] = self.text_processor(ann['captions'])
+
+        if ann["audio"].sum() == 0:
+            return None
+
+        return ann
+
+class AudioSetInstructDataset(AudioSetDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data['text_output'] = data["text_input"]
+            data['text_input'] = self.text_processor("")
+        return data
+
+class AudioSetEvalDataset(AudioSetDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            del data["text_input"]
+        return data
+    
+class AudioCapsDataset(AudioCaptioningDataset):
+    def __init__(self, **kwargs):
+        self.sample_id_key = "youtube_id"
+        self.split = 'train' if 'train' in kwargs['ann_paths'][0] else 'test' if 'test' in kwargs['ann_paths'][0] else 'val'
+        self.modalities = kwargs['modalities']
+        for modality in self.modalities:
+            kwargs[f"{modality}_root"] = os.path.join(kwargs[f"{modality}_root"],f'{self.split}')
+        super().__init__(**kwargs)
+        self.cached = kwargs.get('cached', False)
+        self.cache_dir = kwargs.get('cached_dir', '')
+        def get_existing_audio_annotations(self):
+            return [f.split('_')[0] for f in os.listdir(self.audio_root)] if not self.cached else [f.split('_')[0] for f in os.listdir(self.cached_dir)]
+
+        self.sample_ids = set.intersection(*[set(getattr(self, f"existing_{modality}_annotation")) for modality in self.modalities])
+        self.annotation = [ann for ann in self.annotation if ann[self.sample_id_key] in self.sample_ids and ann[self.sample_id_key] not in kwargs.get('missing_ids', [])]
+        self._add_instance_ids()
+        print(f"Loaded {len(self.annotation)} examples.")
+    
+    def get_audio_path(self, ann):
+        if 'end_seconds' not in ann:
+            ann['start_seconds'] = float(ann['start_time'])
+            ann['end_seconds'] = ann['start_seconds'] + 10.0
+        return os.path.join(self.audio_root, ann[self.sample_id_key] + '_{}.flac'.format(int(ann['start_seconds'])))
+    
+    def get_cached_audio_path(self, ann):
+        if 'end_seconds' not in ann:
+            ann['start_seconds'] = float(ann['start_time'])
+            ann['end_seconds'] = ann['start_seconds'] + 10.0
+        return os.path.join(self.cache_dir, ann[self.sample_id_key] + '_{}.flac.pt'.format(int(ann['start_seconds'])))
+    
+    def __getitem__(self, index):
+        ann = copy.deepcopy(self.annotation[index])
+        ann['captions'] = ann['caption']
+        ann["sample_id"] = ann["youtube_id"]
+
+        for modality in self.modalities:
+            if modality == 'audio' and self.cached:
+                ann[f"{modality}_path"] = getattr(self, f"get_cached_{modality}_path")(ann)
+                ann["audio"] = torch.load(ann[f"{modality}_path"])
+            else:
+                ann[f"{modality}_path"] = getattr(self, f"get_{modality}_path")(ann)
+                if isinstance(ann[f"{modality}_path"], list):
+                    ann[f"{modality}_path"] = random.choice(ann[f"{modality}_path"])
+                ann[modality if 'image' not in modality else 'image'] = getattr(self, f"{'vis' if 'image' in modality else modality}_processor")(ann[f"{modality}_path"])
+        
+        if isinstance(ann['captions'], list):
+            ann['text_input'] = self.text_processor(random.choice(ann['captions']))
+        else:
+            ann['text_input'] = self.text_processor(ann['captions'])
+
+        if ann["audio"].sum() == 0:
+            return None
+
+        return ann
+
+class AudioCapsInstructDataset(AudioCapsDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data['text_output'] = data["text_input"]
+            data['text_input'] = self.text_processor("")
+        return data
+
+class AudioCapsEvalDataset(AudioCapsDataset):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        seen = set()
+        self.annotation = [x for x in self.annotation if x["youtube_id"] not in seen and not seen.add(x["youtube_id"])]
+    
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            del data["text_input"]
+        return data
+
+class ClothoV2Dataset(BaseDataset, __DisplMixin):
+    def __init__(self, **kwargs):
+        super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], kwargs['ann_paths'])
+        # Captions column names in CSV files
+        self._CAPTIONS_KEYS = (
+            "caption_1",
+            "caption_2",
+            "caption_3",
+            "caption_4",
+            "caption_5",
+        )
+        self.split = kwargs['ann_paths'][-1].split('_')[-1].split('.')[0]
+        for ann in self.annotation:
+            ann["fname"] = ann["file_name"]
+            ann["sound_id"] = ann["fname"]
+            ann["captions"] = [ann[caption_key] for caption_key in self._CAPTIONS_KEYS]
+
+        self.audio_processor = kwargs[f"audio_processor"]
+        self.audio_root = kwargs[f"audio_root"]
+        self._add_instance_ids()
+
+    def __getitem__(self, index):
+        ann = copy.deepcopy(self.annotation[index])
+        ann['audio'] = self.audio_processor(os.path.join(self.audio_root,self.split,ann['fname']))
+        if ann["audio"].sum() == 0:
+            return None
+        ann['audio_path'] = os.path.join(self.audio_root,self.split,ann['fname'])
+        ann["text_input"] = self.text_processor(random.choice(ann['captions']))
+        return ann
+    
+class ClothoV2InstructDataset(ClothoV2Dataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data['text_output'] = data["text_input"]
+            data['text_input'] = self.text_processor("")
+        return data
+
+class ClothoV2EvalDataset(ClothoV2Dataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            del data["text_input"]
+        return data
+    
+# class ClothoV2EvalDataset(BaseDataset, __DisplMixin):
+#     def __init__(self, **kwargs):
+#         super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], kwargs['ann_paths'])
+#         # Captions column names in CSV files
+#         self._CAPTIONS_KEYS = (
+#             "caption_1",
+#             "caption_2",
+#             "caption_3",
+#             "caption_4",
+#             "caption_5",
+#         )
+
+#         for ann in self.annotation:
+#             ann["fname"] = ann["file_name"]
+#             ann["sound_id"] = ann["fname"]
+#             ann["captions"] = [ann[caption_key] for caption_key in self._CAPTIONS_KEYS]
+#         self.audio_processor = kwargs[f"audio_processor"]
+#         self.audio_root = kwargs[f"audio_root"]
+#         self._add_instance_ids()
+
+#     def __getitem__(self, index):
+#         ann = copy.deepcopy(self.annotation[index])
+#         ann['audio'] = self.audio_processor(os.path.join(self.audio_root,ann['fname']))
+#         if ann["audio"].sum() == 0:
+#             return None
+#         ann['audio_path'] = os.path.join(self.audio_root,ann['fname'])
+#         # ann["text_input"] = ann['captions']
+#         return ann
+
+class AudioLanguagePretrainDataset(BaseDataset, __DisplMixin):
+    def __init__(self, **kwargs):
+        json_files = kwargs['ann_paths'][:-1]
+        blacklist = None
+        # self._load_json_file(json_files, kwargs["audio_root"], blacklist)
+        self.annotation = json.load(open(kwargs['ann_paths'][-1]))
+        self.cached = kwargs.get('cached', False)
+        self.cache_dir = kwargs.get('cached_dir', '')
+        self.text_processor = kwargs.get('text_processor', None)
+        self.audio_processor = kwargs['audio_processor']
+        self._add_instance_ids()
+    
+    # https://github.com/XinhaoMei/WavCaps/blob/c17ff4fe61a650a5d19fb7df8b85569c9ebc74e3/retrieval/data_handling/pretrain_dataset.py#L55
+    def _load_json_file(self, files, audio_root, blacklist=None):
+        json_data = []
+        audio_id = 0
+        if blacklist is not None:
+            with open(blacklist, 'r') as f:
+                blacklist = json.load(f)
+        for file in files:
+            with open(file, "r") as f:
+                json_obj = json.load(f)
+                if json_obj["num_captions_per_audio"] == 1:
+                    for item in tqdm(json_obj["data"]):
+                        if "FreeSound" in file and blacklist is not None:
+                            if item["id"] in blacklist["FreeSound"]:
+                                continue
+                        elif "AudioSet" in file and blacklist is not None:
+                            if item["id"] in blacklist["AudioSet"]:
+                                continue
+                        if 'AudioSet' in file:
+                            audio_path = f"{audio_root}/AudioSet_SL_flac/{item['id'].split('.')[0]}.flac"
+                        elif 'BBC_Sound' in file:
+                            audio_path = f"{audio_root}/BBC_Sound_Effects_flac/{item['id'].split('.')[0]}.flac"
+                        elif 'FreeSound' in file:
+                            audio_path = f"{audio_root}/FreeSound_flac/{item['id'].split('.')[0]}.flac"
+                        elif 'SoundBible' in file:
+                            audio_path = f"{audio_root}/SoundBible_flac/{item['id'].split('.')[0]}.flac"
+                        if not os.path.exists(audio_path):
+                            # print(f'Skipped {audio_path}')
+                            continue
+                        temp_dict = {"audio": item["audio"], "caption": item["caption"], "id": item['id'],"duration": item["duration"], 'audio_path': audio_path}
+                        json_data.append(temp_dict)
+                        audio_id += 1
+                else:
+                    for item in json_obj["data"]:
+                        for i in range(1, json_obj["num_captions_per_audio"] + 1):
+                            temp_dict = {"audio": item["audio"], "caption": item[f"caption_{i}"], "id": item['id'],
+                                        "duration": item["duration"]}
+                            json_data.append(temp_dict)
+                        audio_id += 1
+        return json_data
+
+    def __len__(self):
+        return len(self.annotation)
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        ## CACHED REPRESENTATIONS
+        if self.cached:
+            audio = torch.load(os.path.join(self.cache_dir, f"{ann['id']}.pt"), map_location=torch.device('cpu'))
+        else:
+            audio = self.audio_processor(ann["audio_path"])
+        
+        if audio.sum() == 0:
+            return None
+
+        caption = self.text_processor(ann["caption"])
+        audio_id = ann["id"]
+
+        return {
+            "audio": audio ,
+            "text_input": caption,
+            "sample_id": audio_id,
+            "instance_id": ann["instance_id"]
+        }
+    
+    def _build_templates(self, templates_path):
+        self.templates = None
+
+class AudioLanguagePretrainInstructDataset(AudioLanguagePretrainDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data['text_output'] = data["text_input"]
+            data['text_input'] = self.text_processor("")
+        return data
+
+class AudioLanguagePretrainEvalDataset(AudioLanguagePretrainDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            del data["text_input"]
+        return data
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/datasets/audio_classification_datasets.py b/LAVIS-main/lavis/datasets/datasets/audio_classification_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..2697d0980798fc6668f659a658ec925bd6894710
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/audio_classification_datasets.py
@@ -0,0 +1,100 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+from collections import OrderedDict
+import torch
+import copy
+import pathlib
+import random
+import json
+import pandas as pd
+import torchaudio
+import torch
+from tqdm import tqdm
+
+from lavis.datasets.datasets.base_dataset import BaseDataset
+
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+
+        return OrderedDict(
+            {
+                "file": ann["image"],
+                "label": ann["caption"],
+                "audio": sample["audio"],
+                "audio_path": sample["audio_path"],
+                "caption": sample["caption"],
+    
+            }
+        )
+
+
+class ESC50(BaseDataset, __DisplMixin):
+    def __init__(self, **kwargs):
+        self.modalities = kwargs['modalities']
+        super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], kwargs['ann_paths'])
+        for modality in self.modalities:
+            setattr(self, f"{modality}_root", kwargs[f"{modality}_root"])
+            setattr(self, f"{modality}_processor", kwargs[f"{modality}_processor"])
+            setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')())
+        self.classnames = list(set([ann['category'] for ann in self.annotation]))
+        self.classnames = [c.replace('_', ' ') for c in self.classnames]
+        
+    def get_audio_path(self, ann):
+        return os.path.join(self.audio_root, ann["sample_id"])
+    
+    def is_empty_audio(self, ann):
+        path = self.get_audio_path(ann)
+        try:
+            waveform, sr = torchaudio.load(path)
+
+            # Convert to mono if it's stereo
+            if waveform.shape[0] == 2:
+                waveform = torch.mean(waveform, dim=0)
+
+        except torchaudio.TorchaudioException:
+            return True  # Audio loading failed
+
+        return waveform.nelement() == 0
+    
+    def get_existing_audio_annotations(self):
+        return [f for f in os.listdir(self.audio_root)]
+
+    def get_existing_video_annotations(self):
+        return os.listdir(self.video_root)
+    
+    def get_existing_images_annotations(self):
+        return os.listdir(self.vis_root)
+    
+    def get_video_path(self, ann):
+        return  pathlib.Path(os.path.join(self.video_root, ann[self.sample_id_key])).resolve()
+     
+    def get_images_path(self, ann):
+        return  pathlib.Path(os.path.join(self.vis_root, ann[self.sample_id_key])).resolve()
+    
+    def __len__(self):
+        return len(self.annotation)
+    
+
+    def __getitem__(self, index):
+        ann = copy.deepcopy(self.annotation[index])
+        ann["sample_id"] = ann["filename"]
+        ann['label'] = ann['category'].replace('_', ' ')
+        for modality in self.modalities:
+            ann[f"{modality}_path"] = getattr(self, f"get_{modality}_path")(ann)
+            if isinstance(ann[f"{modality}_path"], list):
+                ann[f"{modality}_path"] = random.choice(ann[f"{modality}_path"])
+            else:
+                ann[modality if 'image' not in modality else 'image'] = getattr(self, f"{'vis' if 'image' in modality else modality}_processor")(ann[f"{modality}_path"])
+
+        if ann["audio"].sum() == 0:
+            return None
+
+        return ann
+
diff --git a/LAVIS-main/lavis/datasets/datasets/audio_qa_datasets.py b/LAVIS-main/lavis/datasets/datasets/audio_qa_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..31085808e10db236192a719039949d05e144869e
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/audio_qa_datasets.py
@@ -0,0 +1,116 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import copy
+import os
+from lavis.datasets.datasets.audio_captioning_datasets import AudioCapsDataset
+from lavis.datasets.datasets.base_dataset import BaseDataset
+import torch
+import random
+from collections import Counter
+
+class AudioCapsQADataset(AudioCapsDataset):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.add_binary = kwargs.get('add_binary', False)
+        self.binary_templates = ["do you hear {}?", "is this {}?", "does the audio contain {}?"]
+    
+    def __getitem__(self, index):
+        ann = copy.deepcopy(self.annotation[index])
+        for modality in self.modalities:
+            if modality == 'audio' and self.cached:
+                ann[f"{modality}_path"] = getattr(self, f"get_cached_{modality}_path")(ann)
+                ann["audio"] = torch.load(ann[f"{modality}_path"])
+            else:
+                ann[f"{modality}_path"] = getattr(self, f"get_{modality}_path")(ann)
+                if isinstance(ann[f"{modality}_path"], list):
+                    ann[f"{modality}_path"] = random.choice(ann[f"{modality}_path"])
+                ann[modality if 'image' not in modality else 'image'] = getattr(self, f"{'vis' if 'image' in modality else modality}_processor")(ann[f"{modality}_path"])
+        
+        if ann["audio"].sum() == 0:
+            return None
+        if self.add_binary and random.randint(0,10) < 3:
+            yes_answer = random.randint(0,10)<5
+            if not yes_answer:
+                caption_index = random.choice(list(set(range(len(self.annotation))).difference(set([index]))))
+                caption = self.annotation[caption_index]['caption']
+            else:
+                caption = ann['caption']
+            
+            question = random.choice(self.binary_templates).format(caption)
+            answer = 'yes' if yes_answer else 'no'
+            return {
+                "text_input": self.text_processor(question),
+                "instance_id": ann["instance_id"],
+                "text_output":answer,
+                "answer":answer,
+                "caption": ann['caption'],
+                "audio": ann['audio'],
+                "audio_id": ann['youtube_id'],
+                "question_id": ann['youtube_id'],
+            }
+
+        return {
+            "text_input": self.text_processor(ann['question']),
+            "instance_id": ann["instance_id"],
+            "text_output":ann['answer'],
+            "answer":ann['answer'],
+            "caption": ann['caption'],
+            "audio": ann['audio'],
+            "audio_id": ann['youtube_id'],
+            "question_id": ann['youtube_id'],
+        }
+
+
+
+class ClothoQADataset(BaseDataset):
+    def __init__(self, **kwargs):
+        super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], kwargs['ann_paths'])
+
+        self.non_binary_cls = kwargs.get('non_bin',False)
+        self.unanimous = kwargs.get('unanimous',False)
+        
+
+        annotation = []
+        for i in range(0, len(self.annotation), 3):
+            new_ann = self.annotation[i]
+            new_ann['question'] = new_ann['QuestionText']
+            del new_ann['QuestionText']
+            new_ann['answer'] = [self.annotation[i+off]['answer'] for off in range(3)]
+            if self.unanimous and Counter(new_ann['answer'])[new_ann['answer'][0]] != 3:
+                continue
+            if self.non_binary_cls and ('yes' in new_ann['answer'] or 'no' in new_ann['answer']):
+                continue
+            new_ann["question_id"] = new_ann['instance_id']
+            annotation.append(new_ann)
+        self.modalities = kwargs['modalities']
+        for modality in self.modalities:
+            setattr(self, f"{modality}_root", kwargs[f"{modality}_root"])
+            setattr(self, f"{modality}_processor", kwargs[f"{modality}_processor"])
+        self.annotation = annotation
+
+    
+    def __getitem__(self, index):
+        ann = copy.deepcopy(self.annotation[index])
+        audio_path = os.path.join(self.audio_root, ann["file_name"])
+        ann['audio'] = self.audio_processor(audio_path)
+       
+        if ann["audio"].sum() == 0:
+            return None
+
+        return {
+            "text_input": self.text_processor(ann['question']),
+            "question": self.text_processor(ann['question']),
+            "instance_id": ann["instance_id"],
+            "text_output":random.choice(ann['answer']),
+            "answer":ann['answer'],
+            "answers":ann['answer'],
+            "audio": ann['audio'],
+            "question_id": ann['instance_id'],
+        }
+    
+    def _build_templates(self, template):
+        return None
diff --git a/LAVIS-main/lavis/datasets/datasets/avsd_dialogue_datasets.py b/LAVIS-main/lavis/datasets/datasets/avsd_dialogue_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..69459220349b494d52e93f328ae6301a8fe5b1ae
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/avsd_dialogue_datasets.py
@@ -0,0 +1,227 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import torch
+import os
+import copy
+import random
+from PIL import Image
+from lavis.datasets.datasets.dialogue_datasets import (
+    DialogueDataset,
+    DialogueEvalDataset,
+)
+
+
+class AVSDDialDataset(DialogueDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        split (string): val or test
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    def __getitem__(self, index):
+
+        ann = self.annotation[index]
+
+        vname = ann["image_id"]
+
+        video = self.vis_processor(self.vis_root, vname)
+
+        dialogue = self.text_processor(ann)
+
+        # "image_id" is kept to stay compatible with the COCO evaluation format
+        return {
+            "video_fts": video["video_fts"],
+            "video_token_type_ids": video["token_type_ids"],
+            "input_ids": dialogue["input_ids"],
+            "token_type_ids": dialogue["token_type_ids"],
+            "labels": dialogue["labels"],
+            "image_id": ann["image_id"],
+            "instance_id": ann["instance_id"],
+        }
+
+    def collater(self, samples):
+
+        input_ids, token_type_ids, labels, video_fts, video_token_type_ids = (
+            [],
+            [],
+            [],
+            [],
+            [],
+        )
+
+        for i in samples:
+            input_ids.append(i["input_ids"])
+            token_type_ids.append(i["token_type_ids"])
+            labels.append(i["labels"])
+            video_fts.append(i["video_fts"])
+            video_token_type_ids.append(i["video_token_type_ids"])
+
+        input_ids = self.text_processor.padding(input_ids)
+
+        labels = self.text_processor.padding(
+            labels, -1
+        )  # ignore token indice -1 by default
+        video_fts = self.vis_processor.padding(video_fts)
+
+        token_type_ids = self.text_processor.padding(token_type_ids)
+        video_token_type_ids = self.text_processor.padding(video_token_type_ids)
+        token_type_ids = torch.cat([video_token_type_ids, token_type_ids], dim=1)
+
+        attn_mask = self.text_processor.get_attention_mask(input_ids)
+        video_mask = self.vis_processor.get_attention_mask(video_fts)
+        attn_mask = torch.cat([video_mask, attn_mask], dim=1)
+
+        video_labels = (
+            torch.ones((video_fts.size(0), video_fts.size(1))).long() * -1
+        )  # ignore token indice -1 by default
+        labels = torch.cat([video_labels, labels], dim=1)
+
+        samples = {}
+        samples["input_ids"] = input_ids
+        samples["token_type_ids"] = token_type_ids
+        samples["labels"] = labels
+        samples["video_fts"] = video_fts
+        samples["attn_mask"] = attn_mask
+
+        return samples
+
+
+class AVSDDialEvalDataset(DialogueEvalDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        split (string): val or test
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    def __getitem__(self, index):
+
+        ann = self.annotation[index]
+
+        vname = ann["image_id"]
+
+        video = self.vis_processor(self.vis_root, vname)
+
+        dialogue = self.text_processor(ann)
+
+        # "image_id" is kept to stay compatible with the COCO evaluation format
+        return {
+            "video_fts": video["video_fts"],
+            "video_token_type_ids": video["token_type_ids"],
+            "input_ids": dialogue["input_ids"],
+            "token_type_ids": dialogue["token_type_ids"],
+            "labels": dialogue["labels"],
+            "image_id": ann["image_id"],
+            "instance_id": ann["instance_id"],
+        }
+
+    def collater(self, samples):
+
+        input_ids, token_type_ids, labels, video_fts, video_token_type_ids = (
+            [],
+            [],
+            [],
+            [],
+            [],
+        )
+
+        for i in samples:
+            input_ids.append(i["input_ids"])
+            token_type_ids.append(i["token_type_ids"])
+            labels.append(i["labels"])
+            video_fts.append(i["video_fts"])
+            video_token_type_ids.append(i["video_token_type_ids"])
+
+        input_ids = self.text_processor.padding(input_ids)
+
+        labels = self.text_processor.padding(
+            labels, -1
+        )  # ignore token indice -1 by default
+        video_fts = self.vis_processor.padding(video_fts)
+
+        token_type_ids = self.text_processor.padding(token_type_ids)
+        video_token_type_ids = self.text_processor.padding(video_token_type_ids)
+        token_type_ids = torch.cat([video_token_type_ids, token_type_ids], dim=1)
+
+        attn_mask = self.text_processor.get_attention_mask(input_ids)
+        video_mask = self.vis_processor.get_attention_mask(video_fts)
+        attn_mask = torch.cat([video_mask, attn_mask], dim=1)
+
+        video_labels = (
+            torch.ones((video_fts.size(0), video_fts.size(1))).long() * -1
+        )  # ignore token indice -1 by default
+        labels = torch.cat([video_labels, labels], dim=1)
+
+        samples = {}
+        samples["input_ids"] = input_ids
+        samples["token_type_ids"] = token_type_ids
+        samples["labels"] = labels
+        samples["video_fts"] = video_fts
+        samples["attn_mask"] = attn_mask
+
+        return samples
+
+
+class AVSDDialInstructEvalDataset(DialogueDataset):
+    def __init__(self, **kwargs):
+        super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], kwargs['ann_paths'])
+
+        self.modalities = kwargs['modalities']
+
+        for modality in self.modalities:
+            if 'image' in modality:
+                setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')())
+                continue
+            setattr(self, f"{modality}_root", kwargs[f"{modality}_root"])
+            setattr(self, f"{modality}_processor", kwargs[f"{modality}_processor"])
+            setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')())
+        self.sample_ids = set.intersection(*[set(getattr(self, f"existing_{modality}_annotation")) for modality in self.modalities])
+        self.annotation = [ann for ann in self.annotation if ann['image_id'] in self.sample_ids]
+        if 'test' in kwargs['ann_paths'][0]:
+             self.annotation = [ann for ann in self.annotation if ann['answer'] == '__UNDISCLOSED__']
+    
+    def get_existing_audio_annotations(self):
+        return [f.split('.')[0] for f in os.listdir(self.audio_root)]
+    
+    def get_existing_video_annotations(self):
+        return [f.split('.')[0] for f in os.listdir(self.video_root)]
+    
+    def get_audio_path(self, sample_key):
+        return os.path.join(self.audio_root, sample_key) + '.mp4'
+    
+    def get_video_path(self, sample_key):
+        return os.path.join(self.video_root, sample_key) + '.mp4'
+
+    def __getitem__(self, index):
+        ann = copy.deepcopy(self.annotation[index])
+        for modality in self.modalities:
+            ann[f"{modality}_path"] = getattr(self, f"get_{modality}_path")(ann['image_id'])
+            
+            if type(ann[f"{modality}_path"]) == list:
+                ann[f"{modality}_path"] = random.choice(ann[f"{modality}_path"])
+            if 'image' in modality:
+                ann['image'] = self.vis_processor(Image.open(ann[f"images_path"]))
+            else:
+                ann[modality] = getattr(self, f"{modality}_processor")(ann[f"{modality}_path"]).to(torch.float32)
+        
+        ann["sample_id"] = ann["image_id"]
+        dialog = ""
+        for t in ann['dialog']:
+            dialog += f"{t['question']} {t['answer']} "
+        ann['dialog'] = dialog
+        ann['text_output'] = self.text_processor(ann['answer'])
+        ann['text_input'] =  self.text_processor(ann['question'])
+        ann["question_id"] = index
+        # ann['captions'] = ann[ann['answer']] # commented out for test dataset
+        return ann
+    
+    def __len__(self):
+        return len(self.annotation)
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/datasets/base_dataset.py b/LAVIS-main/lavis/datasets/datasets/base_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4b3302b13ae8f042016b717db83ac9362d298e4
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/base_dataset.py
@@ -0,0 +1,95 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import json
+from typing import Iterable
+import pandas as pd
+import torch
+
+from torch.utils.data import Dataset, ConcatDataset
+from torch.utils.data.dataloader import default_collate
+
+
+class BaseDataset(Dataset):
+    def __init__(
+        self, vis_processor=None, text_processor=None, vis_root=None, ann_paths=[]
+    ):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+        self.annotation = []
+        for ann_path in ann_paths:
+            if any(ext in ann_path for ext in ['csv', 'tsv']):
+                df = pd.read_csv(ann_path)
+                self.annotation.extend(df.to_dict(orient="records"))
+                
+            elif 'jsonl' in ann_path:
+                with open(ann_path, "r") as f:
+                    self.annotation.extend([json.loads(line) for line in f])
+
+            else:
+                with open(ann_path, "r") as f:
+                    loaded = json.load(f)
+                    if isinstance(loaded, list):
+                        self.annotation.extend(loaded)
+                    elif isinstance(loaded, dict):
+                       self.annotation.extend([{"sample_id": k, **v} if isinstance(v, dict) else {"sample_id": k, "data": v} for k, v in loaded.items()])
+
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        self._add_instance_ids()
+
+    def __len__(self):
+        return len(self.annotation)
+
+    def collater(self, samples):
+        # Filter out None samples
+        samples = [s for s in samples if s is not None]
+        # Check if samples is empty after filtering
+        if not samples:
+            return {}
+        collated_dict = {}
+        keys = samples[0].keys() # Use the keys of the first sample as a reference
+        for k in keys:
+            values = [sample[k] for sample in samples]
+            # If the value type for the key is torch.Tensor, stack them else return list
+            collated_dict[k] = torch.stack(values, dim=0) if isinstance(values[0], torch.Tensor) else values
+        return collated_dict
+        # return default_collate(samples)
+
+    def set_processors(self, vis_processor, text_processor):
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+    def _add_instance_ids(self, key="instance_id"):
+        for idx, ann in enumerate(self.annotation):
+            ann[key] = str(idx)
+
+class ConcatDataset(ConcatDataset):
+    def __init__(self, datasets: Iterable[Dataset]) -> None:
+        super().__init__(datasets)
+
+    def collater(self, samples):
+        # TODO For now only supports datasets with same underlying collater implementations
+
+        all_keys = set()
+        for s in samples:
+            all_keys.update(s)
+
+        shared_keys = all_keys
+        for s in samples:
+            shared_keys = shared_keys & set(s.keys())
+
+        samples_shared_keys = []
+        for s in samples:
+            samples_shared_keys.append({k: s[k] for k in s.keys() if k in shared_keys})
+
+        return self.datasets[0].collater(samples_shared_keys)
diff --git a/LAVIS-main/lavis/datasets/datasets/capfilt_dataset.py b/LAVIS-main/lavis/datasets/datasets/capfilt_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..37bc984f41b45d7e8170383611c8bf57fd171e7b
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/capfilt_dataset.py
@@ -0,0 +1,58 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import json
+
+from PIL import Image
+from PIL import ImageFile
+
+from lavis.datasets.datasets.caption_datasets import CaptionDataset, CaptionEvalDataset, __DisplMixin
+from lavis.datasets.datasets.base_dataset import BaseDataset
+
+class CapFiltCaptionDataset(BaseDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+        self.img_ids = {}
+        n = 0
+        for ann in self.annotation:
+            ann["image_id"] = ''.join(ann['image'].split('.')[:-1])
+            img_id = ann["image_id"]
+            if img_id not in self.img_ids.keys():
+                self.img_ids[img_id] = n
+                n += 1
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(ann["image"])
+        try:
+            image = Image.open(image_path).convert("RGB")
+        except:
+            return None # image does not exist
+
+        image = self.vis_processor(image)
+        caption = self.text_processor(ann["caption"])
+
+        return {
+            "image": image,
+            "text_input": caption,
+            "image_id": ann["image_id"]
+        }
+
+class CapFiltCaptionInstructDataset(CapFiltCaptionDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data['text_output'] = data["text_input"]
+            data['text_input'] = self.text_processor("")
+        return data
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/datasets/caption_datasets.py b/LAVIS-main/lavis/datasets/datasets/caption_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..973a0d682b2250cce2f3fbb10c322455d43611b3
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/caption_datasets.py
@@ -0,0 +1,94 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+from collections import OrderedDict
+
+from lavis.datasets.datasets.base_dataset import BaseDataset
+from PIL import Image
+
+
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+
+        return OrderedDict(
+            {
+                "file": ann["image"],
+                "caption": ann["caption"],
+                "image": sample["image"],
+            }
+        )
+
+
+class CaptionDataset(BaseDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+        self.img_ids = {}
+        n = 0
+        for ann in self.annotation:
+            img_id = ann["image_id"]
+            if img_id not in self.img_ids.keys():
+                self.img_ids[img_id] = n
+                n += 1
+
+    def __getitem__(self, index):
+
+        # TODO this assumes image input, not general enough
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        try:
+            image = Image.open(image_path).convert("RGB")
+        except:
+            return None # image does not exist
+
+        image = self.vis_processor(image)
+        caption = self.text_processor(ann["caption"])
+
+        return {
+            "image": image,
+            "text_input": caption,
+            "image_id": ann["image_id"]
+        }
+
+class CaptionEvalDataset(BaseDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        split (string): val or test
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    def __getitem__(self, index):
+
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+
+        return {
+            "image": image,
+            "image_id": ann["image_id"],
+            "instance_id": ann["instance_id"],
+        }
+
+class CaptionInstructDataset(CaptionDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data['text_output'] = data["text_input"]
+            data['text_input'] = self.text_processor("")
+        return data
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/datasets/coco_caption_datasets.py b/LAVIS-main/lavis/datasets/datasets/coco_caption_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5497f82762654268c339255b38e840c199732ca
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/coco_caption_datasets.py
@@ -0,0 +1,72 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import json
+
+from PIL import Image
+from PIL import ImageFile
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+from lavis.datasets.datasets.caption_datasets import CaptionDataset, CaptionInstructDataset, CaptionEvalDataset
+
+COCOCapDataset = CaptionDataset
+COCOCapInstructDataset = CaptionInstructDataset
+
+
+class COCOCapEvalDataset(CaptionEvalDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        split (string): val or test
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+
+        img_id = ann["image"].split("/")[-1].strip(".jpg").split("_")[-1]
+
+        return {
+            "image": image,
+            "image_id": img_id,
+            "instance_id": ann["instance_id"],
+        }
+
+
+
+class NoCapsEvalDataset(CaptionEvalDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        split (string): val or test
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+
+        img_id = ann["img_id"]
+
+        return {
+            "image": image,
+            "image_id": img_id,
+            "instance_id": ann["instance_id"],
+        }
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/datasets/coco_vqa_datasets.py b/LAVIS-main/lavis/datasets/datasets/coco_vqa_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3b3837138f960386645ce4f0fd83bb447f8a586
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/coco_vqa_datasets.py
@@ -0,0 +1,121 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import json
+import random
+from PIL import Image
+
+from lavis.datasets.datasets.vqa_datasets import VQADataset, VQAEvalDataset
+
+from collections import OrderedDict
+
+
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+
+        return OrderedDict(
+            {
+                "file": ann["image"],
+                "question": ann["question"],
+                "question_id": ann["question_id"],
+                "answers": "; ".join(ann["answer"]),
+                "image": sample["image"],
+            }
+        )
+
+
+class COCOVQADataset(VQADataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        question = self.text_processor(ann["question"])
+
+        answer_weight = {}
+        for answer in ann["answer"]:
+            if answer in answer_weight.keys():
+                answer_weight[answer] += 1 / len(ann["answer"])
+            else:
+                answer_weight[answer] = 1 / len(ann["answer"])
+
+        answers = list(answer_weight.keys())
+        weights = list(answer_weight.values())
+
+        return {
+            "image": image,
+            "text_input": question,
+            "answers": answers,
+            "weights": weights,
+        }
+
+
+class COCOVQAInstructDataset(COCOVQADataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data['text_output'] = random.choice(data["answers"])
+        return data
+
+    def collater(self, samples):
+        data = super().collater(samples)
+        data['text_output'] = data['answer']
+        return data
+
+    
+
+class COCOVQAEvalDataset(VQAEvalDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+
+        self.vis_root = vis_root
+
+        self.annotation = json.load(open(ann_paths[0]))
+
+        answer_list_path = ann_paths[1]
+        if os.path.exists(answer_list_path):
+            self.answer_list = json.load(open(answer_list_path))
+        else:
+            self.answer_list = None
+
+        try:
+            self.coco_fmt_qust_file = ann_paths[2]
+            self.coco_fmt_anno_file = ann_paths[3]
+        except IndexError:
+            self.coco_fmt_qust_file = None
+            self.coco_fmt_anno_file = None
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        self._add_instance_ids()
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        question = self.text_processor(ann["question"])
+
+        return {
+            "image": image,
+            "text_input": question,
+            "question_id": ann["question_id"],
+            "instance_id": ann["instance_id"],
+        }
diff --git a/LAVIS-main/lavis/datasets/datasets/dataloader_utils.py b/LAVIS-main/lavis/datasets/datasets/dataloader_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3decb6f5d1f647b8d1bdfb513e08f95fc3bc3f6e
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/dataloader_utils.py
@@ -0,0 +1,164 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import time
+import random
+import torch
+from lavis.datasets.data_utils import move_to_cuda
+from torch.utils.data import DataLoader
+
+
+class MultiIterLoader:
+    """
+    A simple wrapper for iterating over multiple iterators.
+
+    Args:
+        loaders (List[Loader]): List of Iterator loaders.
+        ratios (List[float]): List of ratios to sample from each loader. If None, all loaders are sampled uniformly.
+    """
+
+    def __init__(self, loaders, ratios=None):
+        # assert all loaders has __next__ method
+        for loader in loaders:
+            assert hasattr(
+                loader, "__next__"
+            ), "Loader {} has no __next__ method.".format(loader)
+        if ratios is None:
+            ratios = [1.0] * len(loaders)
+        else:
+            assert len(ratios) == len(loaders)
+            ratios = [float(ratio) / sum(ratios) for ratio in ratios]
+
+        self.loaders = loaders
+        self.ratios = ratios
+
+    def __next__(self):
+        # random sample from each loader by ratio
+        loader_idx = random.choices(range(len(self.loaders)), self.ratios, k=1)[0]
+        return next(self.loaders[loader_idx])
+
+
+class PrefetchLoader(object):
+    """
+    Modified from https://github.com/ChenRocks/UNITER.
+
+    overlap compute and cuda data transfer
+    (copied and then modified from nvidia apex)
+    """
+
+    def __init__(self, loader):
+        self.loader = loader
+        self.stream = torch.cuda.Stream()
+
+    def __iter__(self):
+        loader_it = iter(self.loader)
+        self.preload(loader_it)
+        batch = self.next(loader_it)
+        while batch is not None:
+            is_tuple = isinstance(batch, tuple)
+            if is_tuple:
+                task, batch = batch
+
+            if is_tuple:
+                yield task, batch
+            else:
+                yield batch
+            batch = self.next(loader_it)
+
+    def __len__(self):
+        return len(self.loader)
+
+    def preload(self, it):
+        try:
+            self.batch = next(it)
+        except StopIteration:
+            self.batch = None
+            return
+        # if record_stream() doesn't work, another option is to make sure
+        # device inputs are created on the main stream.
+        # self.next_input_gpu = torch.empty_like(self.next_input,
+        #                                        device='cuda')
+        # self.next_target_gpu = torch.empty_like(self.next_target,
+        #                                         device='cuda')
+        # Need to make sure the memory allocated for next_* is not still in use
+        # by the main stream at the time we start copying to next_*:
+        # self.stream.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(self.stream):
+            self.batch = move_to_cuda(self.batch)
+            # more code for the alternative if record_stream() doesn't work:
+            # copy_ will record the use of the pinned source tensor in this
+            # side stream.
+            # self.next_input_gpu.copy_(self.next_input, non_blocking=True)
+            # self.next_target_gpu.copy_(self.next_target, non_blocking=True)
+            # self.next_input = self.next_input_gpu
+            # self.next_target = self.next_target_gpu
+
+    def next(self, it):
+        torch.cuda.current_stream().wait_stream(self.stream)
+        batch = self.batch
+        if batch is not None and batch is not {}:
+            record_cuda_stream(batch)
+        self.preload(it)
+        return batch
+    
+    def __next__(self, it):
+        return self.next(it)
+
+    def __getattr__(self, name):
+        method = self.loader.__getattribute__(name)
+        return method
+
+
+def record_cuda_stream(batch):
+    if isinstance(batch, torch.Tensor):
+        batch.record_stream(torch.cuda.current_stream())
+    elif isinstance(batch, list) or isinstance(batch, tuple):
+        for t in batch:
+            record_cuda_stream(t)
+    elif isinstance(batch, dict):
+        for t in batch.values():
+            record_cuda_stream(t)
+    else:
+        pass
+
+
+class IterLoader:
+    """
+    A wrapper to convert DataLoader as an infinite iterator.
+
+    Modified from:
+        https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/iter_based_runner.py
+    """
+
+    def __init__(self, dataloader: DataLoader, use_distributed: bool = False):
+        self._dataloader = dataloader
+        self.iter_loader = iter(self._dataloader)
+        self._use_distributed = use_distributed
+        self._epoch = 0
+
+    @property
+    def epoch(self) -> int:
+        return self._epoch
+
+    def __next__(self):
+        try:
+            data = next(self.iter_loader)
+        except StopIteration:
+            self._epoch += 1
+            if hasattr(self._dataloader.sampler, "set_epoch") and self._use_distributed:
+                self._dataloader.sampler.set_epoch(self._epoch)
+            time.sleep(2)  # Prevent possible deadlock during epoch transition
+            self.iter_loader = iter(self._dataloader)
+            data = next(self.iter_loader)
+
+        return data
+
+    def __iter__(self):
+        return self
+
+    def __len__(self):
+        return len(self._dataloader)
diff --git a/LAVIS-main/lavis/datasets/datasets/dialogue_datasets.py b/LAVIS-main/lavis/datasets/datasets/dialogue_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..7596da65f42812d185d91c8c7bcf7776e8362444
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/dialogue_datasets.py
@@ -0,0 +1,141 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+from collections import OrderedDict
+
+from PIL import Image
+
+from lavis.datasets.datasets.base_dataset import BaseDataset
+
+import json
+import copy
+
+
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+
+        return OrderedDict(
+            {
+                "file": ann["image"],
+                "dialogue": ann["dialogue"],
+                "image": sample["image"],
+            }
+        )
+
+
+class DialogueDataset(BaseDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+
+        self.vis_root = vis_root
+
+        self.annotation = []
+        for ann_path in ann_paths:
+            dialogs = json.load(open(ann_path, "r"))["dialogs"]
+            for dialog in dialogs:
+                all_turns = dialog["dialog"]
+                dialogue_context = []
+                for turn in all_turns:
+                    dialog_instance = copy.deepcopy(dialog)
+                    question = turn["question"]
+                    answer = turn["answer"]
+
+                    dialog_instance["dialog"] = copy.deepcopy(dialogue_context)
+                    dialog_instance["question"] = question
+                    dialog_instance["answer"] = answer
+                    self.annotation.append(dialog_instance)
+                    dialogue_context.append(turn)
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        self._add_instance_ids()
+
+        self.img_ids = {}
+        n = 0
+        for ann in self.annotation:
+            img_id = ann["image_id"]
+            if img_id not in self.img_ids.keys():
+                self.img_ids[img_id] = n
+                n += 1
+
+    def __getitem__(self, index):
+
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        caption = self.text_processor(ann["caption"])
+
+        return {
+            "image": image,
+            "text_input": caption,
+            "image_id": self.img_ids[ann["image_id"]],
+        }
+
+
+class DialogueEvalDataset(BaseDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        split (string): val or test
+        """
+
+        self.vis_root = vis_root
+
+        self.annotation = []
+        for ann_path in ann_paths:
+            dialogs = json.load(open(ann_path, "r"))["dialogs"]
+            for dialog in dialogs:
+                all_turns = dialog["dialog"]
+                dialogue_context = all_turns[:-1]
+                last_turn = all_turns[-1]
+
+                question = last_turn["question"]
+                answer = last_turn["answer"]
+
+                dialog["dialog"] = dialogue_context
+                dialog["question"] = question
+                dialog["answer"] = answer
+
+                self.annotation.append(dialog)
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        self._add_instance_ids()
+
+        self.img_ids = {}
+        n = 0
+        for ann in self.annotation:
+            img_id = ann["image_id"]
+            if img_id not in self.img_ids.keys():
+                self.img_ids[img_id] = n
+                n += 1
+
+    def __getitem__(self, index):
+
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+
+        return {
+            "image": image,
+            "image_id": ann["image_id"],
+            "instance_id": ann["instance_id"],
+        }
diff --git a/LAVIS-main/lavis/datasets/datasets/discriminatory_reasoning_datasets.py b/LAVIS-main/lavis/datasets/datasets/discriminatory_reasoning_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..d00be18b0f7a84faf687dd3acb3c58239323d596
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/discriminatory_reasoning_datasets.py
@@ -0,0 +1,165 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+from collections import OrderedDict
+from PIL import Image
+import copy
+
+from lavis.datasets.datasets.base_dataset import BaseDataset
+from lavis.common.utils import is_serializable
+
+
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+
+        serializable_sample_keys = [k for k,v in sample.items() if is_serializable(v)]
+        serializable_ann_keys = [k for k,v in ann.items() if is_serializable(v)]
+        display = {k:sample[k] for k in serializable_sample_keys}
+        display.update({k:ann[k] for k in serializable_ann_keys})
+
+        return OrderedDict(
+            display
+        )
+
+
+
+class DisCRnDataset(BaseDataset, __DisplMixin):
+    def __init__(self, **kwargs):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        pc_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file 
+        """
+        super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], kwargs['ann_paths'])
+
+        self.ds_name = kwargs['dataset_name']
+        self.modalities = [str(m) for m in kwargs['modalities']]
+        ## from lavis convention, sometimes "image" modality is denoted as images
+        if "images" in self.modalities:
+            self.modalities[self.modalities.index("images")] = "image"
+        self.npoints = 8192
+        self.sample_points_num = self.npoints
+        self.annotation = self.annotation
+        self.view = kwargs.get('view', 2)
+        self.classnames = copy.deepcopy(self.modalities)
+        self.classnames = kwargs.get('classnames', ["first", "second"])
+        self.total = kwargs.get('total', 'all')
+        self.ground_truth = kwargs.get('ground_truth', False)
+        self.shuffle_modalities = kwargs.get('shuffle_modalities', False)
+        self.balance_labels = kwargs.get('balance_labels', True)
+        self.raw = kwargs.get('raw', False)
+
+        if self.total != 'all':
+            self.annotation = self.annotation[:self.total]
+        
+        for modality in self.modalities:
+            if "image" not in modality:
+                setattr(self, f"{modality}_root", kwargs[f"{modality}_root"])
+                setattr(self, f"{modality}_processor", kwargs[f"{modality}_processor"])
+            setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')())
+        
+        self.sample_ids = set.intersection(*[set(getattr(self, f"existing_{modality}_annotation")) for modality in self.modalities])
+        self.annotation = [ann for ann in self.annotation if ann['sample_ids'][0] in self.sample_ids and ann['sample_ids'][1] in self.sample_ids]
+        self._add_instance_ids()
+
+    def get_existing_image_annotations(self):
+        if self.ds_name == 'objaverse':
+            return [f.split('_')[0] for f in os.listdir(os.path.join(self.vis_root, f'compressed_imgs_view{self.view}/Cap3D_imgs_view{self.view}/'))]
+    
+    def get_image_path(self, ann, entity_index):
+        if self.ds_name == 'objaverse':
+            # data downloaded from: https://huggingface.co/datasets/tiange/Cap3D/tree/main/RenderedImage_zips
+            return os.path.join(self.vis_root, f'compressed_imgs_view{self.view}/Cap3D_imgs_view{self.view}/', ann['sample_ids'][entity_index]+f'_{self.view}.jpeg')
+
+    def get_existing_audio_annotations(self):
+        return [f.split('_')[0] for f in os.listdir(self.audio_root)]
+    
+    def get_audio_path(self, ann, entity_index):
+        if self.ds_name == 'audiocaps':
+            return str(os.path.join(self.audio_root, ann['sample_ids'][entity_index] + '_{}.flac'.format(int(ann['start_seconds'][entity_index]))))
+    
+    def get_video_path(self, ann, entity_index):
+        if self.ds_name == 'audiocaps':
+            return str(os.path.realpath(os.path.join(self.video_root,ann['sample_ids'][entity_index] + '_{}.mp4'.format(int(ann['start_seconds'][entity_index])))))
+
+    def get_existing_video_annotations(self):
+        return [f.split('_')[0] for f in os.listdir(self.video_root)]
+    
+    def get_existing_pc_annotations(self):
+        if self.ds_name == 'objaverse':
+            return os.listdir(self.pc_root)
+
+    def get_pc_path(self, ann, entity_index):
+        if self.ds_name == 'objaverse':
+            return os.path.join(self.pc_root, ann['sample_ids'][entity_index], '{}_{}.npz'.format(ann['sample_ids'][entity_index], self.npoints))
+
+    def __getitem__(self, index):
+        ann = copy.deepcopy(self.annotation[index])
+        N = 2 # number of inputs
+        ann["question_id"] = ann["instance_id"]
+        ann[f"modalities"] = copy.deepcopy(self.modalities)
+        for i,modality in enumerate(self.modalities):
+            if  ann[f'captions_pred_{modality}'] == None or ann[f'captions_pred_{modality}'][i]== None:
+                        return None
+        if len(self.modalities) == 1: # both modalities of the same type.
+            ann[f"modalities"] = [self.modalities[0]] * N
+        
+        if self.balance_labels:
+            if (index%2 and ann["label"] == 1) or (not index%2 and ann['label'] == 0):
+                ann["label"] = 1- ann["label"] 
+                ann["properties"] = [ann['properties'][1],ann['properties'][0]]
+                ann["captions"] = [ann['captions'][1],ann['captions'][0]]
+                if self.shuffle_modalities:
+                    ann['modalities'] = [ann['modalities'][1],ann['modalities'][0]] # if we comment this out, we can have batch size > 1. Maintaining for reproducibility.
+                for modality in self.modalities:
+                    ann[f'captions_pred_{modality}'] = [ann[f'captions_pred_{modality}'][1], ann[f'captions_pred_{modality}'][0]]
+        
+        ## baseline captions
+        ann["baseline_captions"] = [c for c in ann["captions"]] if self.ground_truth else [ann[f'captions_pred_{ann["modalities"][0]}'][0], ann[f'captions_pred_{ann["modalities"][1]}'][1]]
+        # ann["baseline_captions"] = [c.replace('..', '.') for c in ann["baseline_captions"]]
+        ann["baseline_captions"] = [c.strip() if c!=None else "" for c in ann["baseline_captions"]]
+        ## text input
+        ann["text_input"] = self.text_processor(f'{ann["question"].replace("which entity", "which of the two options").replace("which object", "which of the two options").replace("which image", "which of the two options").replace("which audio", "which of the two options").replace("audio", "object").replace("image", "object")}?'.replace('??', '?'))
+        # ann["text_input"] = self.text_processor(f'{ann["question"]}?'.replace('??', '?'))
+        ## answers
+        first_answers = [ann['modalities'][0], "the first option.", "the first", "left one", "(a) left",  "(a) left one", "(a)", 'a.', 'A.', "a)", "(A)", 'Input A', 'Entity 1', 'Object 1','Entity A', 'Object A', 'left', 'first', '1st', 'input 1', '1','a', 'input a', "the first", "the left one"]
+        second_answers = [ann['modalities'][1], "the second option.", "the second.", "second option", "the second option", "second option.", "right one","(b) right", "(b) right one" , "(b)", "b)", 'Input B', 'right', 'second', '2nd', 'input 2', '2', 'b', 'input b', 'Object 2','Entity B', 'Object B', "the second", "the right one", "the second one"]
+        if ann["label"] == 0:
+            ann["answers"] = first_answers
+        else:
+            ann["answers"] = second_answers 
+        if 'pc' in ann["answers"]:
+            ann["answers"].extend(['3d', '3d model', 'model', 'rendering', 'a 3d', 'a 3d model'])
+        if 'image' in ann["answers"]:
+            ann["answers"].extend(['photo', 'picture'])
+        if 'audio' in ann["answers"]:
+            ann["answers"].append('sound')
+        ## label
+        ann["label"] = self.classnames[ann["label"]]
+        ann['answer'] = ann["answers"] # for vqa task compatibility
+
+        ## get data
+        for i,modality in enumerate(ann["modalities"]):
+            path = getattr(self, f"get_{modality}_path")(ann, i)
+            if 'image' in modality:
+                path = Image.open(path).convert("RGB")
+            if self.raw:
+                ann[modality] = path
+                continue
+            try:
+                ann[modality] = getattr(self, f"{'vis' if 'image' in modality else modality}_processor")(path)
+            except:
+                return None
+        
+        ann["discrn"] = True # signify to model, this is a discrn task
+     
+        return ann
+    
+    def __len__(self):
+        return len(self.annotation)
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/datasets/gqa_datasets.py b/LAVIS-main/lavis/datasets/datasets/gqa_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1913a7054198095de43b67482890f6031393a02
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/gqa_datasets.py
@@ -0,0 +1,114 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import json
+import random
+
+from PIL import Image
+
+from lavis.datasets.datasets.vqa_datasets import VQADataset, VQAEvalDataset
+
+from collections import OrderedDict
+
+
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+
+        return OrderedDict(
+            {
+                "file": ann["image"],
+                "question": ann["question"],
+                "question_id": ann["question_id"],
+                "answers": "; ".join(ann["answer"]),
+                "image": sample["image"],
+            }
+        )
+
+
+class GQADataset(VQADataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        question = self.text_processor(ann["question"])
+
+        answers = [ann["answer"]]
+        weights = [1]
+
+        return {
+            "image": image,
+            "text_input": question,
+            "answers": answers,
+            "weights": weights,
+        }
+
+class GQAInstructDataset(GQADataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data['text_output'] = random.choice(data["answers"])
+        return data
+
+    def collater(self, samples):
+        data = super().collater(samples)
+        data['text_output'] = data['answer']
+        return data
+
+
+class GQAEvalDataset(VQAEvalDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. gqa/images/)
+        ann_root (string): directory to store the annotation file
+        """
+
+        self.vis_root = vis_root
+
+        self.annotation = json.load(open(ann_paths[0]))
+
+        ## TODO: support inference method == 'ranking'
+        answer_list_path = ann_paths[1] if len(ann_paths) > 1 else ''
+        if os.path.exists(answer_list_path):
+            self.answer_list = json.load(open(answer_list_path))
+        else:
+            self.answer_list = None
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        self._add_instance_ids()
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        question = self.text_processor(ann["question"])
+
+        if "answer" in ann:
+            # answer is a string
+            answer = ann["answer"]
+        else:
+            answer = None
+
+        return {
+            "image": image,
+            "text_input": question,
+            "answer": answer,
+            "question_id": ann["question_id"],
+            "instance_id": ann["instance_id"],
+        }
diff --git a/LAVIS-main/lavis/datasets/datasets/iconqa_datasets.py b/LAVIS-main/lavis/datasets/datasets/iconqa_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..02936854a03a1b9b5a8f358617ac3af7e00e7a3a
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/iconqa_datasets.py
@@ -0,0 +1,143 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from collections import OrderedDict
+import json
+import os
+import torch
+import pathlib
+import random
+
+from PIL import Image
+
+from lavis.datasets.datasets.vqa_datasets import VQADataset, VQAEvalDataset
+
+
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+        return OrderedDict(
+            {
+                "file": ann["image"],
+                "question": ann["question"],
+                "question_id": ann["question_id"],
+                "direct_answers": "; ".join(ann["direct_answers"]),
+                "choices": "; ".join(ann["choices"]),
+                "correct_choice": ann["choices"][ann["correct_choice_idx"]],
+                "image": sample["image"],
+            }
+        )
+
+
+class IconQADataset(VQADataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image_path = pathlib.Path(os.path.join(self.vis_root, ann["image"])).resolve()
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        question = self.text_processor(ann["question"])
+
+        answers = [ann['choices'][ann['answer']]]
+
+        return {
+            "image": image,
+            "text_input": question,
+            "direct_answers": answers,
+            "weights": [1],
+        }
+
+class IconQAInstructDataset(IconQADataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data['text_output'] = random.choice(data["direct_answers"])
+        return data
+        
+    def collater(self, samples):
+        data = super().collatter(samples)
+        data['text_output'] = data['answer']
+        return data
+
+
+class IconQAEvalDataset(VQAEvalDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+    def collater(self, samples):
+        (
+            image_list,
+            question_list,
+            question_id_list,
+            instance_id_list,
+            choices_list,
+            correct_choice_idx_list,
+            direct_answers_list,
+        ) = ([], [], [], [], [], [], [])
+
+        for sample in samples:
+            image_list.append(sample["image"])
+            question_list.append(sample["text_input"])
+            question_id_list.append(sample["question_id"])
+            instance_id_list.append(sample["instance_id"])
+            choices_list.append(sample["choices"])
+            correct_choice_idx_list.append(sample["correct_choice_idx"])
+            direct_answers_list.append(sample["direct_answers"])
+
+        return {
+            "image": torch.stack(image_list, dim=0),
+            "text_input": question_list,
+            "instance_id": instance_id_list,
+            "choices": choices_list,
+            "correct_choice_idx": correct_choice_idx_list,
+            "direct_answers": direct_answers_list,
+        }
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image_path = pathlib.Path(os.path.join(self.vis_root, ann["image"])).resolve()
+
+        answers = [ann['choices'][ann['answer']]]
+
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        question = self.text_processor(ann["question"])
+
+        choices = ann["choices"]
+        correct_choice_idx = ann["answer"]
+
+        return {
+            "image": image,
+            "text_input": question,
+            "instance_id": ann["instance_id"],
+            "choices": choices,
+            "correct_choice_idx": correct_choice_idx,
+            "direct_answers": answers,
+            "question_id": ann["instance_id"]
+        }
diff --git a/LAVIS-main/lavis/datasets/datasets/image_text_pair_datasets.py b/LAVIS-main/lavis/datasets/datasets/image_text_pair_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f019ef92fe1355451406d8e73d36452a048b27a
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/image_text_pair_datasets.py
@@ -0,0 +1,58 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+from collections import OrderedDict
+
+from lavis.datasets.datasets.base_dataset import BaseDataset
+from PIL import Image
+
+
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+
+        return OrderedDict(
+            {
+                "file": os.path.basename(ann["image"]),
+                "caption": ann["caption"],
+                "image": sample["image"],
+            }
+        )
+
+
+class ImageTextPairDataset(BaseDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    def __getitem__(self, index):
+
+        # TODO this assumes image input, not general enough
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        try:
+            image = Image.open(image_path).convert("RGB")
+        except:
+            return None
+
+        image = self.vis_processor(image)
+        caption = self.text_processor(ann["caption"])
+
+        return {"image": image, "text_input": caption}
+
+class ImageTextPairInstructDataset(ImageTextPairDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data['text_output'] = data["text_input"]
+            data['text_input'] = self.text_processor("")
+        return data
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/datasets/imagefolder_dataset.py b/LAVIS-main/lavis/datasets/datasets/imagefolder_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..8057b7e946d5df5c837499a4a92d46e8c56cf03e
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/imagefolder_dataset.py
@@ -0,0 +1,59 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+from collections import OrderedDict
+
+from lavis.datasets.datasets.base_dataset import BaseDataset
+from PIL import Image
+from torchvision import datasets
+
+
+class ImageFolderDataset(BaseDataset):
+    def __init__(self, vis_processor, vis_root, classnames=[], **kwargs):
+        super().__init__(vis_processor=vis_processor, vis_root=vis_root)
+
+        self.inner_dataset = datasets.ImageFolder(vis_root)
+
+        self.annotation = [
+            {"image": elem[0], "label": elem[1], "image_id": elem[0]}
+            for elem in self.inner_dataset.imgs
+        ]
+
+        self.classnames = classnames
+
+        self._add_instance_ids()
+
+    def __len__(self):
+        return len(self.inner_dataset)
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        img_fn = ann["image"]
+        image_path = os.path.join(self.vis_root, img_fn)
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+
+        return {
+            "image": image,
+            "label": ann["label"],
+            "image_id": ann["image_id"],
+            "instance_id": ann["instance_id"],
+        }
+
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+
+        return OrderedDict(
+            {
+                "file": ann["image"],
+                "label": self.classnames[ann["label"]],
+                "image": sample["image"],
+            }
+        )
diff --git a/LAVIS-main/lavis/datasets/datasets/laion_dataset.py b/LAVIS-main/lavis/datasets/datasets/laion_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..6322c217e26d411903e79e6ed582182f96fb6e4a
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/laion_dataset.py
@@ -0,0 +1,77 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import webdataset as wds
+from lavis.datasets.datasets.base_dataset import BaseDataset
+import random
+
+class LaionDataset(BaseDataset):
+    def __init__(self, vis_processor, text_processor, location):
+        super().__init__(vis_processor=vis_processor, text_processor=text_processor)
+
+        self.inner_dataset = wds.DataPipeline(
+            wds.ResampledShards(location),
+            wds.tarfile_to_samples(handler=wds.warn_and_continue),
+            wds.shuffle(1000, handler=wds.warn_and_continue),
+            wds.decode("pilrgb", handler=wds.warn_and_continue),
+            wds.to_tuple("jpg", "json", handler=wds.warn_and_continue),
+            wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue),
+            wds.map(self.to_dict, handler=wds.warn_and_continue),
+        )
+
+    def to_dict(self, sample):
+        if type(sample[1]) == list:
+            caption = random.choice(sample[1][:2])
+        else:
+            caption = sample[1]["caption"]
+
+        return {
+            "image": sample[0],
+            "text_input": self.text_processor(caption),
+        }
+
+
+class LaionInstructDataset(LaionDataset):
+    def to_dict(self, sample):
+        data = super().to_dict(sample)
+        if data != None:
+            data['text_output'] = data["text_input"]
+            data['text_input'] = self.text_processor("")
+        return data
+
+if __name__ == "__main__":
+    from torchvision import transforms
+
+    def to_image_text_pair(sample):
+        return sample[0], sample[1]["caption"]
+
+    normalize = transforms.Normalize(
+        (0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)
+    )
+
+    transform_train = transforms.Compose(
+        [
+            transforms.RandomResizedCrop(256, scale=(0.2, 1.0)),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            normalize,
+        ]
+    )
+
+    dataset = LaionDataset(
+        vis_processor=transform_train,
+        text_processor=lambda x: x,
+        # location="/export/laion400m-data-ssd/laion115m_capfilt_20220817/{part0/part0,part1/part1,part2/part2}_node{"
+        #          "00..15}_shard{000000..000118}.tar",
+        location="/export/laion/laion2B-multi/part-00000/{00000..01743}.tar",
+    )
+
+    import torch
+
+    loader = torch.utils.data.DataLoader(dataset.inner_dataset, batch_size=2)
+
+    print(next(iter(loader))["text_input"])
diff --git a/LAVIS-main/lavis/datasets/datasets/llava150k_dataset.py b/LAVIS-main/lavis/datasets/datasets/llava150k_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..256698468b91bf03d5dd5c5a1afca2072fa517da
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/llava150k_dataset.py
@@ -0,0 +1,37 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from lavis.datasets.datasets.base_dataset import BaseDataset
+import os
+from PIL import Image
+
+
+class LLaVA150kInstructDataset(BaseDataset):
+    def __init__(self, vis_processor, text_processor,ann_paths, vis_root):
+        super().__init__(vis_processor=vis_processor, text_processor=text_processor, ann_paths=ann_paths, vis_root=vis_root)
+        self.inner_dataset = self.annotation
+        self.location = vis_root
+
+    def __len__(self):
+        return len(self.inner_dataset)
+
+    def __getitem__(self, index):
+
+        example = self.inner_dataset[index]
+        text_input = example['conversations'][0]['value'].replace('<image>', '').strip()
+        text_output = example['conversations'][1]['value']
+        image_id = example['image']
+        image_path = os.path.join(self.location, image_id)
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+        return {
+            "image": image,
+            "instance_id":image_id,
+            "text_input": self.text_processor(text_input),
+            "text_output": self.text_processor(text_output),
+            "image_path": image_path
+        }
diff --git a/LAVIS-main/lavis/datasets/datasets/multimodal_classification_datasets.py b/LAVIS-main/lavis/datasets/datasets/multimodal_classification_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1b4fe02ed39bcec396e160bda6fe43246cb4d03
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/multimodal_classification_datasets.py
@@ -0,0 +1,20 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from abc import abstractmethod
+from lavis.datasets.datasets.base_dataset import BaseDataset
+
+
+class MultimodalClassificationDataset(BaseDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+        self.class_labels = None
+
+    @abstractmethod
+    def _build_class_labels(self):
+        pass
diff --git a/LAVIS-main/lavis/datasets/datasets/music_avqa.py b/LAVIS-main/lavis/datasets/datasets/music_avqa.py
new file mode 100644
index 0000000000000000000000000000000000000000..38e361cc1f4a037dbbff52033422cd3cb5d445e0
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/music_avqa.py
@@ -0,0 +1,71 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import torch
+import copy
+import os
+import random
+import json
+import ast
+from PIL import Image
+from lavis.datasets.datasets.base_dataset import BaseDataset
+
+class MusicAVQADataset(BaseDataset):
+    def __init__(self, **kwargs):
+        super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], kwargs['ann_paths'])
+
+        self.modalities = kwargs['modalities']
+
+        for modality in self.modalities:
+            if 'image' in modality:
+                setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')())
+                continue
+            setattr(self, f"{modality}_root", kwargs[f"{modality}_root"])
+            setattr(self, f"{modality}_processor", kwargs[f"{modality}_processor"])
+            setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')())
+        self.sample_ids = set.intersection(*[set(getattr(self, f"existing_{modality}_annotation")) for modality in self.modalities])
+        self.annotation = [ann for ann in self.annotation if ann['video_id'] in self.sample_ids]
+    
+    def get_existing_audio_annotations(self):
+        return [f.split('.')[0] for f in os.listdir(self.audio_root)]
+    
+    def get_existing_video_annotations(self):
+        return [f.split('.')[0] for f in os.listdir(self.video_root)]
+    
+    def get_audio_path(self, ann):
+        # return os.path.join(self.audio_root, f'{ann["video_id"]}.flac')
+        return os.path.join(self.audio_root, f'{ann["video_id"]}.mp4')
+    
+    def get_video_path(self, ann):
+        return os.path.join(self.video_root, f'{ann["video_id"]}.mp4')
+
+    def __getitem__(self, index):
+        ann = copy.deepcopy(self.annotation[index])
+        for modality in self.modalities:
+            ann[f"{modality}_path"] = getattr(self, f"get_{modality}_path")(ann)
+            if type(ann[f"{modality}_path"]) == list:
+                ann[f"{modality}_path"] = random.choice(ann[f"{modality}_path"])
+            if 'image' in modality:
+                ann['image'] = self.vis_processor(Image.open(ann[f"images_path"]))
+            else:
+                ann[modality] = getattr(self, f"{modality}_processor")(ann[f"{modality}_path"]).to(torch.float32)
+
+        ann["sample_id"] = ann["video_id"]
+        question = ann['question_content'].replace( '<Object>', '{}').format(*ast.literal_eval(ann['templ_values']))
+        ann['text_input'] =  self.text_processor(question)
+        ann["question_id"] = ann['question_id']
+        ann['answers'] = ann['anser']
+        return ann
+    
+
+class MusicAVQAInstructDataset(MusicAVQADataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data['answer'] = data["answers"] # needed to use gqa task
+            data['text_output'] = data["answers"]
+        return data
diff --git a/LAVIS-main/lavis/datasets/datasets/nlvr_datasets.py b/LAVIS-main/lavis/datasets/datasets/nlvr_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cc818c6ac7592686ce104bea345bfe95d727aa0
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/nlvr_datasets.py
@@ -0,0 +1,94 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import random
+
+from collections import OrderedDict
+
+from lavis.datasets.datasets.multimodal_classification_datasets import (
+    MultimodalClassificationDataset,
+)
+from PIL import Image
+
+
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+
+        return OrderedDict(
+            {
+                "file_L": ann["images"][0],
+                "file_R": ann["images"][1],
+                "sentence": ann["sentence"],
+                "label": ann["label"],
+                "image": [sample["image0"], sample["image1"]],
+            }
+        )
+
+
+class NLVRDataset(MultimodalClassificationDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+        self.class_labels = self._build_class_labels()
+
+    def _build_class_labels(self):
+        return {"False": 0, "True": 1}
+
+    @staticmethod
+    def _flip(samples):
+        sentence = samples["text_input"]
+        image0, image1 = samples["image0"], samples["image1"]
+
+        if "left" not in sentence and "right" not in sentence:
+            if random.random() < 0.5:
+                image0, image1 = image1, image0
+        else:
+            if random.random() < 0.5:
+                sentence = sentence.replace("left", "[TEMP_TOKEN]")
+                sentence = sentence.replace("right", "left")
+                sentence = sentence.replace("[TEMP_TOKEN]", "right")
+
+                image0, image1 = image1, image0
+
+        samples["text_input"] = sentence
+        samples["image0"] = image0
+        samples["image1"] = image1
+
+        return samples
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image0_path = os.path.join(self.vis_root, ann["images"][0])
+        image0 = Image.open(image0_path).convert("RGB")
+        image0 = self.vis_processor(image0)
+
+        image1_path = os.path.join(self.vis_root, ann["images"][1])
+        image1 = Image.open(image1_path).convert("RGB")
+        image1 = self.vis_processor(image1)
+
+        sentence = self.text_processor(ann["sentence"])
+        label = self.class_labels[ann["label"]]
+
+        return self._flip(
+            {
+                "image0": image0,
+                "image1": image1,
+                "text_input": sentence,
+                "label": label,
+                # "image_id": ann["image_id"],
+                "instance_id": ann["instance_id"],
+            }
+        )
+
+
+class NLVREvalDataset(NLVRDataset):
+    @staticmethod
+    def _flip(samples):
+        return samples
diff --git a/LAVIS-main/lavis/datasets/datasets/object3d_captioning_datasets.py b/LAVIS-main/lavis/datasets/datasets/object3d_captioning_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..65c8c74931d751b696f34f764b8830b5ee9696e1
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/object3d_captioning_datasets.py
@@ -0,0 +1,177 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os, sys
+from collections import OrderedDict
+import random
+
+from lavis.datasets.datasets.base_dataset import BaseDataset
+from lavis.common.utils import is_serializable
+
+from PIL import Image
+import numpy as np
+from tqdm import tqdm
+import json
+import torch
+import copy
+
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+        serializable_sample_keys = [k for k,v in sample.items() if is_serializable(v)]
+        serializable_ann_keys = [k for k,v in ann.items() if is_serializable(v)]
+        display = {k:sample[k] for k in serializable_sample_keys}
+        display.update({k:ann[k] for k in serializable_ann_keys})
+
+        return OrderedDict(
+            display
+        )
+
+
+
+class Object3dCaptionDataset(BaseDataset, __DisplMixin):
+    def __init__(self, **kwargs):
+        super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], kwargs['ann_paths'])
+
+        self.modalities = kwargs['modalities']
+        self.npoints = 8192
+        self.sample_points_num = self.npoints
+
+        for modality in self.modalities:
+            if 'image' in modality:
+                setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')())
+                continue
+            setattr(self, f"{modality}_root", kwargs[f"{modality}_root"])
+            setattr(self, f"{modality}_processor", kwargs[f"{modality}_processor"])
+            setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')())
+        self.sample_ids = set.intersection(*[set(getattr(self, f"existing_{modality}_annotation")) for modality in self.modalities])
+        self.annotation = [ann for ann in self.annotation if ann['sample_id'] in self.sample_ids]
+    
+    def get_existing_depth_annotations(self):
+        return os.listdir(self.depth_root)
+    
+    def get_existing_images_annotations(self):
+        return os.listdir(self.vis_root)
+    
+    def get_existing_pc_annotations(self):
+        raise NotImplementedError("Subclasses should implement this!")
+
+    def get_pc_path(self, sample_key):
+        raise NotImplementedError("Subclasses should implement this!")
+    
+    def get_images_path(self, sample_key):
+        raise NotImplementedError("Subclasses should implement this!")
+    
+    def get_depth_path(self, sample_key):
+        raise NotImplementedError("Subclasses should implement this!")
+
+    def __getitem__(self, index):
+        ann = copy.deepcopy(self.annotation[index])
+        ann['captions'] = ann['data']
+        del ann['data']
+        
+        for modality in self.modalities:
+            ann[f"{modality}_path"] = getattr(self, f"get_{modality}_path")(ann['sample_id'])
+            if type(ann[f"{modality}_path"]) == list: # select from image views
+                ann[f"{modality}_path"] = random.choice(ann[f"{modality}_path"])
+            if 'image' in modality:
+                ann['image'] = self.vis_processor(Image.open(ann[f"images_path"]))
+            else:
+                ann[modality] = getattr(self, f"{modality}_processor")(ann[f"{modality}_path"]).to(torch.float32)
+        return ann
+    
+    def __len__(self):
+        return len(self.annotation)
+    
+    def _build_templates(self, templates_path):
+        # use captions not templates
+        if templates_path is None:
+            self.templates = None
+        else:
+            with open(templates_path) as f:
+                self.templates = json.load(f)
+
+
+class ObjaverseCaptionDataset(Object3dCaptionDataset, __DisplMixin):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    
+    def get_existing_images_annotations(self):
+       return [f.split('_')[0] for f in os.listdir(os.path.join(self.vis_root, f'compressed_imgs_view{0}/Cap3D_imgs_view{0}/'))]
+    
+    def get_existing_pc_annotations(self):
+        return list(set(os.listdir(self.pc_root)).intersection(set(ann['sample_id'] for ann in self.annotation)))
+
+    def get_pc_path(self, sample_key):
+        return os.path.join(self.pc_root, sample_key, '{}_{}.npz'.format(sample_key, self.npoints))
+       
+    def get_images_path(self, sample_key):
+        # data downloaded from: https://huggingface.co/datasets/tiange/Cap3D/tree/main/RenderedImage_zips
+        return [os.path.join(self.vis_root, f'compressed_imgs_view{i}/Cap3D_imgs_view{i}/', sample_key+f'_{i}.jpeg') for i in range(8)]
+        
+    def __getitem__(self, index):
+        ann = super().__getitem__(index)
+        ann['text_input'] = self.text_processor(random.choice(ann['captions']))
+        return ann
+
+class ObjaverseCaptionInstructDataset(ObjaverseCaptionDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data['text_output'] = data["text_input"]
+            data['text_input'] = self.text_processor("")
+        return data
+
+
+class ObjaverseCaptionEvalDataset(ObjaverseCaptionDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            del data["text_input"]
+        return data
+
+
+
+class ShapenetCaptionDataset(Object3dCaptionDataset, __DisplMixin):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    
+    def get_existing_pc_annotations(self):
+        return list(set([f.replace('.npy', '') for f in os.listdir(self.pc_root)]))
+
+    def get_pc_path(self, sample_key):
+        return os.path.join(self.pc_root, sample_key+'.npy')
+    
+    def get_images_path(self, sample_key):
+        return [os.path.join(self.vis_root,sample_key, img_path) for img_path in os.listdir(os.path.join(self.vis_root, sample_key))]
+        
+    def __getitem__(self, index):
+        ann = super().__getitem__(index)
+        if not isinstance(ann['captions'], list):
+            if self.templates:
+                ann['objects'] = ann['captions']
+                ann['captions'] = [random.choice(self.templates).format(obj) for obj in ann['objects'].split(',')]
+            else:
+                ann['objects'] = ann['captions']
+                ann['captions'] = [random.choice(ann['objects'].split(','))]
+        ann['text_input'] = self.text_processor(random.choice(ann['captions']))
+        return ann
+
+class ShapenetCaptionInstructDataset(ShapenetCaptionDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data['text_output'] = data["text_input"]
+            data['text_input'] = self.text_processor("")
+        return data
+
+class ShapenetCaptionEvalDataset(ShapenetCaptionDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            del data["text_input"]
+        return data
diff --git a/LAVIS-main/lavis/datasets/datasets/object3d_classification_datasets.py b/LAVIS-main/lavis/datasets/datasets/object3d_classification_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..675156fd18587cabe21d3edd7f4acab255774409
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/object3d_classification_datasets.py
@@ -0,0 +1,158 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+# Adapted from https://github.com/salesforce/ULIP/blob/48d8d00b1cdb2aee79005817a202816f1c521911/models/pointnext/PointNeXt/openpoints/dataset/modelnet/modelnet40_normal_resampled_loader.py
+
+import os
+from collections import OrderedDict
+import numpy as np
+from tqdm import tqdm
+import torch
+import copy
+import random
+import pickle
+from PIL import Image
+from lavis.processors.ulip_processors import farthest_point_sample, pc_normalize
+from lavis.datasets.datasets.base_dataset import BaseDataset
+
+
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+
+        return OrderedDict(
+            {
+                "file": ann["image"],
+                "caption": ann["caption"],
+                "image": sample["image"],
+                "pc": sample["pc"],
+            }
+        )
+
+class ModelNetClassificationDataset(BaseDataset, __DisplMixin):
+    """
+    Dataset for ModelNet Classification.
+    """
+    def __init__(self, **kwargs):
+        super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], [])
+
+        self.modalities = kwargs['modalities']
+        # Setting dataset specific properties
+        self.npoints = 8192
+        self.use_normals = False
+        self.num_category = 40
+        self.process_data = True
+        self.uniform = True
+        self.generate_from_raw_data = False
+        ann_paths = kwargs['ann_paths']
+
+        assert 'pc_root' in kwargs, "Point cloud root needs to be provided to retrieve labels."
+        self.pc_root = kwargs["pc_root"]
+
+        # Fetching class names and IDs
+        self.classnames = [line.rstrip() for line in open(ann_paths[0])]
+        self.classes = dict(zip(self.classnames, range(len(self.classnames))))
+        self.shape_ids = [line.rstrip() for line in open(ann_paths[-1])]
+        self.shape_names = ['_'.join(x.split('_')[0:-1]) for x in self.shape_ids]
+
+        # Setting data paths
+        self.datapath = [(self.shape_names[i], os.path.join(self.pc_root, self.shape_names[i], self.shape_ids[i]) + '.txt') for i
+                         in range(len(self.shape_ids))]
+
+
+        # Saving path settings
+        self.save_path = ann_paths[1] if self.uniform else ann_paths[0].replace('_fps', '')
+        
+        # Processing or loading data
+        self._prepare_data()
+
+
+    def _prepare_data(self):
+        # Check for pre-processed data
+        if self.process_data:
+            if not os.path.exists(self.save_path):
+                if self.generate_from_raw_data:
+                    print('Processing data %s (only running in the first time)...' % self.save_path)
+                    self._process_raw_data()
+            else:
+                print('Load processed data from %s...' % self.save_path)
+                with open(self.save_path, 'rb') as f:
+                    self.list_of_points, self.list_of_labels = pickle.load(f)
+        else:
+            print('Load processed data from %s...' % self.save_path)
+            with open(self.save_path, 'rb') as f:
+                self.list_of_points, self.list_of_labels = pickle.load(f)
+
+    def _process_raw_data(self):
+        self.list_of_points = [None] * len(self.datapath)
+        self.list_of_labels = [None] * len(self.datapath)
+        for index in tqdm(range(len(self.datapath)), total=len(self.datapath)):
+            fn = self.datapath[index]
+            cls = self.classes[self.datapath[index][0]]
+            cls = np.array([cls]).astype(np.int32)
+            point_set = np.loadtxt(fn[1], delimiter=',').astype(np.float32)
+
+            if self.uniform:
+                point_set = farthest_point_sample(point_set, self.npoints)
+                print("uniformly sampled out {} points".format(self.npoints))
+            else:
+                point_set = point_set[0:self.npoints, :]
+
+            self.list_of_points[index] = point_set
+            self.list_of_labels[index] = cls
+
+        with open(self.save_path, 'wb') as f:
+            pickle.dump([self.list_of_points, self.list_of_labels], f)
+
+    def __len__(self):
+        return len(self.list_of_labels)
+
+    def _get_item(self, index):
+        if self.process_data:
+            point_set, label = self.list_of_points[index], self.list_of_labels[index]
+        else:
+            fn = self.datapath[index]
+            cls = self.classes[self.datapath[index][0]]
+            label = np.array([cls]).astype(np.int32)
+            point_set = np.loadtxt(fn[1], delimiter=',').astype(np.float32)
+            
+            # Uniform sampling or trimming
+            if self.uniform:
+                point_set = farthest_point_sample(point_set, self.npoints)
+            else:
+                point_set = point_set[0:self.npoints, :]
+        if self.npoints < point_set.shape[0]:
+            point_set = farthest_point_sample(point_set, self.npoints)
+
+        point_set[:, 0:3] = pc_normalize(point_set[:, 0:3])
+        if not self.use_normals:
+            point_set = point_set[:, 0:3]
+            
+        return point_set, label[0]
+
+    def __getitem__(self, index):
+        points, label = self._get_item(index)
+        label_name = self.classnames[int(label)]
+
+        data =  {
+                "instance_id": index,
+                "sample_key": index,
+                "image_id": index,
+                "label": label_name
+                }
+        
+        if 'pc' in self.modalities:
+            pt_idxs = np.arange(0, points.shape[0])
+            np.random.shuffle(pt_idxs)
+            current_points = points[pt_idxs].copy()
+            current_points = torch.from_numpy(current_points).float()
+            data['pc'] = current_points
+        if any([k in self.modalities for k in ['images', 'image']]):
+            img = Image.open(os.path.join(self.vis_root,f"{index}.jpeg" ))
+            data['image'] = self.vis_processor(img)
+
+        return data
diff --git a/LAVIS-main/lavis/datasets/datasets/object3d_qa_datasets.py b/LAVIS-main/lavis/datasets/datasets/object3d_qa_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef2c14117166802b58c9ba7dfc88ef974ffef158
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/object3d_qa_datasets.py
@@ -0,0 +1,65 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import copy
+import random
+from PIL import Image
+import torch
+
+from lavis.datasets.datasets.object3d_captioning_datasets import Object3dCaptionDataset
+
+class ObjaverseQADataset(Object3dCaptionDataset):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.add_binary = kwargs.get('add_binary', False)
+        self.binary_templates = ["do you see {}?", "is this {}?", "does the 3d model contain {}?"]
+        self.remove_model_answer = kwargs.get('remove_model_answer', False)
+        if self.remove_model_answer:
+            self.annotation = [ann for ann in self.annotation if 'model' not in ann['answer']]
+    
+    def get_existing_pc_annotations(self):
+        return list(set(os.listdir(self.pc_root)).intersection(set(ann['sample_id'] for ann in self.annotation)))
+
+    def get_pc_path(self, sample_key):
+        return os.path.join(self.pc_root, sample_key, '{}_{}.npz'.format(sample_key, self.npoints))
+       
+    def get_images_path(self, sample_key):
+        # data downloaded from: https://huggingface.co/datasets/tiange/Cap3D/tree/main/RenderedImage_zips
+        return [os.path.join(self.vis_root, f'compressed_imgs_view{i}/Cap3D_imgs_view{i}/', sample_key+f'_{i}.jpeg') for i in range(8)]
+        
+    def __getitem__(self, index):
+        ann = copy.deepcopy(self.annotation[index])
+        for modality in self.modalities:
+            ann[f"{modality}_path"] = getattr(self, f"get_{modality}_path")(ann['sample_id'])
+            if type(ann[f"{modality}_path"]) == list:
+                ann[f"{modality}_path"] = random.choice(ann[f"{modality}_path"])
+            if 'image' in modality:
+                ann['image'] = self.vis_processor(Image.open(ann[f"image_path"]))
+            else:
+                ann[modality] = getattr(self, f"{modality}_processor")(ann[f"{modality}_path"]).to(torch.float32)
+        
+        if self.add_binary and random.randint(0,10) < 3:
+            yes_answer = random.randint(0,10)<5
+            if not yes_answer:
+                caption_index = random.choice(list(set(range(len(self.annotation))).difference(set([index]))))
+                caption = self.annotation[caption_index]['caption']
+            else:
+                caption = ann['caption']
+            
+            question = random.choice(self.binary_templates).format(caption)
+            answer = 'yes' if yes_answer else 'no'
+            ann['text_input'] = self.text_processor(question)
+            ann['text_output'] = answer
+
+        else:
+            ann['text_input'] = self.text_processor(ann['question'])
+            ann['text_output'] =  ann['answer']
+
+        ann['answers'] =  [ann['text_output']]
+        ann['question_id'] = ann['instance_id']
+        return ann
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/datasets/ocr_datasets.py b/LAVIS-main/lavis/datasets/datasets/ocr_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..713ddbe106d10bab6c7876dd5beb72a2b129bf7a
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/ocr_datasets.py
@@ -0,0 +1,69 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import random
+import copy
+
+from PIL import Image
+from lavis.datasets.datasets.vqa_datasets import VQADataset
+
+
+class OCRVQADataset(VQADataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+        count_id = 0
+        annotations = []
+        for ann in self.annotation:
+            for q,a in zip(ann['questions'],ann['answers']):
+                new_ann = {}
+                new_ann = copy.deepcopy(ann)
+                new_ann['questions'] = q
+                new_ann['answers'] = a   
+                new_ann['instance_id'] = count_id
+                new_ann['sample_id'] = ann["sample_id"]
+                image_id = ann['sample_id'] + '.jpg'
+                image_path = os.path.join(self.vis_root, image_id)
+                if not os.path.exists(image_path):
+                    continue
+                count_id+= 1           
+                annotations.append(new_ann)
+        self.annotation = annotations
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+        image_id = ann['sample_id'] + '.jpg'
+        image_path = os.path.join(self.vis_root, image_id)
+        try:
+            image = Image.open(image_path).convert("RGB")
+        except:
+            return None
+        image = self.vis_processor(image)
+        question = self.text_processor(ann["questions"])
+
+        answers = [ann["answers"]]
+        # TODO this should be configured better
+        weights = [1.]
+        
+        return {
+            "image": image,
+            "text_input": question,
+            "answers": answers,
+            "weights": weights,
+            "question_id": ann["sample_id"]
+        }
+
+class OCRVQAInstructDataset(OCRVQADataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data['text_output'] = random.choice(data["answers"])
+        return data
+    def collater(self, samples):
+        data = super().collater(samples)
+        data['text_output'] = data['answer']
+        return data
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/datasets/retrieval_datasets.py b/LAVIS-main/lavis/datasets/datasets/retrieval_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cee7a4f800c67524fffbd3ce1e4fc068fba67e1
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/retrieval_datasets.py
@@ -0,0 +1,162 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+from collections import OrderedDict
+
+from lavis.datasets.datasets.base_dataset import BaseDataset
+from PIL import Image
+
+
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+        visual_key = "image" if "image" in ann else "video"
+
+        return OrderedDict(
+            {
+                "file": ann[visual_key],
+                "caption": ann["caption"],
+                visual_key: sample[visual_key],
+            }
+        )
+
+
+class RetrievalDataset(BaseDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+        self.img_ids = {}
+        n = 0
+        for ann in self.annotation:
+            img_id = ann["image_id"]
+            if img_id not in self.img_ids.keys():
+                self.img_ids[img_id] = n
+                n += 1
+
+    def __getitem__(self, index):
+
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        caption = self.text_processor(ann["caption"])
+
+        return {
+            "image": image,
+            "text_input": caption,
+            "image_id": self.img_ids[ann["image_id"]],
+            "instance_id": ann["instance_id"],
+        }
+
+
+class RetrievalEvalDataset(BaseDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        split (string): val or test
+        """
+
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+        self.text = []
+        self.image = []
+        self.txt2img = {}
+        self.img2txt = {}
+
+        txt_id = 0
+        for img_id, ann in enumerate(self.annotation):
+            self.image.append(ann["image"])
+            self.img2txt[img_id] = []
+            for i, caption in enumerate(ann["caption"]):
+                self.text.append(self.text_processor(caption))
+                self.img2txt[img_id].append(txt_id)
+                self.txt2img[txt_id] = img_id
+                txt_id += 1
+
+    def __getitem__(self, index):
+
+        image_path = os.path.join(self.vis_root, self.annotation[index]["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+
+        return {"image": image, "index": index}
+
+
+class VideoRetrievalDataset(BaseDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of videos.
+        ann_root (string): directory to store the annotation file
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+        self.img_ids = {}
+        n = 0
+        for ann in self.annotation:
+            img_id = ann["video"]
+            if img_id not in self.img_ids.keys():
+                self.img_ids[img_id] = n
+                n += 1
+
+    def __getitem__(self, index):
+
+        ann = self.annotation[index]
+
+        vpath = os.path.join(self.vis_root, ann["video"])
+
+        video = self.vis_processor(vpath)
+        caption = self.text_processor(ann["caption"])
+
+        # return image, caption, self.img_ids[ann['image_id']]
+        return {
+            "video": video,
+            "text_input": caption,
+            "image_id": self.img_ids[ann["video"]],
+        }
+
+
+class VideoRetrievalEvalDataset(BaseDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of videos.
+        ann_root (string): directory to store the annotation file
+        split (string): val or test
+        """
+
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+        self.text = []
+        self.image = []
+        self.txt2img = {}
+        self.img2txt = {}
+
+        txt_id = 0
+        for img_id, ann in enumerate(self.annotation):
+            self.image.append(ann["video"])
+            self.img2txt[img_id] = []
+            for i, caption in enumerate(ann["caption"]):
+                self.text.append(self.text_processor(caption))
+                self.img2txt[img_id].append(txt_id)
+                self.txt2img[txt_id] = img_id
+                txt_id += 1
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        vpath = os.path.join(self.vis_root, ann["video"])
+        video = self.vis_processor(vpath)
+
+        return {"video": video, "index": index}
diff --git a/LAVIS-main/lavis/datasets/datasets/snli_ve_datasets.py b/LAVIS-main/lavis/datasets/datasets/snli_ve_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..68d32801d0875f378cdb6863a89a315d4361b160
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/snli_ve_datasets.py
@@ -0,0 +1,70 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+from collections import OrderedDict
+
+from lavis.datasets.datasets.multimodal_classification_datasets import (
+    MultimodalClassificationDataset,
+)
+from PIL import Image
+
+
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+
+        return OrderedDict(
+            {
+                "file": os.path.basename(ann["image"]),
+                "sentence": ann["sentence"],
+                "label": ann["label"],
+                "image": sample["image"],
+            }
+        )
+
+
+class SNLIVisualEntialmentDataset(MultimodalClassificationDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+        self.class_labels = self._build_class_labels()
+        self.classnames = list(self.class_labels.keys())
+
+    def _build_class_labels(self):
+        return {"contradiction": 0, "neutral": 1, "entailment": 2}
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image_id = ann["image"]
+        image_path = os.path.join(self.vis_root, "%s.jpg" % image_id)
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        sentence = self.text_processor(ann["sentence"])
+
+        return {
+            "image": image,
+            "text_input": sentence,
+            "label": self.class_labels[ann["label"]],
+            "image_id": image_id,
+            "instance_id": ann["instance_id"],
+        }
+
+class SNLIVisualEntialmentInstructDataset(SNLIVisualEntialmentDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+        self.classnames = ['no', 'maybe', 'yes']
+
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data["prompt"] = self.text_processor("based on the given the image is {} true?")
+            data["answer"] = self.classnames[data["label"]]
+            data["label"] = self.classnames[data["label"]]
+            data["question_id"] = data["instance_id"]
+        return data
diff --git a/LAVIS-main/lavis/datasets/datasets/subject_driven_t2i_dataset.py b/LAVIS-main/lavis/datasets/datasets/subject_driven_t2i_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc45d118feca57a5e99981be0539d2b51989d8f6
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/subject_driven_t2i_dataset.py
@@ -0,0 +1,72 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+
+from PIL import Image
+from torch.utils.data import Dataset
+from torch.utils.data.dataloader import default_collate
+
+
+class SubjectDrivenTextToImageDataset(Dataset):
+    def __init__(
+        self,
+        image_dir,
+        subject_text,
+        inp_image_processor,
+        tgt_image_processor,
+        txt_processor,
+        repetition=100000,
+    ):
+        self.subject = txt_processor(subject_text.lower())
+        self.image_dir = image_dir
+
+        self.inp_image_transform = inp_image_processor
+        self.tgt_image_transform = tgt_image_processor
+
+        self.text_processor = txt_processor
+
+        image_paths = os.listdir(image_dir)
+        # image paths are jpg png webp
+        image_paths = [
+            os.path.join(image_dir, imp)
+            for imp in image_paths
+            if os.path.splitext(imp)[1][1:]
+            in ["jpg", "png", "webp", "jpeg", "JPG", "PNG", "WEBP", "JPEG"]
+        ]
+        # make absolute path
+        self.image_paths = [os.path.abspath(imp) for imp in image_paths]
+        self.repetition = repetition
+
+    def __len__(self):
+        return len(self.image_paths) * self.repetition
+    
+    @property
+    def len_without_repeat(self):
+        return len(self.image_paths)
+
+    def collater(self, samples):
+        return default_collate(samples)
+
+    def __getitem__(self, index):
+        image_path = self.image_paths[index % len(self.image_paths)]
+        image = Image.open(image_path).convert("RGB")
+
+        # For fine-tuning, we use the same caption for all images
+        # maybe worth trying different captions for different images
+        caption = f"a {self.subject}"
+        caption = self.text_processor(caption)
+
+        inp_image = self.inp_image_transform(image)
+        tgt_image = self.tgt_image_transform(image)
+
+        return {
+            "inp_image": inp_image,
+            "tgt_image": tgt_image,
+            "caption": caption,
+            "subject_text": self.subject,
+        }
diff --git a/LAVIS-main/lavis/datasets/datasets/textcaps_datasets.py b/LAVIS-main/lavis/datasets/datasets/textcaps_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..4588fc327f1e75c3058a47825f1d2f4eb89458c4
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/textcaps_datasets.py
@@ -0,0 +1,57 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+from lavis.datasets.datasets.base_dataset import BaseDataset
+from lavis.datasets.datasets.caption_datasets import CaptionDataset, CaptionEvalDataset
+
+class TextCapsCapDataset(CaptionDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        BaseDataset.__init__(self, vis_processor, text_processor, vis_root, ann_paths)
+        self.annotation = self.annotation[3]['data']
+        self.img_ids = {}
+        n = 0
+        for ann in self.annotation:
+            img_id = ann["image_id"]
+            if img_id not in self.img_ids.keys():
+                self.img_ids[img_id] = n
+                n += 1
+            ann["image"] = ann["image_id"]+'.jpg'
+            ann["caption"] = ann["caption_str"]
+            del ann["caption_str"]
+            
+class TextCapsCapInstructDataset(TextCapsCapDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data['text_output'] = data["text_input"]
+            data['text_input'] = self.text_processor("")
+        return data
+
+class TextCapsCapEvalDataset(CaptionEvalDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        BaseDataset.__init__(self, vis_processor, text_processor, vis_root, ann_paths)
+        self.annotation = self.annotation[3]['data']
+        self.annotation = [ann for ann in self.annotation if "caption_str" in ann] # only keep annotations with captions
+
+        self.img_ids = {}
+        n = 0
+        for ann in self.annotation:
+            img_id = ann["image_id"]
+            if img_id not in self.img_ids.keys():
+                self.img_ids[img_id] = n
+                n += 1
+            ann["image"] = ann["image_id"]+'.jpg'
+            ann["caption"] = ann["caption_str"]
+            del ann["caption_str"]
+        self._add_instance_ids()
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/datasets/valor_caption.py b/LAVIS-main/lavis/datasets/datasets/valor_caption.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7706bb6962780f80b23b5cddc91da01c49213d3
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/valor_caption.py
@@ -0,0 +1,88 @@
+"""
+ Copyright (c) 2023, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+
+import torch
+import copy
+import os
+import random
+import json
+from PIL import Image
+from lavis.datasets.datasets.base_dataset import BaseDataset
+
+class VALORCaptionDataset(BaseDataset):
+    def __init__(self, **kwargs):
+        super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], kwargs['ann_paths'])
+
+        self.modalities = kwargs['modalities']
+
+        for modality in self.modalities:
+            if 'image' in modality:
+                setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')())
+                continue
+            setattr(self, f"{modality}_root", kwargs[f"{modality}_root"])
+            setattr(self, f"{modality}_processor", kwargs[f"{modality}_processor"])
+            setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')())
+
+        self.sample_ids = set.intersection(*[set(getattr(self, f"existing_{modality}_annotation")) for modality in self.modalities])
+        self.annotation = [ann for ann in self.annotation if ann['video_id'].replace('000', '0') in self.sample_ids]
+        seen = set()
+        self.annotation = [x for x in self.annotation if x["video_id"] not in seen and not seen.add(x["video_id"])]
+    
+    def __len__(self):
+        return len(self.annotation)
+    
+    def get_existing_audio_annotations(self):
+        return ['.'.join(f.split('.')[:-1]) for f in os.listdir(self.audio_root)]
+    
+    def get_existing_video_annotations(self):
+        return ['.'.join(f.split('.')[:-1]) for f in os.listdir(self.video_root)]
+    
+    
+    def get_audio_path(self, ann):
+        return os.path.join(self.audio_root, f'{ann["video_id"].replace("000", "0")}.mp4')
+    
+    def get_video_path(self, ann):
+        return os.path.join(self.video_root, f'{ann["video_id"].replace("000", "0")}.mp4')
+
+
+    def __getitem__(self, index):
+        ann = copy.deepcopy(self.annotation[index])
+        ann["sample_id"] = ann["video_id"]
+        ann["text_input"] =  self.text_processor(ann['desc'])
+        for modality in self.modalities:
+            ann[f"{modality}_path"] = getattr(self, f"get_{modality}_path")(ann)
+            if type(ann[f"{modality}_path"]) == list:
+                ann[f"{modality}_path"] = random.choice(ann[f"{modality}_path"])
+            if 'image' in modality:
+                ann['image'] = self.vis_processor(Image.open(ann[f"images_path"]))
+            else:
+                ann[modality] = getattr(self, f"{modality}_processor")(ann[f"{modality}_path"]).to(torch.float32)
+
+        ann["caption"] =  ann["text_input"] 
+        ann["image_id"] = ann["video_id"]
+
+        
+        return ann
+
+
+class VALORCaptionEvalDataset(VALORCaptionDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            del data['text_input']
+            del data['caption']
+        return data
+
+
+class VALORCaptionInstuctDataset(VALORCaptionDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data['text_output'] = data["text_input"]
+            data['text_input'] = self.text_processor("")
+        return data
diff --git a/LAVIS-main/lavis/datasets/datasets/vatex_captioning_datasets.py b/LAVIS-main/lavis/datasets/datasets/vatex_captioning_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..4397f01650570390f6e96639c8f382bf450a778f
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/vatex_captioning_datasets.py
@@ -0,0 +1,87 @@
+
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import torch
+import copy
+import os
+import random
+import json
+from PIL import Image
+from lavis.datasets.datasets.base_dataset import BaseDataset
+
+class VATEXCaptionDataset(BaseDataset):
+    def __init__(self, **kwargs):
+        super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], kwargs['ann_paths'])
+
+        self.modalities = kwargs['modalities']
+
+        for modality in self.modalities:
+            if 'image' in modality:
+                setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')())
+                continue
+            setattr(self, f"{modality}_root", kwargs[f"{modality}_root"])
+            setattr(self, f"{modality}_processor", kwargs[f"{modality}_processor"])
+            setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')())
+
+        self.sample_ids = set.intersection(*[set(getattr(self, f"existing_{modality}_annotation")) for modality in self.modalities])
+        seen = set()
+        self.annotation = [x for x in self.annotation if x["video"] not in seen and not seen.add(x["video"])]
+    
+    def __len__(self):
+        return len(self.annotation)
+    
+    def get_existing_audio_annotations(self):
+        return ['.'.join(f.split('.')[:-1]) for f in os.listdir(self.audio_root)]
+    
+    def get_existing_video_annotations(self):
+        return ['.'.join(f.split('.')[:-1]) for f in os.listdir(self.video_root)]
+    
+
+    def get_audio_path(self, ann):
+        return os.path.join(self.audio_root, f'{ann["video"]}')
+    
+
+    def get_video_path(self, ann):
+        return os.path.join(self.video_root, f'{ann["video"]}')
+
+
+    def __getitem__(self, index):
+        ann = copy.deepcopy(self.annotation[index])
+        ann["video_path"] = ann["video"]
+        ann["audio_path"] = ann["video"]
+        ann["sample_id"] = ann["video"]
+        ann['text_input'] = ann["caption"]
+        ann["image_id"] = ann["video"]
+
+        for modality in self.modalities:
+            ann[f"{modality}_path"] = getattr(self, f"get_{modality}_path")(ann)
+            if type(ann[f"{modality}_path"]) == list:
+                ann[f"{modality}_path"] = random.choice(ann[f"{modality}_path"])
+            if 'image' in modality:
+                ann['image'] = self.vis_processor(Image.open(ann[f"images_path"]))
+            else:
+                ann[modality] = getattr(self, f"{modality}_processor")(ann[f"{modality}_path"]).to(torch.float32)
+
+        return ann
+
+
+class VATEXCaptionEvalDataset(VATEXCaptionDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            del data['text_input']
+        return data
+
+
+class VATEXCaptionInstuctDataset(VATEXCaptionDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data['text_output'] = data["text_input"]
+            data['text_input'] = self.text_processor("")
+        return data
diff --git a/LAVIS-main/lavis/datasets/datasets/vg_vqa_datasets.py b/LAVIS-main/lavis/datasets/datasets/vg_vqa_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..30d3c67d82a8283ceff01299f554f5f2d6eb0081
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/vg_vqa_datasets.py
@@ -0,0 +1,51 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import random
+
+from PIL import Image
+
+from lavis.datasets.datasets.vqa_datasets import VQADataset
+
+
+class VGVQADataset(VQADataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        question = self.text_processor(ann["question"])
+
+        answers = [ann["answer"]]
+        # TODO this should be configured better
+        weights = [1.]
+
+        return {
+            "image": image,
+            "text_input": question,
+            "answers": answers,
+            "weights": weights,
+        }
+
+
+class VGVQAInstructDataset(VGVQADataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data['text_output'] = random.choice(data["answers"])
+        return data
+    def collater(self, samples):
+        data = super().collater(samples)
+        data['text_output'] = data['answer']
+        return data
+
diff --git a/LAVIS-main/lavis/datasets/datasets/video_caption_datasets.py b/LAVIS-main/lavis/datasets/datasets/video_caption_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..3feed795b33e199459b74b7fab689470ec23a34a
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/video_caption_datasets.py
@@ -0,0 +1,177 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import math
+from lavis.datasets.datasets.base_dataset import BaseDataset
+from lavis.datasets.datasets.caption_datasets import CaptionDataset
+
+
+class VideoCaptionDataset(CaptionDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        split (string): val or test
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    def __getitem__(self, index):
+
+        ann = self.annotation[index]
+
+        vname = ann["video"]
+        video_path = os.path.join(self.vis_root, vname)
+        
+        try:
+            video = self.vis_processor(video_path)
+        except:
+            print(f"Could not load {video_path}")
+            return None
+        if video==None:
+            return None
+        
+        caption = self.text_processor(ann["caption"])
+
+        # "image_id" is kept to stay compatible with the COCO evaluation format
+        return {
+            "video": video,
+            "text_input": caption,
+            "image_id": self.img_ids[ann["image_id"]],
+        }
+
+
+class VideoCaptionEvalDataset(BaseDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        split (string): val or test
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+        # videos set. do not repeat videos in inference
+        ## todo: make it deduplicated because creating annotation file makes 
+        seen = set()
+        self.annotation = [x for x in self.annotation if x["video"] not in seen and not seen.add(x["image_id"])]
+    
+    def __len__(self):
+        return len(self.annotation)
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        vname = ann["video"]
+        video_path = os.path.join(self.vis_root, vname)
+
+        try:
+            video = self.vis_processor(video_path)
+        except:
+            print(f"Could not load {video_path}")
+            return None
+
+        return {
+            "video": video,
+            "image_id": ann["image_id"],
+            "instance_id": ann["instance_id"],
+        }
+
+
+class VideoCaptionInstructDataset(VideoCaptionDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data['text_output'] = data["text_input"]
+            data['text_input'] = self.text_processor("")
+        return data
+
+
+
+class ClipCaptionDataset(BaseDataset):
+    """
+    Handles video datasets where subclip of full video needs to be loaded. 
+    """
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    def __getitem__(self, index):
+
+        ann = self.annotation[index]
+
+        vname = ann["video_path"]
+        video_path = os.path.join(self.vis_root, vname)
+        try:
+            video = self.vis_processor(video_path, start_sec=math.floor(ann['ts'][0]), end_sec=math.ceil(ann['ts'][1]))
+        except:
+            return None
+
+
+        caption = ann["caption"] if 'caption' in ann else ann["query"]
+
+        image_id = ann['youtube_id'] if 'youtube_id' in ann else ann["video_id"] if "video_id" in ann else vname
+
+        # "image_id" is kept to stay compatible with the COCO evaluation format
+        return {
+            "video": video,
+            "text_input": self.text_processor(caption),
+            "image_id": image_id,
+            "instance_id": ann['instance_id'],
+        }
+
+class ClipCaptionInstructDataset(ClipCaptionDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data['text_output'] = data["text_input"]
+            data['text_input'] = self.text_processor("")
+        return data
+
+class ClipCaptionEvalDataset(ClipCaptionDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            del data["text_input"]
+        return data
+
+
+class WebVideoCaptionDataset(BaseDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+    
+    def _get_video(self, index):
+        """
+        If video does not exist, loop to the next one.
+        """
+        max_retries = 3
+        for _ in range(max_retries):
+            ann = self.annotation[index]
+            video_path = os.path.join(self.vis_root, f"{ann['videoid']}.mp4")
+            try:
+                video = self.vis_processor(video_path)
+                return video, video_path, ann
+            except:
+                index = (index + 1) % len(self.annotation)  # Safely loop back to start of annotations
+        return None
+
+    def __getitem__(self, index):
+        video, video_path, ann = self._get_video(index)
+        caption = self.text_processor(ann["name"])
+
+        # "image_id" is kept for compatibility with the COCO evaluation format
+        return {
+            "video": video,
+            "text_input": caption,
+            "image_id": ann["videoid"],
+            "instance_id": ann["instance_id"],
+        }
+
+class WebVideoCaptionInstructDataset(WebVideoCaptionDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data['text_output'] = data["text_input"]
+            data['text_input'] = self.text_processor("")
+        return data
diff --git a/LAVIS-main/lavis/datasets/datasets/video_vqa_datasets.py b/LAVIS-main/lavis/datasets/datasets/video_vqa_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..63e92bcabfe2c4a1423acabad9988309a33dfdcc
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/video_vqa_datasets.py
@@ -0,0 +1,84 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import json
+import os
+from collections import OrderedDict
+
+from lavis.datasets.datasets.multimodal_classification_datasets import (
+    MultimodalClassificationDataset,
+)
+
+
+class __DisplMixin:
+    def displ_item(self, index):
+        ann = self.annotation[index]
+
+        vname = ann["video"]
+        vpath = os.path.join(self.vis_root, vname)
+
+        return OrderedDict(
+            {"file": vpath, "question": ann["question"], "answer": ann["answer"]}
+        )
+
+
+class VideoQADataset(MultimodalClassificationDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    def _build_class_labels(self, ans_path):
+        ans2label = json.load(open(ans_path))
+
+        self.class_labels = ans2label
+
+    def _get_answer_label(self, answer):
+        if answer in self.class_labels:
+            return self.class_labels[answer]
+        else:
+            return len(self.class_labels)
+
+    def __getitem__(self, index):
+        assert (
+            self.class_labels
+        ), f"class_labels of {__class__.__name__} is not built yet."
+
+        ann = self.annotation[index]
+
+        vname = ann["video"]
+        vpath = os.path.join(self.vis_root, vname)
+
+        frms = self.vis_processor(vpath)
+        question = self.text_processor(ann["question"])
+
+        return {
+            "video": frms,
+            "text_input": question,
+            "answers": self._get_answer_label(ann["answer"]),
+            "question_id": ann["question_id"],
+            "instance_id": ann["instance_id"],
+        }
+
+class VideoQAInstructDataset(VideoQADataset):
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        vname = ann["video"]
+        vpath = os.path.join(self.vis_root, vname)
+
+        frms = self.vis_processor(vpath)
+        question = self.text_processor(ann["question"])
+
+        return {
+            "video": frms,
+            "text_input": question,
+            "answer": ann["answer"],
+            "text_output": ann["answer"],
+            "question_id": ann["question_id"],
+            "instance_id": ann["instance_id"],
+            ## add weight to use with vqa eval script
+            "weight": [1.]
+        }
diff --git a/LAVIS-main/lavis/datasets/datasets/violin_dataset.py b/LAVIS-main/lavis/datasets/datasets/violin_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..c967d81703e0d18ab8f6ddded7dc5aecf9f209bc
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/violin_dataset.py
@@ -0,0 +1,112 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import random
+from lavis.datasets.datasets.base_dataset import BaseDataset
+
+from lavis.datasets.datasets.multimodal_classification_datasets import (
+    MultimodalClassificationDataset,
+)
+
+from lavis.datasets.datasets.caption_datasets import CaptionDataset
+
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+
+        return OrderedDict(
+            {
+                "file": ann["video_path"],
+                "sentence": ann["sentence"],
+                "label": ann["label"],
+                "video": sample["video"],
+            }
+        )
+
+class ViolinVideoEntailmentDataset(MultimodalClassificationDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+        self.class_labels = self._build_class_labels()
+    
+    def _build_class_labels(self):
+        return {"wrong": 0, "correct": 1}
+
+    def __getitem__(self, index):
+
+        ann = self.annotation[index]
+
+        vname = ann['video_path']
+        video_path = os.path.join(self.vis_root, vname)
+
+        try:
+            video = self.vis_processor(video_path,  start_sec=ann['start_time'], end_sec=ann['end_time']) 
+        except:
+            return None
+
+        sentence = self.text_processor(ann["statement"])
+
+        # "image_id" is kept to stay compatible with the COCO evaluation format
+        return {
+            "video": video,
+            "video_path": vname,
+            "sentence": sentence,
+            "label": self.class_labels[ann["label"]],
+            "image_id": ann["source"],
+            "instance_id": ann['instance_id'],
+        }
+
+class ViolinVideoEntailmentInstructDataset(ViolinVideoEntailmentDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        templates = ["is it true that {}?", "is the satement {} contained in the video?", "is the statement {} entailed in the video?"]
+        if data != None:
+            data['text_output'] = "yes" if data['label'] == 'correct' else 'no'
+            data['text_input'] = random.choice(templates).format(data["sentence"])
+        return data
+
+
+class ViolinVideoCaptionDataset(BaseDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+        self.annotation = [ann for ann in self.annotation if ann['label'] == 'correct']
+
+    def __getitem__(self, index):
+
+        ann = self.annotation[index]
+
+        vname = ann['video_path']
+        video_path = os.path.join(self.vis_root, vname)
+
+        try:
+            video = self.vis_processor(video_path,  start_sec=ann['start_time'], end_sec=ann['end_time']) 
+        except:
+            return None
+        caption = self.text_processor(ann["statement"])
+
+        # "image_id" is kept to stay compatible with the COCO evaluation format
+        return {
+            "video": video,
+            "text_input": self.text_processor(caption), 
+            "image_id": ann["source"],
+            "instance_id": ann['instance_id'],
+        }
+
+class ViolinVideoCaptionInstructDataset(ViolinVideoCaptionDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data['text_output'] = data["text_input"]
+            data['text_input'] = self.text_processor("")
+        return data
+
+class ViolinVideoCaptionEvalDataset(ViolinVideoCaptionDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            del data["text_input"]
+        return data
diff --git a/LAVIS-main/lavis/datasets/datasets/visdial_dialogue_datasets.py b/LAVIS-main/lavis/datasets/datasets/visdial_dialogue_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f036fe3016c807eb43e689b21218062d57cc73c
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/visdial_dialogue_datasets.py
@@ -0,0 +1,138 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+from collections import OrderedDict
+
+from PIL import Image
+
+from lavis.datasets.datasets.dialogue_datasets import DialogueDataset, DialogueEvalDataset
+
+import json
+import copy
+
+
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+
+        return OrderedDict(
+            {
+                "file": ann["image"],
+                "dialogue": ann["dialogue"],
+                "image": sample["image"],
+            }
+        )
+
+
+class VisDialDataset(DialogueDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+
+        self.vis_root = vis_root
+        self.annotation = []
+        for ann_path in ann_paths:
+            data = json.load(open(ann_path, "r"))['data']
+            dialogs = data['dialogs']
+            answers = data['answers']
+            questions = data['questions']
+            
+            for dialog in dialogs:
+                all_turns = [
+                    {
+                        "answer": answers[d["answer"]],
+                        "question": questions[d["question"]],
+                    }
+                    for d in dialog['dialog']
+                ]
+                for i in range(len(all_turns)):
+                    dialogue_context = ' '.join([f" q: {t['question']} a: {t['answer']}" for t in all_turns[:i]]).strip()
+                    last_turn = all_turns[i]
+
+                    question = last_turn["question"]
+                    answer = last_turn["answer"]
+
+                    dialog["dialog"] = dialogue_context
+                    dialog["question"] = question
+                    dialog["answer"] = answer
+
+                    self.annotation.append(dialog)
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        self._add_instance_ids()
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        self._add_instance_ids()
+
+        self.img_ids = {}
+        n = 0
+        for ann in self.annotation:
+            img_id = ann["image_id"]
+            if img_id not in self.img_ids.keys():
+                self.img_ids[img_id] = n
+                n += 1
+                
+    def __getitem__(self, index):
+
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root,"VisualDialog_train2018", f'VisualDialog_train2018_'+ str(ann["image_id"]).zfill(12)+'.jpg')
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        return {
+            "image": image,
+            "dialog": self.text_processor(ann["dialog"]),
+            "text_input": self.text_processor(ann["question"]),
+            "image_id": self.img_ids[ann["image_id"]],
+            "answer": ann["answer"]
+        }
+
+
+class VisDialInstructDataset(VisDialDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data["text_output"] = data["answer"]
+        return data
+
+class VisDialEvalDataset(VisDialDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    def __getitem__(self, index):
+
+        ann = self.annotation[index]
+        image_path = os.path.join(self.vis_root, "VisualDialog_val2018", 'VisualDialog_val2018_'+str(ann["image_id"]).zfill(12)+'.jpg')
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        return {
+            "image": image,
+            "dialog": self.text_processor(ann["dialog"]),
+            "text_input": self.text_processor(ann["question"]),
+            "image_id": self.img_ids[ann["image_id"]],
+            "answer": ann["answer"]
+        }
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/datasets/vizwiz_vqa_datasets.py b/LAVIS-main/lavis/datasets/datasets/vizwiz_vqa_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..c94c7b450bdb9cb50fa726ae19a54502f2a13beb
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/vizwiz_vqa_datasets.py
@@ -0,0 +1,54 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import os
+from collections import Counter
+from PIL import Image
+from lavis.datasets.datasets.vqa_datasets import VQAEvalDataset
+
+class VizWizEvalDataset(VQAEvalDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+        if 'val' in ann["image"]:
+            image_path = os.path.join(self.vis_root.replace('images', 'val'), ann["image"])
+        else:
+            image_path = os.path.join(self.vis_root.replace('images', 'test'), ann["image"])
+            
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        question = self.text_processor(ann["question"])
+
+        if "answers" in ann:
+            num_annotators = len(ann["answers"])
+            answers = [item['answer'] for item in ann["answers"]]
+            answer_counts = Counter(answers)
+            answers = list(set(answers))
+            weights = [answer_counts[ans]/num_annotators for ans in answers]
+        else:
+            # test
+            return {
+            "image": image,
+            "question_id": ann["image"],
+            "instance_id": ann["instance_id"],
+            "text_input": question,
+            }
+
+        return {
+            "image": image,
+            "text_input": question,
+            "instance_id": ann["instance_id"],
+            "question_id": ann["instance_id"],
+            "weights": weights,
+            "answer": answers
+        }
diff --git a/LAVIS-main/lavis/datasets/datasets/vlep_dataset.py b/LAVIS-main/lavis/datasets/datasets/vlep_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..f02acc1a5c5688133ef8eaf5caf5969ed8a879f8
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/vlep_dataset.py
@@ -0,0 +1,72 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import random
+from lavis.datasets.datasets.base_dataset import BaseDataset
+import math
+
+from lavis.datasets.datasets.caption_datasets import CaptionDataset
+
+
+class VlepVideoDataset(BaseDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        split (string): val or test
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+        existing_videos = [f.replace('.mp4', '') for f in os.listdir(self.vis_root)]
+        self.annotation = [ann for ann in self.annotation if ann['vid_name'] in existing_videos]
+
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        vname = ann['vid_name']+'.mp4'
+        video_path = os.path.join(self.vis_root, vname)
+
+        try:
+            video = self.vis_processor(video_path, start_sec=math.floor(ann['ts'][0]), end_sec=math.ceil(ann['ts'][1]))
+        except:
+            return None
+       
+        caption = self.text_processor(ann['events'][ann['answer']])
+
+        # "image_id" is kept to stay compatible with the COCO evaluation format
+        return {
+            "video": video,
+            "text_input": self.text_processor(caption),
+            "image_id": vname,
+            "example_id": ann['example_id'],
+            "instance_id": ann["instance_id"]
+        }
+
+class VlepVideoInstructDataset(VlepVideoDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        templates = [
+            "what is likely to happen next?", 
+            "what comes after this?", 
+            "where is this leading?",
+            "in your estimation, what's the next move?",
+            "can you foresee the subsequent events?",
+            "based on the video, what might follow?",
+            "can you give a glimpse into what might be coming?",
+            ]
+        if data != None:
+            data['text_output'] = data["text_input"]
+            data['text_input'] = self.text_processor(random.choice(templates))
+        return data
+
+class VlepVideoEvalDataset(VlepVideoDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            del data["text_input"]
+        return data
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/datasets/vqa_datasets.py b/LAVIS-main/lavis/datasets/datasets/vqa_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c3185f5f86199628482ca50932ea7b09b733a9a
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/vqa_datasets.py
@@ -0,0 +1,55 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import torch
+
+from lavis.datasets.datasets.base_dataset import BaseDataset
+
+
+class VQADataset(BaseDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    def collater(self, samples):
+        # Filter out None samples
+        samples = [s for s in samples if s is not None]
+        # Check if samples is empty after filtering
+        if not samples:
+            return None
+        image_list, question_list, answer_list, weight_list = [], [], [], []
+
+        num_answers = []
+
+        for sample in samples:
+            image_list.append(sample["image"])
+            question_list.append(sample["text_input"])
+
+            weight_list.extend(sample["weights"])
+
+            answers = sample["answers"]
+
+            answer_list.extend(answers)
+            num_answers.append(len(answers))
+
+        return {
+            "image": torch.stack(image_list, dim=0),
+            "text_input": question_list,
+            "answer": answer_list,
+            "weight": weight_list,
+            "n_answers": torch.LongTensor(num_answers),
+        }
+
+class VQAInstructDataset(VQADataset):
+     def collater(self, samples):
+        data = super().collater(samples)
+        data['text_output'] = data['answer']
+        return data
+
+class VQAEvalDataset(BaseDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
diff --git a/LAVIS-main/lavis/datasets/datasets/vsr_datasets.py b/LAVIS-main/lavis/datasets/datasets/vsr_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..dddb6cee1e9e210d19706ff86097014fe242a39d
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/vsr_datasets.py
@@ -0,0 +1,104 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import json
+
+from PIL import Image
+from PIL import ImageFile
+
+from lavis.datasets.datasets.multimodal_classification_datasets import (
+    MultimodalClassificationDataset,
+)
+from lavis.datasets.datasets.base_dataset import BaseDataset
+
+class VSRClassificationDataset(MultimodalClassificationDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+        self.class_labels = self._build_class_labels()
+        self.classnames = ['no', 'yes']
+
+    def _build_class_labels(self):
+        return {"no": 0, "yes": 1}
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+
+        img_id = ann["image"].split('.')[0]
+
+        return {
+            "image": image,
+            "image_id": img_id,
+            "text_input": ann['caption'],
+            "label": ann["label"],
+            "instance_id": ann["instance_id"],
+        }
+
+class VSRClassificationInstructDataset(VSRClassificationDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data["answer"]= ["yes", "true"] if data['label'] == 1 else ["no", "false"]
+            data["text_output"] = "yes" if data["label"] == 1 else "no"
+        return data
+
+class VSRCaptionDataset(BaseDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+        self.annotation = [ann for ann in self.annotation if ann['label'] == 1]
+    
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+
+        img_id = ann["image"].split('.')[0]
+
+        return {
+            "image": image,
+            "image_id": img_id,
+            "text_input": ann['caption'],
+        }
+
+class VSRCaptionInstructDataset(VSRCaptionDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            data['text_output'] = data["text_input"]
+            data['text_input'] = self.text_processor("")
+        return data
+
+
+class VSRCaptionEvalDataset(VSRCaptionDataset):
+    def __getitem__(self, index):
+        data = super().__getitem__(index)
+        if data != None:
+            del data["text_input"]
+        return data
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/datasets/yt8m_video_dialogue_datasets.py b/LAVIS-main/lavis/datasets/datasets/yt8m_video_dialogue_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..d54f6c7e6ce9cb560cf42c2f6cd99362a4a6f351
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/datasets/yt8m_video_dialogue_datasets.py
@@ -0,0 +1,66 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import torch
+import os
+import copy
+import random
+from PIL import Image
+from lavis.datasets.datasets.base_dataset import (
+    BaseDataset
+)
+
+
+class YT8MDialDataset(BaseDataset):
+    def __init__(self, **kwargs):
+        super().__init__(kwargs['vis_processor'], kwargs['text_processor'], kwargs['vis_root'], kwargs['ann_paths'])
+
+        self.modalities = kwargs['modalities']
+
+        for modality in self.modalities:
+            if 'image' in modality:
+                setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')())
+                continue
+            setattr(self, f"{modality}_root", kwargs[f"{modality}_root"])
+            setattr(self, f"{modality}_processor", kwargs[f"{modality}_processor"])
+            setattr(self, f"existing_{modality}_annotation",getattr(self, f'get_existing_{modality}_annotations')())
+        self.sample_ids = set.intersection(*[set(getattr(self, f"existing_{modality}_annotation")) for modality in self.modalities])
+        self.annotation = [ann for ann in self.annotation if ann['youtube_id'] in self.sample_ids]
+    
+    def get_existing_audio_annotations(self):
+        return [f.split('_')[0] for f in os.listdir(self.audio_root)]
+    
+    def get_existing_video_annotations(self):
+        return [f.split('_')[0] for f in os.listdir(self.video_root)]
+    
+    def get_audio_path(self, ann):
+        return os.path.join(self.audio_root, f'{ann["youtube_id"]}_{ann["start_sec"]}_{ann["end_sec"]}.flac')
+    
+    def get_video_path(self, ann):
+        return os.path.join(self.video_root, f'{ann["youtube_id"]}_{ann["start_sec"]}_{ann["end_sec"]}.mp4')
+
+    def __getitem__(self, index):
+        ann = copy.deepcopy(self.annotation[index])
+        for modality in self.modalities:
+            ann[f"{modality}_path"] = getattr(self, f"get_{modality}_path")(ann)
+            if type(ann[f"{modality}_path"]) == list:
+                ann[f"{modality}_path"] = random.choice(ann[f"{modality}_path"])
+            if 'video' in modality:
+                try:
+                    ann['video'] = getattr(self, f"video_processor")(ann[f"video_path"], start_sec=ann['start_sec'], end_sec=ann['end_sec']).to(torch.float32)
+                except:
+                    return None
+            elif 'image' in modality:
+                ann['image'] = self.vis_processor(Image.open(ann[f"images_path"]))
+            else:
+                ann[modality] = getattr(self, f"{modality}_processor")(ann[f"{modality}_path"]).to(torch.float32)
+        ann["sample_id"] = ann["youtube_id"]
+        ann['text_output'] = self.text_processor(ann['response'])
+        ann['text_input'] =  self.text_processor(ann['context'])
+        ann["question_id"] = index
+        ann['captions'] = ann['response']
+        return ann
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/download_scripts/download_charade.py b/LAVIS-main/lavis/datasets/download_scripts/download_charade.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8dd1d71146d55561dac82122862c6a6b8a88b54
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/download_scripts/download_charade.py
@@ -0,0 +1,26 @@
+"""
+ Copyright (c) 2023, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import json
+from tqdm import tqdm
+
+train_file = './train.jsonl'
+test_file = './test.jsonl'
+
+train_data = [json.loads(l.strip()) for l in open(train_file).readlines()]
+test_data = [json.loads(l.strip()) for l in open(test_file).readlines()]
+
+for d in tqdm(train_data):
+    d['video_path'] = d['video_id'] + '.mp4'
+    d['ts'] = [float(d['start']), float(d['end'])]
+
+for d in tqdm(test_data):
+    d['video_path'] = d['video_id'] + '.mp4'
+    d['ts'] = [float(d['start']), float(d['end'])]
+
+json.dump(train_data, open('train_lavis.json', 'w'))
+json.dump(test_data, open('test_lavis.json', 'w'))
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/download_scripts/download_coco.py b/LAVIS-main/lavis/datasets/download_scripts/download_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..283448aed1b745a975bc89b5c531a853efdd31f4
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/download_scripts/download_coco.py
@@ -0,0 +1,57 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+from pathlib import Path
+
+from omegaconf import OmegaConf
+
+from lavis.common.utils import (
+    cleanup_dir,
+    download_and_extract_archive,
+    get_abs_path,
+    get_cache_path,
+)
+
+
+DATA_URL = {
+    "train": "http://images.cocodataset.org/zips/train2014.zip",  # md5: 0da8c0bd3d6becc4dcb32757491aca88
+    "val": "http://images.cocodataset.org/zips/val2014.zip",  # md5: a3d79f5ed8d289b7a7554ce06a5782b3
+    "test": "http://images.cocodataset.org/zips/test2014.zip",  # md5: 04127eef689ceac55e3a572c2c92f264
+    "test2015": "http://images.cocodataset.org/zips/test2015.zip",  # md5: 04127eef689ceac55e3a572c2c92f264
+}
+
+
+def download_datasets(root, url):
+    download_and_extract_archive(url=url, download_root=root, extract_root=storage_dir)
+
+
+if __name__ == "__main__":
+
+    config_path = get_abs_path("configs/datasets/coco/defaults_cap.yaml")
+
+    storage_dir = OmegaConf.load(
+        config_path
+    ).datasets.coco_caption.build_info.images.storage
+
+    download_dir = Path(get_cache_path(storage_dir)).parent / "download"
+    storage_dir = Path(get_cache_path(storage_dir))
+
+    if storage_dir.exists():
+        print(f"Dataset already exists at {storage_dir}. Aborting.")
+        exit(0)
+
+    try:
+        for k, v in DATA_URL.items():
+            print("Downloading {} to {}".format(v, k))
+            download_datasets(download_dir, v)
+    except Exception as e:
+        # remove download dir if failed
+        cleanup_dir(download_dir)
+        print("Failed to download or extracting datasets. Aborting.")
+
+    cleanup_dir(download_dir)
diff --git a/LAVIS-main/lavis/datasets/download_scripts/download_coin.py b/LAVIS-main/lavis/datasets/download_scripts/download_coin.py
new file mode 100644
index 0000000000000000000000000000000000000000..8376aa00ae9a605d71f2ff98a13a22595fb8c6de
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/download_scripts/download_coin.py
@@ -0,0 +1,57 @@
+"""
+ Copyright (c) 2023, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+
+## Pre-requisities: run 'pip install youtube-dl' to install the youtube-dl package.
+## Specify your location of output videos and input json file.
+## It can also be used for youcookii by updating the file paths.
+import json
+import os
+
+output_path = './videos'
+json_path = './COIN.json'
+
+if not os.path.exists(output_path):
+	os.mkdir(output_path)
+	
+data = json.load(open(json_path, 'r'))['database']
+youtube_ids = list(data.keys())
+
+for youtube_id in data:
+	info = data[youtube_id]
+	type = info['recipe_type']
+	url = info['video_url']
+	vid_loc = output_path + '/' + str(type)
+	if not os.path.exists(vid_loc):
+		os.mkdir(vid_loc)
+	os.system('youtube-dl -o ' + vid_loc + '/' + youtube_id + '.mp4' + ' -f best ' + url)
+	
+	# To save disk space, you could download the best format available 
+	# 	but not better that 480p or any other qualities optinally
+	# See https://askubuntu.com/questions/486297/how-to-select-video-quality-from-youtube-dl
+
+## convert annotations
+all_json = json.load(open(json_path))['database']
+train_data = []
+test_data = []
+for k,v in all_json.items():
+	for gt_ann in v['annotation']:
+		new_ann = {}
+		youtube_id = v["video_url"].split("/")[-1]
+		new_ann['youtube_id'] = youtube_id
+		new_ann["recipe_type"] = v["recipe_type"]
+		new_ann['video_path'] = f'{v["recipe_type"]}/{youtube_id}.mp4'
+		new_ann['caption'] = gt_ann['label']
+		new_ann['id'] = gt_ann['id']
+		new_ann['ts'] = gt_ann['ts']
+		if v['subset'] == 'training':
+			train_data.append(new_ann)
+		else:
+			test_data.append(new_ann)
+	
+json.dump(train_data, open('train.json', 'w'))
+json.dump(test_data, open('test.json', 'w'))
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/download_scripts/download_didemo.py b/LAVIS-main/lavis/datasets/download_scripts/download_didemo.py
new file mode 100644
index 0000000000000000000000000000000000000000..376b71c4de1e83442a0209796c95f55da6b3e71a
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/download_scripts/download_didemo.py
@@ -0,0 +1,70 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+from pathlib import Path
+
+from omegaconf import OmegaConf
+
+from lavis.common.utils import (
+    cleanup_dir,
+    download_and_extract_archive,
+    get_abs_path,
+    get_cache_path,
+)
+
+DATA_URL = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/didemo_videos.tar.gz"
+
+
+def download_datasets(root, url):
+    """
+    Download the Imagenet-R dataset archives and expand them
+    in the folder provided as parameter
+    """
+    download_and_extract_archive(url=url, download_root=root)
+
+
+def move_files(download_path, storage_path):
+    """
+    Move files from download_path to storage_path
+    """
+    print("Moving to {}".format(storage_path))
+
+    os.makedirs(storage_path, exist_ok=True)
+
+    for file_name in os.listdir(download_path):
+        os.rename(
+            os.path.join(download_path, file_name),
+            os.path.join(storage_path, file_name),
+        )
+
+
+if __name__ == "__main__":
+
+    config_path = get_abs_path("configs/datasets/didemo/defaults_ret.yaml")
+
+    storage_dir = OmegaConf.load(
+        config_path
+    ).datasets.didemo_retrieval.build_info.videos.storage
+
+    download_dir = Path(get_cache_path(storage_dir)).parent / "download"
+    storage_dir = Path(get_cache_path(storage_dir))
+
+    if storage_dir.exists():
+        print(f"Dataset already exists at {storage_dir}. Aborting.")
+        exit(0)
+
+    try:
+        print("Downloading {} to {}".format(DATA_URL, download_dir))
+        download_datasets(download_dir, DATA_URL)
+    except Exception as e:
+        # remove download dir if failed
+        cleanup_dir(download_dir)
+        print("Failed to download or extracting datasets. Aborting.")
+
+    move_files(download_dir / "videos", storage_dir)
+    cleanup_dir(download_dir)
diff --git a/LAVIS-main/lavis/datasets/download_scripts/download_flickr.py b/LAVIS-main/lavis/datasets/download_scripts/download_flickr.py
new file mode 100644
index 0000000000000000000000000000000000000000..3075f02299110b729ccb0f4b34f7b9cf23046b6c
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/download_scripts/download_flickr.py
@@ -0,0 +1,78 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+from pathlib import Path
+
+from omegaconf import OmegaConf
+
+from lavis.common.utils import (
+    cleanup_dir,
+    get_abs_path,
+    get_cache_path,
+)
+
+import opendatasets as od
+
+
+DATA_URL = "https://www.kaggle.com/datasets/hsankesara/flickr-image-dataset"
+
+print(
+    """
+    To download the dataset, you need to have a Kaggle account and the associated key.
+    See https://www.kaggle.com/docs/api to create account and a new API token.
+    """
+)
+
+
+def move_directory(src_dir, dst_dir):
+    """
+    Move files from download_path to storage_path
+    """
+    print("Moving to {}".format(dst_dir))
+
+    os.makedirs(dst_dir, exist_ok=True)
+
+    for file_name in os.listdir(src_dir):
+        os.rename(
+            os.path.join(src_dir, file_name),
+            os.path.join(dst_dir, file_name),
+        )
+
+
+if __name__ == "__main__":
+
+    config_path = get_abs_path("configs/datasets/flickr30k/defaults.yaml")
+
+    storage_dir = OmegaConf.load(
+        config_path
+    ).datasets.flickr30k.build_info.images.storage
+
+    storage_dir = Path(get_cache_path(storage_dir))
+    download_dir = storage_dir.parent / "download"
+
+    if storage_dir.exists():
+        print(f"Dataset already exists at {storage_dir}. Aborting.")
+        exit(0)
+
+    os.makedirs(download_dir)
+
+    try:
+        print("Downloading {} to {}".format(DATA_URL, download_dir))
+        od.download(DATA_URL, download_dir)
+    except Exception as e:
+        print(e)
+        # remove download dir if failed
+        cleanup_dir(download_dir)
+        exit(1)
+
+    move_directory(
+        download_dir / "flickr-image-dataset" / "flickr30k_images" / "flickr30k_images",
+        storage_dir / "flickr30k-images",
+    )
+
+    cleanup_dir(download_dir)
diff --git a/LAVIS-main/lavis/datasets/download_scripts/download_gqa.py b/LAVIS-main/lavis/datasets/download_scripts/download_gqa.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bce71408c9f8d8973ef8f7fa9419d328127978e
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/download_scripts/download_gqa.py
@@ -0,0 +1,51 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+from pathlib import Path
+
+from omegaconf import OmegaConf
+
+from lavis.common.utils import (
+    cleanup_dir,
+    download_and_extract_archive,
+    get_abs_path,
+    get_cache_path,
+)
+
+
+DATA_URL = "https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip"
+
+
+def download_datasets(root, url):
+    download_and_extract_archive(url=url, download_root=root, extract_root=storage_dir.parent)
+
+
+if __name__ == "__main__":
+
+    config_path = get_abs_path("configs/datasets/gqa/defaults.yaml")
+
+    storage_dir = OmegaConf.load(
+        config_path
+    ).datasets.gqa.build_info.images.storage
+
+    download_dir = Path(get_cache_path(storage_dir)).parent / "download"
+    storage_dir = Path(get_cache_path(storage_dir))
+
+    if storage_dir.exists():
+        print(f"Dataset already exists at {storage_dir}. Aborting.")
+        exit(0)
+
+    try:
+        print("Downloading {}".format(DATA_URL))
+        download_datasets(download_dir, DATA_URL)
+    except Exception as e:
+        # remove download dir if failed
+        cleanup_dir(download_dir)
+        print("Failed to download or extracting datasets. Aborting.")
+
+    cleanup_dir(download_dir)
diff --git a/LAVIS-main/lavis/datasets/download_scripts/download_iconqa.py b/LAVIS-main/lavis/datasets/download_scripts/download_iconqa.py
new file mode 100644
index 0000000000000000000000000000000000000000..1070c436ab6bdc387c33f24db94c1f09868f57ed
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/download_scripts/download_iconqa.py
@@ -0,0 +1,35 @@
+"""
+ Copyright (c) 2023, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import json 
+from tqdm import tqdm
+import shutil
+import subprocess
+
+
+image_dir = f'./all_images'
+os.makedirs(image_dir, exist_ok=True)
+for split in ['train', 'test', 'val']:
+    print(f"Processing split {split}...")
+    path = f'{os.path.abspath(image_dir)}/{split}/choose_txt'
+    annotations = []
+    for id in tqdm(os.listdir(path)):
+        if not os.path.isdir(os.path.join(path, id)):
+            continue
+        ann = json.load(open(os.path.join(path, id, 'data.json'), "r"))
+        ann['instance_id'] = id
+        ann['image_id'] = f'{split}_{id}'
+        ann['image'] = f'{split}_{id}.png'
+        os.system(' '.join(('ln -s',os.path.join(path, id, 'image.png'),os.path.join(image_dir,ann["image"]))))
+        
+        annotations.append(ann)
+    f = open(f'annotations_{split}.json', 'w')
+    f.write(json.dumps(annotations))
+    f.close()
+
+    
\ No newline at end of file
diff --git a/LAVIS-main/lavis/datasets/download_scripts/download_msrvtt.py b/LAVIS-main/lavis/datasets/download_scripts/download_msrvtt.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e9dc1cd942ad3a17d0debe0c2b94e6edbc56c61
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/download_scripts/download_msrvtt.py
@@ -0,0 +1,105 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+from pathlib import Path
+
+from omegaconf import OmegaConf
+
+from lavis.common.utils import (
+    cleanup_dir,
+    download_and_extract_archive,
+    get_abs_path,
+    get_cache_path,
+)
+
+
+# TODO
+# 1. Go to https://www.mediafire.com/file/czh8sezbo9s4692/test_videos.zip/file
+#      and https://www.mediafire.com/file/x3rrbe4hwp04e6w/train_val_videos.zip/file
+# 2. Right-click the Download button and copy the link address
+#      e.g.
+#    DATA_URL = {
+#        "train": "https://download1602.mediafire.com/xxxxxxxxxxxx/x3rrbe4hwp04e6w/train_val_videos.zip",
+#        "test": "https://download2390.mediafire.com/xxxxxxxxxxxx/czh8sezbo9s4692/test_videos.zip",
+#    }
+# 3. Paste the link address to DATA_URL
+
+DATA_URL = {
+    "train": "https://download2295.mediafire.com/4bb7p74xrbgg/x3rrbe4hwp04e6w/train_val_videos.zip",
+    "test": "https://download2390.mediafire.com/79hfq3592lqg/czh8sezbo9s4692/test_videos.zip",
+}
+
+
+def download_datasets(root, url):
+    """
+    Download the Imagenet-R dataset archives and expand them
+    in the folder provided as parameter
+    """
+    download_and_extract_archive(url=url, download_root=root)
+
+
+def merge_datasets(download_path, storage_path):
+    """
+    Merge datasets in download_path to storage_path
+    """
+
+    # Merge train and test datasets
+    train_path = os.path.join(download_path, "TrainValVideo")
+    test_path = os.path.join(download_path, "TestVideo")
+    train_test_path = storage_path
+
+    print("Merging to {}".format(train_test_path))
+
+    os.makedirs(train_test_path, exist_ok=True)
+
+    for file_name in os.listdir(train_path):
+        os.rename(
+            os.path.join(train_path, file_name),
+            os.path.join(train_test_path, file_name),
+        )
+
+    for file_name in os.listdir(test_path):
+        os.rename(
+            os.path.join(test_path, file_name),
+            os.path.join(train_test_path, file_name),
+        )
+
+
+if __name__ == "__main__":
+
+    config_path = get_abs_path("configs/datasets/msrvtt/defaults_cap.yaml")
+
+    storage_dir = OmegaConf.load(
+        config_path
+    ).datasets.msrvtt_cap.build_info.videos.storage
+
+    download_dir = Path(get_cache_path(storage_dir)).parent / "download"
+    storage_dir = Path(get_cache_path(storage_dir))
+
+    if storage_dir.exists():
+        print(f"Dataset already exists at {storage_dir}. Aborting.")
+        exit(0)
+
+    try:
+        for k, v in DATA_URL.items():
+            print("Downloading {} to {}".format(v, k))
+            download_datasets(download_dir, v)
+    except Exception as e:
+        # remove download dir if failed
+        cleanup_dir(download_dir)
+        print("Failed to download or extracting datasets. Aborting.")
+
+    try:
+        merge_datasets(download_dir, storage_dir)
+    except Exception as e:
+        # remove storage dir if failed
+        cleanup_dir(download_dir)
+        cleanup_dir(storage_dir)
+        print("Failed to merging datasets. Aborting.")
+
+    cleanup_dir(download_dir)
diff --git a/LAVIS-main/lavis/datasets/download_scripts/download_msvd.py b/LAVIS-main/lavis/datasets/download_scripts/download_msvd.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4bf5467f3af7acdde7f7a25a38d28c599525771
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/download_scripts/download_msvd.py
@@ -0,0 +1,67 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+from pathlib import Path
+
+from omegaconf import OmegaConf
+
+from lavis.common.utils import (
+    cleanup_dir,
+    download_and_extract_archive,
+    get_abs_path,
+    get_cache_path,
+)
+
+
+DATA_URL = "https://www.cs.utexas.edu/users/ml/clamp/videoDescription/YouTubeClips.tar"
+
+
+def download_datasets(root, url):
+    download_and_extract_archive(url=url, download_root=root)
+
+
+def move_files(download_path, storage_path):
+    """
+    Move files from download_path to storage_path
+    """
+    print("Moving to {}".format(storage_path))
+
+    os.makedirs(storage_path, exist_ok=True)
+
+    for file_name in os.listdir(download_path):
+        os.rename(
+            os.path.join(download_path, file_name),
+            os.path.join(storage_path, file_name),
+        )
+
+
+if __name__ == "__main__":
+
+    config_path = get_abs_path("configs/datasets/msvd/defaults_cap.yaml")
+
+    storage_dir = OmegaConf.load(
+        config_path
+    ).datasets.msvd_cap.build_info.videos.storage
+
+    download_dir = Path(get_cache_path(storage_dir)).parent / "download"
+    storage_dir = Path(get_cache_path(storage_dir))
+
+    if storage_dir.exists():
+        print(f"Dataset already exists at {storage_dir}. Aborting.")
+        exit(0)
+
+    try:
+        print("Downloading {}".format(DATA_URL))
+        download_datasets(download_dir, DATA_URL)
+    except Exception as e:
+        # remove download dir if failed
+        cleanup_dir(download_dir)
+        print("Failed to download or extracting datasets. Aborting.")
+
+    move_files(download_dir / "YouTubeClips", storage_dir)
+    cleanup_dir(download_dir)
diff --git a/LAVIS-main/lavis/datasets/download_scripts/download_nocaps.py b/LAVIS-main/lavis/datasets/download_scripts/download_nocaps.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab56a7c10d958e6debb3968ca1c4def3da3beb0a
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/download_scripts/download_nocaps.py
@@ -0,0 +1,134 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import json
+import logging
+import os
+import time
+from multiprocessing import Pool
+
+import numpy as np
+import requests
+import tqdm
+from lavis.common.utils import cleanup_dir, get_abs_path, get_cache_path
+from omegaconf import OmegaConf
+
+header_mzl = {
+    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
+    # "User-Agent": "Googlebot-Image/1.0",  # Pretend to be googlebot
+    # "X-Forwarded-For": "64.18.15.200",
+}
+
+header_gbot = {
+    "User-Agent": "Googlebot-Image/1.0",  # Pretend to be googlebot
+}
+
+headers = [header_mzl, header_gbot]
+
+# Setup
+logging.basicConfig(filename="download_nocaps.log", filemode="w", level=logging.INFO)
+requests.packages.urllib3.disable_warnings(
+    requests.packages.urllib3.exceptions.InsecureRequestWarning
+)
+
+
+def download_file(url, filename):
+    max_retries = 20
+    cur_retries = 0
+
+    header = headers[0]
+
+    while cur_retries < max_retries:
+        try:
+            r = requests.get(url, headers=header, timeout=10)
+            with open(filename, "wb") as f:
+                f.write(r.content)
+
+            break
+        except Exception as e:
+            logging.info(" ".join(repr(e).splitlines()))
+            logging.error(url)
+            cur_retries += 1
+
+            # random sample a header from headers
+            header = headers[np.random.randint(0, len(headers))]
+
+    time.sleep(3 + cur_retries * 2)
+
+
+def download_image_from_url_val(url):
+    basename = os.path.basename(url)
+    filename = os.path.join(storage_dir, "val", basename)
+
+    download_file(url, filename)
+
+
+def download_image_from_url_test(url):
+    basename = os.path.basename(url)
+    filename = os.path.join(storage_dir, "test", basename)
+
+    download_file(url, filename)
+
+
+if __name__ == "__main__":
+    os.makedirs("tmp", exist_ok=True)
+
+    # storage dir
+    config_path = get_abs_path("configs/datasets/nocaps/defaults.yaml")
+
+    storage_dir = OmegaConf.load(config_path).datasets.nocaps.build_info.images.storage
+    storage_dir = get_cache_path(storage_dir)
+    # make sure the storage dir exists
+    os.makedirs(storage_dir, exist_ok=True)
+    print("Storage dir:", storage_dir)
+
+    # make sure the storage dir for val and test exists
+    os.makedirs(os.path.join(storage_dir, "val"), exist_ok=True)
+    os.makedirs(os.path.join(storage_dir, "test"), exist_ok=True)
+
+    # download annotations
+    val_url = "https://nocaps.s3.amazonaws.com/nocaps_val_4500_captions.json"
+    tst_url = "https://s3.amazonaws.com/nocaps/nocaps_test_image_info.json"
+
+    print("Downloading validation annotations from %s" % val_url)
+    download_file(val_url, "tmp/nocaps_val_ann.json")
+    print("Downloading testing annotations from %s" % tst_url)
+    download_file(tst_url, "tmp/nocaps_tst_ann.json")
+
+    # open annotations
+    val_ann = json.load(open("tmp/nocaps_val_ann.json"))
+    tst_ann = json.load(open("tmp/nocaps_tst_ann.json"))
+
+    # collect image urls
+    val_info = val_ann["images"]
+    tst_info = tst_ann["images"]
+
+    val_urls = [info["coco_url"] for info in val_info]
+    tst_urls = [info["coco_url"] for info in tst_info]
+
+    # setup multiprocessing
+    # large n_procs possibly causes server to reject requests
+    n_procs = 16
+
+    with Pool(n_procs) as pool:
+        print("Downloading validation images...")
+        list(
+            tqdm.tqdm(
+                pool.imap(download_image_from_url_val, val_urls), total=len(val_urls)
+            )
+        )
+
+    with Pool(n_procs) as pool:
+        print("Downloading test images...")
+        list(
+            tqdm.tqdm(
+                pool.imap(download_image_from_url_test, tst_urls), total=len(tst_urls)
+            )
+        )
+
+    # clean tmp
+    cleanup_dir("tmp")
diff --git a/LAVIS-main/lavis/datasets/download_scripts/download_sbu.py b/LAVIS-main/lavis/datasets/download_scripts/download_sbu.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ffbf43c670d471f7eb160bcb8a9b6bd887aaf65
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/download_scripts/download_sbu.py
@@ -0,0 +1,82 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import io
+import os
+import pathlib
+import urllib
+import tqdm
+
+from concurrent.futures import ThreadPoolExecutor
+
+from lavis.common.utils import get_abs_path, get_cache_path
+from lavis.datasets.builders import load_dataset
+from omegaconf import OmegaConf
+from PIL import Image
+
+# DATA_URL = {"train": "http://www.cs.rice.edu/~vo9/sbucaptions/sbu_images.tar"}
+
+USER_AGENT = (
+    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
+)
+
+
+def fetch_single_image(image_url, timeout=None, retries=0):
+    for _ in range(retries + 1):
+        try:
+            request = urllib.request.Request(
+                image_url,
+                data=None,
+                headers={"user-agent": USER_AGENT},
+            )
+            with urllib.request.urlopen(request, timeout=timeout) as req:
+                image = Image.open(io.BytesIO(req.read()))
+            break
+        except Exception:
+            image = None
+    return image
+
+
+def download_and_save_image(ann, save_dir, timeout=None, retries=0):
+    image = fetch_single_image(ann["url"], timeout=timeout, retries=retries)
+
+    if image is not None:
+        image_path = os.path.join(save_dir, ann["image"])
+        print(image_path)
+        image.save(image_path)
+
+
+if __name__ == "__main__":
+
+    config_path = get_abs_path("configs/datasets/sbu_caption/defaults.yaml")
+
+    storage_dir = OmegaConf.load(
+        config_path
+    ).datasets.sbu_caption.build_info.images.storage
+
+    storage_dir = pathlib.Path(get_cache_path(storage_dir))
+
+    if storage_dir.exists():
+        print(f"Dataset already exists at {storage_dir}. Aborting.")
+        exit(0)
+
+    storage_dir.mkdir(parents=True, exist_ok=True)
+
+    num_threads = 20
+    dset = load_dataset("sbu_caption")["train"].annotation
+
+    print("Downloading dataset...")
+    # multiprocessing
+    with ThreadPoolExecutor(max_workers=num_threads) as executor:
+        for ann in tqdm.tqdm(dset):
+            executor.submit(
+                download_and_save_image,
+                ann,
+                storage_dir,
+                timeout=30,
+                retries=10,
+            )
diff --git a/LAVIS-main/lavis/datasets/download_scripts/download_vg.py b/LAVIS-main/lavis/datasets/download_scripts/download_vg.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fbb7828f035f2cc9b32471129f0d2ec0f916f8e
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/download_scripts/download_vg.py
@@ -0,0 +1,55 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+from pathlib import Path
+
+from omegaconf import OmegaConf
+
+from lavis.common.utils import (
+    cleanup_dir,
+    download_and_extract_archive,
+    get_abs_path,
+    get_cache_path,
+)
+
+
+DATA_URL = {
+    "train": "https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip",
+    "train2": "https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip",
+}
+
+
+def download_datasets(root, url):
+    download_and_extract_archive(url=url, download_root=root, extract_root=storage_dir)
+
+
+if __name__ == "__main__":
+
+    config_path = get_abs_path("configs/datasets/vg/defaults_caption.yaml")
+
+    storage_dir = OmegaConf.load(
+        config_path
+    ).datasets.vg_caption.build_info.images.storage
+
+    download_dir = Path(get_cache_path(storage_dir)).parent / "download"
+    storage_dir = Path(get_cache_path(storage_dir))
+
+    if storage_dir.exists():
+        print(f"Dataset already exists at {storage_dir}. Aborting.")
+        exit(0)
+
+    try:
+        for k, v in DATA_URL.items():
+            print("Downloading {} to {}".format(v, k))
+            download_datasets(download_dir, v)
+    except Exception as e:
+        # remove download dir if failed
+        cleanup_dir(download_dir)
+        print("Failed to download or extracting datasets. Aborting.")
+
+    cleanup_dir(download_dir)
diff --git a/LAVIS-main/lavis/datasets/download_scripts/download_violin.py b/LAVIS-main/lavis/datasets/download_scripts/download_violin.py
new file mode 100644
index 0000000000000000000000000000000000000000..45b850965bbaa671caef955ea60c7d1f302ecc37
--- /dev/null
+++ b/LAVIS-main/lavis/datasets/download_scripts/download_violin.py
@@ -0,0 +1,19 @@
+"""
+ Copyright (c) 2023, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import json
+import os
+
+json_path = './violin_annotation.json'
+
+## convert annotations
+all_json = json.load(open(json_path))
+train_data = [v for v in all_json.values() if 'split' in v and v['split'] == 'train']
+test_data = [v for v in all_json.values() if 'split' in v and v['split'] == 'test']
+
+json.dump(train_data, open('train.json', 'w'))
+json.dump(test_data, open('test.json', 'w'))
\ No newline at end of file