yuccaaa commited on Sep 3, 2025

Commit

48cce71

verified ·

1 Parent(s): 5c8f92e

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-835/trainer_state.json +0 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_loss.png +0 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_runtime.png +0 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_samples_per_second.png +0 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_steps_per_second.png +0 -0
LAVIS-main/lavis/configs/datasets/gqa/balanced_testdev_instruct.yaml +46 -0
LAVIS-main/lavis/configs/datasets/gqa/balanced_val.yaml +30 -0
LAVIS-main/lavis/configs/datasets/gqa/balanced_val_instruct.yaml +47 -0
LAVIS-main/lavis/configs/datasets/gqa/defaults.yaml +36 -0
LAVIS-main/lavis/configs/datasets/gqa/defaults_instruct.yaml +55 -0
LAVIS-main/lavis/configs/datasets/iconqa/defaults.yaml +52 -0
LAVIS-main/lavis/configs/datasets/iconqa/defaults_instruct.yaml +55 -0
LAVIS-main/lavis/configs/datasets/imagenet/defaults.yaml +15 -0
LAVIS-main/lavis/configs/datasets/laion/defaults_2B_multi.yaml +13 -0
LAVIS-main/lavis/configs/datasets/laion/defaults_400M.yaml +20 -0
LAVIS-main/lavis/configs/datasets/laion/defaults_400M_instruct.yaml +31 -0
LAVIS-main/lavis/configs/datasets/llava150k/defaults_dial.yaml +32 -0
LAVIS-main/lavis/configs/datasets/modelnet40/defaults_cls.yaml +55 -0
LAVIS-main/lavis/configs/datasets/msrvtt/defaults_cap.yaml +24 -0
LAVIS-main/lavis/configs/datasets/msrvtt/defaults_cap_instruct.yaml +48 -0
LAVIS-main/lavis/configs/datasets/msrvtt/defaults_qa.yaml +27 -0
LAVIS-main/lavis/configs/datasets/msrvtt/defaults_qa_instruct.yaml +51 -0
LAVIS-main/lavis/configs/datasets/msrvtt/defaults_ret.yaml +24 -0
LAVIS-main/lavis/configs/datasets/msvd/defaults_cap.yaml +24 -0
LAVIS-main/lavis/configs/datasets/msvd/defaults_cap_instruct.yaml +50 -0
LAVIS-main/lavis/configs/datasets/msvd/defaults_qa.yaml +29 -0
LAVIS-main/lavis/configs/datasets/msvd/defaults_qa_instruct.yaml +53 -0
LAVIS-main/lavis/configs/datasets/music_avqa/defaults_mm_qa.yaml +66 -0
LAVIS-main/lavis/configs/datasets/music_avqa/defaults_mm_qa_instruct.yaml +69 -0
LAVIS-main/lavis/configs/datasets/nlvr/defaults.yaml +24 -0
LAVIS-main/lavis/configs/datasets/nocaps/defaults.yaml +22 -0
LAVIS-main/lavis/configs/datasets/objaverse/defaults_mm_cap.yaml +54 -0
LAVIS-main/lavis/configs/datasets/objaverse/defaults_mm_cap_instruct.yaml +55 -0
LAVIS-main/lavis/configs/datasets/objaverse/defaults_mm_qa.yaml +55 -0
LAVIS-main/lavis/configs/datasets/ocrvqa/defaults.yaml +33 -0
LAVIS-main/lavis/configs/datasets/ocrvqa/defaults_instruct.yaml +35 -0
LAVIS-main/lavis/configs/datasets/okvqa/defaults.yaml +37 -0
LAVIS-main/lavis/configs/datasets/okvqa/defaults_instruct.yaml +53 -0
LAVIS-main/lavis/configs/datasets/sbu_caption/defaults.yaml +22 -0
LAVIS-main/lavis/configs/datasets/sbu_caption/defaults_instruct.yaml +38 -0
LAVIS-main/lavis/configs/datasets/scienceqa/defaults.yaml +51 -0
LAVIS-main/lavis/configs/datasets/scienceqa/defaults_instruct.yaml +54 -0
LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/LICENSE +25 -0
LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/README.md +22 -0
LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/create_annotation_12m.ipynb +227 -0
LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/create_annotation_3m.ipynb +227 -0
LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc12m.py +232 -0
LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc3m.py +229 -0
LAVIS-main/lavis/models/__init__.py +270 -0
LAVIS-main/lavis/models/albef_models/__init__.py +202 -0

BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-835/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_loss.png ADDED Viewed

BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_runtime.png ADDED Viewed

BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_samples_per_second.png ADDED Viewed

BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_steps_per_second.png ADDED Viewed

LAVIS-main/lavis/configs/datasets/gqa/balanced_testdev_instruct.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  gqa:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: blip_instruction
+        task: qa
+        modality: image
+      eval:
+        name: blip_question
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
+          storage:
+              - gqa/annotations/train_balanced_questions.json
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/testdev_balanced_questions.json
+          storage:
+            - gqa/annotations/testdev_balanced_questions.json
+        test:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
+          storage:
+              - gqa/annotations/test_balanced_questions.json
+      images:
+          storage: /export/share/datasets/vision/GQA/images #gqa/images/

LAVIS-main/lavis/configs/datasets/gqa/balanced_val.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  gqa:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
+          storage:
+              - gqa/annotations/train_balanced_questions.json
+        val:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/val_balanced_questions.json
+          storage:
+              - gqa/annotations/val_balanced_questions.json
+        test:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
+          storage:
+              - gqa/annotations/test_balanced_questions.json
+      images:
+          storage: gqa/images/

LAVIS-main/lavis/configs/datasets/gqa/balanced_val_instruct.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  gqa_instruct:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: blip_instruction
+        task: qa
+        modality: image
+      eval:
+        name: blip_question
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
+          storage:
+              - gqa/annotations/train_balanced_questions.json
+        val:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/val_balanced_questions.json
+          storage:
+              - gqa/annotations/val_balanced_questions.json
+        test:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
+          storage:
+              - gqa/annotations/test_balanced_questions.json
+      images:
+          storage: /export/share/datasets/vision/GQA/images #gqa/images/

LAVIS-main/lavis/configs/datasets/gqa/defaults.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  gqa:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+              - /export/share/datasets/vision/GQA/questions1.2/train_all_questions/train_all_questions_0.json
+              - /export/share/datasets/vision/GQA/questions1.2/val_all_questions.json
+          storage:
+              - gqa/annotations/train_all_questions_0.json
+              - gqa/annotations/val_all_questions.json
+        val:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json
+          storage:
+              - aokvqa/annotations/aokvqa_v1p0_val.json
+              - aokvqa/annotations/large_vocab_train_lavis.json
+        test:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json
+          storage:
+              - aokvqa/annotations/aokvqa_v1p0_test.json
+              - aokvqa/annotations/large_vocab_train_lavis.json
+      images:
+          storage: gqa/images/

LAVIS-main/lavis/configs/datasets/gqa/defaults_instruct.yaml ADDED Viewed

	@@ -0,0 +1,55 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  gqa_instruct:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: blip_instruction
+        task: qa
+        modality: image
+      eval:
+        name: blip_question
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/gqa/train_all_questions_0.json
+              # - /export/share/datasets/vision/GQA/questions1.2/train_all_questions/train_all_questions_0.json
+              # - /export/share/datasets/vision/GQA/questions1.2/val_all_questions.json
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/gqa/val_all_questions.json
+          storage:
+              - gqa/annotations/train_all_questions_0.json
+              - gqa/annotations/val_all_questions.json
+        val:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json
+          storage:
+              - aokvqa/annotations/aokvqa_v1p0_val.json
+              - aokvqa/annotations/large_vocab_train_lavis.json
+        test:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json
+          storage:
+              - aokvqa/annotations/aokvqa_v1p0_test.json
+              - aokvqa/annotations/large_vocab_train_lavis.json
+      images:
+          storage: /export/share/datasets/vision/GQA/images #gqa/images/

LAVIS-main/lavis/configs/datasets/iconqa/defaults.yaml ADDED Viewed

	@@ -0,0 +1,52 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  iconqa:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: blip_question
+      eval:
+        name: blip_question
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_train.json
+            # - /export/share/datasets/vision_language/iconqa/annotations_train.json
+          storage:
+            - iconqa/annotations/train.json
+            # - /export/share/datasets/vision_language/iconqa/annotations_train.json
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_val.json
+            # - /export/share/datasets/vision_language/iconqa/annotations_val.json
+          storage:
+            - iconqa/annotations/val.json
+            # - /export/share/datasets/vision_language/iconqa/annotations_val.json
+        test:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_test.json
+            # - /export/share/datasets/vision_language/iconqa/annotations_test.json
+          storage:
+            - iconqa/annotations/test.json
+            # - /export/share/datasets/vision_language/iconqa/annotations_test.json
+      images:
+          storage: /export/share/datasets/vision_language/iconqa/all_images/

LAVIS-main/lavis/configs/datasets/iconqa/defaults_instruct.yaml ADDED Viewed

	@@ -0,0 +1,55 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  iconqa_instruct:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: blip_instruction
+        modality: image
+        task: qa
+      eval:
+        name: blip_question
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_train.json
+            # - /export/share/datasets/vision_language/iconqa/annotations_train.json
+          storage:
+            - iconqa/annotations/train.json
+            # - /export/share/datasets/vision_language/iconqa/annotations_train.json
+        # val:
+        #   url:
+        #     - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_val.json
+        #     # - /export/share/datasets/vision_language/iconqa/annotations_val.json
+        #   storage:
+        #     - iconqa/annotations/val.json
+        #     # - /export/share/datasets/vision_language/iconqa/annotations_val.json
+        # test:
+        #   url:
+        #     - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_test.json
+        #     # - /export/share/datasets/vision_language/iconqa/annotations_test.json
+        #   storage:
+        #     - iconqa/annotations/test.json
+        #     # - /export/share/datasets/vision_language/iconqa/annotations_test.json
+      images:
+          storage: /export/share/datasets/vision_language/iconqa/all_images/

LAVIS-main/lavis/configs/datasets/imagenet/defaults.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  imagenet:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      splits: ["val"]
+      images:
+          storage: /export/share/datasets/vision/imagenet

LAVIS-main/lavis/configs/datasets/laion/defaults_2B_multi.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  laion2B_multi:
+    data_type: images
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar

LAVIS-main/lavis/configs/datasets/laion/defaults_400M.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  laion400M:
+    data_type: images
+    text_processor:
+      train:
+        name: blip_caption
+      eval:
+        name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      storage: /export/laion400m-data-ssd/laion115m_capfilt_20220817/{part0/part0,part1/part1,part2/part2}_node{00..15}_shard{000000..000118}.tar
+#      storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar

LAVIS-main/lavis/configs/datasets/laion/defaults_400M_instruct.yaml ADDED Viewed

	@@ -0,0 +1,31 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  laion400M_instruct:
+    data_type: images
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: blip_instruction
+        modality: image
+        task: caption
+      eval:
+        name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      storage: /export/laion400m-data-ssd/laion115m_capfilt_20220817/{part0/part0,part1/part1,part2/part2}_node{00..15}_shard{000000..000118}.tar
+#      storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar

LAVIS-main/lavis/configs/datasets/llava150k/defaults_dial.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  llava150k_dialogue_instruct: #394276 train examples
+    data_type: images
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+        train:
+          name: "blip_caption"
+    build_info:
+      annotations:
+        train:
+          url:
+            - https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/llava_instruct_150k.json
+          storage:
+            - LLaVA-Instruct-150K/annotations/lava_instruct_150k.json
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      images:
+        storage: /export/share/datasets/vision/coco/images/train2017

LAVIS-main/lavis/configs/datasets/modelnet40/defaults_cls.yaml ADDED Viewed

	@@ -0,0 +1,55 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  modelnet40_cls: # name of the dataset builder
+    data_type: [pc, images]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    pc_processor:
+        train:
+          name: "ulip_pc"
+        eval:
+          name: "ulip_pc"
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - https://storage.googleapis.com/sfr-ulip-code-release-research/modelnet40_normal_resampled/modelnet40_shape_names.txt
+            - https://storage.googleapis.com/sfr-ulip-code-release-research/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
+            - https://storage.googleapis.com/sfr-ulip-code-release-research/modelnet40_normal_resampled/modelnet40_train.txt
+          storage:
+            - modelnet40_normal_resampled/modelnet40_shape_names.txt
+            - modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
+            - /modelnet40_normal_resampled/modelnet40_train.txt
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-ulip-code-release-research/modelnet40_normal_resampled/modelnet40_shape_names.txt
+            - https://storage.googleapis.com/sfr-ulip-code-release-research/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
+            - https://storage.googleapis.com/sfr-ulip-code-release-research/modelnet40_normal_resampled/modelnet40_test.txt
+          storage:
+            - modelnet40_normal_resampled/modelnet40_shape_names.txt
+            - modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
+            - modelnet40_normal_resampled/modelnet40_test.txt
+      pc:
+        storage: /export/home/ULIP/data/modelnet40_normal_resampled
+      images:
+        storage: /export/einstein-vision/3d_vision/3d_object_datasets/modelnet40_pc_img

LAVIS-main/lavis/configs/datasets/msrvtt/defaults_cap.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  msrvtt_cap: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_train.json
+          storage: msrvtt/annotations/cap_train.json
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_val.json
+          storage: msrvtt/annotations/cap_val.json
+        test:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_test.json
+          storage: msrvtt/annotations/cap_test.json
+      videos:
+        storage: msrvtt/videos

LAVIS-main/lavis/configs/datasets/msrvtt/defaults_cap_instruct.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  msrvtt_caption_instruct: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+    vis_processor:
+      train:
+        name: alpro_video_train
+        n_frms: 4
+        image_size: 224
+        min_scale: 0.9
+        max_scale: 1.0
+        full_video: True
+      eval:
+        name: alpro_video_eval
+        n_frms: 4
+        image_size: 224
+        min_scale: 0.9
+        max_scale: 1.0
+        full_video: True
+    text_processor:
+      train:
+        name: blip_instruction
+        task: caption
+        modality: video
+      eval:
+        name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_train.json
+          storage: msrvtt/annotations/cap_train.json
+        # val:
+        #   url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_val.json
+        #   storage: msrvtt/annotations/cap_val.json
+        # test:
+        #   url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_test.json
+        #   storage: msrvtt/annotations/cap_test.json
+      videos:
+        storage: msrvtt/videos

LAVIS-main/lavis/configs/datasets/msrvtt/defaults_qa.yaml ADDED Viewed

	@@ -0,0 +1,27 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  msrvtt_qa: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json
+          storage: msrvtt/annotations/qa_train.json
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json
+          storage: msrvtt/annotations/qa_val.json
+        test:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json
+          storage: msrvtt/annotations/qa_test.json
+        ans2label:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json
+          storage: msrvtt/annotations/qa_ans2label.json
+      videos:
+        storage: msrvtt/videos

LAVIS-main/lavis/configs/datasets/msrvtt/defaults_qa_instruct.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  msrvtt_qa_instruct: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+    vis_processor:
+      train:
+        name: alpro_video_train
+        n_frms: 4
+        image_size: 224
+        min_scale: 0.9
+        max_scale: 1.0
+        full_video: True
+      eval:
+        name: alpro_video_eval
+        n_frms: 4
+        image_size: 224
+        min_scale: 0.9
+        max_scale: 1.0
+        full_video: True
+    text_processor:
+      train:
+        name: blip_instruction
+        task: qa
+        modality: video
+      eval:
+        name: blip_question
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json
+          storage: msrvtt/annotations/qa_train.json
+        # val:
+        #   url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json
+        #   storage: msrvtt/annotations/qa_val.json
+        # test:
+        #   url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json
+        #   storage: msrvtt/annotations/qa_test.json
+        ans2label:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json
+          storage: msrvtt/annotations/qa_ans2label.json
+      videos:
+        storage: msrvtt/videos

LAVIS-main/lavis/configs/datasets/msrvtt/defaults_ret.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  msrvtt_retrieval: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_train.json
+          storage: msrvtt/annotations/retrieval_train.json
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_val.json
+          storage: msrvtt/annotations/retrieval_val.json
+        test:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_test.json
+          storage: msrvtt/annotations/retrieval_test.json
+      videos:
+        storage: msrvtt/videos

LAVIS-main/lavis/configs/datasets/msvd/defaults_cap.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  msvd_cap: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_train.json
+          storage: msvd/annotations/cap_train.json
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_val.json
+          storage: msvd/annotations/cap_val.json
+        test:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_test.json
+          storage: msvd/annotations/cap_test.json
+      videos:
+        storage: msvd/videos

LAVIS-main/lavis/configs/datasets/msvd/defaults_cap_instruct.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  msvd_caption_instruct: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+    vis_processor:
+      train:
+        name: alpro_video_train
+        n_frms: 4
+        image_size: 224
+        min_scale: 0.9
+        max_scale: 1.0
+        full_video: True
+      eval:
+        name: alpro_video_eval
+        n_frms: 4
+        image_size: 224
+        min_scale: 0.9
+        max_scale: 1.0
+        full_video: True
+    text_processor:
+      train:
+        name: blip_instruction
+        task: caption
+        modality: video
+      eval:
+        name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_train.json
+          storage: msvd/annotations/cap_train.json
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_val.json
+          storage: msvd/annotations/cap_val.json
+        test:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_test.json
+          storage: msvd/annotations/cap_test.json
+      videos:
+        # storage: msvd/videos
+        storage: /export/share/datasets/vision_language/msvd/videos

LAVIS-main/lavis/configs/datasets/msvd/defaults_qa.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  msvd_qa: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json
+          storage: msvd/annotations/qa_train.json
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json
+          storage: msvd/annotations/qa_val.json
+        test:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json
+          storage: msvd/annotations/qa_test.json
+        ans2label:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json
+          storage: msvd/annotations/qa_ans2label.json
+      videos:
+        storage: msvd/videos
+      instance_id_key: question_id

LAVIS-main/lavis/configs/datasets/msvd/defaults_qa_instruct.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  msvd_qa_instruct: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: videos # [images|videos|features]
+    vis_processor:
+      train:
+        name: alpro_video_train
+        n_frms: 4
+        image_size: 224
+        min_scale: 0.9
+        max_scale: 1.0
+        full_video: True
+      eval:
+        name: alpro_video_eval
+        n_frms: 4
+        image_size: 224
+        min_scale: 0.9
+        max_scale: 1.0
+        full_video: True
+    text_processor:
+      train:
+        name: blip_instruction
+        task: qa
+        modality: video
+      eval:
+        name: blip_question
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json
+          storage: msvd/annotations/qa_train.json
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json
+          storage: msvd/annotations/qa_val.json
+        test:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json
+          storage: msvd/annotations/qa_test.json
+        ans2label:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json
+          storage: msvd/annotations/qa_ans2label.json
+      videos:
+        storage: /export/share/datasets/vision_language/msvd/videos
+      instance_id_key: question_id

LAVIS-main/lavis/configs/datasets/music_avqa/defaults_mm_qa.yaml ADDED Viewed

	@@ -0,0 +1,66 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  musicavqa_mm: # name of the dataset builder
+    data_type: [video, audio]
+    video_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+    text_processor:
+        train:
+          name: blip_question
+        eval:
+          name: blip_question
+    audio_processor:
+        train:
+          name: beats_audio
+          sampling_rate: 16000
+        eval:
+          name: beats_audio
+          sampling_rate: 16000
+          is_eval: False
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        val:
+          url:
+            -  https://storage.googleapis.com/sfr-xinstructblip-data-research/data/musicavqa/avqa-val.json
+            # - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-val.json
+          storage:
+            - /musicavqa/annotations/avqa-val.json
+            # - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-val.json
+        test:
+          url:
+            # -  https://storage.googleapis.com/sfr-xinstructblip-data-research/data/musicavqa/avqa-test.json
+            - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-test.json
+          storage:
+            # - /musicavqa/annotations/avqa-test.json
+            - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-test.json
+      templates: null
+      audio:
+        storage: /export/video-language-dataset/data/MUSIC-AVQA/data/MUSIC-AVQA-videos-Real
+      video:
+        storage: /export/video-language-dataset/data/MUSIC-AVQA/data/MUSIC-AVQA-videos-Real

LAVIS-main/lavis/configs/datasets/music_avqa/defaults_mm_qa_instruct.yaml ADDED Viewed

	@@ -0,0 +1,69 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  musicavqa_mm_instruct: # name of the dataset builder
+    data_type: [video, audio]
+    video_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+    text_processor:
+        train:
+          name: blip_instruction
+          task: qa
+          modality: video
+        eval:
+          name: blip_question
+    audio_processor:
+        train:
+          name: beats_audio
+          sampling_rate: 16000
+        eval:
+          name: beats_audio
+          sampling_rate: 16000
+          is_eval: False
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        val:
+          url:
+            -  https://storage.googleapis.com/sfr-xinstructblip-data-research/data/musicavqa/avqa-val.json
+            # - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-val.json
+          storage:
+            - /musicavqa/annotations/avqa-val.json
+            # - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-val.json
+        test:
+          url:
+            -  https://storage.googleapis.com/sfr-xinstructblip-data-research/data/musicavqa/avqa-test.json
+            # - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-test.json
+          storage:
+            - /musicavqa/annotations/avqa-test.json
+            # - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-test.json
+      templates: null
+      audio:
+        storage: /export/video-language-dataset/data/MUSIC-AVQA/data/MUSIC-AVQA-videos-Real
+      video:
+        storage: /export/video-language-dataset/data/MUSIC-AVQA/data/MUSIC-AVQA-videos-Real

LAVIS-main/lavis/configs/datasets/nlvr/defaults.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  nlvr:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_train.json
+          storage: nlvr/annotations/train.json
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json
+          storage: nlvr/annotations/dev.json
+        test:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json
+          storage: nlvr/annotations/test.json
+      images:
+          storage: /export/share/datasets/vision/NLVR2/

LAVIS-main/lavis/configs/datasets/nocaps/defaults.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  nocaps: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_val.json
+          storage:  nocaps/annotations/nocaps_val.json
+        test:
+          url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_test.json
+          storage: nocaps/annotations/nocaps_test.json
+      images:
+        storage: nocaps/images
+        # storage: /export/share/datasets/vision/nocaps/

LAVIS-main/lavis/configs/datasets/objaverse/defaults_mm_cap.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  objaverse_mm_caption: # 651576 train examples
+      vis_processor:
+          train:
+            name: "clip_image_train"
+            image_size: 224
+          eval:
+            name: "clip_image_train"
+            image_size: 224
+      pc_processor:
+            train:
+              name: "ulip_pc"
+            eval:
+              name: "ulip_pc"
+      text_processor:
+          train:
+            name: "blip_caption"
+          eval:
+            name: "blip_caption"
+      data_type: [pc, images] # [images|pc]
+      build_info:
+        # Be careful not to append minus sign (-) before split to avoid itemizing
+        annotations:
+          train:
+            url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/objaverse/cap3d_cap_final_train.csv
+              # - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_train.json
+            storage:
+              - objaverse/annotations/train.csv
+              # - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_train.json
+          val:
+            url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/objaverse/cap3d_cap_final_val.csv
+              # - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_val.json
+            storage:
+              - objaverse/annotations/val.csv
+              # - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_val.json
+        templates: null
+        pc:
+          storage: /export/einstein-vision/3d_vision/objaverse/objaverse_pc_parallel
+        images:
+          storage: /export/einstein-vision/3d_vision/objaverse_captions/images/

LAVIS-main/lavis/configs/datasets/objaverse/defaults_mm_cap_instruct.yaml ADDED Viewed

	@@ -0,0 +1,55 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  objaverse_mm_caption_instruct: # 651576 train examples
+      vis_processor:
+          train:
+            name: "clip_image_train"
+            image_size: 224
+          eval:
+            name: "clip_image_train"
+            image_size: 224
+      pc_processor:
+            train:
+              name: "ulip_pc"
+            eval:
+              name: "ulip_pc"
+      text_processor:
+          train:
+            name: "blip_instruction"
+            modality: pc
+            task: caption
+          eval:
+            name: "blip_caption"
+      data_type: [pc, images] # [images|pc]
+      build_info:
+        # Be careful not to append minus sign (-) before split to avoid itemizing
+        annotations:
+          train:
+            url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/objaverse/cap3d_cap_final_train.csv
+              # - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_train.json
+            storage:
+              - objaverse/annotations/train.csv
+              # - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_train.json
+          # val:
+          #   url:
+          #     # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/objaverse/cap3d_cap_final_val.csv
+          #     - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_val.json
+          #   storage:
+          #     # - objaverse/annotations/val.csv
+          #     - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_val.json
+        templates: null
+        pc:
+          storage: /export/einstein-vision/3d_vision/objaverse/objaverse_pc_parallel
+        images:
+          storage: /export/einstein-vision/3d_vision/objaverse_captions/images/

LAVIS-main/lavis/configs/datasets/objaverse/defaults_mm_qa.yaml ADDED Viewed

	@@ -0,0 +1,55 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  objaverse_mm_qa: # 250070
+    vis_processor:
+        train:
+          name: "clip_image_train"
+          image_size: 224
+        eval:
+          name: "clip_image_train"
+          image_size: 224
+    pc_processor:
+          train:
+            name: "ulip_pc"
+          eval:
+            name: "ulip_pc"
+    text_processor:
+        train:
+          name: "blip_instruction"
+          modality: pc
+          task: qa
+        eval:
+          name: "blip_question"
+    data_type: pc # [images|pc]
+    build_info:
+      kwargs:
+        add_binary: True
+        remove_model_answer: True
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/objaverse/CAP3DQA_final.csv
+            # - /export/home/LAVIS-xgen_mm/projects/xinstructblip/data_aug/3d_qa_data/CAP3DQA_final.csv
+          storage:
+            - objaverse_qa/annotations/train.csv
+            # - /export/home/LAVIS-xgen_mm/projects/xinstructblip/data_aug/3d_qa_data/CAP3DQA_final.csv
+        # val:
+        #   url:
+        #     - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/objaverse/CAP3DQA_final_val.csv
+        #     # - /export/home/LAVIS-xgen_mm/projects/xinstructblip/data_aug/3d_qa_data/CAP3DQA_final_val.csv
+        #   storage:
+        #     - objaverse_qa/annotations/val.csv
+        #     # - /export/home/LAVIS-xgen_mm/projects/xinstructblip/data_aug/3d_qa_data/CAP3DQA_final_val.csv
+      templates: null
+      pc:
+        storage: /export/einstein-vision/3d_vision/objaverse/objaverse_pc_parallel

LAVIS-main/lavis/configs/datasets/ocrvqa/defaults.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  ocr_vqa: # 1002146 train examples
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+        train:
+          name: "clip_image_train"
+          image_size: 224
+    text_processor:
+        train:
+          name: "blip_question"
+        eval:
+          name: blip_question
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+            url:
+              - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/ocrvqa/ocrvqa.json
+              # - /export/video-language-dataset/ocrvqa/ocrvqa.json
+            storage:
+              - ocrvqa/annotations/ocrvqa.json
+              # - /export/video-language-dataset/ocrvqa/ocrvqa.json
+      images:
+        storage: /export/video-language-dataset/ocrvqa/images/

LAVIS-main/lavis/configs/datasets/ocrvqa/defaults_instruct.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  ocr_vqa_instruct: # 1002146 train examples
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+        train:
+          name: "clip_image_train"
+          image_size: 224
+    text_processor:
+      train:
+        name: blip_instruction
+        modality: image
+        task: qa
+      eval:
+        name: blip_question
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/ocrvqa/ocrvqa.json
+            # - /export/video-language-dataset/ocrvqa/ocrvqa.json
+          storage:
+            - ocrvqa/annotations/ocrvqa.json
+            # - /export/video-language-dataset/ocrvqa/ocrvqa.json
+      images:
+        storage: /export/video-language-dataset/ocrvqa/images/

LAVIS-main/lavis/configs/datasets/okvqa/defaults.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  ok_vqa:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+              # TODO make this order insensitive
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
+              # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
+              # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
+          storage:
+              - okvqa/annotations/okvqa_train.json
+              # - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json
+              # - okvqa/annotations/mscoco_train2014_annotations.json
+        test:
+          url:
+              # TODO make this order insensitive
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
+          storage:
+              - okvqa/annotations/vqa_val_eval.json
+              - okvqa/annotations/answer_list.json
+              - okvqa/annotations/OpenEnded_mscoco_val2014_questions.json
+              - okvqa/annotations/mscoco_val2014_annotations.json
+      images:
+          storage: coco/images/

LAVIS-main/lavis/configs/datasets/okvqa/defaults_instruct.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  ok_vqa_instruct:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: blip_instruction
+        modality: image
+        task: qa
+      eval:
+        name: blip_question
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+              # TODO make this order insensitive
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
+              # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
+              # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
+          storage:
+              - okvqa/annotations/okvqa_train.json
+              # - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json
+              # - okvqa/annotations/mscoco_train2014_annotations.json
+        # test:
+        #   url:
+        #       # TODO make this order insensitive
+        #       - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
+        #       - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
+        #       - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
+        #       - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
+        #   storage:
+        #       - okvqa/annotations/vqa_val_eval.json
+        #       - okvqa/annotations/answer_list.json
+        #       - okvqa/annotations/OpenEnded_mscoco_val2014_questions.json
+        #       - okvqa/annotations/mscoco_val2014_annotations.json
+      images:
+          storage: /export/share/datasets/vision/coco/images

LAVIS-main/lavis/configs/datasets/sbu_caption/defaults.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  sbu_caption:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/sbu/sbu.json
+              # - /export/share/dongxuli/data/lavis/sbu/annotation/sbu.json
+          storage:
+              - sbu_captions/annotations/sbu.json
+      images:
+          storage: sbu_captions/images
+          # storage: /export/share/datasets/vision_language/sbu_resize

LAVIS-main/lavis/configs/datasets/sbu_caption/defaults_instruct.yaml ADDED Viewed

	@@ -0,0 +1,38 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  sbu_caption_instruct:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: blip_instruction
+        modality: image
+        task: caption
+      eval:
+        name: blip_caption
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/sbu/sbu.json
+              # - /export/share/dongxuli/data/lavis/sbu/annotation/sbu.json
+          storage:
+              - sbu_captions/annotations/sbu.json
+      images:
+          storage: sbu_captions/images
+          # storage: /export/share/datasets/vision_language/sbu_resize

LAVIS-main/lavis/configs/datasets/scienceqa/defaults.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  scienceqa:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: blip_question
+      eval:
+        name: blip_question
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      train:
+          url:
+            -  https://storage.googleapis.com/sfr-xinstructblip-data-research/data/scienceqa/problems_train.json
+            # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_train.json
+          storage:
+            - scienceqa/annotations/problems_train.json
+            # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_train.json
+        val:
+          url:
+            -  https://storage.googleapis.com/sfr-xinstructblip-data-research/data/scienceqa/problems_val.json
+            # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_val.json
+          storage:
+            - scienceqa/annotations/problems_val.json
+            # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_val.json
+        test:
+          url:
+            -  https://storage.googleapis.com/sfr-xinstructblip-data-research/data/scienceqa/problems_test.json
+            # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_test.json
+          storage:
+            - scienceqa/annotations/problems_test.json
+            # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_test.json
+      images:
+          storage: /export/video-language-dataset/ScienceQA/data/scienceqa/images/

LAVIS-main/lavis/configs/datasets/scienceqa/defaults_instruct.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+datasets:
+  scienceqa_instruct:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: blip_instruction
+        modality: image
+        task: qa
+      eval:
+        name: blip_question
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            -  https://storage.googleapis.com/sfr-xinstructblip-data-research/data/scienceqa/problems_train.json
+            # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_train.json
+          storage:
+            - scienceqa/annotations/problems_train.json
+            # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_train.json
+        val:
+          url:
+            -  https://storage.googleapis.com/sfr-xinstructblip-data-research/data/scienceqa/problems_val.json
+            # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_val.json
+          storage:
+            - scienceqa/annotations/problems_val.json
+            # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_val.json
+        test:
+          url:
+            -  https://storage.googleapis.com/sfr-xinstructblip-data-research/data/scienceqa/problems_test.json
+            # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_test.json
+          storage:
+            - scienceqa/annotations/problems_test.json
+            # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_test.json
+      images:
+          storage: /export/video-language-dataset/ScienceQA/data/scienceqa/images/

LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/LICENSE ADDED Viewed

	@@ -0,0 +1,25 @@

+// Copyright 2022 Dongxu Li, Junnan Li, Hung Le, Guangsen Wang, Silvio Savarese, Steven Hoi. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+MIT License
+Copyright (c) 2019 Igor Brigadir
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/README.md ADDED Viewed

	@@ -0,0 +1,22 @@

+<!--
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+-->
+# Download Conceptual Captions Data
+Place data from: https://ai.google.com/research/ConceptualCaptions/download in this folder
+`Train_GCC-training.tsv / cc3m.tsv` Training Split (3,318,333)
+run `download_data_cc3m.py` or `download_data_cc12m.py`.
+Images will be in default LAVIS cache folders. You can stop and resume, the settings for splitting downloads into chunks / threads are not optimal, but it maxed out my connection so i kept them as is.
+Note: A previous version of this script used a different file naming scheme, this changed and if you are resuming a previously started download, you will get duplicates.
+A bunch of them will fail to download, and return web pages instead. These will need to be cleaned up later. See `downloaded_validation_report.tsv` after it downloads for HTTP errors. Around 8% of images are gone, based on validation set results. Setting the user agent could fix some errors too maybe - not sure if any requests are rejected by sites based on this.
+It should take about a day or two to download the training data, keep an eye on disk space.

LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/create_annotation_12m.ipynb ADDED Viewed

	@@ -0,0 +1,227 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm\n",
+    "from lavis.common.utils import get_abs_path, get_cache_path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cc12m = pd.read_csv(\"downloaded_cc12m_report.tsv.gz\", compression=\"gzip\", sep=\"\\t\", names=[\"caption\", \"path\", \"dataset\", \"mimetype\", \"size\", \"status\", \"url\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "caption                            a very typical bus station\n",
+       "path        /export/home/.cache/lavis/conceptual_caption/i...\n",
+       "dataset                                                  cc3m\n",
+       "mimetype                                           image/jpeg\n",
+       "size                                                    36078\n",
+       "status                                                    200\n",
+       "url         http://lh6.ggpht.com/-IvRtNLNcG8o/TpFyrudaT6I/...\n",
+       "Name: 0, dtype: object"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cc12m.iloc[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "3318333"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(cc12m)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 3130587/3130587 [17:28<00:00, 2986.08it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 2759017 valid records\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "cnt = 0\n",
+    "\n",
+    "valid_records = []\n",
+    "\n",
+    "for i, path in tqdm(enumerate(cc12m.path.unique()), total=len(cc12m.path.unique())):\n",
+    "    path = str(path)\n",
+    "    if os.path.exists(path):\n",
+    "        record = cc12m.iloc[i]\n",
+    "        valid_records.append({\"image\": record[\"path\"], \"caption\": record[\"caption\"]})\n",
+    "\n",
+    "        cnt += 1\n",
+    "\n",
+    "print(\"Found {} valid records\".format(cnt))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2759017"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(valid_records)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'image': '/export/home/.cache/lavis/conceptual_caption/images/1_3239086386.jpg',\n",
+       " 'caption': 'sierra looked stunning in this top and this skirt while performing with person at their former university'}"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "valid_records[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/export/home/.cache/lavis/conceptual_caption/annotations/cc3m.json already exists\n"
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "from omegaconf import OmegaConf\n",
+    "\n",
+    "\n",
+    "config_path = get_abs_path(\"configs/datasets/conceptual_caption/defaults_12m.yaml\")\n",
+    "\n",
+    "ann_path = OmegaConf.load(\n",
+    "    config_path\n",
+    ").datasets.conceptual_caption_12m.build_info.annotations.train.storage[0]\n",
+    "\n",
+    "ann_path = get_cache_path(ann_path)\n",
+    "\n",
+    "if os.path.exists(ann_path):\n",
+    "    # abort\n",
+    "    print(\"{} already exists\".format(ann_path))\n",
+    "else:\n",
+    "    # Save the valid records to a json file\n",
+    "    with open(ann_path, \"w\") as f:\n",
+    "        f.write(json.dumps(valid_records))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.10 ('base')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/create_annotation_3m.ipynb ADDED Viewed

	@@ -0,0 +1,227 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm\n",
+    "from lavis.common.utils import get_abs_path, get_cache_path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cc3m = pd.read_csv(\"downloaded_cc3m_report.tsv.gz\", compression=\"gzip\", sep=\"\\t\", names=[\"caption\", \"path\", \"dataset\", \"mimetype\", \"size\", \"status\", \"url\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "caption                            a very typical bus station\n",
+       "path        /export/home/.cache/lavis/conceptual_caption/i...\n",
+       "dataset                                                  cc3m\n",
+       "mimetype                                           image/jpeg\n",
+       "size                                                    36078\n",
+       "status                                                    200\n",
+       "url         http://lh6.ggpht.com/-IvRtNLNcG8o/TpFyrudaT6I/...\n",
+       "Name: 0, dtype: object"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cc3m.iloc[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "3318333"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(cc3m)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 3130587/3130587 [17:28<00:00, 2986.08it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 2759017 valid records\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "cnt = 0\n",
+    "\n",
+    "valid_records = []\n",
+    "\n",
+    "for i, path in tqdm(enumerate(cc3m.path.unique()), total=len(cc3m.path.unique())):\n",
+    "    path = str(path)\n",
+    "    if os.path.exists(path):\n",
+    "        record = cc3m.iloc[i]\n",
+    "        valid_records.append({\"image\": record[\"path\"], \"caption\": record[\"caption\"]})\n",
+    "\n",
+    "        cnt += 1\n",
+    "\n",
+    "print(\"Found {} valid records\".format(cnt))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2759017"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(valid_records)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'image': '/export/home/.cache/lavis/conceptual_caption/images/1_3239086386.jpg',\n",
+       " 'caption': 'sierra looked stunning in this top and this skirt while performing with person at their former university'}"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "valid_records[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/export/home/.cache/lavis/conceptual_caption/annotations/cc3m.json already exists\n"
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "from omegaconf import OmegaConf\n",
+    "\n",
+    "\n",
+    "config_path = get_abs_path(\"configs/datasets/conceptual_caption/defaults_3m.yaml\")\n",
+    "\n",
+    "ann_path = OmegaConf.load(\n",
+    "    config_path\n",
+    ").datasets.conceptual_caption_3m.build_info.annotations.train.storage[0]\n",
+    "\n",
+    "ann_path = get_cache_path(ann_path)\n",
+    "\n",
+    "if os.path.exists(ann_path):\n",
+    "    # abort\n",
+    "    print(\"{} already exists\".format(ann_path))\n",
+    "else:\n",
+    "    # Save the valid records to a json file\n",
+    "    with open(ann_path, \"w\") as f:\n",
+    "        f.write(json.dumps(valid_records))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.10 ('base')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc12m.py ADDED Viewed

	@@ -0,0 +1,232 @@

+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import time
+from PIL import Image
+from lavis.common.utils import get_abs_path, get_cache_path
+from multiprocessing import Pool
+from omegaconf import OmegaConf
+from pathlib import Path
+from torchvision.transforms import functional as TF
+from tqdm import tqdm
+import glob
+import io
+import json
+import magic  # pip install python-magic
+import numpy as np
+import os
+import pandas as pd
+import requests
+import shelve
+import zlib
+headers = {
+    #'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
+    "User-Agent": "Googlebot-Image/1.0",  # Pretend to be googlebot
+    "X-Forwarded-For": "64.18.15.200",
+}
+def _df_split_apply(tup_arg):
+    split_ind, subset, func = tup_arg
+    r = subset.apply(func, axis=1)
+    return (split_ind, r)
+def df_multiprocess(df, processes, chunk_size, func, dataset_name):
+    print("Generating parts...")
+    with shelve.open(
+        "%s_%s_%s_results.tmp" % (dataset_name, func.__name__, chunk_size)
+    ) as results:
+        pbar = tqdm(total=len(df), position=0)
+        # Resume:
+        finished_chunks = set([int(k) for k in results.keys()])
+        pbar.desc = "Resuming"
+        for k in results.keys():
+            pbar.update(len(results[str(k)][1]))
+        pool_data = (
+            (index, df[i : i + chunk_size], func)
+            for index, i in enumerate(range(0, len(df), chunk_size))
+            if index not in finished_chunks
+        )
+        print(
+            int(len(df) / chunk_size),
+            "parts.",
+            chunk_size,
+            "per part.",
+            "Using",
+            processes,
+            "processes",
+        )
+        pbar.desc = "Downloading"
+        with Pool(processes) as pool:
+            for i, result in enumerate(
+                pool.imap_unordered(_df_split_apply, pool_data, 2)
+            ):
+                results[str(result[0])] = result
+                pbar.update(len(result[1]))
+        pbar.close()
+    print("Finished Downloading.")
+    return
+# Unique name based on url
+def _file_name(row):
+    name = (
+        "%s/%s_%s"
+        % (
+            # row["folder"],
+            storage_dir,
+            row.name,
+            (zlib.crc32(row["url"].encode("utf-8")) & 0xFFFFFFFF),
+        )
+        + ".jpg"
+    )
+    return name
+# For checking mimetypes separately without download
+def check_mimetype(row):
+    if os.path.isfile(str(row["file"])):
+        row["mimetype"] = magic.from_file(row["file"], mime=True)
+        row["size"] = os.stat(row["file"]).st_size
+    return row
+# Don't download image, just check with a HEAD request, can't resume.
+# Can use this instead of download_image to get HTTP status codes.
+def check_download(row):
+    fname = _file_name(row)
+    try:
+        # not all sites will support HEAD
+        response = requests.head(
+            row["url"], stream=False, timeout=5, allow_redirects=True, headers=headers
+        )
+        row["status"] = response.status_code
+        row["headers"] = dict(response.headers)
+    except:
+        # log errors later, set error as 408 timeout
+        row["status"] = 408
+        return row
+    if response.ok:
+        row["file"] = fname
+    return row
+def resize_img(req):
+    image = Image.open(req).convert("RGB")
+    image = TF.resize(
+        # image, size=(resize_size, resize_size)
+        image,
+        size=resize_size,
+    )  # , interpolation=Image.LANCZOS)
+    return image
+def download_image(row):
+    fname = _file_name(row)
+    # Skip Already downloaded, retry others later
+    if os.path.isfile(fname):
+        row["status"] = 200
+        row["file"] = fname
+        row["mimetype"] = magic.from_file(row["file"], mime=True)
+        row["size"] = os.stat(row["file"]).st_size
+        return row
+    try:
+        # use smaller timeout to skip errors, but can result in failed downloads
+        response = requests.get(
+            row["url"], stream=False, timeout=5, allow_redirects=True, headers=headers
+        )
+        row["status"] = response.status_code
+        # row['headers'] = dict(response.headers)
+    except Exception as e:
+        # log errors later, set error as 408 timeout
+        row["status"] = 408
+        return row
+    if response.ok:
+        try:
+            # some sites respond with gzip transport encoding
+            response.raw.decode_content = True
+            img = resize_img(io.BytesIO(response.content))
+            img.save(fname)
+            row["mimetype"] = magic.from_file(fname, mime=True)
+            row["size"] = os.stat(fname).st_size
+        except Exception as e:
+            #     # This is if it times out during a download or decode
+            row["status"] = 408
+    row["file"] = fname
+    return row
+def open_tsv(fname, folder):
+    print("Opening %s Data File..." % fname)
+    df = pd.read_csv(
+        fname, sep="\t", names=["url", "caption"]
+    )  # , usecols=range(1, 2))
+    df["folder"] = folder
+    print("Processing", len(df), " Images:")
+    return df
+def df_from_shelve(chunk_size, func, dataset_name):
+    print("Generating Dataframe from results...")
+    with shelve.open(
+        "%s_%s_%s_results.tmp" % (dataset_name, func.__name__, chunk_size)
+    ) as results:
+        keylist = sorted([int(k) for k in results.keys()])
+        df = pd.concat([results[str(k)][1] for k in keylist], sort=True)
+    return df
+resize_size = 384
+config_path = get_abs_path("configs/datasets/conceptual_caption/defaults_12m.yaml")
+storage_dir = OmegaConf.load(
+    config_path
+).datasets.conceptual_caption_12m.build_info.images.storage
+storage_dir = Path(get_cache_path(storage_dir))
+os.makedirs(storage_dir, exist_ok=True)
+# number of processes in the pool can be larger than cores
+num_processes = 96
+# num_processes = 1
+# chunk_size is how many images per chunk per process - changing this resets progress when restarting.
+images_per_part = 100
+data_name = "cc12m"
+# os.makedirs(data_name, exist_ok=True)
+df = open_tsv("cc12m.tsv", data_name)
+df_multiprocess(
+    df=df,
+    processes=num_processes,
+    chunk_size=images_per_part,
+    func=download_image,
+    dataset_name=data_name,
+)
+df = df_from_shelve(
+    chunk_size=images_per_part, func=download_image, dataset_name=data_name
+)
+df.to_csv(
+    "downloaded_%s_report.tsv.gz" % data_name,
+    compression="gzip",
+    sep="\t",
+    header=False,
+    index=False,
+)
+print("Saved.")

LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc3m.py ADDED Viewed

	@@ -0,0 +1,229 @@

+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import glob
+from pathlib import Path
+import time
+from omegaconf import OmegaConf
+import pandas as pd
+import numpy as np
+import requests
+import zlib
+import os
+import io
+import shelve
+from lavis.common.utils import get_abs_path, get_cache_path
+import magic  # pip install python-magic
+import json
+from multiprocessing import Pool
+from tqdm import tqdm
+from PIL import Image
+from torchvision.transforms import functional as TF
+headers = {
+    #'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
+    "User-Agent": "Googlebot-Image/1.0",  # Pretend to be googlebot
+    "X-Forwarded-For": "64.18.15.200",
+}
+def _df_split_apply(tup_arg):
+    split_ind, subset, func = tup_arg
+    r = subset.apply(func, axis=1)
+    return (split_ind, r)
+def df_multiprocess(df, processes, chunk_size, func, dataset_name):
+    print("Generating parts...")
+    with shelve.open(
+        "%s_%s_%s_results.tmp" % (dataset_name, func.__name__, chunk_size)
+    ) as results:
+        pbar = tqdm(total=len(df), position=0)
+        # Resume:
+        finished_chunks = set([int(k) for k in results.keys()])
+        pbar.desc = "Resuming"
+        for k in results.keys():
+            pbar.update(len(results[str(k)][1]))
+        pool_data = (
+            (index, df[i : i + chunk_size], func)
+            for index, i in enumerate(range(0, len(df), chunk_size))
+            if index not in finished_chunks
+        )
+        print(
+            int(len(df) / chunk_size),
+            "parts.",
+            chunk_size,
+            "per part.",
+            "Using",
+            processes,
+            "processes",
+        )
+        pbar.desc = "Downloading"
+        with Pool(processes) as pool:
+            for i, result in enumerate(
+                pool.imap_unordered(_df_split_apply, pool_data, 2)
+            ):
+                results[str(result[0])] = result
+                pbar.update(len(result[1]))
+        pbar.close()
+    print("Finished Downloading.")
+    return
+# Unique name based on url
+def _file_name(row):
+    name = (
+        "%s/%s_%s"
+        % (
+            # row["folder"],
+            storage_dir,
+            row.name,
+            (zlib.crc32(row["url"].encode("utf-8")) & 0xFFFFFFFF),
+        )
+        + ".jpg"
+    )
+    return name
+# For checking mimetypes separately without download
+def check_mimetype(row):
+    if os.path.isfile(str(row["file"])):
+        row["mimetype"] = magic.from_file(row["file"], mime=True)
+        row["size"] = os.stat(row["file"]).st_size
+    return row
+# Don't download image, just check with a HEAD request, can't resume.
+# Can use this instead of download_image to get HTTP status codes.
+def check_download(row):
+    fname = _file_name(row)
+    try:
+        # not all sites will support HEAD
+        response = requests.head(
+            row["url"], stream=False, timeout=5, allow_redirects=True, headers=headers
+        )
+        row["status"] = response.status_code
+        row["headers"] = dict(response.headers)
+    except:
+        # log errors later, set error as 408 timeout
+        row["status"] = 408
+        return row
+    if response.ok:
+        row["file"] = fname
+    return row
+def resize_img(req):
+    image = Image.open(req).convert("RGB")
+    image = TF.resize(
+        # image, size=(resize_size, resize_size)
+        image,
+        size=resize_size,
+    )  # , interpolation=Image.LANCZOS)
+    return image
+def download_image(row):
+    fname = _file_name(row)
+    # Skip Already downloaded, retry others later
+    if os.path.isfile(fname):
+        row["status"] = 200
+        row["file"] = fname
+        row["mimetype"] = magic.from_file(row["file"], mime=True)
+        row["size"] = os.stat(row["file"]).st_size
+        return row
+    try:
+        # use smaller timeout to skip errors, but can result in failed downloads
+        response = requests.get(
+            row["url"], stream=False, timeout=5, allow_redirects=True, headers=headers
+        )
+        row["status"] = response.status_code
+        # row['headers'] = dict(response.headers)
+    except Exception as e:
+        # log errors later, set error as 408 timeout
+        row["status"] = 408
+        return row
+    if response.ok:
+        try:
+            # some sites respond with gzip transport encoding
+            response.raw.decode_content = True
+            img = resize_img(io.BytesIO(response.content))
+            img.save(fname)
+            row["mimetype"] = magic.from_file(fname, mime=True)
+            row["size"] = os.stat(fname).st_size
+        except Exception as e:
+            #     # This is if it times out during a download or decode
+            row["status"] = 408
+    row["file"] = fname
+    return row
+def open_tsv(fname, folder):
+    print("Opening %s Data File..." % fname)
+    df = pd.read_csv(
+        fname, sep="\t", names=["caption", "url"]
+    )  # , usecols=range(1, 2))
+    df["folder"] = folder
+    print("Processing", len(df), " Images:")
+    return df
+def df_from_shelve(chunk_size, func, dataset_name):
+    print("Generating Dataframe from results...")
+    with shelve.open(
+        "%s_%s_%s_results.tmp" % (dataset_name, func.__name__, chunk_size)
+    ) as results:
+        keylist = sorted([int(k) for k in results.keys()])
+        df = pd.concat([results[str(k)][1] for k in keylist], sort=True)
+    return df
+resize_size = 384
+config_path = get_abs_path("configs/datasets/conceptual_caption/defaults_3m.yaml")
+storage_dir = OmegaConf.load(
+    config_path
+).datasets.conceptual_caption_3m.build_info.images.storage
+storage_dir = Path(get_cache_path(storage_dir))
+os.makedirs(storage_dir, exist_ok=True)
+# number of processes in the pool can be larger than cores
+num_processes = 32
+# chunk_size is how many images per chunk per process - changing this resets progress when restarting.
+images_per_part = 100
+data_name = "cc3m"
+df = open_tsv("Train_GCC-training.tsv", data_name)
+df_multiprocess(
+    df=df,
+    processes=num_processes,
+    chunk_size=images_per_part,
+    func=download_image,
+    dataset_name=data_name,
+)
+df = df_from_shelve(
+    chunk_size=images_per_part, func=download_image, dataset_name=data_name
+)
+df.to_csv(
+    "downloaded_%s_report.tsv.gz" % data_name,
+    compression="gzip",
+    sep="\t",
+    header=False,
+    index=False,
+)
+print("Saved.")

LAVIS-main/lavis/models/__init__.py ADDED Viewed

	@@ -0,0 +1,270 @@

+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import logging
+import torch
+from omegaconf import OmegaConf
+from lavis.common.registry import registry
+from lavis.models.base_model import BaseModel
+from lavis.models.albef_models.albef_classification import AlbefClassification
+from lavis.models.albef_models.albef_feature_extractor import AlbefFeatureExtractor
+from lavis.models.albef_models.albef_nlvr import AlbefNLVR
+from lavis.models.albef_models.albef_pretrain import AlbefPretrain
+from lavis.models.albef_models.albef_retrieval import AlbefRetrieval
+from lavis.models.albef_models.albef_vqa import AlbefVQA
+from lavis.models.alpro_models.alpro_qa import AlproQA
+from lavis.models.alpro_models.alpro_retrieval import AlproRetrieval
+from lavis.models.blip_models.blip import BlipBase
+from lavis.models.blip_models.blip_caption import BlipCaption
+from lavis.models.blip_models.blip_classification import BlipClassification
+from lavis.models.blip_models.blip_feature_extractor import BlipFeatureExtractor
+from lavis.models.blip_models.blip_image_text_matching import BlipITM
+from lavis.models.blip_models.blip_nlvr import BlipNLVR
+from lavis.models.blip_models.blip_pretrain import BlipPretrain
+from lavis.models.blip_models.blip_retrieval import BlipRetrieval
+from lavis.models.blip_models.blip_vqa import BlipVQA
+from lavis.models.blip2_models.blip2 import Blip2Base
+from lavis.models.blip2_models.blip2_opt import Blip2OPT
+from lavis.models.blip2_models.blip2_t5 import Blip2T5
+from lavis.models.blip2_models.blip2_qformer import Blip2Qformer
+from lavis.models.blip2_models.blip2_image_text_matching import Blip2ITM
+from lavis.models.blip2_models.blip2_t5_instruct import Blip2T5Instruct
+from lavis.models.blip2_models.blip2_vicuna_instruct import Blip2VicunaInstruct
+from lavis.models.blip2_models.blip2_vicuna_xinstruct import Blip2VicunaXInstruct
+from lavis.models.blip_diffusion_models.blip_diffusion import BlipDiffusion
+from lavis.models.pnp_vqa_models.pnp_vqa import PNPVQA
+from lavis.models.pnp_vqa_models.pnp_unifiedqav2_fid import PNPUnifiedQAv2FiD
+from lavis.models.img2prompt_models.img2prompt_vqa import Img2PromptVQA
+from lavis.models.med import XBertLMHeadDecoder
+from lavis.models.vit import VisionTransformerEncoder
+from lavis.models.clip_models.model import CLIP
+from lavis.models.gpt_models.gpt_dialogue import GPTDialogue
+from lavis.processors.base_processor import BaseProcessor
+__all__ = [
+    "load_model",
+    "AlbefClassification",
+    "AlbefFeatureExtractor",
+    "AlbefNLVR",
+    "AlbefVQA",
+    "AlbefPretrain",
+    "AlbefRetrieval",
+    "AlproQA",
+    "AlproRetrieval",
+    "BaseModel",
+    "BlipBase",
+    "BlipFeatureExtractor",
+    "BlipCaption",
+    "BlipClassification",
+    "BlipDiffusion",
+    "BlipITM",
+    "BlipNLVR",
+    "BlipPretrain",
+    "BlipRetrieval",
+    "BlipVQA",
+    "Blip2Qformer",
+    "Blip2Base",
+    "Blip2ITM",
+    "Blip2OPT",
+    "Blip2T5",
+    "Blip2T5Instruct",
+    "Blip2VicunaInstruct",
+    "Blip2VicunaXInstruct",
+    "PNPVQA",
+    "Img2PromptVQA",
+    "PNPUnifiedQAv2FiD",
+    "CLIP",
+    "VisionTransformerEncoder",
+    "XBertLMHeadDecoder",
+    "GPTDialogue",
+]
+def load_model(name, model_type, is_eval=False, device="cpu", checkpoint=None):
+    """
+    Load supported models.
+    To list all available models and types in registry:
+    >>> from lavis.models import model_zoo
+    >>> print(model_zoo)
+    Args:
+        name (str): name of the model.
+        model_type (str): type of the model.
+        is_eval (bool): whether the model is in eval mode. Default: False.
+        device (str): device to use. Default: "cpu".
+        checkpoint (str): path or to checkpoint. Default: None.
+            Note that expecting the checkpoint to have the same keys in state_dict as the model.
+    Returns:
+        model (torch.nn.Module): model.
+    """
+    model = registry.get_model_class(name).from_pretrained(model_type=model_type)
+    if checkpoint is not None:
+        model.load_checkpoint(checkpoint)
+    if is_eval:
+        model.eval()
+    if device == "cpu":
+        model = model.float()
+    return model.to(device)
+def load_preprocess(config):
+    """
+    Load preprocessor configs and construct preprocessors.
+    If no preprocessor is specified, return BaseProcessor, which does not do any preprocessing.
+    Args:
+        config (dict): preprocessor configs.
+    Returns:
+        vis_processors (dict): preprocessors for visual inputs.
+        txt_processors (dict): preprocessors for text inputs.
+        Key is "train" or "eval" for processors used in training and evaluation respectively.
+    """
+    def _build_proc_from_cfg(cfg):
+        return (
+            registry.get_processor_class(cfg.name).from_config(cfg)
+            if cfg is not None
+            else BaseProcessor()
+        )
+    vis_processors = dict()
+    txt_processors = dict()
+    vis_proc_cfg = config.get("vis_processor")
+    txt_proc_cfg = config.get("text_processor")
+    if vis_proc_cfg is not None:
+        vis_train_cfg = vis_proc_cfg.get("train")
+        vis_eval_cfg = vis_proc_cfg.get("eval")
+    else:
+        vis_train_cfg = None
+        vis_eval_cfg = None
+    vis_processors["train"] = _build_proc_from_cfg(vis_train_cfg)
+    vis_processors["eval"] = _build_proc_from_cfg(vis_eval_cfg)
+    if txt_proc_cfg is not None:
+        txt_train_cfg = txt_proc_cfg.get("train")
+        txt_eval_cfg = txt_proc_cfg.get("eval")
+    else:
+        txt_train_cfg = None
+        txt_eval_cfg = None
+    txt_processors["train"] = _build_proc_from_cfg(txt_train_cfg)
+    txt_processors["eval"] = _build_proc_from_cfg(txt_eval_cfg)
+    return vis_processors, txt_processors
+def load_model_and_preprocess(name, model_type, is_eval=False, device="cpu"):
+    """
+    Load model and its related preprocessors.
+    List all available models and types in registry:
+    >>> from lavis.models import model_zoo
+    >>> print(model_zoo)
+    Args:
+        name (str): name of the model.
+        model_type (str): type of the model.
+        is_eval (bool): whether the model is in eval mode. Default: False.
+        device (str): device to use. Default: "cpu".
+    Returns:
+        model (torch.nn.Module): model.
+        vis_processors (dict): preprocessors for visual inputs.
+        txt_processors (dict): preprocessors for text inputs.
+    """
+    model_cls = registry.get_model_class(name)
+    # load model
+    model = model_cls.from_pretrained(model_type=model_type)
+    if is_eval:
+        model.eval()
+    # load preprocess
+    cfg = OmegaConf.load(model_cls.default_config_path(model_type))
+    if cfg is not None:
+        preprocess_cfg = cfg.preprocess
+        vis_processors, txt_processors = load_preprocess(preprocess_cfg)
+    else:
+        vis_processors, txt_processors = None, None
+        logging.info(
+            f"""No default preprocess for model {name} ({model_type}).
+                This can happen if the model is not finetuned on downstream datasets,
+                or it is not intended for direct use without finetuning.
+            """
+        )
+    if device == "cpu" or device == torch.device("cpu"):
+        model = model.float()
+    return model.to(device), vis_processors, txt_processors
+class ModelZoo:
+    """
+    A utility class to create string representation of available model architectures and types.
+    >>> from lavis.models import model_zoo
+    >>> # list all available models
+    >>> print(model_zoo)
+    >>> # show total number of models
+    >>> print(len(model_zoo))
+    """
+    def __init__(self) -> None:
+        self.model_zoo = {
+            k: list(v.PRETRAINED_MODEL_CONFIG_DICT.keys())
+            for k, v in registry.mapping["model_name_mapping"].items()
+        }
+    def __str__(self) -> str:
+        return (
+            "=" * 50
+            + "\n"
+            + f"{'Architectures':<30} {'Types'}\n"
+            + "=" * 50
+            + "\n"
+            + "\n".join(
+                [
+                    f"{name:<30} {', '.join(types)}"
+                    for name, types in self.model_zoo.items()
+                ]
+            )
+        )
+    def __iter__(self):
+        return iter(self.model_zoo.items())
+    def __len__(self):
+        return sum([len(v) for v in self.model_zoo.values()])
+model_zoo = ModelZoo()

LAVIS-main/lavis/models/albef_models/__init__.py ADDED Viewed

	@@ -0,0 +1,202 @@

+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import datetime
+import logging
+import os
+import time
+import lavis.common.dist_utils as dist_utils
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from lavis.common.dist_utils import download_cached_file
+from lavis.common.logger import MetricLogger
+from lavis.common.utils import is_url
+from lavis.models.base_model import BaseModel
+from lavis.models.vit import interpolate_pos_embed
+from transformers import BertTokenizer
+class AlbefBase(BaseModel):
+    @classmethod
+    def init_tokenizer(cls):
+        return BertTokenizer.from_pretrained("bert-base-uncased")
+    def load_from_pretrained(self, url_or_filename, rename_text_keys=True):
+        if is_url(url_or_filename):
+            cached_file = download_cached_file(
+                url_or_filename, check_hash=False, progress=True
+            )
+            checkpoint = torch.load(cached_file, map_location="cpu")
+        elif os.path.isfile(url_or_filename):
+            checkpoint = torch.load(url_or_filename, map_location="cpu")
+        else:
+            raise RuntimeError("checkpoint url or path is invalid")
+        if "model" in checkpoint:
+            state_dict = checkpoint["model"]
+        else:
+            state_dict = checkpoint
+        state_dict["visual_encoder.pos_embed"] = interpolate_pos_embed(
+            state_dict["visual_encoder.pos_embed"], self.visual_encoder
+        )
+        if (
+            "visual_encoder_m.pos_embed" in self.state_dict().keys()
+            and "visual_encoder_m.pos_embed" in state_dict
+        ):
+            state_dict["visual_encoder_m.pos_embed"] = interpolate_pos_embed(
+                state_dict["visual_encoder_m.pos_embed"], self.visual_encoder_m
+            )
+        if rename_text_keys:
+            for key in list(state_dict.keys()):
+                if "bert" in key:
+                    new_key = key.replace("bert.", "")
+                    state_dict[new_key] = state_dict[key]
+                    del state_dict[key]
+        for key in self.state_dict().keys():
+            if key in state_dict.keys():
+                if state_dict[key].shape != self.state_dict()[key].shape:
+                    del state_dict[key]
+        msg = self.load_state_dict(state_dict, strict=False)
+        logging.info("Missing keys {}".format(msg.missing_keys))
+        logging.info("load checkpoint from %s" % url_or_filename)
+        return msg
+def compute_sim_matrix(model, data_loader, **kwargs):
+    k_test = kwargs.pop("k_test")
+    metric_logger = MetricLogger(delimiter="  ")
+    header = "Evaluation:"
+    logging.info("Computing features for evaluation...")
+    start_time = time.time()
+    texts = data_loader.dataset.text
+    num_text = len(texts)
+    text_bs = 256
+    text_ids = []
+    text_embeds = []
+    text_atts = []
+    for i in range(0, num_text, text_bs):
+        text = texts[i : min(num_text, i + text_bs)]
+        text_input = model.tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=35,
+            return_tensors="pt",
+        ).to(model.device)
+        text_output = model.text_encoder.forward_text(text_input)
+        text_embed = F.normalize(
+            model.text_proj(text_output.last_hidden_state[:, 0, :])
+        )
+        text_embeds.append(text_embed)
+        text_ids.append(text_input.input_ids)
+        text_atts.append(text_input.attention_mask)
+    text_embeds = torch.cat(text_embeds, dim=0)
+    text_ids = torch.cat(text_ids, dim=0)
+    text_atts = torch.cat(text_atts, dim=0)
+    if hasattr(model.tokenizer, "enc_token_id"):
+        text_ids[:, 0] = model.tokenizer.enc_token_id
+    image_feats = []
+    image_embeds = []
+    for samples in data_loader:
+        image = samples["image"]
+        image = image.to(model.device)
+        image_feat = model.visual_encoder.forward_features(image)
+        image_embed = model.vision_proj(image_feat[:, 0, :])
+        image_embed = F.normalize(image_embed, dim=-1)
+        image_feats.append(image_feat.cpu())
+        image_embeds.append(image_embed)
+    image_feats = torch.cat(image_feats, dim=0)
+    image_embeds = torch.cat(image_embeds, dim=0)
+    sims_matrix = image_embeds @ text_embeds.t()
+    score_matrix_i2t = torch.full(
+        (len(data_loader.dataset.image), len(texts)), -100.0
+    ).to(model.device)
+    num_tasks = dist_utils.get_world_size()
+    rank = dist_utils.get_rank()
+    step = sims_matrix.size(0) // num_tasks + 1
+    start = rank * step
+    end = min(sims_matrix.size(0), start + step)
+    for i, sims in enumerate(
+        metric_logger.log_every(sims_matrix[start:end], 50, header)
+    ):
+        # topk_sim, topk_idx = sims.topk(k=config["k_test"], dim=0)
+        topk_sim, topk_idx = sims.topk(k=k_test, dim=0)
+        encoder_output = image_feats[start + i].repeat(k_test, 1, 1).to(model.device)
+        encoder_att = torch.ones(encoder_output.size()[:-1], dtype=torch.long).to(
+            model.device
+        )
+        output = model.text_encoder(
+            text_ids[topk_idx],
+            attention_mask=text_atts[topk_idx],
+            encoder_hidden_states=encoder_output,
+            encoder_attention_mask=encoder_att,
+            return_dict=True,
+        )
+        score = model.itm_head(output.last_hidden_state[:, 0, :])[:, 1]
+        score_matrix_i2t[start + i, topk_idx] = score + topk_sim
+    sims_matrix = sims_matrix.t()
+    score_matrix_t2i = torch.full(
+        (len(texts), len(data_loader.dataset.image)), -100.0
+    ).to(model.device)
+    step = sims_matrix.size(0) // num_tasks + 1
+    start = rank * step
+    end = min(sims_matrix.size(0), start + step)
+    for i, sims in enumerate(
+        metric_logger.log_every(sims_matrix[start:end], 50, header)
+    ):
+        topk_sim, topk_idx = sims.topk(k=k_test, dim=0)
+        encoder_output = image_feats[topk_idx.cpu()].to(model.device)
+        encoder_att = torch.ones(encoder_output.size()[:-1], dtype=torch.long).to(
+            model.device
+        )
+        output = model.text_encoder(
+            text_ids[start + i].repeat(k_test, 1),
+            attention_mask=text_atts[start + i].repeat(k_test, 1),
+            encoder_hidden_states=encoder_output,
+            encoder_attention_mask=encoder_att,
+            return_dict=True,
+        )
+        score = model.itm_head(output.last_hidden_state[:, 0, :])[:, 1]
+        score_matrix_t2i[start + i, topk_idx] = score + topk_sim
+    if dist_utils.is_dist_avail_and_initialized():
+        dist.barrier()
+        torch.distributed.all_reduce(
+            score_matrix_i2t, op=torch.distributed.ReduceOp.SUM
+        )
+        torch.distributed.all_reduce(
+            score_matrix_t2i, op=torch.distributed.ReduceOp.SUM
+        )
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    logging.info("Evaluation time {}".format(total_time_str))
+    return score_matrix_i2t.cpu().numpy(), score_matrix_t2i.cpu().numpy()