Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-835/trainer_state.json +0 -0
- BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_loss.png +0 -0
- BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_runtime.png +0 -0
- BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_samples_per_second.png +0 -0
- BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_steps_per_second.png +0 -0
- LAVIS-main/lavis/configs/datasets/gqa/balanced_testdev_instruct.yaml +46 -0
- LAVIS-main/lavis/configs/datasets/gqa/balanced_val.yaml +30 -0
- LAVIS-main/lavis/configs/datasets/gqa/balanced_val_instruct.yaml +47 -0
- LAVIS-main/lavis/configs/datasets/gqa/defaults.yaml +36 -0
- LAVIS-main/lavis/configs/datasets/gqa/defaults_instruct.yaml +55 -0
- LAVIS-main/lavis/configs/datasets/iconqa/defaults.yaml +52 -0
- LAVIS-main/lavis/configs/datasets/iconqa/defaults_instruct.yaml +55 -0
- LAVIS-main/lavis/configs/datasets/imagenet/defaults.yaml +15 -0
- LAVIS-main/lavis/configs/datasets/laion/defaults_2B_multi.yaml +13 -0
- LAVIS-main/lavis/configs/datasets/laion/defaults_400M.yaml +20 -0
- LAVIS-main/lavis/configs/datasets/laion/defaults_400M_instruct.yaml +31 -0
- LAVIS-main/lavis/configs/datasets/llava150k/defaults_dial.yaml +32 -0
- LAVIS-main/lavis/configs/datasets/modelnet40/defaults_cls.yaml +55 -0
- LAVIS-main/lavis/configs/datasets/msrvtt/defaults_cap.yaml +24 -0
- LAVIS-main/lavis/configs/datasets/msrvtt/defaults_cap_instruct.yaml +48 -0
- LAVIS-main/lavis/configs/datasets/msrvtt/defaults_qa.yaml +27 -0
- LAVIS-main/lavis/configs/datasets/msrvtt/defaults_qa_instruct.yaml +51 -0
- LAVIS-main/lavis/configs/datasets/msrvtt/defaults_ret.yaml +24 -0
- LAVIS-main/lavis/configs/datasets/msvd/defaults_cap.yaml +24 -0
- LAVIS-main/lavis/configs/datasets/msvd/defaults_cap_instruct.yaml +50 -0
- LAVIS-main/lavis/configs/datasets/msvd/defaults_qa.yaml +29 -0
- LAVIS-main/lavis/configs/datasets/msvd/defaults_qa_instruct.yaml +53 -0
- LAVIS-main/lavis/configs/datasets/music_avqa/defaults_mm_qa.yaml +66 -0
- LAVIS-main/lavis/configs/datasets/music_avqa/defaults_mm_qa_instruct.yaml +69 -0
- LAVIS-main/lavis/configs/datasets/nlvr/defaults.yaml +24 -0
- LAVIS-main/lavis/configs/datasets/nocaps/defaults.yaml +22 -0
- LAVIS-main/lavis/configs/datasets/objaverse/defaults_mm_cap.yaml +54 -0
- LAVIS-main/lavis/configs/datasets/objaverse/defaults_mm_cap_instruct.yaml +55 -0
- LAVIS-main/lavis/configs/datasets/objaverse/defaults_mm_qa.yaml +55 -0
- LAVIS-main/lavis/configs/datasets/ocrvqa/defaults.yaml +33 -0
- LAVIS-main/lavis/configs/datasets/ocrvqa/defaults_instruct.yaml +35 -0
- LAVIS-main/lavis/configs/datasets/okvqa/defaults.yaml +37 -0
- LAVIS-main/lavis/configs/datasets/okvqa/defaults_instruct.yaml +53 -0
- LAVIS-main/lavis/configs/datasets/sbu_caption/defaults.yaml +22 -0
- LAVIS-main/lavis/configs/datasets/sbu_caption/defaults_instruct.yaml +38 -0
- LAVIS-main/lavis/configs/datasets/scienceqa/defaults.yaml +51 -0
- LAVIS-main/lavis/configs/datasets/scienceqa/defaults_instruct.yaml +54 -0
- LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/LICENSE +25 -0
- LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/README.md +22 -0
- LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/create_annotation_12m.ipynb +227 -0
- LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/create_annotation_3m.ipynb +227 -0
- LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc12m.py +232 -0
- LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc3m.py +229 -0
- LAVIS-main/lavis/models/__init__.py +270 -0
- LAVIS-main/lavis/models/albef_models/__init__.py +202 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-835/trainer_state.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_loss.png
ADDED
|
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_runtime.png
ADDED
|
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_samples_per_second.png
ADDED
|
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_steps_per_second.png
ADDED
|
LAVIS-main/lavis/configs/datasets/gqa/balanced_testdev_instruct.yaml
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
gqa:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
eval:
|
| 16 |
+
name: "clip_image_eval"
|
| 17 |
+
image_size: 224
|
| 18 |
+
|
| 19 |
+
text_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: blip_instruction
|
| 22 |
+
task: qa
|
| 23 |
+
modality: image
|
| 24 |
+
eval:
|
| 25 |
+
name: blip_question
|
| 26 |
+
|
| 27 |
+
build_info:
|
| 28 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 29 |
+
annotations:
|
| 30 |
+
train:
|
| 31 |
+
url:
|
| 32 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
|
| 33 |
+
storage:
|
| 34 |
+
- gqa/annotations/train_balanced_questions.json
|
| 35 |
+
val:
|
| 36 |
+
url:
|
| 37 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/testdev_balanced_questions.json
|
| 38 |
+
storage:
|
| 39 |
+
- gqa/annotations/testdev_balanced_questions.json
|
| 40 |
+
test:
|
| 41 |
+
url:
|
| 42 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
|
| 43 |
+
storage:
|
| 44 |
+
- gqa/annotations/test_balanced_questions.json
|
| 45 |
+
images:
|
| 46 |
+
storage: /export/share/datasets/vision/GQA/images #gqa/images/
|
LAVIS-main/lavis/configs/datasets/gqa/balanced_val.yaml
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
gqa:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
build_info:
|
| 12 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 13 |
+
annotations:
|
| 14 |
+
train:
|
| 15 |
+
url:
|
| 16 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
|
| 17 |
+
storage:
|
| 18 |
+
- gqa/annotations/train_balanced_questions.json
|
| 19 |
+
val:
|
| 20 |
+
url:
|
| 21 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/val_balanced_questions.json
|
| 22 |
+
storage:
|
| 23 |
+
- gqa/annotations/val_balanced_questions.json
|
| 24 |
+
test:
|
| 25 |
+
url:
|
| 26 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
|
| 27 |
+
storage:
|
| 28 |
+
- gqa/annotations/test_balanced_questions.json
|
| 29 |
+
images:
|
| 30 |
+
storage: gqa/images/
|
LAVIS-main/lavis/configs/datasets/gqa/balanced_val_instruct.yaml
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
gqa_instruct:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
eval:
|
| 16 |
+
name: "clip_image_eval"
|
| 17 |
+
image_size: 224
|
| 18 |
+
|
| 19 |
+
text_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: blip_instruction
|
| 22 |
+
task: qa
|
| 23 |
+
modality: image
|
| 24 |
+
eval:
|
| 25 |
+
name: blip_question
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
build_info:
|
| 29 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 30 |
+
annotations:
|
| 31 |
+
train:
|
| 32 |
+
url:
|
| 33 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
|
| 34 |
+
storage:
|
| 35 |
+
- gqa/annotations/train_balanced_questions.json
|
| 36 |
+
val:
|
| 37 |
+
url:
|
| 38 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/val_balanced_questions.json
|
| 39 |
+
storage:
|
| 40 |
+
- gqa/annotations/val_balanced_questions.json
|
| 41 |
+
test:
|
| 42 |
+
url:
|
| 43 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
|
| 44 |
+
storage:
|
| 45 |
+
- gqa/annotations/test_balanced_questions.json
|
| 46 |
+
images:
|
| 47 |
+
storage: /export/share/datasets/vision/GQA/images #gqa/images/
|
LAVIS-main/lavis/configs/datasets/gqa/defaults.yaml
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
gqa:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
build_info:
|
| 12 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 13 |
+
annotations:
|
| 14 |
+
train:
|
| 15 |
+
url:
|
| 16 |
+
- /export/share/datasets/vision/GQA/questions1.2/train_all_questions/train_all_questions_0.json
|
| 17 |
+
- /export/share/datasets/vision/GQA/questions1.2/val_all_questions.json
|
| 18 |
+
storage:
|
| 19 |
+
- gqa/annotations/train_all_questions_0.json
|
| 20 |
+
- gqa/annotations/val_all_questions.json
|
| 21 |
+
val:
|
| 22 |
+
url:
|
| 23 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
|
| 24 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json
|
| 25 |
+
storage:
|
| 26 |
+
- aokvqa/annotations/aokvqa_v1p0_val.json
|
| 27 |
+
- aokvqa/annotations/large_vocab_train_lavis.json
|
| 28 |
+
test:
|
| 29 |
+
url:
|
| 30 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json
|
| 31 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json
|
| 32 |
+
storage:
|
| 33 |
+
- aokvqa/annotations/aokvqa_v1p0_test.json
|
| 34 |
+
- aokvqa/annotations/large_vocab_train_lavis.json
|
| 35 |
+
images:
|
| 36 |
+
storage: gqa/images/
|
LAVIS-main/lavis/configs/datasets/gqa/defaults_instruct.yaml
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
gqa_instruct:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
eval:
|
| 16 |
+
name: "clip_image_eval"
|
| 17 |
+
image_size: 224
|
| 18 |
+
|
| 19 |
+
text_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: blip_instruction
|
| 22 |
+
task: qa
|
| 23 |
+
modality: image
|
| 24 |
+
eval:
|
| 25 |
+
name: blip_question
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
build_info:
|
| 29 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 30 |
+
annotations:
|
| 31 |
+
train:
|
| 32 |
+
url:
|
| 33 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/gqa/train_all_questions_0.json
|
| 34 |
+
# - /export/share/datasets/vision/GQA/questions1.2/train_all_questions/train_all_questions_0.json
|
| 35 |
+
# - /export/share/datasets/vision/GQA/questions1.2/val_all_questions.json
|
| 36 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/gqa/val_all_questions.json
|
| 37 |
+
storage:
|
| 38 |
+
- gqa/annotations/train_all_questions_0.json
|
| 39 |
+
- gqa/annotations/val_all_questions.json
|
| 40 |
+
val:
|
| 41 |
+
url:
|
| 42 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
|
| 43 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json
|
| 44 |
+
storage:
|
| 45 |
+
- aokvqa/annotations/aokvqa_v1p0_val.json
|
| 46 |
+
- aokvqa/annotations/large_vocab_train_lavis.json
|
| 47 |
+
test:
|
| 48 |
+
url:
|
| 49 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json
|
| 50 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json
|
| 51 |
+
storage:
|
| 52 |
+
- aokvqa/annotations/aokvqa_v1p0_test.json
|
| 53 |
+
- aokvqa/annotations/large_vocab_train_lavis.json
|
| 54 |
+
images:
|
| 55 |
+
storage: /export/share/datasets/vision/GQA/images #gqa/images/
|
LAVIS-main/lavis/configs/datasets/iconqa/defaults.yaml
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
iconqa:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
eval:
|
| 16 |
+
name: "clip_image_eval"
|
| 17 |
+
image_size: 224
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
text_processor:
|
| 21 |
+
train:
|
| 22 |
+
name: blip_question
|
| 23 |
+
eval:
|
| 24 |
+
name: blip_question
|
| 25 |
+
|
| 26 |
+
build_info:
|
| 27 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 28 |
+
annotations:
|
| 29 |
+
train:
|
| 30 |
+
url:
|
| 31 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_train.json
|
| 32 |
+
# - /export/share/datasets/vision_language/iconqa/annotations_train.json
|
| 33 |
+
storage:
|
| 34 |
+
- iconqa/annotations/train.json
|
| 35 |
+
# - /export/share/datasets/vision_language/iconqa/annotations_train.json
|
| 36 |
+
val:
|
| 37 |
+
url:
|
| 38 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_val.json
|
| 39 |
+
# - /export/share/datasets/vision_language/iconqa/annotations_val.json
|
| 40 |
+
storage:
|
| 41 |
+
- iconqa/annotations/val.json
|
| 42 |
+
# - /export/share/datasets/vision_language/iconqa/annotations_val.json
|
| 43 |
+
test:
|
| 44 |
+
url:
|
| 45 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_test.json
|
| 46 |
+
# - /export/share/datasets/vision_language/iconqa/annotations_test.json
|
| 47 |
+
storage:
|
| 48 |
+
- iconqa/annotations/test.json
|
| 49 |
+
# - /export/share/datasets/vision_language/iconqa/annotations_test.json
|
| 50 |
+
images:
|
| 51 |
+
storage: /export/share/datasets/vision_language/iconqa/all_images/
|
| 52 |
+
|
LAVIS-main/lavis/configs/datasets/iconqa/defaults_instruct.yaml
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
iconqa_instruct:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
eval:
|
| 16 |
+
name: "clip_image_eval"
|
| 17 |
+
image_size: 224
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
text_processor:
|
| 21 |
+
train:
|
| 22 |
+
name: blip_instruction
|
| 23 |
+
modality: image
|
| 24 |
+
task: qa
|
| 25 |
+
eval:
|
| 26 |
+
name: blip_question
|
| 27 |
+
|
| 28 |
+
build_info:
|
| 29 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 30 |
+
annotations:
|
| 31 |
+
train:
|
| 32 |
+
url:
|
| 33 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_train.json
|
| 34 |
+
# - /export/share/datasets/vision_language/iconqa/annotations_train.json
|
| 35 |
+
storage:
|
| 36 |
+
- iconqa/annotations/train.json
|
| 37 |
+
# - /export/share/datasets/vision_language/iconqa/annotations_train.json
|
| 38 |
+
# val:
|
| 39 |
+
# url:
|
| 40 |
+
# - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_val.json
|
| 41 |
+
# # - /export/share/datasets/vision_language/iconqa/annotations_val.json
|
| 42 |
+
# storage:
|
| 43 |
+
# - iconqa/annotations/val.json
|
| 44 |
+
# # - /export/share/datasets/vision_language/iconqa/annotations_val.json
|
| 45 |
+
# test:
|
| 46 |
+
# url:
|
| 47 |
+
# - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_test.json
|
| 48 |
+
# # - /export/share/datasets/vision_language/iconqa/annotations_test.json
|
| 49 |
+
# storage:
|
| 50 |
+
# - iconqa/annotations/test.json
|
| 51 |
+
# # - /export/share/datasets/vision_language/iconqa/annotations_test.json
|
| 52 |
+
|
| 53 |
+
images:
|
| 54 |
+
storage: /export/share/datasets/vision_language/iconqa/all_images/
|
| 55 |
+
|
LAVIS-main/lavis/configs/datasets/imagenet/defaults.yaml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
imagenet:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
build_info:
|
| 12 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 13 |
+
splits: ["val"]
|
| 14 |
+
images:
|
| 15 |
+
storage: /export/share/datasets/vision/imagenet
|
LAVIS-main/lavis/configs/datasets/laion/defaults_2B_multi.yaml
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
laion2B_multi:
|
| 8 |
+
|
| 9 |
+
data_type: images
|
| 10 |
+
|
| 11 |
+
build_info:
|
| 12 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 13 |
+
storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar
|
LAVIS-main/lavis/configs/datasets/laion/defaults_400M.yaml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
laion400M:
|
| 8 |
+
|
| 9 |
+
data_type: images
|
| 10 |
+
|
| 11 |
+
text_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: blip_caption
|
| 14 |
+
eval:
|
| 15 |
+
name: blip_caption
|
| 16 |
+
|
| 17 |
+
build_info:
|
| 18 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 19 |
+
storage: /export/laion400m-data-ssd/laion115m_capfilt_20220817/{part0/part0,part1/part1,part2/part2}_node{00..15}_shard{000000..000118}.tar
|
| 20 |
+
# storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar
|
LAVIS-main/lavis/configs/datasets/laion/defaults_400M_instruct.yaml
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
laion400M_instruct:
|
| 8 |
+
|
| 9 |
+
data_type: images
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
eval:
|
| 16 |
+
name: "clip_image_eval"
|
| 17 |
+
image_size: 224
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
text_processor:
|
| 21 |
+
train:
|
| 22 |
+
name: blip_instruction
|
| 23 |
+
modality: image
|
| 24 |
+
task: caption
|
| 25 |
+
eval:
|
| 26 |
+
name: blip_caption
|
| 27 |
+
|
| 28 |
+
build_info:
|
| 29 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 30 |
+
storage: /export/laion400m-data-ssd/laion115m_capfilt_20220817/{part0/part0,part1/part1,part2/part2}_node{00..15}_shard{000000..000118}.tar
|
| 31 |
+
# storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar
|
LAVIS-main/lavis/configs/datasets/llava150k/defaults_dial.yaml
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
llava150k_dialogue_instruct: #394276 train examples
|
| 8 |
+
|
| 9 |
+
data_type: images
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
eval:
|
| 16 |
+
name: "clip_image_eval"
|
| 17 |
+
image_size: 224
|
| 18 |
+
|
| 19 |
+
text_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: "blip_caption"
|
| 22 |
+
|
| 23 |
+
build_info:
|
| 24 |
+
annotations:
|
| 25 |
+
train:
|
| 26 |
+
url:
|
| 27 |
+
- https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/llava_instruct_150k.json
|
| 28 |
+
storage:
|
| 29 |
+
- LLaVA-Instruct-150K/annotations/lava_instruct_150k.json
|
| 30 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 31 |
+
images:
|
| 32 |
+
storage: /export/share/datasets/vision/coco/images/train2017
|
LAVIS-main/lavis/configs/datasets/modelnet40/defaults_cls.yaml
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
modelnet40_cls: # name of the dataset builder
|
| 8 |
+
data_type: [pc, images]
|
| 9 |
+
|
| 10 |
+
vis_processor:
|
| 11 |
+
train:
|
| 12 |
+
name: "clip_image_train"
|
| 13 |
+
image_size: 224
|
| 14 |
+
eval:
|
| 15 |
+
name: "clip_image_eval"
|
| 16 |
+
image_size: 224
|
| 17 |
+
|
| 18 |
+
pc_processor:
|
| 19 |
+
train:
|
| 20 |
+
name: "ulip_pc"
|
| 21 |
+
eval:
|
| 22 |
+
name: "ulip_pc"
|
| 23 |
+
text_processor:
|
| 24 |
+
train:
|
| 25 |
+
name: "blip_caption"
|
| 26 |
+
eval:
|
| 27 |
+
name: "blip_caption"
|
| 28 |
+
|
| 29 |
+
build_info:
|
| 30 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 31 |
+
annotations:
|
| 32 |
+
train:
|
| 33 |
+
url:
|
| 34 |
+
- https://storage.googleapis.com/sfr-ulip-code-release-research/modelnet40_normal_resampled/modelnet40_shape_names.txt
|
| 35 |
+
- https://storage.googleapis.com/sfr-ulip-code-release-research/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
|
| 36 |
+
- https://storage.googleapis.com/sfr-ulip-code-release-research/modelnet40_normal_resampled/modelnet40_train.txt
|
| 37 |
+
storage:
|
| 38 |
+
- modelnet40_normal_resampled/modelnet40_shape_names.txt
|
| 39 |
+
- modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
|
| 40 |
+
- /modelnet40_normal_resampled/modelnet40_train.txt
|
| 41 |
+
val:
|
| 42 |
+
url:
|
| 43 |
+
- https://storage.googleapis.com/sfr-ulip-code-release-research/modelnet40_normal_resampled/modelnet40_shape_names.txt
|
| 44 |
+
- https://storage.googleapis.com/sfr-ulip-code-release-research/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
|
| 45 |
+
- https://storage.googleapis.com/sfr-ulip-code-release-research/modelnet40_normal_resampled/modelnet40_test.txt
|
| 46 |
+
storage:
|
| 47 |
+
- modelnet40_normal_resampled/modelnet40_shape_names.txt
|
| 48 |
+
- modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
|
| 49 |
+
- modelnet40_normal_resampled/modelnet40_test.txt
|
| 50 |
+
|
| 51 |
+
pc:
|
| 52 |
+
storage: /export/home/ULIP/data/modelnet40_normal_resampled
|
| 53 |
+
|
| 54 |
+
images:
|
| 55 |
+
storage: /export/einstein-vision/3d_vision/3d_object_datasets/modelnet40_pc_img
|
LAVIS-main/lavis/configs/datasets/msrvtt/defaults_cap.yaml
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
msrvtt_cap: # name of the dataset builder
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: videos # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
build_info:
|
| 12 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 13 |
+
annotations:
|
| 14 |
+
train:
|
| 15 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_train.json
|
| 16 |
+
storage: msrvtt/annotations/cap_train.json
|
| 17 |
+
val:
|
| 18 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_val.json
|
| 19 |
+
storage: msrvtt/annotations/cap_val.json
|
| 20 |
+
test:
|
| 21 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_test.json
|
| 22 |
+
storage: msrvtt/annotations/cap_test.json
|
| 23 |
+
videos:
|
| 24 |
+
storage: msrvtt/videos
|
LAVIS-main/lavis/configs/datasets/msrvtt/defaults_cap_instruct.yaml
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
msrvtt_caption_instruct: # name of the dataset builder
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: videos # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: alpro_video_train
|
| 14 |
+
n_frms: 4
|
| 15 |
+
image_size: 224
|
| 16 |
+
min_scale: 0.9
|
| 17 |
+
max_scale: 1.0
|
| 18 |
+
full_video: True
|
| 19 |
+
eval:
|
| 20 |
+
name: alpro_video_eval
|
| 21 |
+
n_frms: 4
|
| 22 |
+
image_size: 224
|
| 23 |
+
min_scale: 0.9
|
| 24 |
+
max_scale: 1.0
|
| 25 |
+
full_video: True
|
| 26 |
+
|
| 27 |
+
text_processor:
|
| 28 |
+
train:
|
| 29 |
+
name: blip_instruction
|
| 30 |
+
task: caption
|
| 31 |
+
modality: video
|
| 32 |
+
eval:
|
| 33 |
+
name: blip_caption
|
| 34 |
+
|
| 35 |
+
build_info:
|
| 36 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 37 |
+
annotations:
|
| 38 |
+
train:
|
| 39 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_train.json
|
| 40 |
+
storage: msrvtt/annotations/cap_train.json
|
| 41 |
+
# val:
|
| 42 |
+
# url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_val.json
|
| 43 |
+
# storage: msrvtt/annotations/cap_val.json
|
| 44 |
+
# test:
|
| 45 |
+
# url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_test.json
|
| 46 |
+
# storage: msrvtt/annotations/cap_test.json
|
| 47 |
+
videos:
|
| 48 |
+
storage: msrvtt/videos
|
LAVIS-main/lavis/configs/datasets/msrvtt/defaults_qa.yaml
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
msrvtt_qa: # name of the dataset builder
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: videos # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
build_info:
|
| 12 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 13 |
+
annotations:
|
| 14 |
+
train:
|
| 15 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json
|
| 16 |
+
storage: msrvtt/annotations/qa_train.json
|
| 17 |
+
val:
|
| 18 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json
|
| 19 |
+
storage: msrvtt/annotations/qa_val.json
|
| 20 |
+
test:
|
| 21 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json
|
| 22 |
+
storage: msrvtt/annotations/qa_test.json
|
| 23 |
+
ans2label:
|
| 24 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json
|
| 25 |
+
storage: msrvtt/annotations/qa_ans2label.json
|
| 26 |
+
videos:
|
| 27 |
+
storage: msrvtt/videos
|
LAVIS-main/lavis/configs/datasets/msrvtt/defaults_qa_instruct.yaml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
msrvtt_qa_instruct: # name of the dataset builder
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: videos # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: alpro_video_train
|
| 14 |
+
n_frms: 4
|
| 15 |
+
image_size: 224
|
| 16 |
+
min_scale: 0.9
|
| 17 |
+
max_scale: 1.0
|
| 18 |
+
full_video: True
|
| 19 |
+
eval:
|
| 20 |
+
name: alpro_video_eval
|
| 21 |
+
n_frms: 4
|
| 22 |
+
image_size: 224
|
| 23 |
+
min_scale: 0.9
|
| 24 |
+
max_scale: 1.0
|
| 25 |
+
full_video: True
|
| 26 |
+
|
| 27 |
+
text_processor:
|
| 28 |
+
train:
|
| 29 |
+
name: blip_instruction
|
| 30 |
+
task: qa
|
| 31 |
+
modality: video
|
| 32 |
+
eval:
|
| 33 |
+
name: blip_question
|
| 34 |
+
|
| 35 |
+
build_info:
|
| 36 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 37 |
+
annotations:
|
| 38 |
+
train:
|
| 39 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json
|
| 40 |
+
storage: msrvtt/annotations/qa_train.json
|
| 41 |
+
# val:
|
| 42 |
+
# url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json
|
| 43 |
+
# storage: msrvtt/annotations/qa_val.json
|
| 44 |
+
# test:
|
| 45 |
+
# url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json
|
| 46 |
+
# storage: msrvtt/annotations/qa_test.json
|
| 47 |
+
ans2label:
|
| 48 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json
|
| 49 |
+
storage: msrvtt/annotations/qa_ans2label.json
|
| 50 |
+
videos:
|
| 51 |
+
storage: msrvtt/videos
|
LAVIS-main/lavis/configs/datasets/msrvtt/defaults_ret.yaml
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
msrvtt_retrieval: # name of the dataset builder
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: videos # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
build_info:
|
| 12 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 13 |
+
annotations:
|
| 14 |
+
train:
|
| 15 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_train.json
|
| 16 |
+
storage: msrvtt/annotations/retrieval_train.json
|
| 17 |
+
val:
|
| 18 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_val.json
|
| 19 |
+
storage: msrvtt/annotations/retrieval_val.json
|
| 20 |
+
test:
|
| 21 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_test.json
|
| 22 |
+
storage: msrvtt/annotations/retrieval_test.json
|
| 23 |
+
videos:
|
| 24 |
+
storage: msrvtt/videos
|
LAVIS-main/lavis/configs/datasets/msvd/defaults_cap.yaml
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
msvd_cap: # name of the dataset builder
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: videos # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
build_info:
|
| 12 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 13 |
+
annotations:
|
| 14 |
+
train:
|
| 15 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_train.json
|
| 16 |
+
storage: msvd/annotations/cap_train.json
|
| 17 |
+
val:
|
| 18 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_val.json
|
| 19 |
+
storage: msvd/annotations/cap_val.json
|
| 20 |
+
test:
|
| 21 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_test.json
|
| 22 |
+
storage: msvd/annotations/cap_test.json
|
| 23 |
+
videos:
|
| 24 |
+
storage: msvd/videos
|
LAVIS-main/lavis/configs/datasets/msvd/defaults_cap_instruct.yaml
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
msvd_caption_instruct: # name of the dataset builder
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: videos # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: alpro_video_train
|
| 14 |
+
n_frms: 4
|
| 15 |
+
image_size: 224
|
| 16 |
+
min_scale: 0.9
|
| 17 |
+
max_scale: 1.0
|
| 18 |
+
full_video: True
|
| 19 |
+
eval:
|
| 20 |
+
name: alpro_video_eval
|
| 21 |
+
n_frms: 4
|
| 22 |
+
image_size: 224
|
| 23 |
+
min_scale: 0.9
|
| 24 |
+
max_scale: 1.0
|
| 25 |
+
full_video: True
|
| 26 |
+
|
| 27 |
+
text_processor:
|
| 28 |
+
train:
|
| 29 |
+
name: blip_instruction
|
| 30 |
+
task: caption
|
| 31 |
+
modality: video
|
| 32 |
+
eval:
|
| 33 |
+
name: blip_caption
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
build_info:
|
| 37 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 38 |
+
annotations:
|
| 39 |
+
train:
|
| 40 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_train.json
|
| 41 |
+
storage: msvd/annotations/cap_train.json
|
| 42 |
+
val:
|
| 43 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_val.json
|
| 44 |
+
storage: msvd/annotations/cap_val.json
|
| 45 |
+
test:
|
| 46 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_test.json
|
| 47 |
+
storage: msvd/annotations/cap_test.json
|
| 48 |
+
videos:
|
| 49 |
+
# storage: msvd/videos
|
| 50 |
+
storage: /export/share/datasets/vision_language/msvd/videos
|
LAVIS-main/lavis/configs/datasets/msvd/defaults_qa.yaml
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
msvd_qa: # name of the dataset builder
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: videos # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
build_info:
|
| 12 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 13 |
+
annotations:
|
| 14 |
+
train:
|
| 15 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json
|
| 16 |
+
storage: msvd/annotations/qa_train.json
|
| 17 |
+
val:
|
| 18 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json
|
| 19 |
+
storage: msvd/annotations/qa_val.json
|
| 20 |
+
test:
|
| 21 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json
|
| 22 |
+
storage: msvd/annotations/qa_test.json
|
| 23 |
+
ans2label:
|
| 24 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json
|
| 25 |
+
storage: msvd/annotations/qa_ans2label.json
|
| 26 |
+
videos:
|
| 27 |
+
storage: msvd/videos
|
| 28 |
+
|
| 29 |
+
instance_id_key: question_id
|
LAVIS-main/lavis/configs/datasets/msvd/defaults_qa_instruct.yaml
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
msvd_qa_instruct: # name of the dataset builder
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: videos # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: alpro_video_train
|
| 14 |
+
n_frms: 4
|
| 15 |
+
image_size: 224
|
| 16 |
+
min_scale: 0.9
|
| 17 |
+
max_scale: 1.0
|
| 18 |
+
full_video: True
|
| 19 |
+
eval:
|
| 20 |
+
name: alpro_video_eval
|
| 21 |
+
n_frms: 4
|
| 22 |
+
image_size: 224
|
| 23 |
+
min_scale: 0.9
|
| 24 |
+
max_scale: 1.0
|
| 25 |
+
full_video: True
|
| 26 |
+
|
| 27 |
+
text_processor:
|
| 28 |
+
train:
|
| 29 |
+
name: blip_instruction
|
| 30 |
+
task: qa
|
| 31 |
+
modality: video
|
| 32 |
+
eval:
|
| 33 |
+
name: blip_question
|
| 34 |
+
|
| 35 |
+
build_info:
|
| 36 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 37 |
+
annotations:
|
| 38 |
+
train:
|
| 39 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json
|
| 40 |
+
storage: msvd/annotations/qa_train.json
|
| 41 |
+
val:
|
| 42 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json
|
| 43 |
+
storage: msvd/annotations/qa_val.json
|
| 44 |
+
test:
|
| 45 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json
|
| 46 |
+
storage: msvd/annotations/qa_test.json
|
| 47 |
+
ans2label:
|
| 48 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json
|
| 49 |
+
storage: msvd/annotations/qa_ans2label.json
|
| 50 |
+
videos:
|
| 51 |
+
storage: /export/share/datasets/vision_language/msvd/videos
|
| 52 |
+
|
| 53 |
+
instance_id_key: question_id
|
LAVIS-main/lavis/configs/datasets/music_avqa/defaults_mm_qa.yaml
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
datasets:
|
| 6 |
+
musicavqa_mm: # name of the dataset builder
|
| 7 |
+
data_type: [video, audio]
|
| 8 |
+
|
| 9 |
+
video_processor:
|
| 10 |
+
train:
|
| 11 |
+
name: alpro_video_train
|
| 12 |
+
n_frms: 4
|
| 13 |
+
image_size: 224
|
| 14 |
+
min_scale: 0.9
|
| 15 |
+
max_scale: 1.0
|
| 16 |
+
full_video: True
|
| 17 |
+
eval:
|
| 18 |
+
name: alpro_video_eval
|
| 19 |
+
n_frms: 4
|
| 20 |
+
image_size: 224
|
| 21 |
+
min_scale: 0.9
|
| 22 |
+
max_scale: 1.0
|
| 23 |
+
full_video: True
|
| 24 |
+
|
| 25 |
+
text_processor:
|
| 26 |
+
train:
|
| 27 |
+
name: blip_question
|
| 28 |
+
eval:
|
| 29 |
+
name: blip_question
|
| 30 |
+
|
| 31 |
+
audio_processor:
|
| 32 |
+
train:
|
| 33 |
+
name: beats_audio
|
| 34 |
+
sampling_rate: 16000
|
| 35 |
+
eval:
|
| 36 |
+
name: beats_audio
|
| 37 |
+
sampling_rate: 16000
|
| 38 |
+
is_eval: False
|
| 39 |
+
|
| 40 |
+
build_info:
|
| 41 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 42 |
+
annotations:
|
| 43 |
+
val:
|
| 44 |
+
url:
|
| 45 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/musicavqa/avqa-val.json
|
| 46 |
+
# - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-val.json
|
| 47 |
+
storage:
|
| 48 |
+
- /musicavqa/annotations/avqa-val.json
|
| 49 |
+
# - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-val.json
|
| 50 |
+
|
| 51 |
+
test:
|
| 52 |
+
url:
|
| 53 |
+
# - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/musicavqa/avqa-test.json
|
| 54 |
+
- /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-test.json
|
| 55 |
+
storage:
|
| 56 |
+
# - /musicavqa/annotations/avqa-test.json
|
| 57 |
+
- /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-test.json
|
| 58 |
+
|
| 59 |
+
templates: null
|
| 60 |
+
|
| 61 |
+
audio:
|
| 62 |
+
storage: /export/video-language-dataset/data/MUSIC-AVQA/data/MUSIC-AVQA-videos-Real
|
| 63 |
+
|
| 64 |
+
video:
|
| 65 |
+
storage: /export/video-language-dataset/data/MUSIC-AVQA/data/MUSIC-AVQA-videos-Real
|
| 66 |
+
|
LAVIS-main/lavis/configs/datasets/music_avqa/defaults_mm_qa_instruct.yaml
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
musicavqa_mm_instruct: # name of the dataset builder
|
| 8 |
+
data_type: [video, audio]
|
| 9 |
+
|
| 10 |
+
video_processor:
|
| 11 |
+
train:
|
| 12 |
+
name: alpro_video_train
|
| 13 |
+
n_frms: 4
|
| 14 |
+
image_size: 224
|
| 15 |
+
min_scale: 0.9
|
| 16 |
+
max_scale: 1.0
|
| 17 |
+
full_video: True
|
| 18 |
+
eval:
|
| 19 |
+
name: alpro_video_eval
|
| 20 |
+
n_frms: 4
|
| 21 |
+
image_size: 224
|
| 22 |
+
min_scale: 0.9
|
| 23 |
+
max_scale: 1.0
|
| 24 |
+
full_video: True
|
| 25 |
+
|
| 26 |
+
text_processor:
|
| 27 |
+
train:
|
| 28 |
+
name: blip_instruction
|
| 29 |
+
task: qa
|
| 30 |
+
modality: video
|
| 31 |
+
eval:
|
| 32 |
+
name: blip_question
|
| 33 |
+
|
| 34 |
+
audio_processor:
|
| 35 |
+
train:
|
| 36 |
+
name: beats_audio
|
| 37 |
+
sampling_rate: 16000
|
| 38 |
+
eval:
|
| 39 |
+
name: beats_audio
|
| 40 |
+
sampling_rate: 16000
|
| 41 |
+
is_eval: False
|
| 42 |
+
|
| 43 |
+
build_info:
|
| 44 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 45 |
+
annotations:
|
| 46 |
+
val:
|
| 47 |
+
url:
|
| 48 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/musicavqa/avqa-val.json
|
| 49 |
+
# - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-val.json
|
| 50 |
+
storage:
|
| 51 |
+
- /musicavqa/annotations/avqa-val.json
|
| 52 |
+
# - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-val.json
|
| 53 |
+
|
| 54 |
+
test:
|
| 55 |
+
url:
|
| 56 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/musicavqa/avqa-test.json
|
| 57 |
+
# - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-test.json
|
| 58 |
+
storage:
|
| 59 |
+
- /musicavqa/annotations/avqa-test.json
|
| 60 |
+
# - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-test.json
|
| 61 |
+
|
| 62 |
+
templates: null
|
| 63 |
+
|
| 64 |
+
audio:
|
| 65 |
+
storage: /export/video-language-dataset/data/MUSIC-AVQA/data/MUSIC-AVQA-videos-Real
|
| 66 |
+
|
| 67 |
+
video:
|
| 68 |
+
storage: /export/video-language-dataset/data/MUSIC-AVQA/data/MUSIC-AVQA-videos-Real
|
| 69 |
+
|
LAVIS-main/lavis/configs/datasets/nlvr/defaults.yaml
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
nlvr:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
build_info:
|
| 12 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 13 |
+
annotations:
|
| 14 |
+
train:
|
| 15 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_train.json
|
| 16 |
+
storage: nlvr/annotations/train.json
|
| 17 |
+
val:
|
| 18 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json
|
| 19 |
+
storage: nlvr/annotations/dev.json
|
| 20 |
+
test:
|
| 21 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json
|
| 22 |
+
storage: nlvr/annotations/test.json
|
| 23 |
+
images:
|
| 24 |
+
storage: /export/share/datasets/vision/NLVR2/
|
LAVIS-main/lavis/configs/datasets/nocaps/defaults.yaml
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
nocaps: # name of the dataset builder
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
build_info:
|
| 12 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 13 |
+
annotations:
|
| 14 |
+
val:
|
| 15 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_val.json
|
| 16 |
+
storage: nocaps/annotations/nocaps_val.json
|
| 17 |
+
test:
|
| 18 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_test.json
|
| 19 |
+
storage: nocaps/annotations/nocaps_test.json
|
| 20 |
+
images:
|
| 21 |
+
storage: nocaps/images
|
| 22 |
+
# storage: /export/share/datasets/vision/nocaps/
|
LAVIS-main/lavis/configs/datasets/objaverse/defaults_mm_cap.yaml
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
objaverse_mm_caption: # 651576 train examples
|
| 8 |
+
vis_processor:
|
| 9 |
+
train:
|
| 10 |
+
name: "clip_image_train"
|
| 11 |
+
image_size: 224
|
| 12 |
+
eval:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
pc_processor:
|
| 16 |
+
train:
|
| 17 |
+
name: "ulip_pc"
|
| 18 |
+
eval:
|
| 19 |
+
name: "ulip_pc"
|
| 20 |
+
|
| 21 |
+
text_processor:
|
| 22 |
+
train:
|
| 23 |
+
name: "blip_caption"
|
| 24 |
+
eval:
|
| 25 |
+
name: "blip_caption"
|
| 26 |
+
|
| 27 |
+
data_type: [pc, images] # [images|pc]
|
| 28 |
+
|
| 29 |
+
build_info:
|
| 30 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 31 |
+
annotations:
|
| 32 |
+
train:
|
| 33 |
+
url:
|
| 34 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/objaverse/cap3d_cap_final_train.csv
|
| 35 |
+
# - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_train.json
|
| 36 |
+
storage:
|
| 37 |
+
- objaverse/annotations/train.csv
|
| 38 |
+
# - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_train.json
|
| 39 |
+
|
| 40 |
+
val:
|
| 41 |
+
url:
|
| 42 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/objaverse/cap3d_cap_final_val.csv
|
| 43 |
+
# - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_val.json
|
| 44 |
+
storage:
|
| 45 |
+
- objaverse/annotations/val.csv
|
| 46 |
+
# - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_val.json
|
| 47 |
+
|
| 48 |
+
templates: null
|
| 49 |
+
|
| 50 |
+
pc:
|
| 51 |
+
storage: /export/einstein-vision/3d_vision/objaverse/objaverse_pc_parallel
|
| 52 |
+
|
| 53 |
+
images:
|
| 54 |
+
storage: /export/einstein-vision/3d_vision/objaverse_captions/images/
|
LAVIS-main/lavis/configs/datasets/objaverse/defaults_mm_cap_instruct.yaml
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
objaverse_mm_caption_instruct: # 651576 train examples
|
| 8 |
+
vis_processor:
|
| 9 |
+
train:
|
| 10 |
+
name: "clip_image_train"
|
| 11 |
+
image_size: 224
|
| 12 |
+
eval:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
pc_processor:
|
| 16 |
+
train:
|
| 17 |
+
name: "ulip_pc"
|
| 18 |
+
eval:
|
| 19 |
+
name: "ulip_pc"
|
| 20 |
+
text_processor:
|
| 21 |
+
train:
|
| 22 |
+
name: "blip_instruction"
|
| 23 |
+
modality: pc
|
| 24 |
+
task: caption
|
| 25 |
+
eval:
|
| 26 |
+
name: "blip_caption"
|
| 27 |
+
|
| 28 |
+
data_type: [pc, images] # [images|pc]
|
| 29 |
+
|
| 30 |
+
build_info:
|
| 31 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 32 |
+
annotations:
|
| 33 |
+
train:
|
| 34 |
+
url:
|
| 35 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/objaverse/cap3d_cap_final_train.csv
|
| 36 |
+
# - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_train.json
|
| 37 |
+
storage:
|
| 38 |
+
- objaverse/annotations/train.csv
|
| 39 |
+
# - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_train.json
|
| 40 |
+
|
| 41 |
+
# val:
|
| 42 |
+
# url:
|
| 43 |
+
# # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/objaverse/cap3d_cap_final_val.csv
|
| 44 |
+
# - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_val.json
|
| 45 |
+
# storage:
|
| 46 |
+
# # - objaverse/annotations/val.csv
|
| 47 |
+
# - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_val.json
|
| 48 |
+
|
| 49 |
+
templates: null
|
| 50 |
+
|
| 51 |
+
pc:
|
| 52 |
+
storage: /export/einstein-vision/3d_vision/objaverse/objaverse_pc_parallel
|
| 53 |
+
|
| 54 |
+
images:
|
| 55 |
+
storage: /export/einstein-vision/3d_vision/objaverse_captions/images/
|
LAVIS-main/lavis/configs/datasets/objaverse/defaults_mm_qa.yaml
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
objaverse_mm_qa: # 250070
|
| 8 |
+
vis_processor:
|
| 9 |
+
train:
|
| 10 |
+
name: "clip_image_train"
|
| 11 |
+
image_size: 224
|
| 12 |
+
eval:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
pc_processor:
|
| 16 |
+
train:
|
| 17 |
+
name: "ulip_pc"
|
| 18 |
+
eval:
|
| 19 |
+
name: "ulip_pc"
|
| 20 |
+
text_processor:
|
| 21 |
+
train:
|
| 22 |
+
name: "blip_instruction"
|
| 23 |
+
modality: pc
|
| 24 |
+
task: qa
|
| 25 |
+
eval:
|
| 26 |
+
name: "blip_question"
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
data_type: pc # [images|pc]
|
| 30 |
+
|
| 31 |
+
build_info:
|
| 32 |
+
kwargs:
|
| 33 |
+
add_binary: True
|
| 34 |
+
remove_model_answer: True
|
| 35 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 36 |
+
annotations:
|
| 37 |
+
train:
|
| 38 |
+
url:
|
| 39 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/objaverse/CAP3DQA_final.csv
|
| 40 |
+
# - /export/home/LAVIS-xgen_mm/projects/xinstructblip/data_aug/3d_qa_data/CAP3DQA_final.csv
|
| 41 |
+
storage:
|
| 42 |
+
- objaverse_qa/annotations/train.csv
|
| 43 |
+
# - /export/home/LAVIS-xgen_mm/projects/xinstructblip/data_aug/3d_qa_data/CAP3DQA_final.csv
|
| 44 |
+
# val:
|
| 45 |
+
# url:
|
| 46 |
+
# - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/objaverse/CAP3DQA_final_val.csv
|
| 47 |
+
# # - /export/home/LAVIS-xgen_mm/projects/xinstructblip/data_aug/3d_qa_data/CAP3DQA_final_val.csv
|
| 48 |
+
# storage:
|
| 49 |
+
# - objaverse_qa/annotations/val.csv
|
| 50 |
+
# # - /export/home/LAVIS-xgen_mm/projects/xinstructblip/data_aug/3d_qa_data/CAP3DQA_final_val.csv
|
| 51 |
+
|
| 52 |
+
templates: null
|
| 53 |
+
|
| 54 |
+
pc:
|
| 55 |
+
storage: /export/einstein-vision/3d_vision/objaverse/objaverse_pc_parallel
|
LAVIS-main/lavis/configs/datasets/ocrvqa/defaults.yaml
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
ocr_vqa: # 1002146 train examples
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
|
| 16 |
+
text_processor:
|
| 17 |
+
train:
|
| 18 |
+
name: "blip_question"
|
| 19 |
+
eval:
|
| 20 |
+
name: blip_question
|
| 21 |
+
|
| 22 |
+
build_info:
|
| 23 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 24 |
+
annotations:
|
| 25 |
+
train:
|
| 26 |
+
url:
|
| 27 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/ocrvqa/ocrvqa.json
|
| 28 |
+
# - /export/video-language-dataset/ocrvqa/ocrvqa.json
|
| 29 |
+
storage:
|
| 30 |
+
- ocrvqa/annotations/ocrvqa.json
|
| 31 |
+
# - /export/video-language-dataset/ocrvqa/ocrvqa.json
|
| 32 |
+
images:
|
| 33 |
+
storage: /export/video-language-dataset/ocrvqa/images/
|
LAVIS-main/lavis/configs/datasets/ocrvqa/defaults_instruct.yaml
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
ocr_vqa_instruct: # 1002146 train examples
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
|
| 16 |
+
text_processor:
|
| 17 |
+
train:
|
| 18 |
+
name: blip_instruction
|
| 19 |
+
modality: image
|
| 20 |
+
task: qa
|
| 21 |
+
eval:
|
| 22 |
+
name: blip_question
|
| 23 |
+
|
| 24 |
+
build_info:
|
| 25 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 26 |
+
annotations:
|
| 27 |
+
train:
|
| 28 |
+
url:
|
| 29 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/ocrvqa/ocrvqa.json
|
| 30 |
+
# - /export/video-language-dataset/ocrvqa/ocrvqa.json
|
| 31 |
+
storage:
|
| 32 |
+
- ocrvqa/annotations/ocrvqa.json
|
| 33 |
+
# - /export/video-language-dataset/ocrvqa/ocrvqa.json
|
| 34 |
+
images:
|
| 35 |
+
storage: /export/video-language-dataset/ocrvqa/images/
|
LAVIS-main/lavis/configs/datasets/okvqa/defaults.yaml
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
ok_vqa:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
build_info:
|
| 12 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 13 |
+
annotations:
|
| 14 |
+
train:
|
| 15 |
+
url:
|
| 16 |
+
# TODO make this order insensitive
|
| 17 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
|
| 18 |
+
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
|
| 19 |
+
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
|
| 20 |
+
storage:
|
| 21 |
+
- okvqa/annotations/okvqa_train.json
|
| 22 |
+
# - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json
|
| 23 |
+
# - okvqa/annotations/mscoco_train2014_annotations.json
|
| 24 |
+
test:
|
| 25 |
+
url:
|
| 26 |
+
# TODO make this order insensitive
|
| 27 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
|
| 28 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
|
| 29 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
|
| 30 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
|
| 31 |
+
storage:
|
| 32 |
+
- okvqa/annotations/vqa_val_eval.json
|
| 33 |
+
- okvqa/annotations/answer_list.json
|
| 34 |
+
- okvqa/annotations/OpenEnded_mscoco_val2014_questions.json
|
| 35 |
+
- okvqa/annotations/mscoco_val2014_annotations.json
|
| 36 |
+
images:
|
| 37 |
+
storage: coco/images/
|
LAVIS-main/lavis/configs/datasets/okvqa/defaults_instruct.yaml
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
ok_vqa_instruct:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
eval:
|
| 16 |
+
name: "clip_image_eval"
|
| 17 |
+
image_size: 224
|
| 18 |
+
|
| 19 |
+
text_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: blip_instruction
|
| 22 |
+
modality: image
|
| 23 |
+
task: qa
|
| 24 |
+
eval:
|
| 25 |
+
name: blip_question
|
| 26 |
+
|
| 27 |
+
build_info:
|
| 28 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 29 |
+
annotations:
|
| 30 |
+
train:
|
| 31 |
+
url:
|
| 32 |
+
# TODO make this order insensitive
|
| 33 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
|
| 34 |
+
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
|
| 35 |
+
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
|
| 36 |
+
storage:
|
| 37 |
+
- okvqa/annotations/okvqa_train.json
|
| 38 |
+
# - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json
|
| 39 |
+
# - okvqa/annotations/mscoco_train2014_annotations.json
|
| 40 |
+
# test:
|
| 41 |
+
# url:
|
| 42 |
+
# # TODO make this order insensitive
|
| 43 |
+
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
|
| 44 |
+
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
|
| 45 |
+
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
|
| 46 |
+
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
|
| 47 |
+
# storage:
|
| 48 |
+
# - okvqa/annotations/vqa_val_eval.json
|
| 49 |
+
# - okvqa/annotations/answer_list.json
|
| 50 |
+
# - okvqa/annotations/OpenEnded_mscoco_val2014_questions.json
|
| 51 |
+
# - okvqa/annotations/mscoco_val2014_annotations.json
|
| 52 |
+
images:
|
| 53 |
+
storage: /export/share/datasets/vision/coco/images
|
LAVIS-main/lavis/configs/datasets/sbu_caption/defaults.yaml
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
sbu_caption:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
build_info:
|
| 12 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 13 |
+
annotations:
|
| 14 |
+
train:
|
| 15 |
+
url:
|
| 16 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/sbu/sbu.json
|
| 17 |
+
# - /export/share/dongxuli/data/lavis/sbu/annotation/sbu.json
|
| 18 |
+
storage:
|
| 19 |
+
- sbu_captions/annotations/sbu.json
|
| 20 |
+
images:
|
| 21 |
+
storage: sbu_captions/images
|
| 22 |
+
# storage: /export/share/datasets/vision_language/sbu_resize
|
LAVIS-main/lavis/configs/datasets/sbu_caption/defaults_instruct.yaml
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
sbu_caption_instruct:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
eval:
|
| 16 |
+
name: "clip_image_eval"
|
| 17 |
+
image_size: 224
|
| 18 |
+
|
| 19 |
+
text_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: blip_instruction
|
| 22 |
+
modality: image
|
| 23 |
+
task: caption
|
| 24 |
+
eval:
|
| 25 |
+
name: blip_caption
|
| 26 |
+
|
| 27 |
+
build_info:
|
| 28 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 29 |
+
annotations:
|
| 30 |
+
train:
|
| 31 |
+
url:
|
| 32 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/sbu/sbu.json
|
| 33 |
+
# - /export/share/dongxuli/data/lavis/sbu/annotation/sbu.json
|
| 34 |
+
storage:
|
| 35 |
+
- sbu_captions/annotations/sbu.json
|
| 36 |
+
images:
|
| 37 |
+
storage: sbu_captions/images
|
| 38 |
+
# storage: /export/share/datasets/vision_language/sbu_resize
|
LAVIS-main/lavis/configs/datasets/scienceqa/defaults.yaml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
scienceqa:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
eval:
|
| 16 |
+
name: "clip_image_eval"
|
| 17 |
+
image_size: 224
|
| 18 |
+
|
| 19 |
+
text_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: blip_question
|
| 22 |
+
eval:
|
| 23 |
+
name: blip_question
|
| 24 |
+
|
| 25 |
+
build_info:
|
| 26 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 27 |
+
train:
|
| 28 |
+
url:
|
| 29 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/scienceqa/problems_train.json
|
| 30 |
+
# - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_train.json
|
| 31 |
+
storage:
|
| 32 |
+
- scienceqa/annotations/problems_train.json
|
| 33 |
+
# - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_train.json
|
| 34 |
+
val:
|
| 35 |
+
url:
|
| 36 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/scienceqa/problems_val.json
|
| 37 |
+
# - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_val.json
|
| 38 |
+
storage:
|
| 39 |
+
- scienceqa/annotations/problems_val.json
|
| 40 |
+
# - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_val.json
|
| 41 |
+
test:
|
| 42 |
+
url:
|
| 43 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/scienceqa/problems_test.json
|
| 44 |
+
# - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_test.json
|
| 45 |
+
storage:
|
| 46 |
+
- scienceqa/annotations/problems_test.json
|
| 47 |
+
# - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_test.json
|
| 48 |
+
|
| 49 |
+
images:
|
| 50 |
+
storage: /export/video-language-dataset/ScienceQA/data/scienceqa/images/
|
| 51 |
+
|
LAVIS-main/lavis/configs/datasets/scienceqa/defaults_instruct.yaml
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
datasets:
|
| 7 |
+
scienceqa_instruct:
|
| 8 |
+
# data_dir: ${env.data_dir}/datasets
|
| 9 |
+
data_type: images # [images|videos|features]
|
| 10 |
+
|
| 11 |
+
vis_processor:
|
| 12 |
+
train:
|
| 13 |
+
name: "clip_image_train"
|
| 14 |
+
image_size: 224
|
| 15 |
+
eval:
|
| 16 |
+
name: "clip_image_eval"
|
| 17 |
+
image_size: 224
|
| 18 |
+
|
| 19 |
+
text_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: blip_instruction
|
| 22 |
+
modality: image
|
| 23 |
+
task: qa
|
| 24 |
+
eval:
|
| 25 |
+
name: blip_question
|
| 26 |
+
|
| 27 |
+
build_info:
|
| 28 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 29 |
+
annotations:
|
| 30 |
+
train:
|
| 31 |
+
url:
|
| 32 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/scienceqa/problems_train.json
|
| 33 |
+
# - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_train.json
|
| 34 |
+
storage:
|
| 35 |
+
- scienceqa/annotations/problems_train.json
|
| 36 |
+
# - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_train.json
|
| 37 |
+
val:
|
| 38 |
+
url:
|
| 39 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/scienceqa/problems_val.json
|
| 40 |
+
# - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_val.json
|
| 41 |
+
storage:
|
| 42 |
+
- scienceqa/annotations/problems_val.json
|
| 43 |
+
# - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_val.json
|
| 44 |
+
test:
|
| 45 |
+
url:
|
| 46 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/scienceqa/problems_test.json
|
| 47 |
+
# - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_test.json
|
| 48 |
+
storage:
|
| 49 |
+
- scienceqa/annotations/problems_test.json
|
| 50 |
+
# - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_test.json
|
| 51 |
+
|
| 52 |
+
images:
|
| 53 |
+
storage: /export/video-language-dataset/ScienceQA/data/scienceqa/images/
|
| 54 |
+
|
LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/LICENSE
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Copyright 2022 Dongxu Li, Junnan Li, Hung Le, Guangsen Wang, Silvio Savarese, Steven Hoi. All rights reserved.
|
| 2 |
+
// Use of this source code is governed by a BSD-style
|
| 3 |
+
// license that can be found in the LICENSE file.
|
| 4 |
+
|
| 5 |
+
MIT License
|
| 6 |
+
|
| 7 |
+
Copyright (c) 2019 Igor Brigadir
|
| 8 |
+
|
| 9 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 10 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 11 |
+
in the Software without restriction, including without limitation the rights
|
| 12 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 13 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 14 |
+
furnished to do so, subject to the following conditions:
|
| 15 |
+
|
| 16 |
+
The above copyright notice and this permission notice shall be included in all
|
| 17 |
+
copies or substantial portions of the Software.
|
| 18 |
+
|
| 19 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 20 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 21 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 22 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 23 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 24 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 25 |
+
SOFTWARE.
|
LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/README.md
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!--
|
| 2 |
+
Copyright (c) 2022, salesforce.com, inc.
|
| 3 |
+
All rights reserved.
|
| 4 |
+
SPDX-License-Identifier: BSD-3-Clause
|
| 5 |
+
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 6 |
+
-->
|
| 7 |
+
|
| 8 |
+
# Download Conceptual Captions Data
|
| 9 |
+
|
| 10 |
+
Place data from: https://ai.google.com/research/ConceptualCaptions/download in this folder
|
| 11 |
+
|
| 12 |
+
`Train_GCC-training.tsv / cc3m.tsv` Training Split (3,318,333)
|
| 13 |
+
|
| 14 |
+
run `download_data_cc3m.py` or `download_data_cc12m.py`.
|
| 15 |
+
|
| 16 |
+
Images will be in default LAVIS cache folders. You can stop and resume, the settings for splitting downloads into chunks / threads are not optimal, but it maxed out my connection so i kept them as is.
|
| 17 |
+
|
| 18 |
+
Note: A previous version of this script used a different file naming scheme, this changed and if you are resuming a previously started download, you will get duplicates.
|
| 19 |
+
|
| 20 |
+
A bunch of them will fail to download, and return web pages instead. These will need to be cleaned up later. See `downloaded_validation_report.tsv` after it downloads for HTTP errors. Around 8% of images are gone, based on validation set results. Setting the user agent could fix some errors too maybe - not sure if any requests are rejected by sites based on this.
|
| 21 |
+
|
| 22 |
+
It should take about a day or two to download the training data, keep an eye on disk space.
|
LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/create_annotation_12m.ipynb
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 15,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [],
|
| 8 |
+
"source": [
|
| 9 |
+
"import os\n",
|
| 10 |
+
"import json\n",
|
| 11 |
+
"\n",
|
| 12 |
+
"import pandas as pd\n",
|
| 13 |
+
"from tqdm import tqdm\n",
|
| 14 |
+
"from lavis.common.utils import get_abs_path, get_cache_path"
|
| 15 |
+
]
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"cell_type": "code",
|
| 19 |
+
"execution_count": 2,
|
| 20 |
+
"metadata": {},
|
| 21 |
+
"outputs": [],
|
| 22 |
+
"source": [
|
| 23 |
+
"cc12m = pd.read_csv(\"downloaded_cc12m_report.tsv.gz\", compression=\"gzip\", sep=\"\\t\", names=[\"caption\", \"path\", \"dataset\", \"mimetype\", \"size\", \"status\", \"url\"])"
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"cell_type": "code",
|
| 28 |
+
"execution_count": 7,
|
| 29 |
+
"metadata": {},
|
| 30 |
+
"outputs": [
|
| 31 |
+
{
|
| 32 |
+
"data": {
|
| 33 |
+
"text/plain": [
|
| 34 |
+
"caption a very typical bus station\n",
|
| 35 |
+
"path /export/home/.cache/lavis/conceptual_caption/i...\n",
|
| 36 |
+
"dataset cc3m\n",
|
| 37 |
+
"mimetype image/jpeg\n",
|
| 38 |
+
"size 36078\n",
|
| 39 |
+
"status 200\n",
|
| 40 |
+
"url http://lh6.ggpht.com/-IvRtNLNcG8o/TpFyrudaT6I/...\n",
|
| 41 |
+
"Name: 0, dtype: object"
|
| 42 |
+
]
|
| 43 |
+
},
|
| 44 |
+
"execution_count": 7,
|
| 45 |
+
"metadata": {},
|
| 46 |
+
"output_type": "execute_result"
|
| 47 |
+
}
|
| 48 |
+
],
|
| 49 |
+
"source": [
|
| 50 |
+
"cc12m.iloc[0]"
|
| 51 |
+
]
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"cell_type": "code",
|
| 55 |
+
"execution_count": 3,
|
| 56 |
+
"metadata": {},
|
| 57 |
+
"outputs": [
|
| 58 |
+
{
|
| 59 |
+
"data": {
|
| 60 |
+
"text/plain": [
|
| 61 |
+
"3318333"
|
| 62 |
+
]
|
| 63 |
+
},
|
| 64 |
+
"execution_count": 3,
|
| 65 |
+
"metadata": {},
|
| 66 |
+
"output_type": "execute_result"
|
| 67 |
+
}
|
| 68 |
+
],
|
| 69 |
+
"source": [
|
| 70 |
+
"len(cc12m)"
|
| 71 |
+
]
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"cell_type": "code",
|
| 75 |
+
"execution_count": 21,
|
| 76 |
+
"metadata": {},
|
| 77 |
+
"outputs": [
|
| 78 |
+
{
|
| 79 |
+
"name": "stderr",
|
| 80 |
+
"output_type": "stream",
|
| 81 |
+
"text": [
|
| 82 |
+
"100%|██████████| 3130587/3130587 [17:28<00:00, 2986.08it/s]"
|
| 83 |
+
]
|
| 84 |
+
},
|
| 85 |
+
{
|
| 86 |
+
"name": "stdout",
|
| 87 |
+
"output_type": "stream",
|
| 88 |
+
"text": [
|
| 89 |
+
"Found 2759017 valid records\n"
|
| 90 |
+
]
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"name": "stderr",
|
| 94 |
+
"output_type": "stream",
|
| 95 |
+
"text": [
|
| 96 |
+
"\n"
|
| 97 |
+
]
|
| 98 |
+
}
|
| 99 |
+
],
|
| 100 |
+
"source": [
|
| 101 |
+
"cnt = 0\n",
|
| 102 |
+
"\n",
|
| 103 |
+
"valid_records = []\n",
|
| 104 |
+
"\n",
|
| 105 |
+
"for i, path in tqdm(enumerate(cc12m.path.unique()), total=len(cc12m.path.unique())):\n",
|
| 106 |
+
" path = str(path)\n",
|
| 107 |
+
" if os.path.exists(path):\n",
|
| 108 |
+
" record = cc12m.iloc[i]\n",
|
| 109 |
+
" valid_records.append({\"image\": record[\"path\"], \"caption\": record[\"caption\"]})\n",
|
| 110 |
+
"\n",
|
| 111 |
+
" cnt += 1\n",
|
| 112 |
+
"\n",
|
| 113 |
+
"print(\"Found {} valid records\".format(cnt))"
|
| 114 |
+
]
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"cell_type": "code",
|
| 118 |
+
"execution_count": 22,
|
| 119 |
+
"metadata": {},
|
| 120 |
+
"outputs": [
|
| 121 |
+
{
|
| 122 |
+
"data": {
|
| 123 |
+
"text/plain": [
|
| 124 |
+
"2759017"
|
| 125 |
+
]
|
| 126 |
+
},
|
| 127 |
+
"execution_count": 22,
|
| 128 |
+
"metadata": {},
|
| 129 |
+
"output_type": "execute_result"
|
| 130 |
+
}
|
| 131 |
+
],
|
| 132 |
+
"source": [
|
| 133 |
+
"len(valid_records)"
|
| 134 |
+
]
|
| 135 |
+
},
|
| 136 |
+
{
|
| 137 |
+
"cell_type": "code",
|
| 138 |
+
"execution_count": 24,
|
| 139 |
+
"metadata": {},
|
| 140 |
+
"outputs": [
|
| 141 |
+
{
|
| 142 |
+
"data": {
|
| 143 |
+
"text/plain": [
|
| 144 |
+
"{'image': '/export/home/.cache/lavis/conceptual_caption/images/1_3239086386.jpg',\n",
|
| 145 |
+
" 'caption': 'sierra looked stunning in this top and this skirt while performing with person at their former university'}"
|
| 146 |
+
]
|
| 147 |
+
},
|
| 148 |
+
"execution_count": 24,
|
| 149 |
+
"metadata": {},
|
| 150 |
+
"output_type": "execute_result"
|
| 151 |
+
}
|
| 152 |
+
],
|
| 153 |
+
"source": [
|
| 154 |
+
"valid_records[1]"
|
| 155 |
+
]
|
| 156 |
+
},
|
| 157 |
+
{
|
| 158 |
+
"cell_type": "code",
|
| 159 |
+
"execution_count": 28,
|
| 160 |
+
"metadata": {},
|
| 161 |
+
"outputs": [
|
| 162 |
+
{
|
| 163 |
+
"name": "stdout",
|
| 164 |
+
"output_type": "stream",
|
| 165 |
+
"text": [
|
| 166 |
+
"/export/home/.cache/lavis/conceptual_caption/annotations/cc3m.json already exists\n"
|
| 167 |
+
]
|
| 168 |
+
},
|
| 169 |
+
{
|
| 170 |
+
"ename": "",
|
| 171 |
+
"evalue": "",
|
| 172 |
+
"output_type": "error",
|
| 173 |
+
"traceback": [
|
| 174 |
+
"\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
|
| 175 |
+
]
|
| 176 |
+
}
|
| 177 |
+
],
|
| 178 |
+
"source": [
|
| 179 |
+
"from omegaconf import OmegaConf\n",
|
| 180 |
+
"\n",
|
| 181 |
+
"\n",
|
| 182 |
+
"config_path = get_abs_path(\"configs/datasets/conceptual_caption/defaults_12m.yaml\")\n",
|
| 183 |
+
"\n",
|
| 184 |
+
"ann_path = OmegaConf.load(\n",
|
| 185 |
+
" config_path\n",
|
| 186 |
+
").datasets.conceptual_caption_12m.build_info.annotations.train.storage[0]\n",
|
| 187 |
+
"\n",
|
| 188 |
+
"ann_path = get_cache_path(ann_path)\n",
|
| 189 |
+
"\n",
|
| 190 |
+
"if os.path.exists(ann_path):\n",
|
| 191 |
+
" # abort\n",
|
| 192 |
+
" print(\"{} already exists\".format(ann_path))\n",
|
| 193 |
+
"else:\n",
|
| 194 |
+
" # Save the valid records to a json file\n",
|
| 195 |
+
" with open(ann_path, \"w\") as f:\n",
|
| 196 |
+
" f.write(json.dumps(valid_records))"
|
| 197 |
+
]
|
| 198 |
+
}
|
| 199 |
+
],
|
| 200 |
+
"metadata": {
|
| 201 |
+
"kernelspec": {
|
| 202 |
+
"display_name": "Python 3.8.10 ('base')",
|
| 203 |
+
"language": "python",
|
| 204 |
+
"name": "python3"
|
| 205 |
+
},
|
| 206 |
+
"language_info": {
|
| 207 |
+
"codemirror_mode": {
|
| 208 |
+
"name": "ipython",
|
| 209 |
+
"version": 3
|
| 210 |
+
},
|
| 211 |
+
"file_extension": ".py",
|
| 212 |
+
"mimetype": "text/x-python",
|
| 213 |
+
"name": "python",
|
| 214 |
+
"nbconvert_exporter": "python",
|
| 215 |
+
"pygments_lexer": "ipython3",
|
| 216 |
+
"version": "3.8.10"
|
| 217 |
+
},
|
| 218 |
+
"orig_nbformat": 4,
|
| 219 |
+
"vscode": {
|
| 220 |
+
"interpreter": {
|
| 221 |
+
"hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe"
|
| 222 |
+
}
|
| 223 |
+
}
|
| 224 |
+
},
|
| 225 |
+
"nbformat": 4,
|
| 226 |
+
"nbformat_minor": 2
|
| 227 |
+
}
|
LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/create_annotation_3m.ipynb
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 15,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [],
|
| 8 |
+
"source": [
|
| 9 |
+
"import os\n",
|
| 10 |
+
"import json\n",
|
| 11 |
+
"\n",
|
| 12 |
+
"import pandas as pd\n",
|
| 13 |
+
"from tqdm import tqdm\n",
|
| 14 |
+
"from lavis.common.utils import get_abs_path, get_cache_path"
|
| 15 |
+
]
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"cell_type": "code",
|
| 19 |
+
"execution_count": 2,
|
| 20 |
+
"metadata": {},
|
| 21 |
+
"outputs": [],
|
| 22 |
+
"source": [
|
| 23 |
+
"cc3m = pd.read_csv(\"downloaded_cc3m_report.tsv.gz\", compression=\"gzip\", sep=\"\\t\", names=[\"caption\", \"path\", \"dataset\", \"mimetype\", \"size\", \"status\", \"url\"])"
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"cell_type": "code",
|
| 28 |
+
"execution_count": 7,
|
| 29 |
+
"metadata": {},
|
| 30 |
+
"outputs": [
|
| 31 |
+
{
|
| 32 |
+
"data": {
|
| 33 |
+
"text/plain": [
|
| 34 |
+
"caption a very typical bus station\n",
|
| 35 |
+
"path /export/home/.cache/lavis/conceptual_caption/i...\n",
|
| 36 |
+
"dataset cc3m\n",
|
| 37 |
+
"mimetype image/jpeg\n",
|
| 38 |
+
"size 36078\n",
|
| 39 |
+
"status 200\n",
|
| 40 |
+
"url http://lh6.ggpht.com/-IvRtNLNcG8o/TpFyrudaT6I/...\n",
|
| 41 |
+
"Name: 0, dtype: object"
|
| 42 |
+
]
|
| 43 |
+
},
|
| 44 |
+
"execution_count": 7,
|
| 45 |
+
"metadata": {},
|
| 46 |
+
"output_type": "execute_result"
|
| 47 |
+
}
|
| 48 |
+
],
|
| 49 |
+
"source": [
|
| 50 |
+
"cc3m.iloc[0]"
|
| 51 |
+
]
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"cell_type": "code",
|
| 55 |
+
"execution_count": 3,
|
| 56 |
+
"metadata": {},
|
| 57 |
+
"outputs": [
|
| 58 |
+
{
|
| 59 |
+
"data": {
|
| 60 |
+
"text/plain": [
|
| 61 |
+
"3318333"
|
| 62 |
+
]
|
| 63 |
+
},
|
| 64 |
+
"execution_count": 3,
|
| 65 |
+
"metadata": {},
|
| 66 |
+
"output_type": "execute_result"
|
| 67 |
+
}
|
| 68 |
+
],
|
| 69 |
+
"source": [
|
| 70 |
+
"len(cc3m)"
|
| 71 |
+
]
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"cell_type": "code",
|
| 75 |
+
"execution_count": 21,
|
| 76 |
+
"metadata": {},
|
| 77 |
+
"outputs": [
|
| 78 |
+
{
|
| 79 |
+
"name": "stderr",
|
| 80 |
+
"output_type": "stream",
|
| 81 |
+
"text": [
|
| 82 |
+
"100%|██████████| 3130587/3130587 [17:28<00:00, 2986.08it/s]"
|
| 83 |
+
]
|
| 84 |
+
},
|
| 85 |
+
{
|
| 86 |
+
"name": "stdout",
|
| 87 |
+
"output_type": "stream",
|
| 88 |
+
"text": [
|
| 89 |
+
"Found 2759017 valid records\n"
|
| 90 |
+
]
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"name": "stderr",
|
| 94 |
+
"output_type": "stream",
|
| 95 |
+
"text": [
|
| 96 |
+
"\n"
|
| 97 |
+
]
|
| 98 |
+
}
|
| 99 |
+
],
|
| 100 |
+
"source": [
|
| 101 |
+
"cnt = 0\n",
|
| 102 |
+
"\n",
|
| 103 |
+
"valid_records = []\n",
|
| 104 |
+
"\n",
|
| 105 |
+
"for i, path in tqdm(enumerate(cc3m.path.unique()), total=len(cc3m.path.unique())):\n",
|
| 106 |
+
" path = str(path)\n",
|
| 107 |
+
" if os.path.exists(path):\n",
|
| 108 |
+
" record = cc3m.iloc[i]\n",
|
| 109 |
+
" valid_records.append({\"image\": record[\"path\"], \"caption\": record[\"caption\"]})\n",
|
| 110 |
+
"\n",
|
| 111 |
+
" cnt += 1\n",
|
| 112 |
+
"\n",
|
| 113 |
+
"print(\"Found {} valid records\".format(cnt))"
|
| 114 |
+
]
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"cell_type": "code",
|
| 118 |
+
"execution_count": 22,
|
| 119 |
+
"metadata": {},
|
| 120 |
+
"outputs": [
|
| 121 |
+
{
|
| 122 |
+
"data": {
|
| 123 |
+
"text/plain": [
|
| 124 |
+
"2759017"
|
| 125 |
+
]
|
| 126 |
+
},
|
| 127 |
+
"execution_count": 22,
|
| 128 |
+
"metadata": {},
|
| 129 |
+
"output_type": "execute_result"
|
| 130 |
+
}
|
| 131 |
+
],
|
| 132 |
+
"source": [
|
| 133 |
+
"len(valid_records)"
|
| 134 |
+
]
|
| 135 |
+
},
|
| 136 |
+
{
|
| 137 |
+
"cell_type": "code",
|
| 138 |
+
"execution_count": 24,
|
| 139 |
+
"metadata": {},
|
| 140 |
+
"outputs": [
|
| 141 |
+
{
|
| 142 |
+
"data": {
|
| 143 |
+
"text/plain": [
|
| 144 |
+
"{'image': '/export/home/.cache/lavis/conceptual_caption/images/1_3239086386.jpg',\n",
|
| 145 |
+
" 'caption': 'sierra looked stunning in this top and this skirt while performing with person at their former university'}"
|
| 146 |
+
]
|
| 147 |
+
},
|
| 148 |
+
"execution_count": 24,
|
| 149 |
+
"metadata": {},
|
| 150 |
+
"output_type": "execute_result"
|
| 151 |
+
}
|
| 152 |
+
],
|
| 153 |
+
"source": [
|
| 154 |
+
"valid_records[1]"
|
| 155 |
+
]
|
| 156 |
+
},
|
| 157 |
+
{
|
| 158 |
+
"cell_type": "code",
|
| 159 |
+
"execution_count": 28,
|
| 160 |
+
"metadata": {},
|
| 161 |
+
"outputs": [
|
| 162 |
+
{
|
| 163 |
+
"name": "stdout",
|
| 164 |
+
"output_type": "stream",
|
| 165 |
+
"text": [
|
| 166 |
+
"/export/home/.cache/lavis/conceptual_caption/annotations/cc3m.json already exists\n"
|
| 167 |
+
]
|
| 168 |
+
},
|
| 169 |
+
{
|
| 170 |
+
"ename": "",
|
| 171 |
+
"evalue": "",
|
| 172 |
+
"output_type": "error",
|
| 173 |
+
"traceback": [
|
| 174 |
+
"\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
|
| 175 |
+
]
|
| 176 |
+
}
|
| 177 |
+
],
|
| 178 |
+
"source": [
|
| 179 |
+
"from omegaconf import OmegaConf\n",
|
| 180 |
+
"\n",
|
| 181 |
+
"\n",
|
| 182 |
+
"config_path = get_abs_path(\"configs/datasets/conceptual_caption/defaults_3m.yaml\")\n",
|
| 183 |
+
"\n",
|
| 184 |
+
"ann_path = OmegaConf.load(\n",
|
| 185 |
+
" config_path\n",
|
| 186 |
+
").datasets.conceptual_caption_3m.build_info.annotations.train.storage[0]\n",
|
| 187 |
+
"\n",
|
| 188 |
+
"ann_path = get_cache_path(ann_path)\n",
|
| 189 |
+
"\n",
|
| 190 |
+
"if os.path.exists(ann_path):\n",
|
| 191 |
+
" # abort\n",
|
| 192 |
+
" print(\"{} already exists\".format(ann_path))\n",
|
| 193 |
+
"else:\n",
|
| 194 |
+
" # Save the valid records to a json file\n",
|
| 195 |
+
" with open(ann_path, \"w\") as f:\n",
|
| 196 |
+
" f.write(json.dumps(valid_records))"
|
| 197 |
+
]
|
| 198 |
+
}
|
| 199 |
+
],
|
| 200 |
+
"metadata": {
|
| 201 |
+
"kernelspec": {
|
| 202 |
+
"display_name": "Python 3.8.10 ('base')",
|
| 203 |
+
"language": "python",
|
| 204 |
+
"name": "python3"
|
| 205 |
+
},
|
| 206 |
+
"language_info": {
|
| 207 |
+
"codemirror_mode": {
|
| 208 |
+
"name": "ipython",
|
| 209 |
+
"version": 3
|
| 210 |
+
},
|
| 211 |
+
"file_extension": ".py",
|
| 212 |
+
"mimetype": "text/x-python",
|
| 213 |
+
"name": "python",
|
| 214 |
+
"nbconvert_exporter": "python",
|
| 215 |
+
"pygments_lexer": "ipython3",
|
| 216 |
+
"version": "3.8.10"
|
| 217 |
+
},
|
| 218 |
+
"orig_nbformat": 4,
|
| 219 |
+
"vscode": {
|
| 220 |
+
"interpreter": {
|
| 221 |
+
"hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe"
|
| 222 |
+
}
|
| 223 |
+
}
|
| 224 |
+
},
|
| 225 |
+
"nbformat": 4,
|
| 226 |
+
"nbformat_minor": 2
|
| 227 |
+
}
|
LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc12m.py
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Copyright (c) 2022, salesforce.com, inc.
|
| 3 |
+
All rights reserved.
|
| 4 |
+
SPDX-License-Identifier: BSD-3-Clause
|
| 5 |
+
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import time
|
| 9 |
+
from PIL import Image
|
| 10 |
+
from lavis.common.utils import get_abs_path, get_cache_path
|
| 11 |
+
from multiprocessing import Pool
|
| 12 |
+
from omegaconf import OmegaConf
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from torchvision.transforms import functional as TF
|
| 15 |
+
from tqdm import tqdm
|
| 16 |
+
import glob
|
| 17 |
+
import io
|
| 18 |
+
import json
|
| 19 |
+
import magic # pip install python-magic
|
| 20 |
+
import numpy as np
|
| 21 |
+
import os
|
| 22 |
+
import pandas as pd
|
| 23 |
+
import requests
|
| 24 |
+
import shelve
|
| 25 |
+
import zlib
|
| 26 |
+
|
| 27 |
+
headers = {
|
| 28 |
+
#'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
| 29 |
+
"User-Agent": "Googlebot-Image/1.0", # Pretend to be googlebot
|
| 30 |
+
"X-Forwarded-For": "64.18.15.200",
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _df_split_apply(tup_arg):
|
| 35 |
+
split_ind, subset, func = tup_arg
|
| 36 |
+
r = subset.apply(func, axis=1)
|
| 37 |
+
return (split_ind, r)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def df_multiprocess(df, processes, chunk_size, func, dataset_name):
|
| 41 |
+
print("Generating parts...")
|
| 42 |
+
with shelve.open(
|
| 43 |
+
"%s_%s_%s_results.tmp" % (dataset_name, func.__name__, chunk_size)
|
| 44 |
+
) as results:
|
| 45 |
+
|
| 46 |
+
pbar = tqdm(total=len(df), position=0)
|
| 47 |
+
# Resume:
|
| 48 |
+
finished_chunks = set([int(k) for k in results.keys()])
|
| 49 |
+
pbar.desc = "Resuming"
|
| 50 |
+
for k in results.keys():
|
| 51 |
+
pbar.update(len(results[str(k)][1]))
|
| 52 |
+
|
| 53 |
+
pool_data = (
|
| 54 |
+
(index, df[i : i + chunk_size], func)
|
| 55 |
+
for index, i in enumerate(range(0, len(df), chunk_size))
|
| 56 |
+
if index not in finished_chunks
|
| 57 |
+
)
|
| 58 |
+
print(
|
| 59 |
+
int(len(df) / chunk_size),
|
| 60 |
+
"parts.",
|
| 61 |
+
chunk_size,
|
| 62 |
+
"per part.",
|
| 63 |
+
"Using",
|
| 64 |
+
processes,
|
| 65 |
+
"processes",
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
pbar.desc = "Downloading"
|
| 69 |
+
with Pool(processes) as pool:
|
| 70 |
+
for i, result in enumerate(
|
| 71 |
+
pool.imap_unordered(_df_split_apply, pool_data, 2)
|
| 72 |
+
):
|
| 73 |
+
results[str(result[0])] = result
|
| 74 |
+
pbar.update(len(result[1]))
|
| 75 |
+
pbar.close()
|
| 76 |
+
|
| 77 |
+
print("Finished Downloading.")
|
| 78 |
+
return
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
# Unique name based on url
|
| 82 |
+
def _file_name(row):
|
| 83 |
+
name = (
|
| 84 |
+
"%s/%s_%s"
|
| 85 |
+
% (
|
| 86 |
+
# row["folder"],
|
| 87 |
+
storage_dir,
|
| 88 |
+
row.name,
|
| 89 |
+
(zlib.crc32(row["url"].encode("utf-8")) & 0xFFFFFFFF),
|
| 90 |
+
)
|
| 91 |
+
+ ".jpg"
|
| 92 |
+
)
|
| 93 |
+
return name
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# For checking mimetypes separately without download
|
| 97 |
+
def check_mimetype(row):
|
| 98 |
+
if os.path.isfile(str(row["file"])):
|
| 99 |
+
row["mimetype"] = magic.from_file(row["file"], mime=True)
|
| 100 |
+
row["size"] = os.stat(row["file"]).st_size
|
| 101 |
+
return row
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
# Don't download image, just check with a HEAD request, can't resume.
|
| 105 |
+
# Can use this instead of download_image to get HTTP status codes.
|
| 106 |
+
def check_download(row):
|
| 107 |
+
fname = _file_name(row)
|
| 108 |
+
try:
|
| 109 |
+
# not all sites will support HEAD
|
| 110 |
+
response = requests.head(
|
| 111 |
+
row["url"], stream=False, timeout=5, allow_redirects=True, headers=headers
|
| 112 |
+
)
|
| 113 |
+
row["status"] = response.status_code
|
| 114 |
+
row["headers"] = dict(response.headers)
|
| 115 |
+
except:
|
| 116 |
+
# log errors later, set error as 408 timeout
|
| 117 |
+
row["status"] = 408
|
| 118 |
+
return row
|
| 119 |
+
if response.ok:
|
| 120 |
+
row["file"] = fname
|
| 121 |
+
return row
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def resize_img(req):
|
| 125 |
+
image = Image.open(req).convert("RGB")
|
| 126 |
+
image = TF.resize(
|
| 127 |
+
# image, size=(resize_size, resize_size)
|
| 128 |
+
image,
|
| 129 |
+
size=resize_size,
|
| 130 |
+
) # , interpolation=Image.LANCZOS)
|
| 131 |
+
return image
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def download_image(row):
|
| 135 |
+
fname = _file_name(row)
|
| 136 |
+
# Skip Already downloaded, retry others later
|
| 137 |
+
if os.path.isfile(fname):
|
| 138 |
+
row["status"] = 200
|
| 139 |
+
row["file"] = fname
|
| 140 |
+
row["mimetype"] = magic.from_file(row["file"], mime=True)
|
| 141 |
+
row["size"] = os.stat(row["file"]).st_size
|
| 142 |
+
return row
|
| 143 |
+
|
| 144 |
+
try:
|
| 145 |
+
# use smaller timeout to skip errors, but can result in failed downloads
|
| 146 |
+
response = requests.get(
|
| 147 |
+
row["url"], stream=False, timeout=5, allow_redirects=True, headers=headers
|
| 148 |
+
)
|
| 149 |
+
row["status"] = response.status_code
|
| 150 |
+
# row['headers'] = dict(response.headers)
|
| 151 |
+
except Exception as e:
|
| 152 |
+
# log errors later, set error as 408 timeout
|
| 153 |
+
row["status"] = 408
|
| 154 |
+
return row
|
| 155 |
+
|
| 156 |
+
if response.ok:
|
| 157 |
+
try:
|
| 158 |
+
# some sites respond with gzip transport encoding
|
| 159 |
+
response.raw.decode_content = True
|
| 160 |
+
img = resize_img(io.BytesIO(response.content))
|
| 161 |
+
img.save(fname)
|
| 162 |
+
|
| 163 |
+
row["mimetype"] = magic.from_file(fname, mime=True)
|
| 164 |
+
row["size"] = os.stat(fname).st_size
|
| 165 |
+
|
| 166 |
+
except Exception as e:
|
| 167 |
+
# # This is if it times out during a download or decode
|
| 168 |
+
row["status"] = 408
|
| 169 |
+
|
| 170 |
+
row["file"] = fname
|
| 171 |
+
return row
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def open_tsv(fname, folder):
|
| 175 |
+
print("Opening %s Data File..." % fname)
|
| 176 |
+
df = pd.read_csv(
|
| 177 |
+
fname, sep="\t", names=["url", "caption"]
|
| 178 |
+
) # , usecols=range(1, 2))
|
| 179 |
+
df["folder"] = folder
|
| 180 |
+
print("Processing", len(df), " Images:")
|
| 181 |
+
return df
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def df_from_shelve(chunk_size, func, dataset_name):
|
| 185 |
+
print("Generating Dataframe from results...")
|
| 186 |
+
with shelve.open(
|
| 187 |
+
"%s_%s_%s_results.tmp" % (dataset_name, func.__name__, chunk_size)
|
| 188 |
+
) as results:
|
| 189 |
+
keylist = sorted([int(k) for k in results.keys()])
|
| 190 |
+
df = pd.concat([results[str(k)][1] for k in keylist], sort=True)
|
| 191 |
+
return df
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
resize_size = 384
|
| 195 |
+
|
| 196 |
+
config_path = get_abs_path("configs/datasets/conceptual_caption/defaults_12m.yaml")
|
| 197 |
+
|
| 198 |
+
storage_dir = OmegaConf.load(
|
| 199 |
+
config_path
|
| 200 |
+
).datasets.conceptual_caption_12m.build_info.images.storage
|
| 201 |
+
storage_dir = Path(get_cache_path(storage_dir))
|
| 202 |
+
|
| 203 |
+
os.makedirs(storage_dir, exist_ok=True)
|
| 204 |
+
|
| 205 |
+
# number of processes in the pool can be larger than cores
|
| 206 |
+
num_processes = 96
|
| 207 |
+
# num_processes = 1
|
| 208 |
+
# chunk_size is how many images per chunk per process - changing this resets progress when restarting.
|
| 209 |
+
images_per_part = 100
|
| 210 |
+
|
| 211 |
+
data_name = "cc12m"
|
| 212 |
+
# os.makedirs(data_name, exist_ok=True)
|
| 213 |
+
|
| 214 |
+
df = open_tsv("cc12m.tsv", data_name)
|
| 215 |
+
df_multiprocess(
|
| 216 |
+
df=df,
|
| 217 |
+
processes=num_processes,
|
| 218 |
+
chunk_size=images_per_part,
|
| 219 |
+
func=download_image,
|
| 220 |
+
dataset_name=data_name,
|
| 221 |
+
)
|
| 222 |
+
df = df_from_shelve(
|
| 223 |
+
chunk_size=images_per_part, func=download_image, dataset_name=data_name
|
| 224 |
+
)
|
| 225 |
+
df.to_csv(
|
| 226 |
+
"downloaded_%s_report.tsv.gz" % data_name,
|
| 227 |
+
compression="gzip",
|
| 228 |
+
sep="\t",
|
| 229 |
+
header=False,
|
| 230 |
+
index=False,
|
| 231 |
+
)
|
| 232 |
+
print("Saved.")
|
LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc3m.py
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Copyright (c) 2022, salesforce.com, inc.
|
| 3 |
+
All rights reserved.
|
| 4 |
+
SPDX-License-Identifier: BSD-3-Clause
|
| 5 |
+
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import glob
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import time
|
| 11 |
+
from omegaconf import OmegaConf
|
| 12 |
+
import pandas as pd
|
| 13 |
+
import numpy as np
|
| 14 |
+
import requests
|
| 15 |
+
import zlib
|
| 16 |
+
import os
|
| 17 |
+
import io
|
| 18 |
+
import shelve
|
| 19 |
+
from lavis.common.utils import get_abs_path, get_cache_path
|
| 20 |
+
import magic # pip install python-magic
|
| 21 |
+
import json
|
| 22 |
+
from multiprocessing import Pool
|
| 23 |
+
from tqdm import tqdm
|
| 24 |
+
from PIL import Image
|
| 25 |
+
from torchvision.transforms import functional as TF
|
| 26 |
+
|
| 27 |
+
headers = {
|
| 28 |
+
#'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
|
| 29 |
+
"User-Agent": "Googlebot-Image/1.0", # Pretend to be googlebot
|
| 30 |
+
"X-Forwarded-For": "64.18.15.200",
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _df_split_apply(tup_arg):
|
| 35 |
+
split_ind, subset, func = tup_arg
|
| 36 |
+
r = subset.apply(func, axis=1)
|
| 37 |
+
return (split_ind, r)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def df_multiprocess(df, processes, chunk_size, func, dataset_name):
|
| 41 |
+
print("Generating parts...")
|
| 42 |
+
with shelve.open(
|
| 43 |
+
"%s_%s_%s_results.tmp" % (dataset_name, func.__name__, chunk_size)
|
| 44 |
+
) as results:
|
| 45 |
+
|
| 46 |
+
pbar = tqdm(total=len(df), position=0)
|
| 47 |
+
# Resume:
|
| 48 |
+
finished_chunks = set([int(k) for k in results.keys()])
|
| 49 |
+
pbar.desc = "Resuming"
|
| 50 |
+
for k in results.keys():
|
| 51 |
+
pbar.update(len(results[str(k)][1]))
|
| 52 |
+
|
| 53 |
+
pool_data = (
|
| 54 |
+
(index, df[i : i + chunk_size], func)
|
| 55 |
+
for index, i in enumerate(range(0, len(df), chunk_size))
|
| 56 |
+
if index not in finished_chunks
|
| 57 |
+
)
|
| 58 |
+
print(
|
| 59 |
+
int(len(df) / chunk_size),
|
| 60 |
+
"parts.",
|
| 61 |
+
chunk_size,
|
| 62 |
+
"per part.",
|
| 63 |
+
"Using",
|
| 64 |
+
processes,
|
| 65 |
+
"processes",
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
pbar.desc = "Downloading"
|
| 69 |
+
with Pool(processes) as pool:
|
| 70 |
+
for i, result in enumerate(
|
| 71 |
+
pool.imap_unordered(_df_split_apply, pool_data, 2)
|
| 72 |
+
):
|
| 73 |
+
results[str(result[0])] = result
|
| 74 |
+
pbar.update(len(result[1]))
|
| 75 |
+
pbar.close()
|
| 76 |
+
|
| 77 |
+
print("Finished Downloading.")
|
| 78 |
+
return
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
# Unique name based on url
|
| 82 |
+
def _file_name(row):
|
| 83 |
+
name = (
|
| 84 |
+
"%s/%s_%s"
|
| 85 |
+
% (
|
| 86 |
+
# row["folder"],
|
| 87 |
+
storage_dir,
|
| 88 |
+
row.name,
|
| 89 |
+
(zlib.crc32(row["url"].encode("utf-8")) & 0xFFFFFFFF),
|
| 90 |
+
)
|
| 91 |
+
+ ".jpg"
|
| 92 |
+
)
|
| 93 |
+
return name
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# For checking mimetypes separately without download
|
| 97 |
+
def check_mimetype(row):
|
| 98 |
+
if os.path.isfile(str(row["file"])):
|
| 99 |
+
row["mimetype"] = magic.from_file(row["file"], mime=True)
|
| 100 |
+
row["size"] = os.stat(row["file"]).st_size
|
| 101 |
+
return row
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
# Don't download image, just check with a HEAD request, can't resume.
|
| 105 |
+
# Can use this instead of download_image to get HTTP status codes.
|
| 106 |
+
def check_download(row):
|
| 107 |
+
fname = _file_name(row)
|
| 108 |
+
try:
|
| 109 |
+
# not all sites will support HEAD
|
| 110 |
+
response = requests.head(
|
| 111 |
+
row["url"], stream=False, timeout=5, allow_redirects=True, headers=headers
|
| 112 |
+
)
|
| 113 |
+
row["status"] = response.status_code
|
| 114 |
+
row["headers"] = dict(response.headers)
|
| 115 |
+
except:
|
| 116 |
+
# log errors later, set error as 408 timeout
|
| 117 |
+
row["status"] = 408
|
| 118 |
+
return row
|
| 119 |
+
if response.ok:
|
| 120 |
+
row["file"] = fname
|
| 121 |
+
return row
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def resize_img(req):
|
| 125 |
+
image = Image.open(req).convert("RGB")
|
| 126 |
+
image = TF.resize(
|
| 127 |
+
# image, size=(resize_size, resize_size)
|
| 128 |
+
image,
|
| 129 |
+
size=resize_size,
|
| 130 |
+
) # , interpolation=Image.LANCZOS)
|
| 131 |
+
return image
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def download_image(row):
|
| 135 |
+
fname = _file_name(row)
|
| 136 |
+
# Skip Already downloaded, retry others later
|
| 137 |
+
if os.path.isfile(fname):
|
| 138 |
+
row["status"] = 200
|
| 139 |
+
row["file"] = fname
|
| 140 |
+
row["mimetype"] = magic.from_file(row["file"], mime=True)
|
| 141 |
+
row["size"] = os.stat(row["file"]).st_size
|
| 142 |
+
return row
|
| 143 |
+
|
| 144 |
+
try:
|
| 145 |
+
# use smaller timeout to skip errors, but can result in failed downloads
|
| 146 |
+
response = requests.get(
|
| 147 |
+
row["url"], stream=False, timeout=5, allow_redirects=True, headers=headers
|
| 148 |
+
)
|
| 149 |
+
row["status"] = response.status_code
|
| 150 |
+
# row['headers'] = dict(response.headers)
|
| 151 |
+
except Exception as e:
|
| 152 |
+
# log errors later, set error as 408 timeout
|
| 153 |
+
row["status"] = 408
|
| 154 |
+
return row
|
| 155 |
+
|
| 156 |
+
if response.ok:
|
| 157 |
+
try:
|
| 158 |
+
# some sites respond with gzip transport encoding
|
| 159 |
+
response.raw.decode_content = True
|
| 160 |
+
img = resize_img(io.BytesIO(response.content))
|
| 161 |
+
img.save(fname)
|
| 162 |
+
|
| 163 |
+
row["mimetype"] = magic.from_file(fname, mime=True)
|
| 164 |
+
row["size"] = os.stat(fname).st_size
|
| 165 |
+
|
| 166 |
+
except Exception as e:
|
| 167 |
+
# # This is if it times out during a download or decode
|
| 168 |
+
row["status"] = 408
|
| 169 |
+
|
| 170 |
+
row["file"] = fname
|
| 171 |
+
return row
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def open_tsv(fname, folder):
|
| 175 |
+
print("Opening %s Data File..." % fname)
|
| 176 |
+
df = pd.read_csv(
|
| 177 |
+
fname, sep="\t", names=["caption", "url"]
|
| 178 |
+
) # , usecols=range(1, 2))
|
| 179 |
+
df["folder"] = folder
|
| 180 |
+
print("Processing", len(df), " Images:")
|
| 181 |
+
return df
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def df_from_shelve(chunk_size, func, dataset_name):
|
| 185 |
+
print("Generating Dataframe from results...")
|
| 186 |
+
with shelve.open(
|
| 187 |
+
"%s_%s_%s_results.tmp" % (dataset_name, func.__name__, chunk_size)
|
| 188 |
+
) as results:
|
| 189 |
+
keylist = sorted([int(k) for k in results.keys()])
|
| 190 |
+
df = pd.concat([results[str(k)][1] for k in keylist], sort=True)
|
| 191 |
+
return df
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
resize_size = 384
|
| 195 |
+
|
| 196 |
+
config_path = get_abs_path("configs/datasets/conceptual_caption/defaults_3m.yaml")
|
| 197 |
+
|
| 198 |
+
storage_dir = OmegaConf.load(
|
| 199 |
+
config_path
|
| 200 |
+
).datasets.conceptual_caption_3m.build_info.images.storage
|
| 201 |
+
storage_dir = Path(get_cache_path(storage_dir))
|
| 202 |
+
|
| 203 |
+
os.makedirs(storage_dir, exist_ok=True)
|
| 204 |
+
|
| 205 |
+
# number of processes in the pool can be larger than cores
|
| 206 |
+
num_processes = 32
|
| 207 |
+
# chunk_size is how many images per chunk per process - changing this resets progress when restarting.
|
| 208 |
+
images_per_part = 100
|
| 209 |
+
|
| 210 |
+
data_name = "cc3m"
|
| 211 |
+
df = open_tsv("Train_GCC-training.tsv", data_name)
|
| 212 |
+
df_multiprocess(
|
| 213 |
+
df=df,
|
| 214 |
+
processes=num_processes,
|
| 215 |
+
chunk_size=images_per_part,
|
| 216 |
+
func=download_image,
|
| 217 |
+
dataset_name=data_name,
|
| 218 |
+
)
|
| 219 |
+
df = df_from_shelve(
|
| 220 |
+
chunk_size=images_per_part, func=download_image, dataset_name=data_name
|
| 221 |
+
)
|
| 222 |
+
df.to_csv(
|
| 223 |
+
"downloaded_%s_report.tsv.gz" % data_name,
|
| 224 |
+
compression="gzip",
|
| 225 |
+
sep="\t",
|
| 226 |
+
header=False,
|
| 227 |
+
index=False,
|
| 228 |
+
)
|
| 229 |
+
print("Saved.")
|
LAVIS-main/lavis/models/__init__.py
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Copyright (c) 2022, salesforce.com, inc.
|
| 3 |
+
All rights reserved.
|
| 4 |
+
SPDX-License-Identifier: BSD-3-Clause
|
| 5 |
+
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import logging
|
| 9 |
+
import torch
|
| 10 |
+
from omegaconf import OmegaConf
|
| 11 |
+
from lavis.common.registry import registry
|
| 12 |
+
|
| 13 |
+
from lavis.models.base_model import BaseModel
|
| 14 |
+
|
| 15 |
+
from lavis.models.albef_models.albef_classification import AlbefClassification
|
| 16 |
+
from lavis.models.albef_models.albef_feature_extractor import AlbefFeatureExtractor
|
| 17 |
+
from lavis.models.albef_models.albef_nlvr import AlbefNLVR
|
| 18 |
+
from lavis.models.albef_models.albef_pretrain import AlbefPretrain
|
| 19 |
+
from lavis.models.albef_models.albef_retrieval import AlbefRetrieval
|
| 20 |
+
from lavis.models.albef_models.albef_vqa import AlbefVQA
|
| 21 |
+
from lavis.models.alpro_models.alpro_qa import AlproQA
|
| 22 |
+
from lavis.models.alpro_models.alpro_retrieval import AlproRetrieval
|
| 23 |
+
|
| 24 |
+
from lavis.models.blip_models.blip import BlipBase
|
| 25 |
+
from lavis.models.blip_models.blip_caption import BlipCaption
|
| 26 |
+
from lavis.models.blip_models.blip_classification import BlipClassification
|
| 27 |
+
from lavis.models.blip_models.blip_feature_extractor import BlipFeatureExtractor
|
| 28 |
+
from lavis.models.blip_models.blip_image_text_matching import BlipITM
|
| 29 |
+
from lavis.models.blip_models.blip_nlvr import BlipNLVR
|
| 30 |
+
from lavis.models.blip_models.blip_pretrain import BlipPretrain
|
| 31 |
+
from lavis.models.blip_models.blip_retrieval import BlipRetrieval
|
| 32 |
+
from lavis.models.blip_models.blip_vqa import BlipVQA
|
| 33 |
+
|
| 34 |
+
from lavis.models.blip2_models.blip2 import Blip2Base
|
| 35 |
+
from lavis.models.blip2_models.blip2_opt import Blip2OPT
|
| 36 |
+
from lavis.models.blip2_models.blip2_t5 import Blip2T5
|
| 37 |
+
from lavis.models.blip2_models.blip2_qformer import Blip2Qformer
|
| 38 |
+
from lavis.models.blip2_models.blip2_image_text_matching import Blip2ITM
|
| 39 |
+
|
| 40 |
+
from lavis.models.blip2_models.blip2_t5_instruct import Blip2T5Instruct
|
| 41 |
+
from lavis.models.blip2_models.blip2_vicuna_instruct import Blip2VicunaInstruct
|
| 42 |
+
from lavis.models.blip2_models.blip2_vicuna_xinstruct import Blip2VicunaXInstruct
|
| 43 |
+
|
| 44 |
+
from lavis.models.blip_diffusion_models.blip_diffusion import BlipDiffusion
|
| 45 |
+
|
| 46 |
+
from lavis.models.pnp_vqa_models.pnp_vqa import PNPVQA
|
| 47 |
+
from lavis.models.pnp_vqa_models.pnp_unifiedqav2_fid import PNPUnifiedQAv2FiD
|
| 48 |
+
from lavis.models.img2prompt_models.img2prompt_vqa import Img2PromptVQA
|
| 49 |
+
from lavis.models.med import XBertLMHeadDecoder
|
| 50 |
+
from lavis.models.vit import VisionTransformerEncoder
|
| 51 |
+
from lavis.models.clip_models.model import CLIP
|
| 52 |
+
|
| 53 |
+
from lavis.models.gpt_models.gpt_dialogue import GPTDialogue
|
| 54 |
+
|
| 55 |
+
from lavis.processors.base_processor import BaseProcessor
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
__all__ = [
|
| 59 |
+
"load_model",
|
| 60 |
+
"AlbefClassification",
|
| 61 |
+
"AlbefFeatureExtractor",
|
| 62 |
+
"AlbefNLVR",
|
| 63 |
+
"AlbefVQA",
|
| 64 |
+
"AlbefPretrain",
|
| 65 |
+
"AlbefRetrieval",
|
| 66 |
+
"AlproQA",
|
| 67 |
+
"AlproRetrieval",
|
| 68 |
+
"BaseModel",
|
| 69 |
+
"BlipBase",
|
| 70 |
+
"BlipFeatureExtractor",
|
| 71 |
+
"BlipCaption",
|
| 72 |
+
"BlipClassification",
|
| 73 |
+
"BlipDiffusion",
|
| 74 |
+
"BlipITM",
|
| 75 |
+
"BlipNLVR",
|
| 76 |
+
"BlipPretrain",
|
| 77 |
+
"BlipRetrieval",
|
| 78 |
+
"BlipVQA",
|
| 79 |
+
"Blip2Qformer",
|
| 80 |
+
"Blip2Base",
|
| 81 |
+
"Blip2ITM",
|
| 82 |
+
"Blip2OPT",
|
| 83 |
+
"Blip2T5",
|
| 84 |
+
"Blip2T5Instruct",
|
| 85 |
+
"Blip2VicunaInstruct",
|
| 86 |
+
"Blip2VicunaXInstruct",
|
| 87 |
+
"PNPVQA",
|
| 88 |
+
"Img2PromptVQA",
|
| 89 |
+
"PNPUnifiedQAv2FiD",
|
| 90 |
+
"CLIP",
|
| 91 |
+
"VisionTransformerEncoder",
|
| 92 |
+
"XBertLMHeadDecoder",
|
| 93 |
+
"GPTDialogue",
|
| 94 |
+
]
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def load_model(name, model_type, is_eval=False, device="cpu", checkpoint=None):
|
| 98 |
+
"""
|
| 99 |
+
Load supported models.
|
| 100 |
+
|
| 101 |
+
To list all available models and types in registry:
|
| 102 |
+
>>> from lavis.models import model_zoo
|
| 103 |
+
>>> print(model_zoo)
|
| 104 |
+
|
| 105 |
+
Args:
|
| 106 |
+
name (str): name of the model.
|
| 107 |
+
model_type (str): type of the model.
|
| 108 |
+
is_eval (bool): whether the model is in eval mode. Default: False.
|
| 109 |
+
device (str): device to use. Default: "cpu".
|
| 110 |
+
checkpoint (str): path or to checkpoint. Default: None.
|
| 111 |
+
Note that expecting the checkpoint to have the same keys in state_dict as the model.
|
| 112 |
+
|
| 113 |
+
Returns:
|
| 114 |
+
model (torch.nn.Module): model.
|
| 115 |
+
"""
|
| 116 |
+
|
| 117 |
+
model = registry.get_model_class(name).from_pretrained(model_type=model_type)
|
| 118 |
+
|
| 119 |
+
if checkpoint is not None:
|
| 120 |
+
model.load_checkpoint(checkpoint)
|
| 121 |
+
|
| 122 |
+
if is_eval:
|
| 123 |
+
model.eval()
|
| 124 |
+
|
| 125 |
+
if device == "cpu":
|
| 126 |
+
model = model.float()
|
| 127 |
+
|
| 128 |
+
return model.to(device)
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def load_preprocess(config):
|
| 132 |
+
"""
|
| 133 |
+
Load preprocessor configs and construct preprocessors.
|
| 134 |
+
|
| 135 |
+
If no preprocessor is specified, return BaseProcessor, which does not do any preprocessing.
|
| 136 |
+
|
| 137 |
+
Args:
|
| 138 |
+
config (dict): preprocessor configs.
|
| 139 |
+
|
| 140 |
+
Returns:
|
| 141 |
+
vis_processors (dict): preprocessors for visual inputs.
|
| 142 |
+
txt_processors (dict): preprocessors for text inputs.
|
| 143 |
+
|
| 144 |
+
Key is "train" or "eval" for processors used in training and evaluation respectively.
|
| 145 |
+
"""
|
| 146 |
+
|
| 147 |
+
def _build_proc_from_cfg(cfg):
|
| 148 |
+
return (
|
| 149 |
+
registry.get_processor_class(cfg.name).from_config(cfg)
|
| 150 |
+
if cfg is not None
|
| 151 |
+
else BaseProcessor()
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
vis_processors = dict()
|
| 155 |
+
txt_processors = dict()
|
| 156 |
+
|
| 157 |
+
vis_proc_cfg = config.get("vis_processor")
|
| 158 |
+
txt_proc_cfg = config.get("text_processor")
|
| 159 |
+
|
| 160 |
+
if vis_proc_cfg is not None:
|
| 161 |
+
vis_train_cfg = vis_proc_cfg.get("train")
|
| 162 |
+
vis_eval_cfg = vis_proc_cfg.get("eval")
|
| 163 |
+
else:
|
| 164 |
+
vis_train_cfg = None
|
| 165 |
+
vis_eval_cfg = None
|
| 166 |
+
|
| 167 |
+
vis_processors["train"] = _build_proc_from_cfg(vis_train_cfg)
|
| 168 |
+
vis_processors["eval"] = _build_proc_from_cfg(vis_eval_cfg)
|
| 169 |
+
|
| 170 |
+
if txt_proc_cfg is not None:
|
| 171 |
+
txt_train_cfg = txt_proc_cfg.get("train")
|
| 172 |
+
txt_eval_cfg = txt_proc_cfg.get("eval")
|
| 173 |
+
else:
|
| 174 |
+
txt_train_cfg = None
|
| 175 |
+
txt_eval_cfg = None
|
| 176 |
+
|
| 177 |
+
txt_processors["train"] = _build_proc_from_cfg(txt_train_cfg)
|
| 178 |
+
txt_processors["eval"] = _build_proc_from_cfg(txt_eval_cfg)
|
| 179 |
+
|
| 180 |
+
return vis_processors, txt_processors
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def load_model_and_preprocess(name, model_type, is_eval=False, device="cpu"):
|
| 184 |
+
"""
|
| 185 |
+
Load model and its related preprocessors.
|
| 186 |
+
|
| 187 |
+
List all available models and types in registry:
|
| 188 |
+
>>> from lavis.models import model_zoo
|
| 189 |
+
>>> print(model_zoo)
|
| 190 |
+
|
| 191 |
+
Args:
|
| 192 |
+
name (str): name of the model.
|
| 193 |
+
model_type (str): type of the model.
|
| 194 |
+
is_eval (bool): whether the model is in eval mode. Default: False.
|
| 195 |
+
device (str): device to use. Default: "cpu".
|
| 196 |
+
|
| 197 |
+
Returns:
|
| 198 |
+
model (torch.nn.Module): model.
|
| 199 |
+
vis_processors (dict): preprocessors for visual inputs.
|
| 200 |
+
txt_processors (dict): preprocessors for text inputs.
|
| 201 |
+
"""
|
| 202 |
+
model_cls = registry.get_model_class(name)
|
| 203 |
+
|
| 204 |
+
# load model
|
| 205 |
+
model = model_cls.from_pretrained(model_type=model_type)
|
| 206 |
+
|
| 207 |
+
if is_eval:
|
| 208 |
+
model.eval()
|
| 209 |
+
|
| 210 |
+
# load preprocess
|
| 211 |
+
cfg = OmegaConf.load(model_cls.default_config_path(model_type))
|
| 212 |
+
if cfg is not None:
|
| 213 |
+
preprocess_cfg = cfg.preprocess
|
| 214 |
+
|
| 215 |
+
vis_processors, txt_processors = load_preprocess(preprocess_cfg)
|
| 216 |
+
else:
|
| 217 |
+
vis_processors, txt_processors = None, None
|
| 218 |
+
logging.info(
|
| 219 |
+
f"""No default preprocess for model {name} ({model_type}).
|
| 220 |
+
This can happen if the model is not finetuned on downstream datasets,
|
| 221 |
+
or it is not intended for direct use without finetuning.
|
| 222 |
+
"""
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
if device == "cpu" or device == torch.device("cpu"):
|
| 226 |
+
model = model.float()
|
| 227 |
+
|
| 228 |
+
return model.to(device), vis_processors, txt_processors
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
class ModelZoo:
|
| 232 |
+
"""
|
| 233 |
+
A utility class to create string representation of available model architectures and types.
|
| 234 |
+
|
| 235 |
+
>>> from lavis.models import model_zoo
|
| 236 |
+
>>> # list all available models
|
| 237 |
+
>>> print(model_zoo)
|
| 238 |
+
>>> # show total number of models
|
| 239 |
+
>>> print(len(model_zoo))
|
| 240 |
+
"""
|
| 241 |
+
|
| 242 |
+
def __init__(self) -> None:
|
| 243 |
+
self.model_zoo = {
|
| 244 |
+
k: list(v.PRETRAINED_MODEL_CONFIG_DICT.keys())
|
| 245 |
+
for k, v in registry.mapping["model_name_mapping"].items()
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
def __str__(self) -> str:
|
| 249 |
+
return (
|
| 250 |
+
"=" * 50
|
| 251 |
+
+ "\n"
|
| 252 |
+
+ f"{'Architectures':<30} {'Types'}\n"
|
| 253 |
+
+ "=" * 50
|
| 254 |
+
+ "\n"
|
| 255 |
+
+ "\n".join(
|
| 256 |
+
[
|
| 257 |
+
f"{name:<30} {', '.join(types)}"
|
| 258 |
+
for name, types in self.model_zoo.items()
|
| 259 |
+
]
|
| 260 |
+
)
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
def __iter__(self):
|
| 264 |
+
return iter(self.model_zoo.items())
|
| 265 |
+
|
| 266 |
+
def __len__(self):
|
| 267 |
+
return sum([len(v) for v in self.model_zoo.values()])
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
model_zoo = ModelZoo()
|
LAVIS-main/lavis/models/albef_models/__init__.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Copyright (c) 2022, salesforce.com, inc.
|
| 3 |
+
All rights reserved.
|
| 4 |
+
SPDX-License-Identifier: BSD-3-Clause
|
| 5 |
+
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import datetime
|
| 9 |
+
import logging
|
| 10 |
+
import os
|
| 11 |
+
import time
|
| 12 |
+
|
| 13 |
+
import lavis.common.dist_utils as dist_utils
|
| 14 |
+
import torch
|
| 15 |
+
import torch.distributed as dist
|
| 16 |
+
import torch.nn.functional as F
|
| 17 |
+
from lavis.common.dist_utils import download_cached_file
|
| 18 |
+
from lavis.common.logger import MetricLogger
|
| 19 |
+
from lavis.common.utils import is_url
|
| 20 |
+
from lavis.models.base_model import BaseModel
|
| 21 |
+
from lavis.models.vit import interpolate_pos_embed
|
| 22 |
+
from transformers import BertTokenizer
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class AlbefBase(BaseModel):
|
| 26 |
+
@classmethod
|
| 27 |
+
def init_tokenizer(cls):
|
| 28 |
+
return BertTokenizer.from_pretrained("bert-base-uncased")
|
| 29 |
+
|
| 30 |
+
def load_from_pretrained(self, url_or_filename, rename_text_keys=True):
|
| 31 |
+
if is_url(url_or_filename):
|
| 32 |
+
cached_file = download_cached_file(
|
| 33 |
+
url_or_filename, check_hash=False, progress=True
|
| 34 |
+
)
|
| 35 |
+
checkpoint = torch.load(cached_file, map_location="cpu")
|
| 36 |
+
elif os.path.isfile(url_or_filename):
|
| 37 |
+
checkpoint = torch.load(url_or_filename, map_location="cpu")
|
| 38 |
+
else:
|
| 39 |
+
raise RuntimeError("checkpoint url or path is invalid")
|
| 40 |
+
|
| 41 |
+
if "model" in checkpoint:
|
| 42 |
+
state_dict = checkpoint["model"]
|
| 43 |
+
else:
|
| 44 |
+
state_dict = checkpoint
|
| 45 |
+
|
| 46 |
+
state_dict["visual_encoder.pos_embed"] = interpolate_pos_embed(
|
| 47 |
+
state_dict["visual_encoder.pos_embed"], self.visual_encoder
|
| 48 |
+
)
|
| 49 |
+
if (
|
| 50 |
+
"visual_encoder_m.pos_embed" in self.state_dict().keys()
|
| 51 |
+
and "visual_encoder_m.pos_embed" in state_dict
|
| 52 |
+
):
|
| 53 |
+
state_dict["visual_encoder_m.pos_embed"] = interpolate_pos_embed(
|
| 54 |
+
state_dict["visual_encoder_m.pos_embed"], self.visual_encoder_m
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
if rename_text_keys:
|
| 58 |
+
for key in list(state_dict.keys()):
|
| 59 |
+
if "bert" in key:
|
| 60 |
+
new_key = key.replace("bert.", "")
|
| 61 |
+
state_dict[new_key] = state_dict[key]
|
| 62 |
+
del state_dict[key]
|
| 63 |
+
|
| 64 |
+
for key in self.state_dict().keys():
|
| 65 |
+
if key in state_dict.keys():
|
| 66 |
+
if state_dict[key].shape != self.state_dict()[key].shape:
|
| 67 |
+
del state_dict[key]
|
| 68 |
+
|
| 69 |
+
msg = self.load_state_dict(state_dict, strict=False)
|
| 70 |
+
|
| 71 |
+
logging.info("Missing keys {}".format(msg.missing_keys))
|
| 72 |
+
logging.info("load checkpoint from %s" % url_or_filename)
|
| 73 |
+
return msg
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def compute_sim_matrix(model, data_loader, **kwargs):
|
| 77 |
+
k_test = kwargs.pop("k_test")
|
| 78 |
+
|
| 79 |
+
metric_logger = MetricLogger(delimiter=" ")
|
| 80 |
+
header = "Evaluation:"
|
| 81 |
+
|
| 82 |
+
logging.info("Computing features for evaluation...")
|
| 83 |
+
start_time = time.time()
|
| 84 |
+
|
| 85 |
+
texts = data_loader.dataset.text
|
| 86 |
+
num_text = len(texts)
|
| 87 |
+
text_bs = 256
|
| 88 |
+
text_ids = []
|
| 89 |
+
text_embeds = []
|
| 90 |
+
text_atts = []
|
| 91 |
+
for i in range(0, num_text, text_bs):
|
| 92 |
+
text = texts[i : min(num_text, i + text_bs)]
|
| 93 |
+
text_input = model.tokenizer(
|
| 94 |
+
text,
|
| 95 |
+
padding="max_length",
|
| 96 |
+
truncation=True,
|
| 97 |
+
max_length=35,
|
| 98 |
+
return_tensors="pt",
|
| 99 |
+
).to(model.device)
|
| 100 |
+
text_output = model.text_encoder.forward_text(text_input)
|
| 101 |
+
text_embed = F.normalize(
|
| 102 |
+
model.text_proj(text_output.last_hidden_state[:, 0, :])
|
| 103 |
+
)
|
| 104 |
+
text_embeds.append(text_embed)
|
| 105 |
+
text_ids.append(text_input.input_ids)
|
| 106 |
+
text_atts.append(text_input.attention_mask)
|
| 107 |
+
|
| 108 |
+
text_embeds = torch.cat(text_embeds, dim=0)
|
| 109 |
+
text_ids = torch.cat(text_ids, dim=0)
|
| 110 |
+
text_atts = torch.cat(text_atts, dim=0)
|
| 111 |
+
if hasattr(model.tokenizer, "enc_token_id"):
|
| 112 |
+
text_ids[:, 0] = model.tokenizer.enc_token_id
|
| 113 |
+
|
| 114 |
+
image_feats = []
|
| 115 |
+
image_embeds = []
|
| 116 |
+
for samples in data_loader:
|
| 117 |
+
image = samples["image"]
|
| 118 |
+
|
| 119 |
+
image = image.to(model.device)
|
| 120 |
+
image_feat = model.visual_encoder.forward_features(image)
|
| 121 |
+
image_embed = model.vision_proj(image_feat[:, 0, :])
|
| 122 |
+
image_embed = F.normalize(image_embed, dim=-1)
|
| 123 |
+
|
| 124 |
+
image_feats.append(image_feat.cpu())
|
| 125 |
+
image_embeds.append(image_embed)
|
| 126 |
+
|
| 127 |
+
image_feats = torch.cat(image_feats, dim=0)
|
| 128 |
+
image_embeds = torch.cat(image_embeds, dim=0)
|
| 129 |
+
|
| 130 |
+
sims_matrix = image_embeds @ text_embeds.t()
|
| 131 |
+
score_matrix_i2t = torch.full(
|
| 132 |
+
(len(data_loader.dataset.image), len(texts)), -100.0
|
| 133 |
+
).to(model.device)
|
| 134 |
+
|
| 135 |
+
num_tasks = dist_utils.get_world_size()
|
| 136 |
+
rank = dist_utils.get_rank()
|
| 137 |
+
step = sims_matrix.size(0) // num_tasks + 1
|
| 138 |
+
start = rank * step
|
| 139 |
+
end = min(sims_matrix.size(0), start + step)
|
| 140 |
+
|
| 141 |
+
for i, sims in enumerate(
|
| 142 |
+
metric_logger.log_every(sims_matrix[start:end], 50, header)
|
| 143 |
+
):
|
| 144 |
+
# topk_sim, topk_idx = sims.topk(k=config["k_test"], dim=0)
|
| 145 |
+
topk_sim, topk_idx = sims.topk(k=k_test, dim=0)
|
| 146 |
+
|
| 147 |
+
encoder_output = image_feats[start + i].repeat(k_test, 1, 1).to(model.device)
|
| 148 |
+
encoder_att = torch.ones(encoder_output.size()[:-1], dtype=torch.long).to(
|
| 149 |
+
model.device
|
| 150 |
+
)
|
| 151 |
+
output = model.text_encoder(
|
| 152 |
+
text_ids[topk_idx],
|
| 153 |
+
attention_mask=text_atts[topk_idx],
|
| 154 |
+
encoder_hidden_states=encoder_output,
|
| 155 |
+
encoder_attention_mask=encoder_att,
|
| 156 |
+
return_dict=True,
|
| 157 |
+
)
|
| 158 |
+
score = model.itm_head(output.last_hidden_state[:, 0, :])[:, 1]
|
| 159 |
+
score_matrix_i2t[start + i, topk_idx] = score + topk_sim
|
| 160 |
+
|
| 161 |
+
sims_matrix = sims_matrix.t()
|
| 162 |
+
score_matrix_t2i = torch.full(
|
| 163 |
+
(len(texts), len(data_loader.dataset.image)), -100.0
|
| 164 |
+
).to(model.device)
|
| 165 |
+
|
| 166 |
+
step = sims_matrix.size(0) // num_tasks + 1
|
| 167 |
+
start = rank * step
|
| 168 |
+
end = min(sims_matrix.size(0), start + step)
|
| 169 |
+
|
| 170 |
+
for i, sims in enumerate(
|
| 171 |
+
metric_logger.log_every(sims_matrix[start:end], 50, header)
|
| 172 |
+
):
|
| 173 |
+
|
| 174 |
+
topk_sim, topk_idx = sims.topk(k=k_test, dim=0)
|
| 175 |
+
encoder_output = image_feats[topk_idx.cpu()].to(model.device)
|
| 176 |
+
encoder_att = torch.ones(encoder_output.size()[:-1], dtype=torch.long).to(
|
| 177 |
+
model.device
|
| 178 |
+
)
|
| 179 |
+
output = model.text_encoder(
|
| 180 |
+
text_ids[start + i].repeat(k_test, 1),
|
| 181 |
+
attention_mask=text_atts[start + i].repeat(k_test, 1),
|
| 182 |
+
encoder_hidden_states=encoder_output,
|
| 183 |
+
encoder_attention_mask=encoder_att,
|
| 184 |
+
return_dict=True,
|
| 185 |
+
)
|
| 186 |
+
score = model.itm_head(output.last_hidden_state[:, 0, :])[:, 1]
|
| 187 |
+
score_matrix_t2i[start + i, topk_idx] = score + topk_sim
|
| 188 |
+
|
| 189 |
+
if dist_utils.is_dist_avail_and_initialized():
|
| 190 |
+
dist.barrier()
|
| 191 |
+
torch.distributed.all_reduce(
|
| 192 |
+
score_matrix_i2t, op=torch.distributed.ReduceOp.SUM
|
| 193 |
+
)
|
| 194 |
+
torch.distributed.all_reduce(
|
| 195 |
+
score_matrix_t2i, op=torch.distributed.ReduceOp.SUM
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
total_time = time.time() - start_time
|
| 199 |
+
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
|
| 200 |
+
logging.info("Evaluation time {}".format(total_time_str))
|
| 201 |
+
|
| 202 |
+
return score_matrix_i2t.cpu().numpy(), score_matrix_t2i.cpu().numpy()
|