yuccaaa commited on
Commit
48cce71
·
verified ·
1 Parent(s): 5c8f92e

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-835/trainer_state.json +0 -0
  2. BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_loss.png +0 -0
  3. BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_runtime.png +0 -0
  4. BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_samples_per_second.png +0 -0
  5. BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_steps_per_second.png +0 -0
  6. LAVIS-main/lavis/configs/datasets/gqa/balanced_testdev_instruct.yaml +46 -0
  7. LAVIS-main/lavis/configs/datasets/gqa/balanced_val.yaml +30 -0
  8. LAVIS-main/lavis/configs/datasets/gqa/balanced_val_instruct.yaml +47 -0
  9. LAVIS-main/lavis/configs/datasets/gqa/defaults.yaml +36 -0
  10. LAVIS-main/lavis/configs/datasets/gqa/defaults_instruct.yaml +55 -0
  11. LAVIS-main/lavis/configs/datasets/iconqa/defaults.yaml +52 -0
  12. LAVIS-main/lavis/configs/datasets/iconqa/defaults_instruct.yaml +55 -0
  13. LAVIS-main/lavis/configs/datasets/imagenet/defaults.yaml +15 -0
  14. LAVIS-main/lavis/configs/datasets/laion/defaults_2B_multi.yaml +13 -0
  15. LAVIS-main/lavis/configs/datasets/laion/defaults_400M.yaml +20 -0
  16. LAVIS-main/lavis/configs/datasets/laion/defaults_400M_instruct.yaml +31 -0
  17. LAVIS-main/lavis/configs/datasets/llava150k/defaults_dial.yaml +32 -0
  18. LAVIS-main/lavis/configs/datasets/modelnet40/defaults_cls.yaml +55 -0
  19. LAVIS-main/lavis/configs/datasets/msrvtt/defaults_cap.yaml +24 -0
  20. LAVIS-main/lavis/configs/datasets/msrvtt/defaults_cap_instruct.yaml +48 -0
  21. LAVIS-main/lavis/configs/datasets/msrvtt/defaults_qa.yaml +27 -0
  22. LAVIS-main/lavis/configs/datasets/msrvtt/defaults_qa_instruct.yaml +51 -0
  23. LAVIS-main/lavis/configs/datasets/msrvtt/defaults_ret.yaml +24 -0
  24. LAVIS-main/lavis/configs/datasets/msvd/defaults_cap.yaml +24 -0
  25. LAVIS-main/lavis/configs/datasets/msvd/defaults_cap_instruct.yaml +50 -0
  26. LAVIS-main/lavis/configs/datasets/msvd/defaults_qa.yaml +29 -0
  27. LAVIS-main/lavis/configs/datasets/msvd/defaults_qa_instruct.yaml +53 -0
  28. LAVIS-main/lavis/configs/datasets/music_avqa/defaults_mm_qa.yaml +66 -0
  29. LAVIS-main/lavis/configs/datasets/music_avqa/defaults_mm_qa_instruct.yaml +69 -0
  30. LAVIS-main/lavis/configs/datasets/nlvr/defaults.yaml +24 -0
  31. LAVIS-main/lavis/configs/datasets/nocaps/defaults.yaml +22 -0
  32. LAVIS-main/lavis/configs/datasets/objaverse/defaults_mm_cap.yaml +54 -0
  33. LAVIS-main/lavis/configs/datasets/objaverse/defaults_mm_cap_instruct.yaml +55 -0
  34. LAVIS-main/lavis/configs/datasets/objaverse/defaults_mm_qa.yaml +55 -0
  35. LAVIS-main/lavis/configs/datasets/ocrvqa/defaults.yaml +33 -0
  36. LAVIS-main/lavis/configs/datasets/ocrvqa/defaults_instruct.yaml +35 -0
  37. LAVIS-main/lavis/configs/datasets/okvqa/defaults.yaml +37 -0
  38. LAVIS-main/lavis/configs/datasets/okvqa/defaults_instruct.yaml +53 -0
  39. LAVIS-main/lavis/configs/datasets/sbu_caption/defaults.yaml +22 -0
  40. LAVIS-main/lavis/configs/datasets/sbu_caption/defaults_instruct.yaml +38 -0
  41. LAVIS-main/lavis/configs/datasets/scienceqa/defaults.yaml +51 -0
  42. LAVIS-main/lavis/configs/datasets/scienceqa/defaults_instruct.yaml +54 -0
  43. LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/LICENSE +25 -0
  44. LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/README.md +22 -0
  45. LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/create_annotation_12m.ipynb +227 -0
  46. LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/create_annotation_3m.ipynb +227 -0
  47. LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc12m.py +232 -0
  48. LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc3m.py +229 -0
  49. LAVIS-main/lavis/models/__init__.py +270 -0
  50. LAVIS-main/lavis/models/albef_models/__init__.py +202 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-835/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_loss.png ADDED
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_runtime.png ADDED
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_samples_per_second.png ADDED
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_steps_per_second.png ADDED
LAVIS-main/lavis/configs/datasets/gqa/balanced_testdev_instruct.yaml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ gqa:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+ eval:
16
+ name: "clip_image_eval"
17
+ image_size: 224
18
+
19
+ text_processor:
20
+ train:
21
+ name: blip_instruction
22
+ task: qa
23
+ modality: image
24
+ eval:
25
+ name: blip_question
26
+
27
+ build_info:
28
+ # Be careful not to append minus sign (-) before split to avoid itemizing
29
+ annotations:
30
+ train:
31
+ url:
32
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
33
+ storage:
34
+ - gqa/annotations/train_balanced_questions.json
35
+ val:
36
+ url:
37
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/testdev_balanced_questions.json
38
+ storage:
39
+ - gqa/annotations/testdev_balanced_questions.json
40
+ test:
41
+ url:
42
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
43
+ storage:
44
+ - gqa/annotations/test_balanced_questions.json
45
+ images:
46
+ storage: /export/share/datasets/vision/GQA/images #gqa/images/
LAVIS-main/lavis/configs/datasets/gqa/balanced_val.yaml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ gqa:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ build_info:
12
+ # Be careful not to append minus sign (-) before split to avoid itemizing
13
+ annotations:
14
+ train:
15
+ url:
16
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
17
+ storage:
18
+ - gqa/annotations/train_balanced_questions.json
19
+ val:
20
+ url:
21
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/val_balanced_questions.json
22
+ storage:
23
+ - gqa/annotations/val_balanced_questions.json
24
+ test:
25
+ url:
26
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
27
+ storage:
28
+ - gqa/annotations/test_balanced_questions.json
29
+ images:
30
+ storage: gqa/images/
LAVIS-main/lavis/configs/datasets/gqa/balanced_val_instruct.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ gqa_instruct:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+ eval:
16
+ name: "clip_image_eval"
17
+ image_size: 224
18
+
19
+ text_processor:
20
+ train:
21
+ name: blip_instruction
22
+ task: qa
23
+ modality: image
24
+ eval:
25
+ name: blip_question
26
+
27
+
28
+ build_info:
29
+ # Be careful not to append minus sign (-) before split to avoid itemizing
30
+ annotations:
31
+ train:
32
+ url:
33
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
34
+ storage:
35
+ - gqa/annotations/train_balanced_questions.json
36
+ val:
37
+ url:
38
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/val_balanced_questions.json
39
+ storage:
40
+ - gqa/annotations/val_balanced_questions.json
41
+ test:
42
+ url:
43
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
44
+ storage:
45
+ - gqa/annotations/test_balanced_questions.json
46
+ images:
47
+ storage: /export/share/datasets/vision/GQA/images #gqa/images/
LAVIS-main/lavis/configs/datasets/gqa/defaults.yaml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ gqa:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ build_info:
12
+ # Be careful not to append minus sign (-) before split to avoid itemizing
13
+ annotations:
14
+ train:
15
+ url:
16
+ - /export/share/datasets/vision/GQA/questions1.2/train_all_questions/train_all_questions_0.json
17
+ - /export/share/datasets/vision/GQA/questions1.2/val_all_questions.json
18
+ storage:
19
+ - gqa/annotations/train_all_questions_0.json
20
+ - gqa/annotations/val_all_questions.json
21
+ val:
22
+ url:
23
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
24
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json
25
+ storage:
26
+ - aokvqa/annotations/aokvqa_v1p0_val.json
27
+ - aokvqa/annotations/large_vocab_train_lavis.json
28
+ test:
29
+ url:
30
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json
31
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json
32
+ storage:
33
+ - aokvqa/annotations/aokvqa_v1p0_test.json
34
+ - aokvqa/annotations/large_vocab_train_lavis.json
35
+ images:
36
+ storage: gqa/images/
LAVIS-main/lavis/configs/datasets/gqa/defaults_instruct.yaml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ gqa_instruct:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+ eval:
16
+ name: "clip_image_eval"
17
+ image_size: 224
18
+
19
+ text_processor:
20
+ train:
21
+ name: blip_instruction
22
+ task: qa
23
+ modality: image
24
+ eval:
25
+ name: blip_question
26
+
27
+
28
+ build_info:
29
+ # Be careful not to append minus sign (-) before split to avoid itemizing
30
+ annotations:
31
+ train:
32
+ url:
33
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/gqa/train_all_questions_0.json
34
+ # - /export/share/datasets/vision/GQA/questions1.2/train_all_questions/train_all_questions_0.json
35
+ # - /export/share/datasets/vision/GQA/questions1.2/val_all_questions.json
36
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/gqa/val_all_questions.json
37
+ storage:
38
+ - gqa/annotations/train_all_questions_0.json
39
+ - gqa/annotations/val_all_questions.json
40
+ val:
41
+ url:
42
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
43
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json
44
+ storage:
45
+ - aokvqa/annotations/aokvqa_v1p0_val.json
46
+ - aokvqa/annotations/large_vocab_train_lavis.json
47
+ test:
48
+ url:
49
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json
50
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json
51
+ storage:
52
+ - aokvqa/annotations/aokvqa_v1p0_test.json
53
+ - aokvqa/annotations/large_vocab_train_lavis.json
54
+ images:
55
+ storage: /export/share/datasets/vision/GQA/images #gqa/images/
LAVIS-main/lavis/configs/datasets/iconqa/defaults.yaml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ iconqa:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+ eval:
16
+ name: "clip_image_eval"
17
+ image_size: 224
18
+
19
+
20
+ text_processor:
21
+ train:
22
+ name: blip_question
23
+ eval:
24
+ name: blip_question
25
+
26
+ build_info:
27
+ # Be careful not to append minus sign (-) before split to avoid itemizing
28
+ annotations:
29
+ train:
30
+ url:
31
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_train.json
32
+ # - /export/share/datasets/vision_language/iconqa/annotations_train.json
33
+ storage:
34
+ - iconqa/annotations/train.json
35
+ # - /export/share/datasets/vision_language/iconqa/annotations_train.json
36
+ val:
37
+ url:
38
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_val.json
39
+ # - /export/share/datasets/vision_language/iconqa/annotations_val.json
40
+ storage:
41
+ - iconqa/annotations/val.json
42
+ # - /export/share/datasets/vision_language/iconqa/annotations_val.json
43
+ test:
44
+ url:
45
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_test.json
46
+ # - /export/share/datasets/vision_language/iconqa/annotations_test.json
47
+ storage:
48
+ - iconqa/annotations/test.json
49
+ # - /export/share/datasets/vision_language/iconqa/annotations_test.json
50
+ images:
51
+ storage: /export/share/datasets/vision_language/iconqa/all_images/
52
+
LAVIS-main/lavis/configs/datasets/iconqa/defaults_instruct.yaml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ iconqa_instruct:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+ eval:
16
+ name: "clip_image_eval"
17
+ image_size: 224
18
+
19
+
20
+ text_processor:
21
+ train:
22
+ name: blip_instruction
23
+ modality: image
24
+ task: qa
25
+ eval:
26
+ name: blip_question
27
+
28
+ build_info:
29
+ # Be careful not to append minus sign (-) before split to avoid itemizing
30
+ annotations:
31
+ train:
32
+ url:
33
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_train.json
34
+ # - /export/share/datasets/vision_language/iconqa/annotations_train.json
35
+ storage:
36
+ - iconqa/annotations/train.json
37
+ # - /export/share/datasets/vision_language/iconqa/annotations_train.json
38
+ # val:
39
+ # url:
40
+ # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_val.json
41
+ # # - /export/share/datasets/vision_language/iconqa/annotations_val.json
42
+ # storage:
43
+ # - iconqa/annotations/val.json
44
+ # # - /export/share/datasets/vision_language/iconqa/annotations_val.json
45
+ # test:
46
+ # url:
47
+ # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_test.json
48
+ # # - /export/share/datasets/vision_language/iconqa/annotations_test.json
49
+ # storage:
50
+ # - iconqa/annotations/test.json
51
+ # # - /export/share/datasets/vision_language/iconqa/annotations_test.json
52
+
53
+ images:
54
+ storage: /export/share/datasets/vision_language/iconqa/all_images/
55
+
LAVIS-main/lavis/configs/datasets/imagenet/defaults.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ imagenet:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ build_info:
12
+ # Be careful not to append minus sign (-) before split to avoid itemizing
13
+ splits: ["val"]
14
+ images:
15
+ storage: /export/share/datasets/vision/imagenet
LAVIS-main/lavis/configs/datasets/laion/defaults_2B_multi.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ laion2B_multi:
8
+
9
+ data_type: images
10
+
11
+ build_info:
12
+ # Be careful not to append minus sign (-) before split to avoid itemizing
13
+ storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar
LAVIS-main/lavis/configs/datasets/laion/defaults_400M.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ laion400M:
8
+
9
+ data_type: images
10
+
11
+ text_processor:
12
+ train:
13
+ name: blip_caption
14
+ eval:
15
+ name: blip_caption
16
+
17
+ build_info:
18
+ # Be careful not to append minus sign (-) before split to avoid itemizing
19
+ storage: /export/laion400m-data-ssd/laion115m_capfilt_20220817/{part0/part0,part1/part1,part2/part2}_node{00..15}_shard{000000..000118}.tar
20
+ # storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar
LAVIS-main/lavis/configs/datasets/laion/defaults_400M_instruct.yaml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ laion400M_instruct:
8
+
9
+ data_type: images
10
+
11
+ vis_processor:
12
+ train:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+ eval:
16
+ name: "clip_image_eval"
17
+ image_size: 224
18
+
19
+
20
+ text_processor:
21
+ train:
22
+ name: blip_instruction
23
+ modality: image
24
+ task: caption
25
+ eval:
26
+ name: blip_caption
27
+
28
+ build_info:
29
+ # Be careful not to append minus sign (-) before split to avoid itemizing
30
+ storage: /export/laion400m-data-ssd/laion115m_capfilt_20220817/{part0/part0,part1/part1,part2/part2}_node{00..15}_shard{000000..000118}.tar
31
+ # storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar
LAVIS-main/lavis/configs/datasets/llava150k/defaults_dial.yaml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ llava150k_dialogue_instruct: #394276 train examples
8
+
9
+ data_type: images
10
+
11
+ vis_processor:
12
+ train:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+ eval:
16
+ name: "clip_image_eval"
17
+ image_size: 224
18
+
19
+ text_processor:
20
+ train:
21
+ name: "blip_caption"
22
+
23
+ build_info:
24
+ annotations:
25
+ train:
26
+ url:
27
+ - https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/llava_instruct_150k.json
28
+ storage:
29
+ - LLaVA-Instruct-150K/annotations/lava_instruct_150k.json
30
+ # Be careful not to append minus sign (-) before split to avoid itemizing
31
+ images:
32
+ storage: /export/share/datasets/vision/coco/images/train2017
LAVIS-main/lavis/configs/datasets/modelnet40/defaults_cls.yaml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ modelnet40_cls: # name of the dataset builder
8
+ data_type: [pc, images]
9
+
10
+ vis_processor:
11
+ train:
12
+ name: "clip_image_train"
13
+ image_size: 224
14
+ eval:
15
+ name: "clip_image_eval"
16
+ image_size: 224
17
+
18
+ pc_processor:
19
+ train:
20
+ name: "ulip_pc"
21
+ eval:
22
+ name: "ulip_pc"
23
+ text_processor:
24
+ train:
25
+ name: "blip_caption"
26
+ eval:
27
+ name: "blip_caption"
28
+
29
+ build_info:
30
+ # Be careful not to append minus sign (-) before split to avoid itemizing
31
+ annotations:
32
+ train:
33
+ url:
34
+ - https://storage.googleapis.com/sfr-ulip-code-release-research/modelnet40_normal_resampled/modelnet40_shape_names.txt
35
+ - https://storage.googleapis.com/sfr-ulip-code-release-research/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
36
+ - https://storage.googleapis.com/sfr-ulip-code-release-research/modelnet40_normal_resampled/modelnet40_train.txt
37
+ storage:
38
+ - modelnet40_normal_resampled/modelnet40_shape_names.txt
39
+ - modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
40
+ - /modelnet40_normal_resampled/modelnet40_train.txt
41
+ val:
42
+ url:
43
+ - https://storage.googleapis.com/sfr-ulip-code-release-research/modelnet40_normal_resampled/modelnet40_shape_names.txt
44
+ - https://storage.googleapis.com/sfr-ulip-code-release-research/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
45
+ - https://storage.googleapis.com/sfr-ulip-code-release-research/modelnet40_normal_resampled/modelnet40_test.txt
46
+ storage:
47
+ - modelnet40_normal_resampled/modelnet40_shape_names.txt
48
+ - modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
49
+ - modelnet40_normal_resampled/modelnet40_test.txt
50
+
51
+ pc:
52
+ storage: /export/home/ULIP/data/modelnet40_normal_resampled
53
+
54
+ images:
55
+ storage: /export/einstein-vision/3d_vision/3d_object_datasets/modelnet40_pc_img
LAVIS-main/lavis/configs/datasets/msrvtt/defaults_cap.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ msrvtt_cap: # name of the dataset builder
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: videos # [images|videos|features]
10
+
11
+ build_info:
12
+ # Be careful not to append minus sign (-) before split to avoid itemizing
13
+ annotations:
14
+ train:
15
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_train.json
16
+ storage: msrvtt/annotations/cap_train.json
17
+ val:
18
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_val.json
19
+ storage: msrvtt/annotations/cap_val.json
20
+ test:
21
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_test.json
22
+ storage: msrvtt/annotations/cap_test.json
23
+ videos:
24
+ storage: msrvtt/videos
LAVIS-main/lavis/configs/datasets/msrvtt/defaults_cap_instruct.yaml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ msrvtt_caption_instruct: # name of the dataset builder
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: videos # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: alpro_video_train
14
+ n_frms: 4
15
+ image_size: 224
16
+ min_scale: 0.9
17
+ max_scale: 1.0
18
+ full_video: True
19
+ eval:
20
+ name: alpro_video_eval
21
+ n_frms: 4
22
+ image_size: 224
23
+ min_scale: 0.9
24
+ max_scale: 1.0
25
+ full_video: True
26
+
27
+ text_processor:
28
+ train:
29
+ name: blip_instruction
30
+ task: caption
31
+ modality: video
32
+ eval:
33
+ name: blip_caption
34
+
35
+ build_info:
36
+ # Be careful not to append minus sign (-) before split to avoid itemizing
37
+ annotations:
38
+ train:
39
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_train.json
40
+ storage: msrvtt/annotations/cap_train.json
41
+ # val:
42
+ # url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_val.json
43
+ # storage: msrvtt/annotations/cap_val.json
44
+ # test:
45
+ # url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_test.json
46
+ # storage: msrvtt/annotations/cap_test.json
47
+ videos:
48
+ storage: msrvtt/videos
LAVIS-main/lavis/configs/datasets/msrvtt/defaults_qa.yaml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ msrvtt_qa: # name of the dataset builder
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: videos # [images|videos|features]
10
+
11
+ build_info:
12
+ # Be careful not to append minus sign (-) before split to avoid itemizing
13
+ annotations:
14
+ train:
15
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json
16
+ storage: msrvtt/annotations/qa_train.json
17
+ val:
18
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json
19
+ storage: msrvtt/annotations/qa_val.json
20
+ test:
21
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json
22
+ storage: msrvtt/annotations/qa_test.json
23
+ ans2label:
24
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json
25
+ storage: msrvtt/annotations/qa_ans2label.json
26
+ videos:
27
+ storage: msrvtt/videos
LAVIS-main/lavis/configs/datasets/msrvtt/defaults_qa_instruct.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ msrvtt_qa_instruct: # name of the dataset builder
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: videos # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: alpro_video_train
14
+ n_frms: 4
15
+ image_size: 224
16
+ min_scale: 0.9
17
+ max_scale: 1.0
18
+ full_video: True
19
+ eval:
20
+ name: alpro_video_eval
21
+ n_frms: 4
22
+ image_size: 224
23
+ min_scale: 0.9
24
+ max_scale: 1.0
25
+ full_video: True
26
+
27
+ text_processor:
28
+ train:
29
+ name: blip_instruction
30
+ task: qa
31
+ modality: video
32
+ eval:
33
+ name: blip_question
34
+
35
+ build_info:
36
+ # Be careful not to append minus sign (-) before split to avoid itemizing
37
+ annotations:
38
+ train:
39
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json
40
+ storage: msrvtt/annotations/qa_train.json
41
+ # val:
42
+ # url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json
43
+ # storage: msrvtt/annotations/qa_val.json
44
+ # test:
45
+ # url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json
46
+ # storage: msrvtt/annotations/qa_test.json
47
+ ans2label:
48
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json
49
+ storage: msrvtt/annotations/qa_ans2label.json
50
+ videos:
51
+ storage: msrvtt/videos
LAVIS-main/lavis/configs/datasets/msrvtt/defaults_ret.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ msrvtt_retrieval: # name of the dataset builder
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: videos # [images|videos|features]
10
+
11
+ build_info:
12
+ # Be careful not to append minus sign (-) before split to avoid itemizing
13
+ annotations:
14
+ train:
15
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_train.json
16
+ storage: msrvtt/annotations/retrieval_train.json
17
+ val:
18
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_val.json
19
+ storage: msrvtt/annotations/retrieval_val.json
20
+ test:
21
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_test.json
22
+ storage: msrvtt/annotations/retrieval_test.json
23
+ videos:
24
+ storage: msrvtt/videos
LAVIS-main/lavis/configs/datasets/msvd/defaults_cap.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ msvd_cap: # name of the dataset builder
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: videos # [images|videos|features]
10
+
11
+ build_info:
12
+ # Be careful not to append minus sign (-) before split to avoid itemizing
13
+ annotations:
14
+ train:
15
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_train.json
16
+ storage: msvd/annotations/cap_train.json
17
+ val:
18
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_val.json
19
+ storage: msvd/annotations/cap_val.json
20
+ test:
21
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_test.json
22
+ storage: msvd/annotations/cap_test.json
23
+ videos:
24
+ storage: msvd/videos
LAVIS-main/lavis/configs/datasets/msvd/defaults_cap_instruct.yaml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ msvd_caption_instruct: # name of the dataset builder
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: videos # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: alpro_video_train
14
+ n_frms: 4
15
+ image_size: 224
16
+ min_scale: 0.9
17
+ max_scale: 1.0
18
+ full_video: True
19
+ eval:
20
+ name: alpro_video_eval
21
+ n_frms: 4
22
+ image_size: 224
23
+ min_scale: 0.9
24
+ max_scale: 1.0
25
+ full_video: True
26
+
27
+ text_processor:
28
+ train:
29
+ name: blip_instruction
30
+ task: caption
31
+ modality: video
32
+ eval:
33
+ name: blip_caption
34
+
35
+
36
+ build_info:
37
+ # Be careful not to append minus sign (-) before split to avoid itemizing
38
+ annotations:
39
+ train:
40
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_train.json
41
+ storage: msvd/annotations/cap_train.json
42
+ val:
43
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_val.json
44
+ storage: msvd/annotations/cap_val.json
45
+ test:
46
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_test.json
47
+ storage: msvd/annotations/cap_test.json
48
+ videos:
49
+ # storage: msvd/videos
50
+ storage: /export/share/datasets/vision_language/msvd/videos
LAVIS-main/lavis/configs/datasets/msvd/defaults_qa.yaml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ msvd_qa: # name of the dataset builder
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: videos # [images|videos|features]
10
+
11
+ build_info:
12
+ # Be careful not to append minus sign (-) before split to avoid itemizing
13
+ annotations:
14
+ train:
15
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json
16
+ storage: msvd/annotations/qa_train.json
17
+ val:
18
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json
19
+ storage: msvd/annotations/qa_val.json
20
+ test:
21
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json
22
+ storage: msvd/annotations/qa_test.json
23
+ ans2label:
24
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json
25
+ storage: msvd/annotations/qa_ans2label.json
26
+ videos:
27
+ storage: msvd/videos
28
+
29
+ instance_id_key: question_id
LAVIS-main/lavis/configs/datasets/msvd/defaults_qa_instruct.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ msvd_qa_instruct: # name of the dataset builder
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: videos # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: alpro_video_train
14
+ n_frms: 4
15
+ image_size: 224
16
+ min_scale: 0.9
17
+ max_scale: 1.0
18
+ full_video: True
19
+ eval:
20
+ name: alpro_video_eval
21
+ n_frms: 4
22
+ image_size: 224
23
+ min_scale: 0.9
24
+ max_scale: 1.0
25
+ full_video: True
26
+
27
+ text_processor:
28
+ train:
29
+ name: blip_instruction
30
+ task: qa
31
+ modality: video
32
+ eval:
33
+ name: blip_question
34
+
35
+ build_info:
36
+ # Be careful not to append minus sign (-) before split to avoid itemizing
37
+ annotations:
38
+ train:
39
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json
40
+ storage: msvd/annotations/qa_train.json
41
+ val:
42
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json
43
+ storage: msvd/annotations/qa_val.json
44
+ test:
45
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json
46
+ storage: msvd/annotations/qa_test.json
47
+ ans2label:
48
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json
49
+ storage: msvd/annotations/qa_ans2label.json
50
+ videos:
51
+ storage: /export/share/datasets/vision_language/msvd/videos
52
+
53
+ instance_id_key: question_id
LAVIS-main/lavis/configs/datasets/music_avqa/defaults_mm_qa.yaml ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+ datasets:
6
+ musicavqa_mm: # name of the dataset builder
7
+ data_type: [video, audio]
8
+
9
+ video_processor:
10
+ train:
11
+ name: alpro_video_train
12
+ n_frms: 4
13
+ image_size: 224
14
+ min_scale: 0.9
15
+ max_scale: 1.0
16
+ full_video: True
17
+ eval:
18
+ name: alpro_video_eval
19
+ n_frms: 4
20
+ image_size: 224
21
+ min_scale: 0.9
22
+ max_scale: 1.0
23
+ full_video: True
24
+
25
+ text_processor:
26
+ train:
27
+ name: blip_question
28
+ eval:
29
+ name: blip_question
30
+
31
+ audio_processor:
32
+ train:
33
+ name: beats_audio
34
+ sampling_rate: 16000
35
+ eval:
36
+ name: beats_audio
37
+ sampling_rate: 16000
38
+ is_eval: False
39
+
40
+ build_info:
41
+ # Be careful not to append minus sign (-) before split to avoid itemizing
42
+ annotations:
43
+ val:
44
+ url:
45
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/musicavqa/avqa-val.json
46
+ # - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-val.json
47
+ storage:
48
+ - /musicavqa/annotations/avqa-val.json
49
+ # - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-val.json
50
+
51
+ test:
52
+ url:
53
+ # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/musicavqa/avqa-test.json
54
+ - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-test.json
55
+ storage:
56
+ # - /musicavqa/annotations/avqa-test.json
57
+ - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-test.json
58
+
59
+ templates: null
60
+
61
+ audio:
62
+ storage: /export/video-language-dataset/data/MUSIC-AVQA/data/MUSIC-AVQA-videos-Real
63
+
64
+ video:
65
+ storage: /export/video-language-dataset/data/MUSIC-AVQA/data/MUSIC-AVQA-videos-Real
66
+
LAVIS-main/lavis/configs/datasets/music_avqa/defaults_mm_qa_instruct.yaml ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ musicavqa_mm_instruct: # name of the dataset builder
8
+ data_type: [video, audio]
9
+
10
+ video_processor:
11
+ train:
12
+ name: alpro_video_train
13
+ n_frms: 4
14
+ image_size: 224
15
+ min_scale: 0.9
16
+ max_scale: 1.0
17
+ full_video: True
18
+ eval:
19
+ name: alpro_video_eval
20
+ n_frms: 4
21
+ image_size: 224
22
+ min_scale: 0.9
23
+ max_scale: 1.0
24
+ full_video: True
25
+
26
+ text_processor:
27
+ train:
28
+ name: blip_instruction
29
+ task: qa
30
+ modality: video
31
+ eval:
32
+ name: blip_question
33
+
34
+ audio_processor:
35
+ train:
36
+ name: beats_audio
37
+ sampling_rate: 16000
38
+ eval:
39
+ name: beats_audio
40
+ sampling_rate: 16000
41
+ is_eval: False
42
+
43
+ build_info:
44
+ # Be careful not to append minus sign (-) before split to avoid itemizing
45
+ annotations:
46
+ val:
47
+ url:
48
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/musicavqa/avqa-val.json
49
+ # - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-val.json
50
+ storage:
51
+ - /musicavqa/annotations/avqa-val.json
52
+ # - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-val.json
53
+
54
+ test:
55
+ url:
56
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/musicavqa/avqa-test.json
57
+ # - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-test.json
58
+ storage:
59
+ - /musicavqa/annotations/avqa-test.json
60
+ # - /export/video-language-dataset/data/MUSIC-AVQA/data/json/avqa-test.json
61
+
62
+ templates: null
63
+
64
+ audio:
65
+ storage: /export/video-language-dataset/data/MUSIC-AVQA/data/MUSIC-AVQA-videos-Real
66
+
67
+ video:
68
+ storage: /export/video-language-dataset/data/MUSIC-AVQA/data/MUSIC-AVQA-videos-Real
69
+
LAVIS-main/lavis/configs/datasets/nlvr/defaults.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ nlvr:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ build_info:
12
+ # Be careful not to append minus sign (-) before split to avoid itemizing
13
+ annotations:
14
+ train:
15
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_train.json
16
+ storage: nlvr/annotations/train.json
17
+ val:
18
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json
19
+ storage: nlvr/annotations/dev.json
20
+ test:
21
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json
22
+ storage: nlvr/annotations/test.json
23
+ images:
24
+ storage: /export/share/datasets/vision/NLVR2/
LAVIS-main/lavis/configs/datasets/nocaps/defaults.yaml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ nocaps: # name of the dataset builder
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ build_info:
12
+ # Be careful not to append minus sign (-) before split to avoid itemizing
13
+ annotations:
14
+ val:
15
+ url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_val.json
16
+ storage: nocaps/annotations/nocaps_val.json
17
+ test:
18
+ url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_test.json
19
+ storage: nocaps/annotations/nocaps_test.json
20
+ images:
21
+ storage: nocaps/images
22
+ # storage: /export/share/datasets/vision/nocaps/
LAVIS-main/lavis/configs/datasets/objaverse/defaults_mm_cap.yaml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ objaverse_mm_caption: # 651576 train examples
8
+ vis_processor:
9
+ train:
10
+ name: "clip_image_train"
11
+ image_size: 224
12
+ eval:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+ pc_processor:
16
+ train:
17
+ name: "ulip_pc"
18
+ eval:
19
+ name: "ulip_pc"
20
+
21
+ text_processor:
22
+ train:
23
+ name: "blip_caption"
24
+ eval:
25
+ name: "blip_caption"
26
+
27
+ data_type: [pc, images] # [images|pc]
28
+
29
+ build_info:
30
+ # Be careful not to append minus sign (-) before split to avoid itemizing
31
+ annotations:
32
+ train:
33
+ url:
34
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/objaverse/cap3d_cap_final_train.csv
35
+ # - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_train.json
36
+ storage:
37
+ - objaverse/annotations/train.csv
38
+ # - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_train.json
39
+
40
+ val:
41
+ url:
42
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/objaverse/cap3d_cap_final_val.csv
43
+ # - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_val.json
44
+ storage:
45
+ - objaverse/annotations/val.csv
46
+ # - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_val.json
47
+
48
+ templates: null
49
+
50
+ pc:
51
+ storage: /export/einstein-vision/3d_vision/objaverse/objaverse_pc_parallel
52
+
53
+ images:
54
+ storage: /export/einstein-vision/3d_vision/objaverse_captions/images/
LAVIS-main/lavis/configs/datasets/objaverse/defaults_mm_cap_instruct.yaml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ objaverse_mm_caption_instruct: # 651576 train examples
8
+ vis_processor:
9
+ train:
10
+ name: "clip_image_train"
11
+ image_size: 224
12
+ eval:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+ pc_processor:
16
+ train:
17
+ name: "ulip_pc"
18
+ eval:
19
+ name: "ulip_pc"
20
+ text_processor:
21
+ train:
22
+ name: "blip_instruction"
23
+ modality: pc
24
+ task: caption
25
+ eval:
26
+ name: "blip_caption"
27
+
28
+ data_type: [pc, images] # [images|pc]
29
+
30
+ build_info:
31
+ # Be careful not to append minus sign (-) before split to avoid itemizing
32
+ annotations:
33
+ train:
34
+ url:
35
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/objaverse/cap3d_cap_final_train.csv
36
+ # - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_train.json
37
+ storage:
38
+ - objaverse/annotations/train.csv
39
+ # - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_train.json
40
+
41
+ # val:
42
+ # url:
43
+ # # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/objaverse/cap3d_cap_final_val.csv
44
+ # - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_val.json
45
+ # storage:
46
+ # # - objaverse/annotations/val.csv
47
+ # - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_val.json
48
+
49
+ templates: null
50
+
51
+ pc:
52
+ storage: /export/einstein-vision/3d_vision/objaverse/objaverse_pc_parallel
53
+
54
+ images:
55
+ storage: /export/einstein-vision/3d_vision/objaverse_captions/images/
LAVIS-main/lavis/configs/datasets/objaverse/defaults_mm_qa.yaml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ objaverse_mm_qa: # 250070
8
+ vis_processor:
9
+ train:
10
+ name: "clip_image_train"
11
+ image_size: 224
12
+ eval:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+ pc_processor:
16
+ train:
17
+ name: "ulip_pc"
18
+ eval:
19
+ name: "ulip_pc"
20
+ text_processor:
21
+ train:
22
+ name: "blip_instruction"
23
+ modality: pc
24
+ task: qa
25
+ eval:
26
+ name: "blip_question"
27
+
28
+
29
+ data_type: pc # [images|pc]
30
+
31
+ build_info:
32
+ kwargs:
33
+ add_binary: True
34
+ remove_model_answer: True
35
+ # Be careful not to append minus sign (-) before split to avoid itemizing
36
+ annotations:
37
+ train:
38
+ url:
39
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/objaverse/CAP3DQA_final.csv
40
+ # - /export/home/LAVIS-xgen_mm/projects/xinstructblip/data_aug/3d_qa_data/CAP3DQA_final.csv
41
+ storage:
42
+ - objaverse_qa/annotations/train.csv
43
+ # - /export/home/LAVIS-xgen_mm/projects/xinstructblip/data_aug/3d_qa_data/CAP3DQA_final.csv
44
+ # val:
45
+ # url:
46
+ # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/objaverse/CAP3DQA_final_val.csv
47
+ # # - /export/home/LAVIS-xgen_mm/projects/xinstructblip/data_aug/3d_qa_data/CAP3DQA_final_val.csv
48
+ # storage:
49
+ # - objaverse_qa/annotations/val.csv
50
+ # # - /export/home/LAVIS-xgen_mm/projects/xinstructblip/data_aug/3d_qa_data/CAP3DQA_final_val.csv
51
+
52
+ templates: null
53
+
54
+ pc:
55
+ storage: /export/einstein-vision/3d_vision/objaverse/objaverse_pc_parallel
LAVIS-main/lavis/configs/datasets/ocrvqa/defaults.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ ocr_vqa: # 1002146 train examples
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+
16
+ text_processor:
17
+ train:
18
+ name: "blip_question"
19
+ eval:
20
+ name: blip_question
21
+
22
+ build_info:
23
+ # Be careful not to append minus sign (-) before split to avoid itemizing
24
+ annotations:
25
+ train:
26
+ url:
27
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/ocrvqa/ocrvqa.json
28
+ # - /export/video-language-dataset/ocrvqa/ocrvqa.json
29
+ storage:
30
+ - ocrvqa/annotations/ocrvqa.json
31
+ # - /export/video-language-dataset/ocrvqa/ocrvqa.json
32
+ images:
33
+ storage: /export/video-language-dataset/ocrvqa/images/
LAVIS-main/lavis/configs/datasets/ocrvqa/defaults_instruct.yaml ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ ocr_vqa_instruct: # 1002146 train examples
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+
16
+ text_processor:
17
+ train:
18
+ name: blip_instruction
19
+ modality: image
20
+ task: qa
21
+ eval:
22
+ name: blip_question
23
+
24
+ build_info:
25
+ # Be careful not to append minus sign (-) before split to avoid itemizing
26
+ annotations:
27
+ train:
28
+ url:
29
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/ocrvqa/ocrvqa.json
30
+ # - /export/video-language-dataset/ocrvqa/ocrvqa.json
31
+ storage:
32
+ - ocrvqa/annotations/ocrvqa.json
33
+ # - /export/video-language-dataset/ocrvqa/ocrvqa.json
34
+ images:
35
+ storage: /export/video-language-dataset/ocrvqa/images/
LAVIS-main/lavis/configs/datasets/okvqa/defaults.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ ok_vqa:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ build_info:
12
+ # Be careful not to append minus sign (-) before split to avoid itemizing
13
+ annotations:
14
+ train:
15
+ url:
16
+ # TODO make this order insensitive
17
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
18
+ # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
19
+ # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
20
+ storage:
21
+ - okvqa/annotations/okvqa_train.json
22
+ # - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json
23
+ # - okvqa/annotations/mscoco_train2014_annotations.json
24
+ test:
25
+ url:
26
+ # TODO make this order insensitive
27
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
28
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
29
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
30
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
31
+ storage:
32
+ - okvqa/annotations/vqa_val_eval.json
33
+ - okvqa/annotations/answer_list.json
34
+ - okvqa/annotations/OpenEnded_mscoco_val2014_questions.json
35
+ - okvqa/annotations/mscoco_val2014_annotations.json
36
+ images:
37
+ storage: coco/images/
LAVIS-main/lavis/configs/datasets/okvqa/defaults_instruct.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ ok_vqa_instruct:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+ eval:
16
+ name: "clip_image_eval"
17
+ image_size: 224
18
+
19
+ text_processor:
20
+ train:
21
+ name: blip_instruction
22
+ modality: image
23
+ task: qa
24
+ eval:
25
+ name: blip_question
26
+
27
+ build_info:
28
+ # Be careful not to append minus sign (-) before split to avoid itemizing
29
+ annotations:
30
+ train:
31
+ url:
32
+ # TODO make this order insensitive
33
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
34
+ # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
35
+ # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
36
+ storage:
37
+ - okvqa/annotations/okvqa_train.json
38
+ # - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json
39
+ # - okvqa/annotations/mscoco_train2014_annotations.json
40
+ # test:
41
+ # url:
42
+ # # TODO make this order insensitive
43
+ # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
44
+ # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
45
+ # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
46
+ # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
47
+ # storage:
48
+ # - okvqa/annotations/vqa_val_eval.json
49
+ # - okvqa/annotations/answer_list.json
50
+ # - okvqa/annotations/OpenEnded_mscoco_val2014_questions.json
51
+ # - okvqa/annotations/mscoco_val2014_annotations.json
52
+ images:
53
+ storage: /export/share/datasets/vision/coco/images
LAVIS-main/lavis/configs/datasets/sbu_caption/defaults.yaml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ sbu_caption:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ build_info:
12
+ # Be careful not to append minus sign (-) before split to avoid itemizing
13
+ annotations:
14
+ train:
15
+ url:
16
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/sbu/sbu.json
17
+ # - /export/share/dongxuli/data/lavis/sbu/annotation/sbu.json
18
+ storage:
19
+ - sbu_captions/annotations/sbu.json
20
+ images:
21
+ storage: sbu_captions/images
22
+ # storage: /export/share/datasets/vision_language/sbu_resize
LAVIS-main/lavis/configs/datasets/sbu_caption/defaults_instruct.yaml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ sbu_caption_instruct:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+ eval:
16
+ name: "clip_image_eval"
17
+ image_size: 224
18
+
19
+ text_processor:
20
+ train:
21
+ name: blip_instruction
22
+ modality: image
23
+ task: caption
24
+ eval:
25
+ name: blip_caption
26
+
27
+ build_info:
28
+ # Be careful not to append minus sign (-) before split to avoid itemizing
29
+ annotations:
30
+ train:
31
+ url:
32
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/sbu/sbu.json
33
+ # - /export/share/dongxuli/data/lavis/sbu/annotation/sbu.json
34
+ storage:
35
+ - sbu_captions/annotations/sbu.json
36
+ images:
37
+ storage: sbu_captions/images
38
+ # storage: /export/share/datasets/vision_language/sbu_resize
LAVIS-main/lavis/configs/datasets/scienceqa/defaults.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ scienceqa:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+ eval:
16
+ name: "clip_image_eval"
17
+ image_size: 224
18
+
19
+ text_processor:
20
+ train:
21
+ name: blip_question
22
+ eval:
23
+ name: blip_question
24
+
25
+ build_info:
26
+ # Be careful not to append minus sign (-) before split to avoid itemizing
27
+ train:
28
+ url:
29
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/scienceqa/problems_train.json
30
+ # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_train.json
31
+ storage:
32
+ - scienceqa/annotations/problems_train.json
33
+ # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_train.json
34
+ val:
35
+ url:
36
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/scienceqa/problems_val.json
37
+ # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_val.json
38
+ storage:
39
+ - scienceqa/annotations/problems_val.json
40
+ # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_val.json
41
+ test:
42
+ url:
43
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/scienceqa/problems_test.json
44
+ # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_test.json
45
+ storage:
46
+ - scienceqa/annotations/problems_test.json
47
+ # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_test.json
48
+
49
+ images:
50
+ storage: /export/video-language-dataset/ScienceQA/data/scienceqa/images/
51
+
LAVIS-main/lavis/configs/datasets/scienceqa/defaults_instruct.yaml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ scienceqa_instruct:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+ eval:
16
+ name: "clip_image_eval"
17
+ image_size: 224
18
+
19
+ text_processor:
20
+ train:
21
+ name: blip_instruction
22
+ modality: image
23
+ task: qa
24
+ eval:
25
+ name: blip_question
26
+
27
+ build_info:
28
+ # Be careful not to append minus sign (-) before split to avoid itemizing
29
+ annotations:
30
+ train:
31
+ url:
32
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/scienceqa/problems_train.json
33
+ # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_train.json
34
+ storage:
35
+ - scienceqa/annotations/problems_train.json
36
+ # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_train.json
37
+ val:
38
+ url:
39
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/scienceqa/problems_val.json
40
+ # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_val.json
41
+ storage:
42
+ - scienceqa/annotations/problems_val.json
43
+ # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_val.json
44
+ test:
45
+ url:
46
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/scienceqa/problems_test.json
47
+ # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_test.json
48
+ storage:
49
+ - scienceqa/annotations/problems_test.json
50
+ # - /export/video-language-dataset/ScienceQA/data/scienceqa/problems_test.json
51
+
52
+ images:
53
+ storage: /export/video-language-dataset/ScienceQA/data/scienceqa/images/
54
+
LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/LICENSE ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2022 Dongxu Li, Junnan Li, Hung Le, Guangsen Wang, Silvio Savarese, Steven Hoi. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ MIT License
6
+
7
+ Copyright (c) 2019 Igor Brigadir
8
+
9
+ Permission is hereby granted, free of charge, to any person obtaining a copy
10
+ of this software and associated documentation files (the "Software"), to deal
11
+ in the Software without restriction, including without limitation the rights
12
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ copies of the Software, and to permit persons to whom the Software is
14
+ furnished to do so, subject to the following conditions:
15
+
16
+ The above copyright notice and this permission notice shall be included in all
17
+ copies or substantial portions of the Software.
18
+
19
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25
+ SOFTWARE.
LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/README.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--
2
+ Copyright (c) 2022, salesforce.com, inc.
3
+ All rights reserved.
4
+ SPDX-License-Identifier: BSD-3-Clause
5
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6
+ -->
7
+
8
+ # Download Conceptual Captions Data
9
+
10
+ Place data from: https://ai.google.com/research/ConceptualCaptions/download in this folder
11
+
12
+ `Train_GCC-training.tsv / cc3m.tsv` Training Split (3,318,333)
13
+
14
+ run `download_data_cc3m.py` or `download_data_cc12m.py`.
15
+
16
+ Images will be in default LAVIS cache folders. You can stop and resume, the settings for splitting downloads into chunks / threads are not optimal, but it maxed out my connection so i kept them as is.
17
+
18
+ Note: A previous version of this script used a different file naming scheme, this changed and if you are resuming a previously started download, you will get duplicates.
19
+
20
+ A bunch of them will fail to download, and return web pages instead. These will need to be cleaned up later. See `downloaded_validation_report.tsv` after it downloads for HTTP errors. Around 8% of images are gone, based on validation set results. Setting the user agent could fix some errors too maybe - not sure if any requests are rejected by sites based on this.
21
+
22
+ It should take about a day or two to download the training data, keep an eye on disk space.
LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/create_annotation_12m.ipynb ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 15,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "import json\n",
11
+ "\n",
12
+ "import pandas as pd\n",
13
+ "from tqdm import tqdm\n",
14
+ "from lavis.common.utils import get_abs_path, get_cache_path"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 2,
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "cc12m = pd.read_csv(\"downloaded_cc12m_report.tsv.gz\", compression=\"gzip\", sep=\"\\t\", names=[\"caption\", \"path\", \"dataset\", \"mimetype\", \"size\", \"status\", \"url\"])"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": 7,
29
+ "metadata": {},
30
+ "outputs": [
31
+ {
32
+ "data": {
33
+ "text/plain": [
34
+ "caption a very typical bus station\n",
35
+ "path /export/home/.cache/lavis/conceptual_caption/i...\n",
36
+ "dataset cc3m\n",
37
+ "mimetype image/jpeg\n",
38
+ "size 36078\n",
39
+ "status 200\n",
40
+ "url http://lh6.ggpht.com/-IvRtNLNcG8o/TpFyrudaT6I/...\n",
41
+ "Name: 0, dtype: object"
42
+ ]
43
+ },
44
+ "execution_count": 7,
45
+ "metadata": {},
46
+ "output_type": "execute_result"
47
+ }
48
+ ],
49
+ "source": [
50
+ "cc12m.iloc[0]"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": 3,
56
+ "metadata": {},
57
+ "outputs": [
58
+ {
59
+ "data": {
60
+ "text/plain": [
61
+ "3318333"
62
+ ]
63
+ },
64
+ "execution_count": 3,
65
+ "metadata": {},
66
+ "output_type": "execute_result"
67
+ }
68
+ ],
69
+ "source": [
70
+ "len(cc12m)"
71
+ ]
72
+ },
73
+ {
74
+ "cell_type": "code",
75
+ "execution_count": 21,
76
+ "metadata": {},
77
+ "outputs": [
78
+ {
79
+ "name": "stderr",
80
+ "output_type": "stream",
81
+ "text": [
82
+ "100%|██████████| 3130587/3130587 [17:28<00:00, 2986.08it/s]"
83
+ ]
84
+ },
85
+ {
86
+ "name": "stdout",
87
+ "output_type": "stream",
88
+ "text": [
89
+ "Found 2759017 valid records\n"
90
+ ]
91
+ },
92
+ {
93
+ "name": "stderr",
94
+ "output_type": "stream",
95
+ "text": [
96
+ "\n"
97
+ ]
98
+ }
99
+ ],
100
+ "source": [
101
+ "cnt = 0\n",
102
+ "\n",
103
+ "valid_records = []\n",
104
+ "\n",
105
+ "for i, path in tqdm(enumerate(cc12m.path.unique()), total=len(cc12m.path.unique())):\n",
106
+ " path = str(path)\n",
107
+ " if os.path.exists(path):\n",
108
+ " record = cc12m.iloc[i]\n",
109
+ " valid_records.append({\"image\": record[\"path\"], \"caption\": record[\"caption\"]})\n",
110
+ "\n",
111
+ " cnt += 1\n",
112
+ "\n",
113
+ "print(\"Found {} valid records\".format(cnt))"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "code",
118
+ "execution_count": 22,
119
+ "metadata": {},
120
+ "outputs": [
121
+ {
122
+ "data": {
123
+ "text/plain": [
124
+ "2759017"
125
+ ]
126
+ },
127
+ "execution_count": 22,
128
+ "metadata": {},
129
+ "output_type": "execute_result"
130
+ }
131
+ ],
132
+ "source": [
133
+ "len(valid_records)"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": 24,
139
+ "metadata": {},
140
+ "outputs": [
141
+ {
142
+ "data": {
143
+ "text/plain": [
144
+ "{'image': '/export/home/.cache/lavis/conceptual_caption/images/1_3239086386.jpg',\n",
145
+ " 'caption': 'sierra looked stunning in this top and this skirt while performing with person at their former university'}"
146
+ ]
147
+ },
148
+ "execution_count": 24,
149
+ "metadata": {},
150
+ "output_type": "execute_result"
151
+ }
152
+ ],
153
+ "source": [
154
+ "valid_records[1]"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "execution_count": 28,
160
+ "metadata": {},
161
+ "outputs": [
162
+ {
163
+ "name": "stdout",
164
+ "output_type": "stream",
165
+ "text": [
166
+ "/export/home/.cache/lavis/conceptual_caption/annotations/cc3m.json already exists\n"
167
+ ]
168
+ },
169
+ {
170
+ "ename": "",
171
+ "evalue": "",
172
+ "output_type": "error",
173
+ "traceback": [
174
+ "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
175
+ ]
176
+ }
177
+ ],
178
+ "source": [
179
+ "from omegaconf import OmegaConf\n",
180
+ "\n",
181
+ "\n",
182
+ "config_path = get_abs_path(\"configs/datasets/conceptual_caption/defaults_12m.yaml\")\n",
183
+ "\n",
184
+ "ann_path = OmegaConf.load(\n",
185
+ " config_path\n",
186
+ ").datasets.conceptual_caption_12m.build_info.annotations.train.storage[0]\n",
187
+ "\n",
188
+ "ann_path = get_cache_path(ann_path)\n",
189
+ "\n",
190
+ "if os.path.exists(ann_path):\n",
191
+ " # abort\n",
192
+ " print(\"{} already exists\".format(ann_path))\n",
193
+ "else:\n",
194
+ " # Save the valid records to a json file\n",
195
+ " with open(ann_path, \"w\") as f:\n",
196
+ " f.write(json.dumps(valid_records))"
197
+ ]
198
+ }
199
+ ],
200
+ "metadata": {
201
+ "kernelspec": {
202
+ "display_name": "Python 3.8.10 ('base')",
203
+ "language": "python",
204
+ "name": "python3"
205
+ },
206
+ "language_info": {
207
+ "codemirror_mode": {
208
+ "name": "ipython",
209
+ "version": 3
210
+ },
211
+ "file_extension": ".py",
212
+ "mimetype": "text/x-python",
213
+ "name": "python",
214
+ "nbconvert_exporter": "python",
215
+ "pygments_lexer": "ipython3",
216
+ "version": "3.8.10"
217
+ },
218
+ "orig_nbformat": 4,
219
+ "vscode": {
220
+ "interpreter": {
221
+ "hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe"
222
+ }
223
+ }
224
+ },
225
+ "nbformat": 4,
226
+ "nbformat_minor": 2
227
+ }
LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/create_annotation_3m.ipynb ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 15,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "import json\n",
11
+ "\n",
12
+ "import pandas as pd\n",
13
+ "from tqdm import tqdm\n",
14
+ "from lavis.common.utils import get_abs_path, get_cache_path"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 2,
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "cc3m = pd.read_csv(\"downloaded_cc3m_report.tsv.gz\", compression=\"gzip\", sep=\"\\t\", names=[\"caption\", \"path\", \"dataset\", \"mimetype\", \"size\", \"status\", \"url\"])"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": 7,
29
+ "metadata": {},
30
+ "outputs": [
31
+ {
32
+ "data": {
33
+ "text/plain": [
34
+ "caption a very typical bus station\n",
35
+ "path /export/home/.cache/lavis/conceptual_caption/i...\n",
36
+ "dataset cc3m\n",
37
+ "mimetype image/jpeg\n",
38
+ "size 36078\n",
39
+ "status 200\n",
40
+ "url http://lh6.ggpht.com/-IvRtNLNcG8o/TpFyrudaT6I/...\n",
41
+ "Name: 0, dtype: object"
42
+ ]
43
+ },
44
+ "execution_count": 7,
45
+ "metadata": {},
46
+ "output_type": "execute_result"
47
+ }
48
+ ],
49
+ "source": [
50
+ "cc3m.iloc[0]"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": 3,
56
+ "metadata": {},
57
+ "outputs": [
58
+ {
59
+ "data": {
60
+ "text/plain": [
61
+ "3318333"
62
+ ]
63
+ },
64
+ "execution_count": 3,
65
+ "metadata": {},
66
+ "output_type": "execute_result"
67
+ }
68
+ ],
69
+ "source": [
70
+ "len(cc3m)"
71
+ ]
72
+ },
73
+ {
74
+ "cell_type": "code",
75
+ "execution_count": 21,
76
+ "metadata": {},
77
+ "outputs": [
78
+ {
79
+ "name": "stderr",
80
+ "output_type": "stream",
81
+ "text": [
82
+ "100%|██████████| 3130587/3130587 [17:28<00:00, 2986.08it/s]"
83
+ ]
84
+ },
85
+ {
86
+ "name": "stdout",
87
+ "output_type": "stream",
88
+ "text": [
89
+ "Found 2759017 valid records\n"
90
+ ]
91
+ },
92
+ {
93
+ "name": "stderr",
94
+ "output_type": "stream",
95
+ "text": [
96
+ "\n"
97
+ ]
98
+ }
99
+ ],
100
+ "source": [
101
+ "cnt = 0\n",
102
+ "\n",
103
+ "valid_records = []\n",
104
+ "\n",
105
+ "for i, path in tqdm(enumerate(cc3m.path.unique()), total=len(cc3m.path.unique())):\n",
106
+ " path = str(path)\n",
107
+ " if os.path.exists(path):\n",
108
+ " record = cc3m.iloc[i]\n",
109
+ " valid_records.append({\"image\": record[\"path\"], \"caption\": record[\"caption\"]})\n",
110
+ "\n",
111
+ " cnt += 1\n",
112
+ "\n",
113
+ "print(\"Found {} valid records\".format(cnt))"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "code",
118
+ "execution_count": 22,
119
+ "metadata": {},
120
+ "outputs": [
121
+ {
122
+ "data": {
123
+ "text/plain": [
124
+ "2759017"
125
+ ]
126
+ },
127
+ "execution_count": 22,
128
+ "metadata": {},
129
+ "output_type": "execute_result"
130
+ }
131
+ ],
132
+ "source": [
133
+ "len(valid_records)"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": 24,
139
+ "metadata": {},
140
+ "outputs": [
141
+ {
142
+ "data": {
143
+ "text/plain": [
144
+ "{'image': '/export/home/.cache/lavis/conceptual_caption/images/1_3239086386.jpg',\n",
145
+ " 'caption': 'sierra looked stunning in this top and this skirt while performing with person at their former university'}"
146
+ ]
147
+ },
148
+ "execution_count": 24,
149
+ "metadata": {},
150
+ "output_type": "execute_result"
151
+ }
152
+ ],
153
+ "source": [
154
+ "valid_records[1]"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "execution_count": 28,
160
+ "metadata": {},
161
+ "outputs": [
162
+ {
163
+ "name": "stdout",
164
+ "output_type": "stream",
165
+ "text": [
166
+ "/export/home/.cache/lavis/conceptual_caption/annotations/cc3m.json already exists\n"
167
+ ]
168
+ },
169
+ {
170
+ "ename": "",
171
+ "evalue": "",
172
+ "output_type": "error",
173
+ "traceback": [
174
+ "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
175
+ ]
176
+ }
177
+ ],
178
+ "source": [
179
+ "from omegaconf import OmegaConf\n",
180
+ "\n",
181
+ "\n",
182
+ "config_path = get_abs_path(\"configs/datasets/conceptual_caption/defaults_3m.yaml\")\n",
183
+ "\n",
184
+ "ann_path = OmegaConf.load(\n",
185
+ " config_path\n",
186
+ ").datasets.conceptual_caption_3m.build_info.annotations.train.storage[0]\n",
187
+ "\n",
188
+ "ann_path = get_cache_path(ann_path)\n",
189
+ "\n",
190
+ "if os.path.exists(ann_path):\n",
191
+ " # abort\n",
192
+ " print(\"{} already exists\".format(ann_path))\n",
193
+ "else:\n",
194
+ " # Save the valid records to a json file\n",
195
+ " with open(ann_path, \"w\") as f:\n",
196
+ " f.write(json.dumps(valid_records))"
197
+ ]
198
+ }
199
+ ],
200
+ "metadata": {
201
+ "kernelspec": {
202
+ "display_name": "Python 3.8.10 ('base')",
203
+ "language": "python",
204
+ "name": "python3"
205
+ },
206
+ "language_info": {
207
+ "codemirror_mode": {
208
+ "name": "ipython",
209
+ "version": 3
210
+ },
211
+ "file_extension": ".py",
212
+ "mimetype": "text/x-python",
213
+ "name": "python",
214
+ "nbconvert_exporter": "python",
215
+ "pygments_lexer": "ipython3",
216
+ "version": "3.8.10"
217
+ },
218
+ "orig_nbformat": 4,
219
+ "vscode": {
220
+ "interpreter": {
221
+ "hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe"
222
+ }
223
+ }
224
+ },
225
+ "nbformat": 4,
226
+ "nbformat_minor": 2
227
+ }
LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc12m.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright (c) 2022, salesforce.com, inc.
3
+ All rights reserved.
4
+ SPDX-License-Identifier: BSD-3-Clause
5
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6
+ """
7
+
8
+ import time
9
+ from PIL import Image
10
+ from lavis.common.utils import get_abs_path, get_cache_path
11
+ from multiprocessing import Pool
12
+ from omegaconf import OmegaConf
13
+ from pathlib import Path
14
+ from torchvision.transforms import functional as TF
15
+ from tqdm import tqdm
16
+ import glob
17
+ import io
18
+ import json
19
+ import magic # pip install python-magic
20
+ import numpy as np
21
+ import os
22
+ import pandas as pd
23
+ import requests
24
+ import shelve
25
+ import zlib
26
+
27
+ headers = {
28
+ #'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
29
+ "User-Agent": "Googlebot-Image/1.0", # Pretend to be googlebot
30
+ "X-Forwarded-For": "64.18.15.200",
31
+ }
32
+
33
+
34
+ def _df_split_apply(tup_arg):
35
+ split_ind, subset, func = tup_arg
36
+ r = subset.apply(func, axis=1)
37
+ return (split_ind, r)
38
+
39
+
40
+ def df_multiprocess(df, processes, chunk_size, func, dataset_name):
41
+ print("Generating parts...")
42
+ with shelve.open(
43
+ "%s_%s_%s_results.tmp" % (dataset_name, func.__name__, chunk_size)
44
+ ) as results:
45
+
46
+ pbar = tqdm(total=len(df), position=0)
47
+ # Resume:
48
+ finished_chunks = set([int(k) for k in results.keys()])
49
+ pbar.desc = "Resuming"
50
+ for k in results.keys():
51
+ pbar.update(len(results[str(k)][1]))
52
+
53
+ pool_data = (
54
+ (index, df[i : i + chunk_size], func)
55
+ for index, i in enumerate(range(0, len(df), chunk_size))
56
+ if index not in finished_chunks
57
+ )
58
+ print(
59
+ int(len(df) / chunk_size),
60
+ "parts.",
61
+ chunk_size,
62
+ "per part.",
63
+ "Using",
64
+ processes,
65
+ "processes",
66
+ )
67
+
68
+ pbar.desc = "Downloading"
69
+ with Pool(processes) as pool:
70
+ for i, result in enumerate(
71
+ pool.imap_unordered(_df_split_apply, pool_data, 2)
72
+ ):
73
+ results[str(result[0])] = result
74
+ pbar.update(len(result[1]))
75
+ pbar.close()
76
+
77
+ print("Finished Downloading.")
78
+ return
79
+
80
+
81
+ # Unique name based on url
82
+ def _file_name(row):
83
+ name = (
84
+ "%s/%s_%s"
85
+ % (
86
+ # row["folder"],
87
+ storage_dir,
88
+ row.name,
89
+ (zlib.crc32(row["url"].encode("utf-8")) & 0xFFFFFFFF),
90
+ )
91
+ + ".jpg"
92
+ )
93
+ return name
94
+
95
+
96
+ # For checking mimetypes separately without download
97
+ def check_mimetype(row):
98
+ if os.path.isfile(str(row["file"])):
99
+ row["mimetype"] = magic.from_file(row["file"], mime=True)
100
+ row["size"] = os.stat(row["file"]).st_size
101
+ return row
102
+
103
+
104
+ # Don't download image, just check with a HEAD request, can't resume.
105
+ # Can use this instead of download_image to get HTTP status codes.
106
+ def check_download(row):
107
+ fname = _file_name(row)
108
+ try:
109
+ # not all sites will support HEAD
110
+ response = requests.head(
111
+ row["url"], stream=False, timeout=5, allow_redirects=True, headers=headers
112
+ )
113
+ row["status"] = response.status_code
114
+ row["headers"] = dict(response.headers)
115
+ except:
116
+ # log errors later, set error as 408 timeout
117
+ row["status"] = 408
118
+ return row
119
+ if response.ok:
120
+ row["file"] = fname
121
+ return row
122
+
123
+
124
+ def resize_img(req):
125
+ image = Image.open(req).convert("RGB")
126
+ image = TF.resize(
127
+ # image, size=(resize_size, resize_size)
128
+ image,
129
+ size=resize_size,
130
+ ) # , interpolation=Image.LANCZOS)
131
+ return image
132
+
133
+
134
+ def download_image(row):
135
+ fname = _file_name(row)
136
+ # Skip Already downloaded, retry others later
137
+ if os.path.isfile(fname):
138
+ row["status"] = 200
139
+ row["file"] = fname
140
+ row["mimetype"] = magic.from_file(row["file"], mime=True)
141
+ row["size"] = os.stat(row["file"]).st_size
142
+ return row
143
+
144
+ try:
145
+ # use smaller timeout to skip errors, but can result in failed downloads
146
+ response = requests.get(
147
+ row["url"], stream=False, timeout=5, allow_redirects=True, headers=headers
148
+ )
149
+ row["status"] = response.status_code
150
+ # row['headers'] = dict(response.headers)
151
+ except Exception as e:
152
+ # log errors later, set error as 408 timeout
153
+ row["status"] = 408
154
+ return row
155
+
156
+ if response.ok:
157
+ try:
158
+ # some sites respond with gzip transport encoding
159
+ response.raw.decode_content = True
160
+ img = resize_img(io.BytesIO(response.content))
161
+ img.save(fname)
162
+
163
+ row["mimetype"] = magic.from_file(fname, mime=True)
164
+ row["size"] = os.stat(fname).st_size
165
+
166
+ except Exception as e:
167
+ # # This is if it times out during a download or decode
168
+ row["status"] = 408
169
+
170
+ row["file"] = fname
171
+ return row
172
+
173
+
174
+ def open_tsv(fname, folder):
175
+ print("Opening %s Data File..." % fname)
176
+ df = pd.read_csv(
177
+ fname, sep="\t", names=["url", "caption"]
178
+ ) # , usecols=range(1, 2))
179
+ df["folder"] = folder
180
+ print("Processing", len(df), " Images:")
181
+ return df
182
+
183
+
184
+ def df_from_shelve(chunk_size, func, dataset_name):
185
+ print("Generating Dataframe from results...")
186
+ with shelve.open(
187
+ "%s_%s_%s_results.tmp" % (dataset_name, func.__name__, chunk_size)
188
+ ) as results:
189
+ keylist = sorted([int(k) for k in results.keys()])
190
+ df = pd.concat([results[str(k)][1] for k in keylist], sort=True)
191
+ return df
192
+
193
+
194
+ resize_size = 384
195
+
196
+ config_path = get_abs_path("configs/datasets/conceptual_caption/defaults_12m.yaml")
197
+
198
+ storage_dir = OmegaConf.load(
199
+ config_path
200
+ ).datasets.conceptual_caption_12m.build_info.images.storage
201
+ storage_dir = Path(get_cache_path(storage_dir))
202
+
203
+ os.makedirs(storage_dir, exist_ok=True)
204
+
205
+ # number of processes in the pool can be larger than cores
206
+ num_processes = 96
207
+ # num_processes = 1
208
+ # chunk_size is how many images per chunk per process - changing this resets progress when restarting.
209
+ images_per_part = 100
210
+
211
+ data_name = "cc12m"
212
+ # os.makedirs(data_name, exist_ok=True)
213
+
214
+ df = open_tsv("cc12m.tsv", data_name)
215
+ df_multiprocess(
216
+ df=df,
217
+ processes=num_processes,
218
+ chunk_size=images_per_part,
219
+ func=download_image,
220
+ dataset_name=data_name,
221
+ )
222
+ df = df_from_shelve(
223
+ chunk_size=images_per_part, func=download_image, dataset_name=data_name
224
+ )
225
+ df.to_csv(
226
+ "downloaded_%s_report.tsv.gz" % data_name,
227
+ compression="gzip",
228
+ sep="\t",
229
+ header=False,
230
+ index=False,
231
+ )
232
+ print("Saved.")
LAVIS-main/lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc3m.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright (c) 2022, salesforce.com, inc.
3
+ All rights reserved.
4
+ SPDX-License-Identifier: BSD-3-Clause
5
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6
+ """
7
+
8
+ import glob
9
+ from pathlib import Path
10
+ import time
11
+ from omegaconf import OmegaConf
12
+ import pandas as pd
13
+ import numpy as np
14
+ import requests
15
+ import zlib
16
+ import os
17
+ import io
18
+ import shelve
19
+ from lavis.common.utils import get_abs_path, get_cache_path
20
+ import magic # pip install python-magic
21
+ import json
22
+ from multiprocessing import Pool
23
+ from tqdm import tqdm
24
+ from PIL import Image
25
+ from torchvision.transforms import functional as TF
26
+
27
+ headers = {
28
+ #'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
29
+ "User-Agent": "Googlebot-Image/1.0", # Pretend to be googlebot
30
+ "X-Forwarded-For": "64.18.15.200",
31
+ }
32
+
33
+
34
+ def _df_split_apply(tup_arg):
35
+ split_ind, subset, func = tup_arg
36
+ r = subset.apply(func, axis=1)
37
+ return (split_ind, r)
38
+
39
+
40
+ def df_multiprocess(df, processes, chunk_size, func, dataset_name):
41
+ print("Generating parts...")
42
+ with shelve.open(
43
+ "%s_%s_%s_results.tmp" % (dataset_name, func.__name__, chunk_size)
44
+ ) as results:
45
+
46
+ pbar = tqdm(total=len(df), position=0)
47
+ # Resume:
48
+ finished_chunks = set([int(k) for k in results.keys()])
49
+ pbar.desc = "Resuming"
50
+ for k in results.keys():
51
+ pbar.update(len(results[str(k)][1]))
52
+
53
+ pool_data = (
54
+ (index, df[i : i + chunk_size], func)
55
+ for index, i in enumerate(range(0, len(df), chunk_size))
56
+ if index not in finished_chunks
57
+ )
58
+ print(
59
+ int(len(df) / chunk_size),
60
+ "parts.",
61
+ chunk_size,
62
+ "per part.",
63
+ "Using",
64
+ processes,
65
+ "processes",
66
+ )
67
+
68
+ pbar.desc = "Downloading"
69
+ with Pool(processes) as pool:
70
+ for i, result in enumerate(
71
+ pool.imap_unordered(_df_split_apply, pool_data, 2)
72
+ ):
73
+ results[str(result[0])] = result
74
+ pbar.update(len(result[1]))
75
+ pbar.close()
76
+
77
+ print("Finished Downloading.")
78
+ return
79
+
80
+
81
+ # Unique name based on url
82
+ def _file_name(row):
83
+ name = (
84
+ "%s/%s_%s"
85
+ % (
86
+ # row["folder"],
87
+ storage_dir,
88
+ row.name,
89
+ (zlib.crc32(row["url"].encode("utf-8")) & 0xFFFFFFFF),
90
+ )
91
+ + ".jpg"
92
+ )
93
+ return name
94
+
95
+
96
+ # For checking mimetypes separately without download
97
+ def check_mimetype(row):
98
+ if os.path.isfile(str(row["file"])):
99
+ row["mimetype"] = magic.from_file(row["file"], mime=True)
100
+ row["size"] = os.stat(row["file"]).st_size
101
+ return row
102
+
103
+
104
+ # Don't download image, just check with a HEAD request, can't resume.
105
+ # Can use this instead of download_image to get HTTP status codes.
106
+ def check_download(row):
107
+ fname = _file_name(row)
108
+ try:
109
+ # not all sites will support HEAD
110
+ response = requests.head(
111
+ row["url"], stream=False, timeout=5, allow_redirects=True, headers=headers
112
+ )
113
+ row["status"] = response.status_code
114
+ row["headers"] = dict(response.headers)
115
+ except:
116
+ # log errors later, set error as 408 timeout
117
+ row["status"] = 408
118
+ return row
119
+ if response.ok:
120
+ row["file"] = fname
121
+ return row
122
+
123
+
124
+ def resize_img(req):
125
+ image = Image.open(req).convert("RGB")
126
+ image = TF.resize(
127
+ # image, size=(resize_size, resize_size)
128
+ image,
129
+ size=resize_size,
130
+ ) # , interpolation=Image.LANCZOS)
131
+ return image
132
+
133
+
134
+ def download_image(row):
135
+ fname = _file_name(row)
136
+ # Skip Already downloaded, retry others later
137
+ if os.path.isfile(fname):
138
+ row["status"] = 200
139
+ row["file"] = fname
140
+ row["mimetype"] = magic.from_file(row["file"], mime=True)
141
+ row["size"] = os.stat(row["file"]).st_size
142
+ return row
143
+
144
+ try:
145
+ # use smaller timeout to skip errors, but can result in failed downloads
146
+ response = requests.get(
147
+ row["url"], stream=False, timeout=5, allow_redirects=True, headers=headers
148
+ )
149
+ row["status"] = response.status_code
150
+ # row['headers'] = dict(response.headers)
151
+ except Exception as e:
152
+ # log errors later, set error as 408 timeout
153
+ row["status"] = 408
154
+ return row
155
+
156
+ if response.ok:
157
+ try:
158
+ # some sites respond with gzip transport encoding
159
+ response.raw.decode_content = True
160
+ img = resize_img(io.BytesIO(response.content))
161
+ img.save(fname)
162
+
163
+ row["mimetype"] = magic.from_file(fname, mime=True)
164
+ row["size"] = os.stat(fname).st_size
165
+
166
+ except Exception as e:
167
+ # # This is if it times out during a download or decode
168
+ row["status"] = 408
169
+
170
+ row["file"] = fname
171
+ return row
172
+
173
+
174
+ def open_tsv(fname, folder):
175
+ print("Opening %s Data File..." % fname)
176
+ df = pd.read_csv(
177
+ fname, sep="\t", names=["caption", "url"]
178
+ ) # , usecols=range(1, 2))
179
+ df["folder"] = folder
180
+ print("Processing", len(df), " Images:")
181
+ return df
182
+
183
+
184
+ def df_from_shelve(chunk_size, func, dataset_name):
185
+ print("Generating Dataframe from results...")
186
+ with shelve.open(
187
+ "%s_%s_%s_results.tmp" % (dataset_name, func.__name__, chunk_size)
188
+ ) as results:
189
+ keylist = sorted([int(k) for k in results.keys()])
190
+ df = pd.concat([results[str(k)][1] for k in keylist], sort=True)
191
+ return df
192
+
193
+
194
+ resize_size = 384
195
+
196
+ config_path = get_abs_path("configs/datasets/conceptual_caption/defaults_3m.yaml")
197
+
198
+ storage_dir = OmegaConf.load(
199
+ config_path
200
+ ).datasets.conceptual_caption_3m.build_info.images.storage
201
+ storage_dir = Path(get_cache_path(storage_dir))
202
+
203
+ os.makedirs(storage_dir, exist_ok=True)
204
+
205
+ # number of processes in the pool can be larger than cores
206
+ num_processes = 32
207
+ # chunk_size is how many images per chunk per process - changing this resets progress when restarting.
208
+ images_per_part = 100
209
+
210
+ data_name = "cc3m"
211
+ df = open_tsv("Train_GCC-training.tsv", data_name)
212
+ df_multiprocess(
213
+ df=df,
214
+ processes=num_processes,
215
+ chunk_size=images_per_part,
216
+ func=download_image,
217
+ dataset_name=data_name,
218
+ )
219
+ df = df_from_shelve(
220
+ chunk_size=images_per_part, func=download_image, dataset_name=data_name
221
+ )
222
+ df.to_csv(
223
+ "downloaded_%s_report.tsv.gz" % data_name,
224
+ compression="gzip",
225
+ sep="\t",
226
+ header=False,
227
+ index=False,
228
+ )
229
+ print("Saved.")
LAVIS-main/lavis/models/__init__.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright (c) 2022, salesforce.com, inc.
3
+ All rights reserved.
4
+ SPDX-License-Identifier: BSD-3-Clause
5
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6
+ """
7
+
8
+ import logging
9
+ import torch
10
+ from omegaconf import OmegaConf
11
+ from lavis.common.registry import registry
12
+
13
+ from lavis.models.base_model import BaseModel
14
+
15
+ from lavis.models.albef_models.albef_classification import AlbefClassification
16
+ from lavis.models.albef_models.albef_feature_extractor import AlbefFeatureExtractor
17
+ from lavis.models.albef_models.albef_nlvr import AlbefNLVR
18
+ from lavis.models.albef_models.albef_pretrain import AlbefPretrain
19
+ from lavis.models.albef_models.albef_retrieval import AlbefRetrieval
20
+ from lavis.models.albef_models.albef_vqa import AlbefVQA
21
+ from lavis.models.alpro_models.alpro_qa import AlproQA
22
+ from lavis.models.alpro_models.alpro_retrieval import AlproRetrieval
23
+
24
+ from lavis.models.blip_models.blip import BlipBase
25
+ from lavis.models.blip_models.blip_caption import BlipCaption
26
+ from lavis.models.blip_models.blip_classification import BlipClassification
27
+ from lavis.models.blip_models.blip_feature_extractor import BlipFeatureExtractor
28
+ from lavis.models.blip_models.blip_image_text_matching import BlipITM
29
+ from lavis.models.blip_models.blip_nlvr import BlipNLVR
30
+ from lavis.models.blip_models.blip_pretrain import BlipPretrain
31
+ from lavis.models.blip_models.blip_retrieval import BlipRetrieval
32
+ from lavis.models.blip_models.blip_vqa import BlipVQA
33
+
34
+ from lavis.models.blip2_models.blip2 import Blip2Base
35
+ from lavis.models.blip2_models.blip2_opt import Blip2OPT
36
+ from lavis.models.blip2_models.blip2_t5 import Blip2T5
37
+ from lavis.models.blip2_models.blip2_qformer import Blip2Qformer
38
+ from lavis.models.blip2_models.blip2_image_text_matching import Blip2ITM
39
+
40
+ from lavis.models.blip2_models.blip2_t5_instruct import Blip2T5Instruct
41
+ from lavis.models.blip2_models.blip2_vicuna_instruct import Blip2VicunaInstruct
42
+ from lavis.models.blip2_models.blip2_vicuna_xinstruct import Blip2VicunaXInstruct
43
+
44
+ from lavis.models.blip_diffusion_models.blip_diffusion import BlipDiffusion
45
+
46
+ from lavis.models.pnp_vqa_models.pnp_vqa import PNPVQA
47
+ from lavis.models.pnp_vqa_models.pnp_unifiedqav2_fid import PNPUnifiedQAv2FiD
48
+ from lavis.models.img2prompt_models.img2prompt_vqa import Img2PromptVQA
49
+ from lavis.models.med import XBertLMHeadDecoder
50
+ from lavis.models.vit import VisionTransformerEncoder
51
+ from lavis.models.clip_models.model import CLIP
52
+
53
+ from lavis.models.gpt_models.gpt_dialogue import GPTDialogue
54
+
55
+ from lavis.processors.base_processor import BaseProcessor
56
+
57
+
58
+ __all__ = [
59
+ "load_model",
60
+ "AlbefClassification",
61
+ "AlbefFeatureExtractor",
62
+ "AlbefNLVR",
63
+ "AlbefVQA",
64
+ "AlbefPretrain",
65
+ "AlbefRetrieval",
66
+ "AlproQA",
67
+ "AlproRetrieval",
68
+ "BaseModel",
69
+ "BlipBase",
70
+ "BlipFeatureExtractor",
71
+ "BlipCaption",
72
+ "BlipClassification",
73
+ "BlipDiffusion",
74
+ "BlipITM",
75
+ "BlipNLVR",
76
+ "BlipPretrain",
77
+ "BlipRetrieval",
78
+ "BlipVQA",
79
+ "Blip2Qformer",
80
+ "Blip2Base",
81
+ "Blip2ITM",
82
+ "Blip2OPT",
83
+ "Blip2T5",
84
+ "Blip2T5Instruct",
85
+ "Blip2VicunaInstruct",
86
+ "Blip2VicunaXInstruct",
87
+ "PNPVQA",
88
+ "Img2PromptVQA",
89
+ "PNPUnifiedQAv2FiD",
90
+ "CLIP",
91
+ "VisionTransformerEncoder",
92
+ "XBertLMHeadDecoder",
93
+ "GPTDialogue",
94
+ ]
95
+
96
+
97
+ def load_model(name, model_type, is_eval=False, device="cpu", checkpoint=None):
98
+ """
99
+ Load supported models.
100
+
101
+ To list all available models and types in registry:
102
+ >>> from lavis.models import model_zoo
103
+ >>> print(model_zoo)
104
+
105
+ Args:
106
+ name (str): name of the model.
107
+ model_type (str): type of the model.
108
+ is_eval (bool): whether the model is in eval mode. Default: False.
109
+ device (str): device to use. Default: "cpu".
110
+ checkpoint (str): path or to checkpoint. Default: None.
111
+ Note that expecting the checkpoint to have the same keys in state_dict as the model.
112
+
113
+ Returns:
114
+ model (torch.nn.Module): model.
115
+ """
116
+
117
+ model = registry.get_model_class(name).from_pretrained(model_type=model_type)
118
+
119
+ if checkpoint is not None:
120
+ model.load_checkpoint(checkpoint)
121
+
122
+ if is_eval:
123
+ model.eval()
124
+
125
+ if device == "cpu":
126
+ model = model.float()
127
+
128
+ return model.to(device)
129
+
130
+
131
+ def load_preprocess(config):
132
+ """
133
+ Load preprocessor configs and construct preprocessors.
134
+
135
+ If no preprocessor is specified, return BaseProcessor, which does not do any preprocessing.
136
+
137
+ Args:
138
+ config (dict): preprocessor configs.
139
+
140
+ Returns:
141
+ vis_processors (dict): preprocessors for visual inputs.
142
+ txt_processors (dict): preprocessors for text inputs.
143
+
144
+ Key is "train" or "eval" for processors used in training and evaluation respectively.
145
+ """
146
+
147
+ def _build_proc_from_cfg(cfg):
148
+ return (
149
+ registry.get_processor_class(cfg.name).from_config(cfg)
150
+ if cfg is not None
151
+ else BaseProcessor()
152
+ )
153
+
154
+ vis_processors = dict()
155
+ txt_processors = dict()
156
+
157
+ vis_proc_cfg = config.get("vis_processor")
158
+ txt_proc_cfg = config.get("text_processor")
159
+
160
+ if vis_proc_cfg is not None:
161
+ vis_train_cfg = vis_proc_cfg.get("train")
162
+ vis_eval_cfg = vis_proc_cfg.get("eval")
163
+ else:
164
+ vis_train_cfg = None
165
+ vis_eval_cfg = None
166
+
167
+ vis_processors["train"] = _build_proc_from_cfg(vis_train_cfg)
168
+ vis_processors["eval"] = _build_proc_from_cfg(vis_eval_cfg)
169
+
170
+ if txt_proc_cfg is not None:
171
+ txt_train_cfg = txt_proc_cfg.get("train")
172
+ txt_eval_cfg = txt_proc_cfg.get("eval")
173
+ else:
174
+ txt_train_cfg = None
175
+ txt_eval_cfg = None
176
+
177
+ txt_processors["train"] = _build_proc_from_cfg(txt_train_cfg)
178
+ txt_processors["eval"] = _build_proc_from_cfg(txt_eval_cfg)
179
+
180
+ return vis_processors, txt_processors
181
+
182
+
183
+ def load_model_and_preprocess(name, model_type, is_eval=False, device="cpu"):
184
+ """
185
+ Load model and its related preprocessors.
186
+
187
+ List all available models and types in registry:
188
+ >>> from lavis.models import model_zoo
189
+ >>> print(model_zoo)
190
+
191
+ Args:
192
+ name (str): name of the model.
193
+ model_type (str): type of the model.
194
+ is_eval (bool): whether the model is in eval mode. Default: False.
195
+ device (str): device to use. Default: "cpu".
196
+
197
+ Returns:
198
+ model (torch.nn.Module): model.
199
+ vis_processors (dict): preprocessors for visual inputs.
200
+ txt_processors (dict): preprocessors for text inputs.
201
+ """
202
+ model_cls = registry.get_model_class(name)
203
+
204
+ # load model
205
+ model = model_cls.from_pretrained(model_type=model_type)
206
+
207
+ if is_eval:
208
+ model.eval()
209
+
210
+ # load preprocess
211
+ cfg = OmegaConf.load(model_cls.default_config_path(model_type))
212
+ if cfg is not None:
213
+ preprocess_cfg = cfg.preprocess
214
+
215
+ vis_processors, txt_processors = load_preprocess(preprocess_cfg)
216
+ else:
217
+ vis_processors, txt_processors = None, None
218
+ logging.info(
219
+ f"""No default preprocess for model {name} ({model_type}).
220
+ This can happen if the model is not finetuned on downstream datasets,
221
+ or it is not intended for direct use without finetuning.
222
+ """
223
+ )
224
+
225
+ if device == "cpu" or device == torch.device("cpu"):
226
+ model = model.float()
227
+
228
+ return model.to(device), vis_processors, txt_processors
229
+
230
+
231
+ class ModelZoo:
232
+ """
233
+ A utility class to create string representation of available model architectures and types.
234
+
235
+ >>> from lavis.models import model_zoo
236
+ >>> # list all available models
237
+ >>> print(model_zoo)
238
+ >>> # show total number of models
239
+ >>> print(len(model_zoo))
240
+ """
241
+
242
+ def __init__(self) -> None:
243
+ self.model_zoo = {
244
+ k: list(v.PRETRAINED_MODEL_CONFIG_DICT.keys())
245
+ for k, v in registry.mapping["model_name_mapping"].items()
246
+ }
247
+
248
+ def __str__(self) -> str:
249
+ return (
250
+ "=" * 50
251
+ + "\n"
252
+ + f"{'Architectures':<30} {'Types'}\n"
253
+ + "=" * 50
254
+ + "\n"
255
+ + "\n".join(
256
+ [
257
+ f"{name:<30} {', '.join(types)}"
258
+ for name, types in self.model_zoo.items()
259
+ ]
260
+ )
261
+ )
262
+
263
+ def __iter__(self):
264
+ return iter(self.model_zoo.items())
265
+
266
+ def __len__(self):
267
+ return sum([len(v) for v in self.model_zoo.values()])
268
+
269
+
270
+ model_zoo = ModelZoo()
LAVIS-main/lavis/models/albef_models/__init__.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright (c) 2022, salesforce.com, inc.
3
+ All rights reserved.
4
+ SPDX-License-Identifier: BSD-3-Clause
5
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6
+ """
7
+
8
+ import datetime
9
+ import logging
10
+ import os
11
+ import time
12
+
13
+ import lavis.common.dist_utils as dist_utils
14
+ import torch
15
+ import torch.distributed as dist
16
+ import torch.nn.functional as F
17
+ from lavis.common.dist_utils import download_cached_file
18
+ from lavis.common.logger import MetricLogger
19
+ from lavis.common.utils import is_url
20
+ from lavis.models.base_model import BaseModel
21
+ from lavis.models.vit import interpolate_pos_embed
22
+ from transformers import BertTokenizer
23
+
24
+
25
+ class AlbefBase(BaseModel):
26
+ @classmethod
27
+ def init_tokenizer(cls):
28
+ return BertTokenizer.from_pretrained("bert-base-uncased")
29
+
30
+ def load_from_pretrained(self, url_or_filename, rename_text_keys=True):
31
+ if is_url(url_or_filename):
32
+ cached_file = download_cached_file(
33
+ url_or_filename, check_hash=False, progress=True
34
+ )
35
+ checkpoint = torch.load(cached_file, map_location="cpu")
36
+ elif os.path.isfile(url_or_filename):
37
+ checkpoint = torch.load(url_or_filename, map_location="cpu")
38
+ else:
39
+ raise RuntimeError("checkpoint url or path is invalid")
40
+
41
+ if "model" in checkpoint:
42
+ state_dict = checkpoint["model"]
43
+ else:
44
+ state_dict = checkpoint
45
+
46
+ state_dict["visual_encoder.pos_embed"] = interpolate_pos_embed(
47
+ state_dict["visual_encoder.pos_embed"], self.visual_encoder
48
+ )
49
+ if (
50
+ "visual_encoder_m.pos_embed" in self.state_dict().keys()
51
+ and "visual_encoder_m.pos_embed" in state_dict
52
+ ):
53
+ state_dict["visual_encoder_m.pos_embed"] = interpolate_pos_embed(
54
+ state_dict["visual_encoder_m.pos_embed"], self.visual_encoder_m
55
+ )
56
+
57
+ if rename_text_keys:
58
+ for key in list(state_dict.keys()):
59
+ if "bert" in key:
60
+ new_key = key.replace("bert.", "")
61
+ state_dict[new_key] = state_dict[key]
62
+ del state_dict[key]
63
+
64
+ for key in self.state_dict().keys():
65
+ if key in state_dict.keys():
66
+ if state_dict[key].shape != self.state_dict()[key].shape:
67
+ del state_dict[key]
68
+
69
+ msg = self.load_state_dict(state_dict, strict=False)
70
+
71
+ logging.info("Missing keys {}".format(msg.missing_keys))
72
+ logging.info("load checkpoint from %s" % url_or_filename)
73
+ return msg
74
+
75
+
76
+ def compute_sim_matrix(model, data_loader, **kwargs):
77
+ k_test = kwargs.pop("k_test")
78
+
79
+ metric_logger = MetricLogger(delimiter=" ")
80
+ header = "Evaluation:"
81
+
82
+ logging.info("Computing features for evaluation...")
83
+ start_time = time.time()
84
+
85
+ texts = data_loader.dataset.text
86
+ num_text = len(texts)
87
+ text_bs = 256
88
+ text_ids = []
89
+ text_embeds = []
90
+ text_atts = []
91
+ for i in range(0, num_text, text_bs):
92
+ text = texts[i : min(num_text, i + text_bs)]
93
+ text_input = model.tokenizer(
94
+ text,
95
+ padding="max_length",
96
+ truncation=True,
97
+ max_length=35,
98
+ return_tensors="pt",
99
+ ).to(model.device)
100
+ text_output = model.text_encoder.forward_text(text_input)
101
+ text_embed = F.normalize(
102
+ model.text_proj(text_output.last_hidden_state[:, 0, :])
103
+ )
104
+ text_embeds.append(text_embed)
105
+ text_ids.append(text_input.input_ids)
106
+ text_atts.append(text_input.attention_mask)
107
+
108
+ text_embeds = torch.cat(text_embeds, dim=0)
109
+ text_ids = torch.cat(text_ids, dim=0)
110
+ text_atts = torch.cat(text_atts, dim=0)
111
+ if hasattr(model.tokenizer, "enc_token_id"):
112
+ text_ids[:, 0] = model.tokenizer.enc_token_id
113
+
114
+ image_feats = []
115
+ image_embeds = []
116
+ for samples in data_loader:
117
+ image = samples["image"]
118
+
119
+ image = image.to(model.device)
120
+ image_feat = model.visual_encoder.forward_features(image)
121
+ image_embed = model.vision_proj(image_feat[:, 0, :])
122
+ image_embed = F.normalize(image_embed, dim=-1)
123
+
124
+ image_feats.append(image_feat.cpu())
125
+ image_embeds.append(image_embed)
126
+
127
+ image_feats = torch.cat(image_feats, dim=0)
128
+ image_embeds = torch.cat(image_embeds, dim=0)
129
+
130
+ sims_matrix = image_embeds @ text_embeds.t()
131
+ score_matrix_i2t = torch.full(
132
+ (len(data_loader.dataset.image), len(texts)), -100.0
133
+ ).to(model.device)
134
+
135
+ num_tasks = dist_utils.get_world_size()
136
+ rank = dist_utils.get_rank()
137
+ step = sims_matrix.size(0) // num_tasks + 1
138
+ start = rank * step
139
+ end = min(sims_matrix.size(0), start + step)
140
+
141
+ for i, sims in enumerate(
142
+ metric_logger.log_every(sims_matrix[start:end], 50, header)
143
+ ):
144
+ # topk_sim, topk_idx = sims.topk(k=config["k_test"], dim=0)
145
+ topk_sim, topk_idx = sims.topk(k=k_test, dim=0)
146
+
147
+ encoder_output = image_feats[start + i].repeat(k_test, 1, 1).to(model.device)
148
+ encoder_att = torch.ones(encoder_output.size()[:-1], dtype=torch.long).to(
149
+ model.device
150
+ )
151
+ output = model.text_encoder(
152
+ text_ids[topk_idx],
153
+ attention_mask=text_atts[topk_idx],
154
+ encoder_hidden_states=encoder_output,
155
+ encoder_attention_mask=encoder_att,
156
+ return_dict=True,
157
+ )
158
+ score = model.itm_head(output.last_hidden_state[:, 0, :])[:, 1]
159
+ score_matrix_i2t[start + i, topk_idx] = score + topk_sim
160
+
161
+ sims_matrix = sims_matrix.t()
162
+ score_matrix_t2i = torch.full(
163
+ (len(texts), len(data_loader.dataset.image)), -100.0
164
+ ).to(model.device)
165
+
166
+ step = sims_matrix.size(0) // num_tasks + 1
167
+ start = rank * step
168
+ end = min(sims_matrix.size(0), start + step)
169
+
170
+ for i, sims in enumerate(
171
+ metric_logger.log_every(sims_matrix[start:end], 50, header)
172
+ ):
173
+
174
+ topk_sim, topk_idx = sims.topk(k=k_test, dim=0)
175
+ encoder_output = image_feats[topk_idx.cpu()].to(model.device)
176
+ encoder_att = torch.ones(encoder_output.size()[:-1], dtype=torch.long).to(
177
+ model.device
178
+ )
179
+ output = model.text_encoder(
180
+ text_ids[start + i].repeat(k_test, 1),
181
+ attention_mask=text_atts[start + i].repeat(k_test, 1),
182
+ encoder_hidden_states=encoder_output,
183
+ encoder_attention_mask=encoder_att,
184
+ return_dict=True,
185
+ )
186
+ score = model.itm_head(output.last_hidden_state[:, 0, :])[:, 1]
187
+ score_matrix_t2i[start + i, topk_idx] = score + topk_sim
188
+
189
+ if dist_utils.is_dist_avail_and_initialized():
190
+ dist.barrier()
191
+ torch.distributed.all_reduce(
192
+ score_matrix_i2t, op=torch.distributed.ReduceOp.SUM
193
+ )
194
+ torch.distributed.all_reduce(
195
+ score_matrix_t2i, op=torch.distributed.ReduceOp.SUM
196
+ )
197
+
198
+ total_time = time.time() - start_time
199
+ total_time_str = str(datetime.timedelta(seconds=int(total_time)))
200
+ logging.info("Evaluation time {}".format(total_time_str))
201
+
202
+ return score_matrix_i2t.cpu().numpy(), score_matrix_t2i.cpu().numpy()