yuccaaa commited on
Commit
5c8f92e
·
verified ·
1 Parent(s): 9627ce0

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. LAVIS-main/lavis/configs/datasets/shapenet/defaults_mm_cap.yaml +51 -0
  2. LAVIS-main/lavis/configs/datasets/shapenet/defaults_mm_cap_instruct.yaml +53 -0
  3. LAVIS-main/lavis/configs/datasets/snli_ve/defaults.yaml +25 -0
  4. LAVIS-main/lavis/configs/datasets/snli_ve/defaults_instruct.yaml +49 -0
  5. LAVIS-main/lavis/configs/datasets/textcaps/defaults.yaml +46 -0
  6. LAVIS-main/lavis/configs/datasets/textcaps/defaults_instruct.yaml +47 -0
  7. LAVIS-main/lavis/configs/datasets/valor/defaults_mm_cap.yaml +68 -0
  8. LAVIS-main/lavis/configs/datasets/valor/defaults_mm_cap_instruct.yaml +70 -0
  9. LAVIS-main/lavis/configs/datasets/vatex/defaults_cap.yaml +24 -0
  10. LAVIS-main/lavis/configs/datasets/vatex/defaults_cap_instruct.yaml +62 -0
  11. LAVIS-main/lavis/configs/datasets/vg/defaults_caption.yaml +18 -0
  12. LAVIS-main/lavis/configs/datasets/vg/defaults_caption_instruct.yaml +34 -0
  13. LAVIS-main/lavis/configs/datasets/vg/defaults_vqa.yaml +18 -0
  14. LAVIS-main/lavis/configs/datasets/vg/defaults_vqa_instruct.yaml +34 -0
  15. LAVIS-main/lavis/configs/datasets/violin/defaults_cap.yaml +51 -0
  16. LAVIS-main/lavis/configs/datasets/violin/defaults_cap_instruct.yaml +53 -0
  17. LAVIS-main/lavis/configs/datasets/violin/defaults_entail.yaml +52 -0
  18. LAVIS-main/lavis/configs/datasets/violin/defaults_entail_instruct.yaml +51 -0
  19. LAVIS-main/lavis/configs/datasets/visdial/defaults_dial.yaml +41 -0
  20. LAVIS-main/lavis/configs/datasets/visdial/defaults_dial_instruct.yaml +41 -0
  21. LAVIS-main/lavis/configs/datasets/vizwiz/defaults.yaml +43 -0
  22. LAVIS-main/lavis/configs/datasets/vlep/defaults_cap.yaml +51 -0
  23. LAVIS-main/lavis/configs/datasets/vlep/defaults_cap_instruct.yaml +53 -0
  24. LAVIS-main/lavis/configs/datasets/vsr/defaults.yaml +49 -0
  25. LAVIS-main/lavis/configs/datasets/vsr/defaults_classification.yaml +49 -0
  26. LAVIS-main/lavis/configs/datasets/vsr/defaults_classification_instruct.yaml +49 -0
  27. LAVIS-main/lavis/configs/datasets/vsr/defaults_instruct.yaml +53 -0
  28. LAVIS-main/lavis/configs/datasets/wavcaps/defaults_mm_cap.yaml +63 -0
  29. LAVIS-main/lavis/configs/datasets/wavcaps/defaults_mm_cap_instruct.yaml +63 -0
  30. LAVIS-main/lavis/configs/datasets/webvid/defaults_cap.yaml +41 -0
  31. LAVIS-main/lavis/configs/datasets/webvid/defaults_cap_instruct.yaml +43 -0
  32. LAVIS-main/lavis/configs/datasets/youcook/defaults_cap.yaml +51 -0
  33. LAVIS-main/lavis/configs/datasets/youcook/defaults_cap_instruct.yaml +53 -0
  34. LAVIS-main/lavis/configs/datasets/yt8m/defaults_mm_dial.yaml +62 -0
  35. LAVIS-main/lavis/configs/models/albef_classification_ve.yaml +40 -0
  36. LAVIS-main/lavis/configs/models/albef_feature_extractor.yaml +30 -0
  37. LAVIS-main/lavis/configs/models/albef_nlvr.yaml +42 -0
  38. LAVIS-main/lavis/configs/models/albef_pretrain_base.yaml +38 -0
  39. LAVIS-main/lavis/configs/models/albef_retrieval_coco.yaml +46 -0
  40. LAVIS-main/lavis/configs/models/albef_retrieval_flickr.yaml +46 -0
  41. LAVIS-main/lavis/configs/models/albef_vqav2.yaml +40 -0
  42. LAVIS-main/lavis/configs/models/alpro_qa_msrvtt.yaml +44 -0
  43. LAVIS-main/lavis/configs/models/alpro_qa_msvd.yaml +43 -0
  44. LAVIS-main/lavis/configs/models/alpro_retrieval_didemo.yaml +35 -0
  45. LAVIS-main/lavis/configs/models/alpro_retrieval_msrvtt.yaml +41 -0
  46. LAVIS-main/lavis/configs/models/bert_config.json +21 -0
  47. LAVIS-main/lavis/configs/models/bert_config_alpro.json +23 -0
  48. LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_base.yaml +25 -0
  49. LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_canny.yaml +27 -0
  50. LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_depth.yaml +27 -0
LAVIS-main/lavis/configs/datasets/shapenet/defaults_mm_cap.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+ datasets:
6
+ shapenet_mm_caption: # name of the dataset builder
7
+ vis_processor:
8
+ train:
9
+ name: "clip_image_train"
10
+ image_size: 224
11
+ eval:
12
+ name: "clip_image_train"
13
+ image_size: 224
14
+ pc_processor:
15
+ train:
16
+ name: "ulip_pc"
17
+ eval:
18
+ name: "ulip_pc"
19
+ text_processor:
20
+ train:
21
+ name: "blip_caption"
22
+ eval:
23
+ name: "blip_caption"
24
+
25
+ data_type: [pc, images] # [images|videos|features]
26
+
27
+ build_info:
28
+ # Be careful not to append minus sign (-) before split to avoid itemizing
29
+ annotations:
30
+ train:
31
+ url:
32
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/shapenet/train_ann.json
33
+ # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/train_ann.json
34
+ storage:
35
+ - shapenet/annotations/train_ann.json
36
+ # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/train_ann.json
37
+ val:
38
+ url:
39
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/shapenet/test_ann.json
40
+ # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/test_ann.json
41
+ storage:
42
+ - shapenet/annotations/test_ann.json
43
+ # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/test_ann.json
44
+
45
+ templates: null
46
+
47
+ pc:
48
+ storage: /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/shapenet_pc
49
+
50
+ images:
51
+ storage: /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/rendered_images
LAVIS-main/lavis/configs/datasets/shapenet/defaults_mm_cap_instruct.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+ datasets:
6
+ shapenet_mm_caption_instruct: # name of the dataset builder
7
+ vis_processor:
8
+ train:
9
+ name: "clip_image_train"
10
+ image_size: 224
11
+ eval:
12
+ name: "clip_image_train"
13
+ image_size: 224
14
+ pc_processor:
15
+ train:
16
+ name: "ulip_pc"
17
+ eval:
18
+ name: "ulip_pc"
19
+ text_processor:
20
+ train:
21
+ name: "blip_instruction"
22
+ modality: pc
23
+ task: caption
24
+ eval:
25
+ name: "blip_caption"
26
+
27
+ data_type: [pc, images] # [images|videos|features]
28
+
29
+ build_info:
30
+ # Be careful not to append minus sign (-) before split to avoid itemizing
31
+ annotations:
32
+ train:
33
+ url:
34
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/shapenet/train_ann.json
35
+ # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/train_ann.json
36
+ storage:
37
+ - shapenet/annotations/train_ann.json
38
+ # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/train_ann.json
39
+ val:
40
+ url:
41
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/shapenet/test_ann.json
42
+ # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/test_ann.json
43
+ storage:
44
+ - shapenet/annotations/test_ann.json
45
+ # - /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/test_ann.json
46
+
47
+ templates: null
48
+
49
+ pc:
50
+ storage: /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/shapenet_pc
51
+
52
+ images:
53
+ storage: /export/einstein-vision/3d_vision/3d_object_datasets/ShapeNet55/rendered_images
LAVIS-main/lavis/configs/datasets/snli_ve/defaults.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ snli_ve:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ build_info:
12
+ # Be careful not to append minus sign (-) before split to avoid itemizing
13
+ annotations:
14
+ train:
15
+ url: /export/share/dongxuli/data/lavis/snli/annotation/ve_train.json
16
+ storage: snli/annotations/ve_train.json
17
+ val:
18
+ url: /export/share/dongxuli/data/lavis/snli/annotation/ve_dev.json
19
+ storage: snli/annotations/ve_dev.json
20
+ test:
21
+ url: /export/share/dongxuli/data/lavis/snli/annotation/ve_test.json
22
+ storage: snli/annotations/ve_test.json
23
+ images:
24
+ storage: flickr30k/images/flickr30k-images
25
+ # storage: /export/share/datasets/vision/flickr30k/flickr30k-images
LAVIS-main/lavis/configs/datasets/snli_ve/defaults_instruct.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ snli_ve_instruct:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+ eval:
16
+ name: "clip_image_eval"
17
+ image_size: 224
18
+
19
+ text_processor:
20
+ train:
21
+ name: blip_caption
22
+ eval:
23
+ name: blip_caption
24
+
25
+
26
+ build_info:
27
+ # Be careful not to append minus sign (-) before split to avoid itemizing
28
+ annotations:
29
+ train:
30
+ url:
31
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/snlive/ve_train.json
32
+ # - /export/share/dongxuli/data/lavis/snli/ve_train.json
33
+ storage:
34
+ - snli/annotations/ve_train.json
35
+ val:
36
+ url:
37
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/snlive/ve_dev.json
38
+ # - /export/share/dongxuli/data/lavis/snli/ve_dev.json
39
+ storage:
40
+ - snli/annotations/ve_dev.json
41
+ test:
42
+ url:
43
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/snlive/ve_test.json
44
+ # - /export/share/dongxuli/data/lavis/snli/ve_test.json
45
+ storage:
46
+ - snli/annotations/ve_test.json
47
+ images:
48
+ # storage: flickr30k/images/flickr30k-images
49
+ storage: /export/share/datasets/vision/flickr30k/flickr30k-images
LAVIS-main/lavis/configs/datasets/textcaps/defaults.yaml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ textcaps_caption: # name of the dataset builder
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+ eval:
16
+ name: "clip_image_eval"
17
+ image_size: 224
18
+
19
+ text_processor:
20
+ train:
21
+ name: blip_caption
22
+ eval:
23
+ name: blip_caption
24
+
25
+
26
+ build_info:
27
+ # Be careful not to append minus sign (-) before split to avoid itemizing
28
+ annotations:
29
+ train:
30
+ url:
31
+ - https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_train.json
32
+ storage:
33
+ - TextCaps/TextCaps_0.1_train.json
34
+ val:
35
+ url:
36
+ - https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_val.json
37
+ storage:
38
+ - TextCaps/TextCaps_0.1_val.json
39
+ test:
40
+ url:
41
+ - https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_test.json
42
+ storage:
43
+ - TextCaps/TextCaps_0.1_test.json
44
+ images:
45
+ # storage: nocaps/images
46
+ storage: /export/share/datasets/vision_language/TextCaps/images
LAVIS-main/lavis/configs/datasets/textcaps/defaults_instruct.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ textcaps_caption_instruct: # name of the dataset builder
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+ eval:
16
+ name: "clip_image_eval"
17
+ image_size: 224
18
+
19
+ text_processor:
20
+ train:
21
+ name: blip_instruction
22
+ modality: image
23
+ task: caption
24
+ eval:
25
+ name: blip_caption
26
+
27
+ build_info:
28
+ # Be careful not to append minus sign (-) before split to avoid itemizing
29
+ annotations:
30
+ train:
31
+ url:
32
+ - https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_train.json
33
+ storage:
34
+ - TextCaps/TextCaps_0.1_train.json
35
+ val:
36
+ url:
37
+ - https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_val.json
38
+ storage:
39
+ - TextCaps/TextCaps_0.1_val.json
40
+ test:
41
+ url:
42
+ - https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_test.json
43
+ storage:
44
+ - TextCaps/TextCaps_0.1_test.json
45
+ images:
46
+ # storage: nocaps/images
47
+ storage: /export/share/datasets/vision_language/TextCaps/images
LAVIS-main/lavis/configs/datasets/valor/defaults_mm_cap.yaml ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ valor_mm_caption: # name of the dataset builder
8
+ data_type: [video, audio]
9
+
10
+ video_processor:
11
+ train:
12
+ name: alpro_video_train
13
+ n_frms: 4
14
+ image_size: 224
15
+ min_scale: 0.9
16
+ max_scale: 1.0
17
+ full_video: True
18
+ eval:
19
+ name: alpro_video_eval
20
+ n_frms: 4
21
+ image_size: 224
22
+ min_scale: 0.9
23
+ max_scale: 1.0
24
+ full_video: True
25
+
26
+ audio_processor:
27
+ train:
28
+ name: beats_audio
29
+ sampling_rate: 16000
30
+ eval:
31
+ name: beats_audio
32
+ sampling_rate: 16000
33
+ is_eval: False
34
+
35
+ text_processor:
36
+ train:
37
+ name: blip_caption
38
+ eval:
39
+ name: blip_caption
40
+
41
+
42
+ build_info:
43
+ # Be careful not to append minus sign (-) before split to avoid itemizing
44
+ annotations:
45
+ val:
46
+ url:
47
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/valor/desc_val.json
48
+ # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_val.json
49
+ storage:
50
+ - valor/annotations/desc_val.json
51
+ # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_val.json
52
+
53
+ test:
54
+ url:
55
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/valor/desc_test.json
56
+ # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_test.json
57
+ storage:
58
+ - valor/annotations/desc_test.json
59
+ # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_test.json
60
+
61
+ templates: null
62
+
63
+ audio:
64
+ storage: /export/video-language-dataset/data/VALOR/videos
65
+
66
+ video:
67
+ storage: /export/video-language-dataset/data/VALOR/videos
68
+
LAVIS-main/lavis/configs/datasets/valor/defaults_mm_cap_instruct.yaml ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ valor_mm_caption_instruct: # name of the dataset builder
8
+ data_type: [video, audio]
9
+
10
+ video_processor:
11
+ train:
12
+ name: alpro_video_train
13
+ n_frms: 4
14
+ image_size: 224
15
+ min_scale: 0.9
16
+ max_scale: 1.0
17
+ full_video: True
18
+ eval:
19
+ name: alpro_video_eval
20
+ n_frms: 4
21
+ image_size: 224
22
+ min_scale: 0.9
23
+ max_scale: 1.0
24
+ full_video: True
25
+
26
+ audio_processor:
27
+ train:
28
+ name: beats_audio
29
+ sampling_rate: 16000
30
+ eval:
31
+ name: beats_audio
32
+ sampling_rate: 16000
33
+ is_eval: False
34
+
35
+ text_processor:
36
+ train:
37
+ name: blip_instruction
38
+ modality: image
39
+ task: caption
40
+ eval:
41
+ name: blip_caption
42
+
43
+
44
+ build_info:
45
+ # Be careful not to append minus sign (-) before split to avoid itemizing
46
+ annotations:
47
+ val:
48
+ url:
49
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/valor/desc_val.json
50
+ # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_val.json
51
+ storage:
52
+ - valor/annotations/desc_val.json
53
+ # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_val.json
54
+
55
+ test:
56
+ url:
57
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/valor/desc_test.json
58
+ # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_test.json
59
+ storage:
60
+ - valor/annotations/desc_test.json
61
+ # - /export/video-language-dataset/data/VALOR/valor-32k-annotations/desc_test.json
62
+
63
+ templates: null
64
+
65
+ audio:
66
+ storage: /export/video-language-dataset/data/VALOR/videos
67
+
68
+ video:
69
+ storage: /export/video-language-dataset/data/VALOR/videos
70
+
LAVIS-main/lavis/configs/datasets/vatex/defaults_cap.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ msvd_cap: # name of the dataset builder
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: videos # [images|videos|features]
10
+
11
+ build_info:
12
+ # Be careful not to append minus sign (-) before split to avoid itemizing
13
+ annotations:
14
+ train:
15
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
16
+ storage: vatex/annotations/cap_train.json
17
+ val:
18
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
19
+ storage: vatex/annotations/cap_val.json
20
+ test:
21
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
22
+ storage: vatex/annotations/cap_test.json
23
+ videos:
24
+ storage: /export/share/dongxuli/data/vatex
LAVIS-main/lavis/configs/datasets/vatex/defaults_cap_instruct.yaml ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ vatex_caption_instruct: # name of the dataset builder
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: videos # [images|videos|features]
10
+
11
+ video_processor:
12
+ train:
13
+ name: alpro_video_train
14
+ n_frms: 4
15
+ image_size: 224
16
+ min_scale: 0.9
17
+ max_scale: 1.0
18
+ full_video: True
19
+ eval:
20
+ name: alpro_video_eval
21
+ n_frms: 4
22
+ image_size: 224
23
+ min_scale: 0.9
24
+ max_scale: 1.0
25
+ full_video: True
26
+
27
+ data_type: [video, audio]
28
+
29
+ audio_processor:
30
+ train:
31
+ name: beats_audio
32
+ sampling_rate: 16000
33
+ eval:
34
+ name: beats_audio
35
+ sampling_rate: 16000
36
+ is_eval: False
37
+
38
+
39
+ build_info:
40
+ # Be careful not to append minus sign (-) before split to avoid itemizing
41
+ annotations:
42
+ train:
43
+ url:
44
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
45
+ storage:
46
+ - vatex/annotations/cap_train.json
47
+ val:
48
+ url:
49
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
50
+ storage:
51
+ - vatex/annotations/cap_val.json
52
+ test:
53
+ url:
54
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
55
+ storage:
56
+ - vatex/annotations/cap_test.json
57
+
58
+ video:
59
+ storage: /export/video-language-dataset/data/vatex/
60
+
61
+ audio:
62
+ storage: /export/video-language-dataset/data/vatex/
LAVIS-main/lavis/configs/datasets/vg/defaults_caption.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ vg_caption:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ build_info:
12
+ # Be careful not to append minus sign (-) before split to avoid itemizing
13
+ annotations:
14
+ train:
15
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json
16
+ storage: vg/annotations/vg_caption.json
17
+ images:
18
+ storage: vg/images/
LAVIS-main/lavis/configs/datasets/vg/defaults_caption_instruct.yaml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ vg_caption_instruct:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+ eval:
16
+ name: "clip_image_eval"
17
+ image_size: 224
18
+
19
+ text_processor:
20
+ train:
21
+ name: blip_instruction
22
+ task: caption
23
+ modality: image
24
+ eval:
25
+ name: blip_caption
26
+
27
+ build_info:
28
+ # Be careful not to append minus sign (-) before split to avoid itemizing
29
+ annotations:
30
+ train:
31
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json
32
+ storage: vg/annotations/vg_caption.json
33
+ images:
34
+ storage: /export/share/datasets/vision/visual-genome/ #vg/images/
LAVIS-main/lavis/configs/datasets/vg/defaults_vqa.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ vg_vqa:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ build_info:
12
+ # Be careful not to append minus sign (-) before split to avoid itemizing
13
+ annotations:
14
+ train:
15
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json
16
+ storage: vg/annotations/vg_qa.json
17
+ images:
18
+ storage: vg/images/
LAVIS-main/lavis/configs/datasets/vg/defaults_vqa_instruct.yaml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ vg_vqa_instruct:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+ eval:
16
+ name: "clip_image_eval"
17
+ image_size: 224
18
+
19
+ text_processor:
20
+ train:
21
+ name: blip_instruction
22
+ task: qa
23
+ modality: image
24
+ eval:
25
+ name: blip_question
26
+
27
+ build_info:
28
+ # Be careful not to append minus sign (-) before split to avoid itemizing
29
+ annotations:
30
+ train:
31
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json
32
+ storage: vg/annotations/vg_qa.json
33
+ images:
34
+ storage: /export/share/datasets/vision/visual-genome/ #vg/images/
LAVIS-main/lavis/configs/datasets/violin/defaults_cap.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ violin_caption: # name of the dataset builder
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: videos # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: alpro_video_train
14
+ n_frms: 4
15
+ image_size: 224
16
+ min_scale: 0.9
17
+ max_scale: 1.0
18
+ full_video: False
19
+ eval:
20
+ name: alpro_video_eval
21
+ n_frms: 4
22
+ image_size: 224
23
+ min_scale: 0.9
24
+ max_scale: 1.0
25
+ full_video: False
26
+
27
+ text_processor:
28
+ train:
29
+ name: blip_caption
30
+ eval:
31
+ name: blip_caption
32
+
33
+ build_info:
34
+ # Be careful not to append minus sign (-) before split to avoid itemizing
35
+ annotations:
36
+ train:
37
+ url:
38
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/train.json
39
+ # - /export/video-language-dataset/data/violin/annotations_lavis.json
40
+ storage:
41
+ - violin/annotations/train.json
42
+ # - /export/video-language-dataset/data/violin/annotations_lavis.json
43
+ # val:
44
+ # url:
45
+ # # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/test.json
46
+ # - /export/video-language-dataset/data/violin/annotations_lavis_test.json
47
+ # storage:
48
+ # # - violin/annotations/test.json
49
+ # - /export/video-language-dataset/data/violin/annotations_lavis_test.json
50
+ videos:
51
+ storage: /export/video-language-dataset/data/violin/videos
LAVIS-main/lavis/configs/datasets/violin/defaults_cap_instruct.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ violin_caption_instruct: # name of the dataset builder
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: videos # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: alpro_video_train
14
+ n_frms: 4
15
+ image_size: 224
16
+ min_scale: 0.9
17
+ max_scale: 1.0
18
+ full_video: False
19
+ eval:
20
+ name: alpro_video_eval
21
+ n_frms: 4
22
+ image_size: 224
23
+ min_scale: 0.9
24
+ max_scale: 1.0
25
+ full_video: False
26
+
27
+ text_processor:
28
+ train:
29
+ name: blip_instruction
30
+ modality: video
31
+ task: caption
32
+ eval:
33
+ name: blip_caption
34
+
35
+ build_info:
36
+ # Be careful not to append minus sign (-) before split to avoid itemizing
37
+ annotations:
38
+ train:
39
+ url:
40
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/train.json
41
+ # - /export/video-language-dataset/data/violin/annotations_lavis.json
42
+ storage:
43
+ - violin/annotations/train.json
44
+ # - /export/video-language-dataset/data/violin/annotations_lavis.json
45
+ # val:
46
+ # url:
47
+ # # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/test.json
48
+ # - /export/video-language-dataset/data/violin/annotations_lavis_test.json
49
+ # storage:
50
+ # # - violin/annotations/test.json
51
+ # - /export/video-language-dataset/data/violin/annotations_lavis_test.json
52
+ videos:
53
+ storage: /export/video-language-dataset/data/violin/videos
LAVIS-main/lavis/configs/datasets/violin/defaults_entail.yaml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ violin_entailment: # 22452
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: videos # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: alpro_video_train
14
+ n_frms: 4
15
+ image_size: 224
16
+ min_scale: 0.9
17
+ max_scale: 1.0
18
+ full_video: False
19
+ eval:
20
+ name: alpro_video_eval
21
+ n_frms: 4
22
+ image_size: 224
23
+ min_scale: 0.9
24
+ max_scale: 1.0
25
+ full_video: False
26
+
27
+ text_processor:
28
+ train:
29
+ name: blip_caption
30
+ eval:
31
+ name: blip_caption
32
+
33
+
34
+ build_info:
35
+ # Be careful not to append minus sign (-) before split to avoid itemizing
36
+ annotations:
37
+ train:
38
+ url:
39
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/train.json
40
+ # - /export/video-language-dataset/data/violin/annotations_lavis.json
41
+ storage:
42
+ - violin/annotations/train.json
43
+ # - /export/video-language-dataset/data/violin/annotations_lavis.json
44
+ # val:
45
+ # url:
46
+ # # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/test.json
47
+ # - /export/video-language-dataset/data/violin/annotations_lavis_test.json
48
+ # storage:
49
+ # # - violin/annotations/test.json
50
+ # - /export/video-language-dataset/data/violin/annotations_lavis_test.json
51
+ videos:
52
+ storage: /export/video-language-dataset/data/violin/videos
LAVIS-main/lavis/configs/datasets/violin/defaults_entail_instruct.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ violin_entailment_instruct: # 22452
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: videos # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: alpro_video_train
14
+ n_frms: 4
15
+ image_size: 224
16
+ min_scale: 0.9
17
+ max_scale: 1.0
18
+ full_video: False
19
+ eval:
20
+ name: alpro_video_eval
21
+ n_frms: 4
22
+ image_size: 224
23
+ min_scale: 0.9
24
+ max_scale: 1.0
25
+ full_video: False
26
+
27
+ text_processor:
28
+ train:
29
+ name: blip_caption
30
+ eval:
31
+ name: blip_caption
32
+
33
+ build_info:
34
+ # Be careful not to append minus sign (-) before split to avoid itemizing
35
+ annotations:
36
+ train:
37
+ url:
38
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/train.json
39
+ # - /export/video-language-dataset/data/violin/annotations_lavis.json
40
+ storage:
41
+ - violin/annotations/train.json
42
+ # - /export/video-language-dataset/data/violin/annotations_lavis.json
43
+ # val:
44
+ # url:
45
+ # # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/violin/test.json
46
+ # - /export/video-language-dataset/data/violin/annotations_lavis_test.json
47
+ # storage:
48
+ # # - violin/annotations/test.json
49
+ # - /export/video-language-dataset/data/violin/annotations_lavis_test.json
50
+ videos:
51
+ storage: /export/video-language-dataset/data/violin/videos
LAVIS-main/lavis/configs/datasets/visdial/defaults_dial.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ visdial: # name of the dataset builder
8
+ data_type: images #extracted features of videos (I3D, VGGish) # [images|videos|features]
9
+
10
+ vis_processor:
11
+ train:
12
+ name: "clip_image_train"
13
+ image_size: 224
14
+ eval:
15
+ name: "clip_image_eval"
16
+ image_size: 224
17
+
18
+ text_processor:
19
+ train:
20
+ name: blip_caption
21
+ eval:
22
+ name: blip_caption
23
+
24
+ build_info:
25
+ # Be careful not to append minus sign (-) before split to avoid itemizing
26
+ annotations:
27
+ train:
28
+ url:
29
+ - /export/share/datasets/vision_language/visdial/visdial_1.0_train.json
30
+ storage:
31
+ - /export/share/datasets/vision_language/visdial/visdial_1.0_train.json
32
+ val:
33
+ url:
34
+ - /export/share/datasets/vision_language/visdial/visdial_1.0_val.json
35
+ storage:
36
+ - /export/share/datasets/vision_language/visdial/visdial_1.0_val.json
37
+ # test:
38
+ # url: /export/share/datasets/vision_language/visdial/visdial_1.0_test.json
39
+ # storage: /export/share/datasets/vision_language/visdial/visdial_1.0_test.json
40
+ images:
41
+ storage: /export/share/datasets/vision_language/visdial/
LAVIS-main/lavis/configs/datasets/visdial/defaults_dial_instruct.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ visdial_instruct: # name of the dataset builder
8
+ data_type: images #extracted features of videos (I3D, VGGish) # [images|videos|features]
9
+
10
+ vis_processor:
11
+ train:
12
+ name: "clip_image_train"
13
+ image_size: 224
14
+ eval:
15
+ name: "clip_image_eval"
16
+ image_size: 224
17
+
18
+ text_processor:
19
+ train:
20
+ name: blip_caption
21
+ eval:
22
+ name: blip_caption
23
+
24
+ build_info:
25
+ # Be careful not to append minus sign (-) before split to avoid itemizing
26
+ annotations:
27
+ train:
28
+ url:
29
+ - /export/share/datasets/vision_language/visdial/visdial_1.0_train.json
30
+ storage:
31
+ - /export/share/datasets/vision_language/visdial/visdial_1.0_train.json
32
+ val:
33
+ url:
34
+ - /export/share/datasets/vision_language/visdial/visdial_1.0_val.json
35
+ storage:
36
+ - /export/share/datasets/vision_language/visdial/visdial_1.0_val.json
37
+ # test:
38
+ # url: /export/share/datasets/vision_language/visdial/visdial_1.0_test.json
39
+ # storage: /export/share/datasets/vision_language/visdial/visdial_1.0_test.json
40
+ images:
41
+ storage: /export/share/datasets/vision_language/visdial/
LAVIS-main/lavis/configs/datasets/vizwiz/defaults.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ vizwiz_vqa:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+ eval:
16
+ name: "clip_image_eval"
17
+ image_size: 224
18
+
19
+ text_processor:
20
+ train:
21
+ name: blip_question
22
+ eval:
23
+ name: blip_question
24
+
25
+ build_info:
26
+ # Be careful not to append minus sign (-) before split to avoid itemizing
27
+ annotations:
28
+ val:
29
+ url:
30
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vizwiz/val.json
31
+ # - /export/share/datasets/vision/vizwiz/Annotations/val.json
32
+ storage:
33
+ - vizwiz/annotations/val.json
34
+ # - /export/share/datasets/vision/vizwiz/Annotations/val.json
35
+ test:
36
+ url:
37
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vizwiz/test.json
38
+ # - /export/share/datasets/vision/vizwiz/Annotations/test.json
39
+ storage:
40
+ - vizwiz/annotations/test.json
41
+ # - /export/share/datasets/vision/vizwiz/Annotations/test.json
42
+ images:
43
+ storage: /export/share/datasets/vision/vizwiz/images
LAVIS-main/lavis/configs/datasets/vlep/defaults_cap.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ vlep_caption: # 4900
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: videos # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: alpro_video_train
14
+ n_frms: 4
15
+ image_size: 224
16
+ min_scale: 0.9
17
+ max_scale: 1.0
18
+ full_video: False
19
+ eval:
20
+ name: alpro_video_eval
21
+ n_frms: 4
22
+ image_size: 224
23
+ min_scale: 0.9
24
+ max_scale: 1.0
25
+ full_video: False
26
+
27
+ text_processor:
28
+ train:
29
+ name: blip_caption
30
+ eval:
31
+ name: blip_caption
32
+
33
+ build_info:
34
+ # Be careful not to append minus sign (-) before split to avoid itemizing
35
+ annotations:
36
+ train:
37
+ url:
38
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vlep/annotations_train_existing.json
39
+ # - /export/video-language-dataset/data/vlep/annotations/annotations_train_existing.json
40
+ storage:
41
+ - vlep/annotations/annotations_train_existing.json
42
+ # - /export/video-language-dataset/data/vlep/annotations/annotations_train_existing.json
43
+ val:
44
+ url:
45
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vlep/annotations_dev_existing.json
46
+ # - /export/video-language-dataset/data/vlep/annotations/annotations_dev_existing.json
47
+ storage:
48
+ - vlep/annotations/annotations_dev_existing.json
49
+ # - /export/video-language-dataset/data/vlep/annotations/annotations_dev_existing.json
50
+ videos:
51
+ storage: /export/video-language-dataset/data/vlep/videos
LAVIS-main/lavis/configs/datasets/vlep/defaults_cap_instruct.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ vlep_caption_instruct: # 4900
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: videos # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: alpro_video_train
14
+ n_frms: 4
15
+ image_size: 224
16
+ min_scale: 0.9
17
+ max_scale: 1.0
18
+ full_video: False
19
+ eval:
20
+ name: alpro_video_eval
21
+ n_frms: 4
22
+ image_size: 224
23
+ min_scale: 0.9
24
+ max_scale: 1.0
25
+ full_video: False
26
+
27
+ text_processor:
28
+ train:
29
+ name: blip_instruction
30
+ modality: image
31
+ task: caption
32
+ eval:
33
+ name: blip_caption
34
+
35
+ build_info:
36
+ # Be careful not to append minus sign (-) before split to avoid itemizing
37
+ annotations:
38
+ train:
39
+ url:
40
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vlep/annotations_train_existing.json
41
+ # - /export/video-language-dataset/data/vlep/annotations/annotations_train_existing.json
42
+ storage:
43
+ - vlep/annotations/annotations_train_existing.json
44
+ # - /export/video-language-dataset/data/vlep/annotations/annotations_train_existing.json
45
+ val:
46
+ url:
47
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vlep/annotations_dev_existing.json
48
+ # - /export/video-language-dataset/data/vlep/annotations/annotations_dev_existing.json
49
+ storage:
50
+ - vlep/annotations/annotations_dev_existing.json
51
+ # - /export/video-language-dataset/data/vlep/annotations/annotations_dev_existing.json
52
+ videos:
53
+ storage: /export/video-language-dataset/data/vlep/videos
LAVIS-main/lavis/configs/datasets/vsr/defaults.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ vsr_classification_instruct:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+ eval:
16
+ name: "clip_image_eval"
17
+ image_size: 224
18
+
19
+ text_processor:
20
+ train:
21
+ name: blip_caption
22
+ eval:
23
+ name: blip_caption
24
+
25
+ build_info:
26
+ # Be careful not to append minus sign (-) before split to avoid itemizing
27
+ train:
28
+ url:
29
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/train.jsonl
30
+ # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
31
+ storage:
32
+ - vsr/annotations/train.jsonl
33
+ # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
34
+ val:
35
+ url:
36
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/dev.jsonl
37
+ # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
38
+ storage:
39
+ - vsr/annotations/dev.jsonl
40
+ # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
41
+ test:
42
+ url:
43
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/test.jsonl
44
+ # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
45
+ storage:
46
+ - vsr/annotations/test.jsonl
47
+ # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
48
+ images:
49
+ storage: /export/share/datasets/vision_language/VSR/images
LAVIS-main/lavis/configs/datasets/vsr/defaults_classification.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ vsr_classification:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+ eval:
16
+ name: "clip_image_eval"
17
+ image_size: 224
18
+
19
+ text_processor:
20
+ train:
21
+ name: blip_caption
22
+ eval:
23
+ name: blip_caption
24
+
25
+ build_info:
26
+ # Be careful not to append minus sign (-) before split to avoid itemizing
27
+ train:
28
+ url:
29
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/train.jsonl
30
+ # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
31
+ storage:
32
+ - vsr/annotations/train.jsonl
33
+ # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
34
+ val:
35
+ url:
36
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/dev.jsonl
37
+ # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
38
+ storage:
39
+ - vsr/annotations/dev.jsonl
40
+ # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
41
+ test:
42
+ url:
43
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/test.jsonl
44
+ # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
45
+ storage:
46
+ - vsr/annotations/test.jsonl
47
+ # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
48
+ images:
49
+ storage: /export/share/datasets/vision_language/VSR/images
LAVIS-main/lavis/configs/datasets/vsr/defaults_classification_instruct.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ vsr_caption_instruct:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: "clip_image_train"
14
+ image_size: 224
15
+ eval:
16
+ name: "clip_image_eval"
17
+ image_size: 224
18
+
19
+ text_processor:
20
+ train:
21
+ name: blip_caption
22
+ eval:
23
+ name: blip_caption
24
+
25
+ build_info:
26
+ # Be careful not to append minus sign (-) before split to avoid itemizing
27
+ train:
28
+ url:
29
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/train.jsonl
30
+ # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
31
+ storage:
32
+ - vsr/annotations/train.jsonl
33
+ # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
34
+ val:
35
+ url:
36
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/dev.jsonl
37
+ # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
38
+ storage:
39
+ - vsr/annotations/dev.jsonl
40
+ # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
41
+ test:
42
+ url:
43
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/test.jsonl
44
+ # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
45
+ storage:
46
+ - vsr/annotations/test.jsonl
47
+ # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
48
+ images:
49
+ storage: /export/share/datasets/vision_language/VSR/images
LAVIS-main/lavis/configs/datasets/vsr/defaults_instruct.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ vsr_caption_instruct:
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+
12
+ vis_processor:
13
+ train:
14
+ name: "clip_image_train"
15
+ image_size: 224
16
+ eval:
17
+ name: "clip_image_eval"
18
+ image_size: 224
19
+
20
+ text_processor:
21
+ train:
22
+ name: blip_instruction
23
+ task: caption
24
+ modality: image
25
+ eval:
26
+ name: blip_caption
27
+
28
+ build_info:
29
+ # Be careful not to append minus sign (-) before split to avoid itemizing
30
+ annotations:
31
+ train:
32
+ url:
33
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/train.jsonl
34
+ # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
35
+ storage:
36
+ - vsr/annotations/train.jsonl
37
+ # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/train.jsonl
38
+ val:
39
+ url:
40
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/dev.jsonl
41
+ # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
42
+ storage:
43
+ - vsr/annotations/dev.jsonl
44
+ # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/dev.jsonl
45
+ test:
46
+ url:
47
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/vsr/test.jsonl
48
+ # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
49
+ storage:
50
+ - vsr/annotations/test.jsonl
51
+ # - /export/home/data/annotations/VSR/visual-spatial-reasoning/data/splits/zeroshot/test.jsonl
52
+ images:
53
+ storage: /export/share/datasets/vision_language/VSR/images
LAVIS-main/lavis/configs/datasets/wavcaps/defaults_mm_cap.yaml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+ datasets:
6
+ wavcaps_mm_caption: # name of the dataset builder
7
+ audio_processor:
8
+ train:
9
+ name: beats_audio
10
+ sampling_rate: 16000
11
+ n_frames: 2
12
+ frame_length: 512
13
+ eval:
14
+ name: beats_audio
15
+ sampling_rate: 16000
16
+ n_frames: 2
17
+ frame_length: 512
18
+
19
+ text_processor:
20
+ train:
21
+ name: blip_caption
22
+ eval:
23
+ name: blip_caption
24
+
25
+ data_type: [audio]
26
+
27
+ build_info:
28
+ kwargs:
29
+ cached: False
30
+ cached_dir: /export/share/datasets/audio/WavCaps/beats_features/
31
+
32
+ # Be careful not to append minus sign (-) before split to avoid itemizing
33
+ annotations:
34
+ train:
35
+ url:
36
+ - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/BBC_Sound_Effects/bbc_final.json
37
+ - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/FreeSound/fsd_final.json
38
+ - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/SoundBible/sb_final.json
39
+ - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/AudioSet_SL/as_final.json
40
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/wavcaps/json_data.json
41
+ storage:
42
+ - wavcaps/json_files/BBC_Sound_Effects/bbc_final.json
43
+ - wavcaps/json_files/FreeSound/fsd_final.json
44
+ - wavcaps/json_files/SoundBible/sb_final.json
45
+ - wavcaps/json_files/AudioSet_SL/as_final.json
46
+ - wavcaps/annotations/json_data.json
47
+ # train:
48
+ # url:
49
+ # - /export/share/datasets/audio/WavCaps/json_files/BBC_Sound_Effects/bbc_final.json
50
+ # - /export/share/datasets/audio/WavCaps/json_files/FreeSound/fsd_final.json
51
+ # - /export/share/datasets/audio/WavCaps/json_files/SoundBible/sb_final.json
52
+ # - /export/share/datasets/audio/WavCaps/json_files/AudioSet_SL/as_final.json
53
+ # - /export/share/datasets/audio/WavCaps/json_data.json
54
+ # storage:
55
+ # - /export/share/datasets/audio/WavCaps/json_files/BBC_Sound_Effects/bbc_final.json
56
+ # - /export/share/datasets/audio/WavCaps/json_files/FreeSound/fsd_final.json
57
+ # - /export/share/datasets/audio/WavCaps/json_files/SoundBible/sb_final.json
58
+ # - /export/share/datasets/audio/WavCaps/json_files/AudioSet_SL/as_final.json
59
+ # - /export/share/datasets/audio/WavCaps/json_data.json
60
+
61
+ audio:
62
+ storage: /export/share/datasets/audio/WavCaps/
63
+
LAVIS-main/lavis/configs/datasets/wavcaps/defaults_mm_cap_instruct.yaml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+ datasets:
6
+ wavcaps_mm_caption_instruct: # name of the dataset builder
7
+ audio_processor:
8
+ train:
9
+ name: beats_audio
10
+ sampling_rate: 16000
11
+ n_frames: 2
12
+ frame_length: 512
13
+ eval:
14
+ name: beats_audio
15
+ sampling_rate: 16000
16
+ n_frames: 2
17
+ frame_length: 512
18
+ text_processor:
19
+ train:
20
+ name: "blip_instruction"
21
+ modality: audio
22
+ task: caption
23
+ eval:
24
+ name: "blip_caption"
25
+
26
+ data_type: [audio]
27
+
28
+ build_info:
29
+ kwargs:
30
+ cached: True
31
+ cached_dir: /export/share/datasets/audio/WavCaps/beats_features/
32
+
33
+ # Be careful not to append minus sign (-) before split to avoid itemizing
34
+ annotations:
35
+ train:
36
+ # url:
37
+ # - /export/share/datasets/audio/WavCaps/json_files/BBC_Sound_Effects/bbc_final.json
38
+ # - /export/share/datasets/audio/WavCaps/json_files/FreeSound/fsd_final.json
39
+ # - /export/share/datasets/audio/WavCaps/json_files/SoundBible/sb_final.json
40
+ # - /export/share/datasets/audio/WavCaps/json_files/AudioSet_SL/as_final.json
41
+ # - /export/share/datasets/audio/WavCaps/json_data.json
42
+ # storage:
43
+ # - /export/share/datasets/audio/WavCaps/json_files/BBC_Sound_Effects/bbc_final.json
44
+ # - /export/share/datasets/audio/WavCaps/json_files/FreeSound/fsd_final.json
45
+ # - /export/share/datasets/audio/WavCaps/json_files/SoundBible/sb_final.json
46
+ # - /export/share/datasets/audio/WavCaps/json_files/AudioSet_SL/as_final.json
47
+ # - /export/share/datasets/audio/WavCaps/json_data.json
48
+ url:
49
+ - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/BBC_Sound_Effects/bbc_final.json
50
+ - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/FreeSound/fsd_final.json
51
+ - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/SoundBible/sb_final.json
52
+ - https://raw.githubusercontent.com/XinhaoMei/WavCaps/master/data/json_files/AudioSet_SL/as_final.json
53
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/wavcaps/json_data.json
54
+ storage:
55
+ - wavcaps/json_files/BBC_Sound_Effects/bbc_final.json
56
+ - wavcaps/json_files/FreeSound/fsd_final.json
57
+ - wavcaps/json_files/SoundBible/sb_final.json
58
+ - wavcaps/json_files/AudioSet_SL/as_final.json
59
+ - wavcaps/annotations/json_data.json
60
+
61
+ audio:
62
+ storage: /export/share/datasets/audio/WavCaps/
63
+
LAVIS-main/lavis/configs/datasets/webvid/defaults_cap.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ webvid2m_caption: # name of the dataset builder
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: alpro_video_train
14
+ n_frms: 5
15
+ image_size: 224
16
+ min_scale: 0.9
17
+ max_scale: 1.0
18
+ eval:
19
+ name: alpro_video_eval
20
+ n_frms: 5
21
+ image_size: 224
22
+ min_scale: 0.9
23
+ max_scale: 1.0
24
+ text_processor:
25
+ train:
26
+ name: "blip_caption"
27
+ eval:
28
+ name: "blip_caption"
29
+
30
+ build_info:
31
+ # Be careful not to append minus sign (-) before split to avoid itemizing
32
+ annotations:
33
+ train:
34
+ url:
35
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/webvid2m/train.json
36
+ # - /export/home/LAVIS/webvid_annotation.json
37
+ storage:
38
+ - webvid2m/annotations/train.json
39
+ # - /export/home/LAVIS/webvid_annotation.json
40
+ images:
41
+ storage: /export/video-language-dataset/data/webvid2m/postprocess/downsampled_videos
LAVIS-main/lavis/configs/datasets/webvid/defaults_cap_instruct.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ webvid2m_caption_instruct: # name of the dataset builder
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: images # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: alpro_video_train
14
+ n_frms: 5
15
+ image_size: 224
16
+ min_scale: 0.9
17
+ max_scale: 1.0
18
+ eval:
19
+ name: alpro_video_eval
20
+ n_frms: 5
21
+ image_size: 224
22
+ min_scale: 0.9
23
+ max_scale: 1.0
24
+ text_processor:
25
+ train:
26
+ name: "blip_instruction"
27
+ modality: video
28
+ task: caption
29
+ eval:
30
+ name: "blip_caption"
31
+
32
+ build_info:
33
+ # Be careful not to append minus sign (-) before split to avoid itemizing
34
+ annotations:
35
+ train:
36
+ url:
37
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/webvid2m/train.json
38
+ # - /export/home/LAVIS/webvid_annotation.json
39
+ storage:
40
+ - webvid2m/annotations/train.json
41
+ # - /export/home/LAVIS/webvid_annotation.json
42
+ images:
43
+ storage: /export/video-language-dataset/data/webvid2m/postprocess/downsampled_videos
LAVIS-main/lavis/configs/datasets/youcook/defaults_cap.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ youcook_caption: # name of the dataset builder
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: videos # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: alpro_video_train
14
+ n_frms: 4
15
+ image_size: 224
16
+ min_scale: 0.9
17
+ max_scale: 1.0
18
+ full_video: False
19
+ eval:
20
+ name: alpro_video_eval
21
+ n_frms: 4
22
+ image_size: 224
23
+ min_scale: 0.9
24
+ max_scale: 1.0
25
+ full_video: False
26
+
27
+ text_processor:
28
+ train:
29
+ name: blip_caption
30
+ eval:
31
+ name: blip_caption
32
+
33
+ build_info:
34
+ # Be careful not to append minus sign (-) before split to avoid itemizing
35
+ annotations:
36
+ train:
37
+ url:
38
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/youcook/train_annotations.json
39
+ # - /export/video-language-dataset/data/youcook/annotations/train_annotations.json
40
+ storage:
41
+ - youcook/annotations/train_annotations.json
42
+ # - /export/video-language-dataset/data/youcook/annotations/train_annotations.json
43
+ val:
44
+ url:
45
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/youcook/val_annotations.json
46
+ # - /export/video-language-dataset/data/youcook/annotations/val_annotations.json
47
+ storage:
48
+ - youcook/annotations/val_annotations.json
49
+ # - /export/video-language-dataset/data/youcook/annotations/val_annotations.json
50
+ videos:
51
+ storage: /export/video-language-dataset/data/youcook/raw_videos
LAVIS-main/lavis/configs/datasets/youcook/defaults_cap_instruct.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ youcook_caption_instruct: # name of the dataset builder
8
+ # data_dir: ${env.data_dir}/datasets
9
+ data_type: videos # [images|videos|features]
10
+
11
+ vis_processor:
12
+ train:
13
+ name: alpro_video_train
14
+ n_frms: 4
15
+ image_size: 224
16
+ min_scale: 0.9
17
+ max_scale: 1.0
18
+ full_video: False
19
+ eval:
20
+ name: alpro_video_eval
21
+ n_frms: 4
22
+ image_size: 224
23
+ min_scale: 0.9
24
+ max_scale: 1.0
25
+ full_video: False
26
+
27
+ text_processor:
28
+ train:
29
+ name: blip_instruction
30
+ modality: video
31
+ task: caption
32
+ eval:
33
+ name: blip_caption
34
+
35
+ build_info:
36
+ # Be careful not to append minus sign (-) before split to avoid itemizing
37
+ annotations:
38
+ train:
39
+ url:
40
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/youcook/train_annotations.json
41
+ # - /export/video-language-dataset/data/youcook/annotations/train_annotations.json
42
+ storage:
43
+ - youcook/annotations/train_annotations.json
44
+ # - /export/video-language-dataset/data/youcook/annotations/train_annotations.json
45
+ val:
46
+ url:
47
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/youcook/val_annotations.json
48
+ # - /export/video-language-dataset/data/youcook/annotations/val_annotations.json
49
+ storage:
50
+ - youcook/annotations/val_annotations.json
51
+ # - /export/video-language-dataset/data/youcook/annotations/val_annotations.json
52
+ videos:
53
+ storage: /export/video-language-dataset/data/youcook/raw_videos
LAVIS-main/lavis/configs/datasets/yt8m/defaults_mm_dial.yaml ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ datasets:
7
+ yt8m_mm_dialogue: # name of the dataset builder
8
+ data_type: [video] #extracted features of videos (I3D, VGGish) # [images|videos|features]
9
+
10
+ video_processor:
11
+ train:
12
+ name: alpro_video_train
13
+ n_frms: 4
14
+ image_size: 224
15
+ min_scale: 0.9
16
+ max_scale: 1.0
17
+ full_video: False
18
+ eval:
19
+ name: alpro_video_eval
20
+ n_frms: 4
21
+ image_size: 224
22
+ min_scale: 0.9
23
+ max_scale: 1.0
24
+ full_video: False
25
+
26
+ audio_processor:
27
+ train:
28
+ name: beats_audio
29
+ # sampling_rate: 16000
30
+ eval:
31
+ name: beats_audio
32
+ # sampling_rate: 16000
33
+ is_eval: True
34
+
35
+ text_processor:
36
+ train:
37
+ name: blip_caption
38
+ eval:
39
+ name: blip_caption
40
+
41
+ build_info:
42
+ # Be careful not to append minus sign (-) before split to avoid itemizing
43
+ annotations:
44
+ train:
45
+ url:
46
+ - /export/video-language-dataset/data/yt-8m/ytd_gpt3_safe_json/train.json
47
+ storage:
48
+ - /export/video-language-dataset/data/yt-8m/ytd_gpt3_safe_json/train.json
49
+ val:
50
+ url:
51
+ - /export/video-language-dataset/data/yt-8m/ytd_gpt3_safe_json/validation.json
52
+ storage:
53
+ - /export/video-language-dataset/data/yt-8m/ytd_gpt3_safe_json/validation.json
54
+
55
+ templates: null
56
+
57
+ audio:
58
+ storage: /export/video-language-dataset/data/yt-8m/audios
59
+
60
+ video:
61
+ storage: /export/video-language-dataset/data/yt-8m/videos
62
+
LAVIS-main/lavis/configs/models/albef_classification_ve.yaml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: albef_classification
8
+ load_finetuned: True
9
+
10
+ finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_snli_ve_lavis.pt"
11
+ pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
12
+
13
+ num_classes: 3
14
+
15
+ use_distill: True
16
+ momentum: 0.995
17
+ alpha: 0.4
18
+
19
+ # vit encoder
20
+ vit_type: "base"
21
+ vit_grad_ckpt: False
22
+ vit_ckpt_layer: 0
23
+ vit_layer_norm_epsilon: 1e-6
24
+
25
+ image_size: 384
26
+
27
+ # bert config
28
+ med_config_path: "configs/models/med_config_albef.json"
29
+
30
+ preprocess:
31
+ vis_processor:
32
+ train:
33
+ name: "blip_image_train"
34
+ eval:
35
+ name: "blip_image_eval"
36
+ text_processor:
37
+ train:
38
+ name: "blip_caption"
39
+ eval:
40
+ name: "blip_caption"
LAVIS-main/lavis/configs/models/albef_feature_extractor.yaml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: albef_pretrain
8
+ pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
9
+
10
+ # vit encoder
11
+ vit_type: "base"
12
+ image_size: 224
13
+ vit_ckpt_layer: 0
14
+ vit_drop_path_rate: 0
15
+ vit_layer_norm_epsilon: 1e-6
16
+ vit_grad_ckpt: False
17
+
18
+ # bert config
19
+ med_config_path: "configs/models/med_config_albef.json"
20
+
21
+ embed_dim: 256
22
+
23
+ preprocess:
24
+ vis_processor:
25
+ eval:
26
+ name: "blip_image_eval"
27
+ image_size: 224
28
+ text_processor:
29
+ eval:
30
+ name: "blip_caption"
LAVIS-main/lavis/configs/models/albef_nlvr.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: albef_nlvr
8
+ load_finetuned: True
9
+
10
+ pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/pretrain_model_nlvr.pth"
11
+ finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_nlvr_lavis.pt"
12
+
13
+ num_classes: 2
14
+
15
+ use_distill: True
16
+ momentum: 0.995
17
+ alpha: 0.4
18
+
19
+ # vit encoder
20
+ vit_type: "base"
21
+ vit_grad_ckpt: False
22
+ vit_ckpt_layer: 0
23
+ vit_layer_norm_epsilon: 1e-6
24
+
25
+ image_size: 384
26
+
27
+ # bert config
28
+ med_config_path: "configs/models/med_config_albef.json"
29
+
30
+ preprocess:
31
+ vis_processor:
32
+ train:
33
+ name: "blip_image_train"
34
+ image_size: 384
35
+ eval:
36
+ name: "blip_image_eval"
37
+ image_size: 384
38
+ text_processor:
39
+ train:
40
+ name: "blip_caption"
41
+ eval:
42
+ name: "blip_caption"
LAVIS-main/lavis/configs/models/albef_pretrain_base.yaml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: albef_pretrain
8
+
9
+ load_pretrained: True
10
+ pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11
+
12
+ # vit encoder
13
+ vit_type: "base"
14
+ image_size: 224
15
+ vit_ckpt_layer: 0
16
+ vit_drop_path_rate: 0
17
+ vit_layer_norm_epsilon: 1e-6
18
+ vit_grad_ckpt: False
19
+
20
+ # bert config
21
+ med_config_path: "configs/models/med_config_albef.json"
22
+ mlm_mask_prob: 0.15
23
+
24
+ embed_dim: 256
25
+ momentum: 0.995
26
+ alpha: 0.4
27
+ temp: 0.07
28
+
29
+ max_txt_len: 30
30
+
31
+ preprocess:
32
+ vis_processor:
33
+ train:
34
+ name: "blip_image_train"
35
+ image_size: 256
36
+ text_processor:
37
+ train:
38
+ name: "blip_caption"
LAVIS-main/lavis/configs/models/albef_retrieval_coco.yaml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: albef_retrieval
8
+ load_finetuned: True
9
+
10
+ pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11
+ finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_coco_retrieval_lavis.pt"
12
+
13
+ queue_size: 65536
14
+
15
+ # vit encoder
16
+ vit_type: "base"
17
+ image_size: 384
18
+ vit_ckpt_layer: 0
19
+ vit_drop_path_rate: 0
20
+ vit_layer_norm_epsilon: 1e-6
21
+ vit_grad_ckpt: False
22
+
23
+ # bert config
24
+ med_config_path: "configs/models/med_config_albef.json"
25
+
26
+ embed_dim: 256
27
+ momentum: 0.995
28
+ alpha: 0.4
29
+ temp: 0.07
30
+ use_distill: True
31
+
32
+ max_txt_len: 30
33
+
34
+ preprocess:
35
+ vis_processor:
36
+ train:
37
+ name: "blip_image_train"
38
+ image_size: 384
39
+ eval:
40
+ name: "blip_image_eval"
41
+ image_size: 384
42
+ text_processor:
43
+ train:
44
+ name: "blip_caption"
45
+ eval:
46
+ name: "blip_caption"
LAVIS-main/lavis/configs/models/albef_retrieval_flickr.yaml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: albef_retrieval
8
+ load_finetuned: True
9
+
10
+ pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11
+ finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_flickr_retrieval_lavis.pt
12
+
13
+ queue_size: 65536
14
+
15
+ # vit encoder
16
+ vit_type: "base"
17
+ image_size: 384
18
+ vit_ckpt_layer: 0
19
+ vit_drop_path_rate: 0
20
+ vit_layer_norm_epsilon: 1e-6
21
+ vit_grad_ckpt: False
22
+
23
+ # bert config
24
+ med_config_path: "configs/models/med_config_albef.json"
25
+
26
+ embed_dim: 256
27
+ momentum: 0.995
28
+ alpha: 0.4
29
+ temp: 0.07
30
+ use_distill: True
31
+
32
+ max_txt_len: 30
33
+
34
+ preprocess:
35
+ vis_processor:
36
+ train:
37
+ name: "blip_image_train"
38
+ image_size: 384
39
+ eval:
40
+ name: "blip_image_eval"
41
+ image_size: 384
42
+ text_processor:
43
+ train:
44
+ name: "blip_caption"
45
+ eval:
46
+ name: "blip_caption"
LAVIS-main/lavis/configs/models/albef_vqav2.yaml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: albef_vqa
8
+ load_finetuned: True
9
+
10
+ pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11
+ finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_vqav2_lavis.pt"
12
+
13
+ use_distill: True
14
+ momentum: 0.995
15
+ alpha: 0.4
16
+
17
+ # vit encoder
18
+ vit_type: "base"
19
+ vit_grad_ckpt: False
20
+ vit_ckpt_layer: 0
21
+ vit_layer_norm_epsilon: 1e-6
22
+
23
+ image_size: 384
24
+
25
+ # bert config
26
+ med_config_path: "configs/models/med_config_albef.json"
27
+
28
+ preprocess:
29
+ vis_processor:
30
+ train:
31
+ name: "blip_image_train"
32
+ image_size: 384
33
+ eval:
34
+ name: "blip_image_eval"
35
+ image_size: 384
36
+ text_processor:
37
+ train:
38
+ name: "blip_question"
39
+ eval:
40
+ name: "blip_question"
LAVIS-main/lavis/configs/models/alpro_qa_msrvtt.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: alpro_qa
8
+ num_classes: 1500
9
+
10
+ load_finetuned: True
11
+
12
+ finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_qa.pth"
13
+ pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
14
+
15
+ timesformer:
16
+ n_frms: 16
17
+ image_size: 224
18
+
19
+ patch_size: 16
20
+ attn_drop_rate: 0.
21
+ drop_rate: 0.
22
+ drop_path_rate: 0.1
23
+
24
+ use_grad_ckpt: True
25
+ ckpt_layer: 12
26
+
27
+ # bert config
28
+ med_config_path: "configs/models/bert_config_alpro.json"
29
+
30
+ preprocess:
31
+ vis_processor:
32
+ train:
33
+ name: "alpro_video_train"
34
+ n_frms: 16
35
+ image_size: 224
36
+ eval:
37
+ name: "alpro_video_eval"
38
+ n_frms: 16
39
+ image_size: 224
40
+ text_processor:
41
+ train:
42
+ name: "blip_caption"
43
+ eval:
44
+ name: "blip_caption"
LAVIS-main/lavis/configs/models/alpro_qa_msvd.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: alpro_qa
8
+ num_classes: 2423
9
+
10
+ load_finetuned: True
11
+
12
+ finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msvd_qa.pth"
13
+ pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
14
+
15
+ timesformer:
16
+ n_frms: 16
17
+ image_size: 224
18
+
19
+ patch_size: 16
20
+ attn_drop_rate: 0.
21
+ drop_rate: 0.
22
+ drop_path_rate: 0.1
23
+ use_grad_ckpt: True
24
+ ckpt_layer: 12
25
+
26
+ # bert config
27
+ med_config_path: "configs/models/bert_config_alpro.json"
28
+
29
+ preprocess:
30
+ vis_processor:
31
+ train:
32
+ name: "alpro_video_train"
33
+ n_frms: 16
34
+ image_size: 224
35
+ eval:
36
+ name: "alpro_video_eval"
37
+ n_frms: 16
38
+ image_size: 224
39
+ text_processor:
40
+ train:
41
+ name: "blip_caption"
42
+ eval:
43
+ name: "blip_caption"
LAVIS-main/lavis/configs/models/alpro_retrieval_didemo.yaml ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: alpro_retrieval
8
+
9
+ load_finetuned: True
10
+
11
+ finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_didemo_retrieval.pt
12
+ pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
13
+
14
+ timesformer:
15
+ n_frms: 8
16
+ image_size: 224
17
+
18
+ patch_size: 16
19
+ attn_drop_rate: 0.
20
+ drop_rate: 0.
21
+ drop_path_rate: 0.1
22
+ use_grad_ckpt: False
23
+
24
+ # bert config
25
+ med_config_path: "configs/models/bert_config_alpro.json"
26
+
27
+ preprocess:
28
+ vis_processor:
29
+ eval:
30
+ name: "alpro_video_eval"
31
+ n_frms: 8
32
+ image_size: 224
33
+ text_processor:
34
+ eval:
35
+ name: "blip_caption"
LAVIS-main/lavis/configs/models/alpro_retrieval_msrvtt.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: alpro_retrieval
8
+
9
+ load_finetuned: True
10
+
11
+ finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_retrieval.pt"
12
+ pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
13
+
14
+ timesformer:
15
+ n_frms: 8
16
+ image_size: 224
17
+
18
+ patch_size: 16
19
+ attn_drop_rate: 0.
20
+ drop_rate: 0.
21
+ drop_path_rate: 0.1
22
+ use_grad_ckpt: False
23
+
24
+ # bert config
25
+ med_config_path: "configs/models/bert_config_alpro.json"
26
+
27
+ preprocess:
28
+ vis_processor:
29
+ train:
30
+ name: "alpro_video_train"
31
+ n_frms: 8
32
+ image_size: 224
33
+ eval:
34
+ name: "alpro_video_eval"
35
+ n_frms: 8
36
+ image_size: 224
37
+ text_processor:
38
+ train:
39
+ name: "blip_caption"
40
+ eval:
41
+ name: "blip_caption"
LAVIS-main/lavis/configs/models/bert_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertModel"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "hidden_act": "gelu",
7
+ "hidden_dropout_prob": 0.1,
8
+ "hidden_size": 768,
9
+ "initializer_range": 0.02,
10
+ "intermediate_size": 3072,
11
+ "layer_norm_eps": 1e-12,
12
+ "max_position_embeddings": 512,
13
+ "model_type": "bert",
14
+ "num_attention_heads": 12,
15
+ "num_hidden_layers": 12,
16
+ "pad_token_id": 0,
17
+ "add_type_embeddings": false,
18
+ "vocab_size": 30522,
19
+ "encoder_width": 768,
20
+ "add_cross_attention": true
21
+ }
LAVIS-main/lavis/configs/models/bert_config_alpro.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertModel"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "hidden_act": "gelu",
7
+ "hidden_dropout_prob": 0.1,
8
+ "hidden_size": 768,
9
+ "initializer_range": 0.02,
10
+ "intermediate_size": 3072,
11
+ "layer_norm_eps": 1e-12,
12
+ "max_position_embeddings": 512,
13
+ "model_type": "bert",
14
+ "num_attention_heads": 12,
15
+ "num_hidden_layers": 12,
16
+ "pad_token_id": 0,
17
+ "add_type_embeddings": true,
18
+ "type_vocab_size": 2,
19
+ "vocab_size": 30522,
20
+ "encoder_width": 768,
21
+ "add_cross_attention": false,
22
+ "fusion_layer": 6
23
+ }
LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_base.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ vit_model: "clip_L"
3
+
4
+ qformer_num_query_token: 16
5
+ qformer_cross_attention_freq: 1
6
+
7
+ sd_train_text_encoder: False
8
+ sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5"
9
+
10
+ load_finetuned: False
11
+ load_pretrained: True
12
+ # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
13
+ pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion.tar.gz"
14
+
15
+ preprocess:
16
+ vis_processor:
17
+ train:
18
+ name: "blip_diffusion_inp_image_eval"
19
+ eval:
20
+ name: "blip_diffusion_inp_image_eval"
21
+ text_processor:
22
+ train:
23
+ name: "blip_caption"
24
+ eval:
25
+ name: "blip_caption"
LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_canny.yaml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ vit_model: "clip_L"
3
+
4
+ qformer_num_query_token: 16
5
+ qformer_cross_attention_freq: 1
6
+
7
+ sd_train_text_encoder: False
8
+ sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5"
9
+
10
+ load_finetuned: False
11
+ load_pretrained: True
12
+ # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
13
+ pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion.tar.gz"
14
+
15
+ controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-canny"
16
+
17
+ preprocess:
18
+ vis_processor:
19
+ train:
20
+ name: "blip_diffusion_inp_image_eval"
21
+ eval:
22
+ name: "blip_diffusion_inp_image_eval"
23
+ text_processor:
24
+ train:
25
+ name: "blip_caption"
26
+ eval:
27
+ name: "blip_caption"
LAVIS-main/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_depth.yaml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ vit_model: "clip_L"
3
+
4
+ qformer_num_query_token: 16
5
+ qformer_cross_attention_freq: 1
6
+
7
+ sd_train_text_encoder: False
8
+ sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5"
9
+
10
+ load_finetuned: False
11
+ load_pretrained: True
12
+ # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
13
+ pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion-openimage.tar.gz"
14
+
15
+ controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-depth"
16
+
17
+ preprocess:
18
+ vis_processor:
19
+ train:
20
+ name: "blip_diffusion_inp_image_eval"
21
+ eval:
22
+ name: "blip_diffusion_inp_image_eval"
23
+ text_processor:
24
+ train:
25
+ name: "blip_caption"
26
+ eval:
27
+ name: "blip_caption"