yuccaaa commited on
Commit
31ec239
·
verified ·
1 Parent(s): 5ea4862

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. LAVIS-main/lavis/projects/instructblip/caption_nocaps_out_domain_vicuna7b_eval.yaml +82 -0
  2. LAVIS-main/lavis/projects/instructblip/caption_vatex_flant5xl_eval.yaml +90 -0
  3. LAVIS-main/lavis/projects/instructblip/caption_vatex_flant5xxl_eval.yaml +90 -0
  4. LAVIS-main/lavis/projects/instructblip/caption_vatex_vicuna13b_eval.yaml +90 -0
  5. LAVIS-main/lavis/projects/instructblip/caption_vatex_vicuna7b_eval.yaml +91 -0
  6. LAVIS-main/lavis/projects/instructblip/classification_modelnet40_vicuna13b.yaml +101 -0
  7. LAVIS-main/lavis/projects/instructblip/classification_modelnet40_vicuna7b.yaml +100 -0
  8. LAVIS-main/lavis/projects/instructblip/classification_snlive_flant5xl.yaml +94 -0
  9. LAVIS-main/lavis/projects/instructblip/classification_snlive_flant5xxl.yaml +95 -0
  10. LAVIS-main/lavis/projects/instructblip/classification_snlive_vicuna13b.yaml +93 -0
  11. LAVIS-main/lavis/projects/instructblip/classification_snlive_vicuna13b_test.yaml +93 -0
  12. LAVIS-main/lavis/projects/instructblip/classification_snlive_vicuna7b_test.yaml +93 -0
  13. LAVIS-main/lavis/projects/instructblip/classification_snlive_vicuna7b_val.yaml +93 -0
  14. LAVIS-main/lavis/projects/instructblip/completion_modelnet40_vicuna13b.yaml +101 -0
  15. LAVIS-main/lavis/projects/instructblip/completion_modelnet40_vicuna7b.yaml +102 -0
  16. LAVIS-main/lavis/projects/instructblip/qa_msrvtt_flant5xl_eval_test.yaml +92 -0
  17. LAVIS-main/lavis/projects/instructblip/qa_msrvtt_flant5xxl_eval_test.yaml +92 -0
  18. LAVIS-main/lavis/projects/instructblip/qa_msrvtt_vicuna13b_eval_test.yaml +92 -0
  19. LAVIS-main/lavis/projects/instructblip/qa_msrvtt_vicuna7b_eval_test.yaml +92 -0
  20. LAVIS-main/lavis/projects/instructblip/qa_msvd_flant5xl_eval.yaml +100 -0
  21. LAVIS-main/lavis/projects/instructblip/qa_msvd_flant5xxl_eval.yaml +100 -0
  22. LAVIS-main/lavis/projects/instructblip/qa_msvd_vicuna13b_eval.yaml +100 -0
  23. LAVIS-main/lavis/projects/instructblip/qa_msvd_vicuna7b_eval.yaml +100 -0
  24. LAVIS-main/lavis/projects/instructblip/qa_okvqa_flant5xl_eval.yaml +90 -0
  25. LAVIS-main/lavis/projects/instructblip/qa_okvqa_flant5xxl_eval.yaml +90 -0
  26. LAVIS-main/lavis/projects/instructblip/qa_okvqa_vicuna13b_eval.yaml +90 -0
  27. LAVIS-main/lavis/projects/instructblip/qa_okvqa_vicuna7b_eval.yaml +90 -0
  28. LAVIS-main/lavis/projects/pnp-vqa/eval/gqa_eval.yaml +60 -0
  29. LAVIS-main/lavis/projects/pnp-vqa/eval/gqa_eval_3b.yaml +60 -0
  30. LAVIS-main/lavis/projects/pnp-vqa/eval/gqa_eval_large.yaml +60 -0
  31. LAVIS-main/lavis/projects/pnp-vqa/eval/okvqa_eval.yaml +59 -0
  32. LAVIS-main/lavis/projects/pnp-vqa/eval/okvqa_eval_3b.yaml +59 -0
  33. LAVIS-main/lavis/projects/pnp-vqa/eval/okvqa_eval_large.yaml +59 -0
  34. LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_eval.yaml +60 -0
  35. LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_eval_3b.yaml +60 -0
  36. LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_eval_large.yaml +60 -0
  37. LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_test_eval.yaml +60 -0
  38. LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_test_eval_3b.yaml +60 -0
  39. LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_test_eval_large.yaml +60 -0
  40. LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_caption.yaml +176 -0
  41. LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_caption_13b.yaml +176 -0
  42. LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe.yaml +176 -0
  43. LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_13b.yaml +177 -0
  44. LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_nocue.yaml +176 -0
  45. LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_proj copy.yaml +179 -0
  46. LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_proj.yaml +179 -0
  47. LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_rand_init.yaml +176 -0
  48. LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/image_3d_caption.yaml +154 -0
  49. LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/image_3d_caption_13b.yaml +154 -0
  50. LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/image_3d_describe.yaml +154 -0
LAVIS-main/lavis/projects/instructblip/caption_nocaps_out_domain_vicuna7b_eval.yaml ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_vicuna_instruct
8
+ model_type: vicuna7b
9
+ load_pretrained: True
10
+ prompt: "A short image caption."
11
+
12
+ datasets:
13
+ nocaps: # name of the dataset builder
14
+ # data_dir: ${env.data_dir}/datasets
15
+ data_type: images # [images|videos|features]
16
+
17
+ vis_processor:
18
+ eval:
19
+ name: "blip_image_eval"
20
+ image_size: 224
21
+
22
+ text_processor:
23
+ eval:
24
+ name: "blip_caption"
25
+
26
+ build_info:
27
+ # Be careful not to append minus sign (-) before split to avoid itemizing
28
+ annotations:
29
+ val:
30
+ url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_val.json
31
+ storage: nocaps/annotations/nocaps_val.json
32
+ test:
33
+ url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_test.json
34
+ storage: nocaps/annotations/nocaps_test.json
35
+ images:
36
+ storage: /export/share/datasets/vision/nocaps/
37
+
38
+ run:
39
+ task: captioning
40
+ # optimizer
41
+ lr_sched: "linear_warmup_cosine_lr"
42
+ init_lr: 1e-5
43
+ min_lr: 0
44
+ warmup_lr: 1e-8
45
+ warmup_steps: 1000
46
+ weight_decay: 0.05
47
+ max_epoch: 1
48
+ batch_size_train: 16
49
+ batch_size_eval: 1
50
+ num_workers: 8
51
+ accum_grad_iters: 1
52
+
53
+ max_len: 80
54
+ min_len: 10
55
+ num_beams: 5
56
+ inference_method: "generate"
57
+ # prompt: an image that shows
58
+ length_penalty: 1.
59
+
60
+ annotation_file: https://nocaps.s3.amazonaws.com/nocaps_val_4500_captions.json
61
+
62
+
63
+ seed: 42
64
+ output_dir: "output/instructblip/nocaps_out_domain_captioning_vicuna7b/"
65
+
66
+
67
+ amp: True
68
+ resume_ckpt_path: null
69
+
70
+ evaluate: True
71
+ # train_splits: ["train"]
72
+ valid_splits: ["val"]
73
+
74
+
75
+ device: "cuda"
76
+ world_size: 1
77
+ dist_url: "env://"
78
+ distributed: True
79
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
80
+ val_freq: 1
81
+
82
+ img_ids: [2, 4, 5, 8, 15, 18, 19, 22, 27, 30, 33, 35, 41, 42, 43, 46, 47, 51, 59, 60, 64, 65, 68, 69, 71, 72, 73, 77, 79, 81, 85, 87, 88, 90, 92, 100, 101, 102, 105, 107, 109, 115, 120, 124, 125, 126, 127, 129, 133, 135, 137, 139, 140, 141, 143, 150, 153, 155, 158, 164, 165, 167, 170, 171, 173, 182, 190, 191, 196, 200, 201, 203, 205, 208, 219, 225, 226, 228, 229, 232, 239, 240, 243, 245, 250, 262, 263, 264, 267, 272, 278, 283, 284, 290, 291, 297, 301, 304, 305, 309, 310, 311, 314, 323, 325, 329, 330, 331, 333, 334, 341, 349, 350, 351, 352, 354, 358, 359, 363, 365, 366, 368, 371, 372, 379, 381, 383, 386, 388, 389, 390, 392, 405, 415, 417, 418, 420, 421, 424, 428, 429, 432, 436, 441, 443, 452, 453, 454, 455, 456, 459, 464, 465, 468, 469, 476, 477, 478, 480, 487, 488, 490, 491, 493, 500, 502, 504, 506, 509, 510, 511, 512, 515, 516, 520, 527, 529, 533, 539, 540, 541, 544, 545, 547, 551, 554, 556, 559, 577, 579, 580, 582, 586, 587, 590, 593, 594, 607, 609, 616, 617, 619, 623, 628, 631, 634, 637, 648, 651, 654, 655, 665, 673, 678, 682, 684, 685, 688, 690, 695, 696, 701, 702, 705, 707, 708, 712, 714, 718, 719, 723, 725, 726, 730, 731, 733, 734, 740, 744, 748, 750, 751, 756, 757, 760, 761, 763, 767, 775, 779, 782, 783, 784, 787, 790, 792, 794, 798, 799, 802, 805, 807, 810, 812, 816, 818, 819, 820, 821, 829, 831, 836, 841, 842, 844, 845, 849, 850, 853, 854, 857, 859, 861, 868, 871, 874, 875, 877, 879, 886, 887, 889, 890, 891, 892, 893, 894, 896, 899, 900, 905, 918, 924, 926, 927, 929, 932, 934, 935, 943, 948, 950, 952, 953, 954, 956, 957, 963, 965, 969, 972, 973, 974, 976, 980, 985, 987, 988, 990, 992, 993, 994, 1000, 1001, 1003, 1005, 1009, 1013, 1016, 1018, 1019, 1020, 1021, 1022, 1024, 1028, 1029, 1033, 1036, 1037, 1038, 1042, 1045, 1046, 1050, 1053, 1054, 1056, 1065, 1072, 1076, 1079, 1082, 1083, 1096, 1101, 1103, 1107, 1112, 1117, 1129, 1132, 1133, 1136, 1138, 1141, 1143, 1155, 1157, 1160, 1164, 1165, 1166, 1172, 1175, 1179, 1183, 1194, 1197, 1200, 1202, 1210, 1228, 1234, 1236, 1241, 1246, 1251, 1253, 1255, 1261, 1265, 1268, 1269, 1271, 1272, 1273, 1277, 1286, 1287, 1290, 1296, 1297, 1302, 1303, 1308, 1310, 1312, 1315, 1316, 1317, 1320, 1321, 1324, 1327, 1329, 1330, 1331, 1333, 1334, 1336, 1338, 1339, 1340, 1345, 1347, 1356, 1362, 1366, 1371, 1374, 1376, 1381, 1384, 1385, 1388, 1394, 1396, 1397, 1398, 1403, 1404, 1408, 1410, 1413, 1414, 1417, 1424, 1430, 1433, 1436, 1438, 1440, 1446, 1449, 1453, 1454, 1456, 1460, 1462, 1464, 1465, 1467, 1470, 1473, 1477, 1491, 1494, 1498, 1504, 1506, 1509, 1511, 1515, 1519, 1524, 1530, 1533, 1540, 1541, 1549, 1557, 1558, 1561, 1570, 1572, 1575, 1579, 1591, 1593, 1594, 1595, 1596, 1602, 1605, 1609, 1611, 1615, 1617, 1618, 1624, 1625, 1627, 1636, 1637, 1639, 1640, 1641, 1646, 1647, 1648, 1649, 1655, 1656, 1657, 1658, 1659, 1662, 1671, 1675, 1679, 1681, 1690, 1692, 1701, 1712, 1715, 1716, 1718, 1719, 1721, 1723, 1725, 1728, 1729, 1730, 1732, 1737, 1740, 1746, 1747, 1753, 1754, 1756, 1757, 1758, 1761, 1762, 1767, 1771, 1773, 1775, 1779, 1783, 1784, 1786, 1787, 1789, 1791, 1794, 1802, 1803, 1807, 1813, 1814, 1815, 1817, 1824, 1826, 1827, 1832, 1834, 1835, 1838, 1839, 1840, 1841, 1842, 1843, 1844, 1847, 1850, 1860, 1861, 1866, 1870, 1872, 1873, 1876, 1878, 1886, 1889, 1894, 1897, 1899, 1902, 1907, 1911, 1912, 1917, 1920, 1924, 1925, 1928, 1931, 1935, 1936, 1937, 1939, 1941, 1946, 1948, 1949, 1952, 1954, 1955, 1956, 1959, 1967, 1968, 1970, 1975, 1976, 1979, 1980, 1985, 1986, 1994, 1996, 1998, 2001, 2003, 2007, 2009, 2011, 2012, 2014, 2019, 2028, 2029, 2042, 2047, 2049, 2050, 2060, 2068, 2071, 2076, 2078, 2080, 2081, 2086, 2089, 2090, 2093, 2094, 2099, 2102, 2107, 2112, 2115, 2121, 2124, 2125, 2129, 2131, 2133, 2135, 2140, 2141, 2148, 2150, 2151, 2152, 2155, 2163, 2173, 2176, 2178, 2182, 2183, 2187, 2188, 2196, 2197, 2198, 2199, 2200, 2205, 2207, 2209, 2215, 2217, 2220, 2221, 2223, 2230, 2235, 2236, 2237, 2238, 2241, 2242, 2243, 2244, 2246, 2252, 2253, 2261, 2265, 2274, 2277, 2278, 2281, 2286, 2290, 2292, 2293, 2294, 2296, 2299, 2301, 2304, 2305, 2307, 2309, 2312, 2314, 2315, 2319, 2323, 2324, 2337, 2338, 2339, 2340, 2342, 2351, 2356, 2358, 2360, 2367, 2369, 2371, 2374, 2376, 2378, 2382, 2383, 2387, 2388, 2390, 2399, 2400, 2412, 2416, 2422, 2423, 2427, 2428, 2435, 2439, 2440, 2442, 2447, 2450, 2455, 2459, 2461, 2462, 2463, 2466, 2468, 2470, 2479, 2480, 2482, 2483, 2485, 2488, 2491, 2495, 2496, 2502, 2505, 2506, 2507, 2510, 2511, 2515, 2522, 2524, 2532, 2534, 2546, 2547, 2550, 2554, 2558, 2562, 2563, 2574, 2583, 2584, 2590, 2594, 2598, 2602, 2603, 2606, 2611, 2613, 2615, 2617, 2619, 2623, 2625, 2630, 2636, 2642, 2643, 2644, 2646, 2647, 2649, 2650, 2659, 2661, 2664, 2674, 2675, 2677, 2682, 2684, 2685, 2691, 2693, 2695, 2698, 2699, 2703, 2704, 2706, 2707, 2711, 2713, 2719, 2720, 2723, 2726, 2727, 2729, 2730, 2733, 2734, 2738, 2739, 2741, 2744, 2745, 2748, 2749, 2754, 2757, 2761, 2762, 2764, 2765, 2767, 2768, 2772, 2776, 2778, 2779, 2780, 2781, 2783, 2787, 2791, 2795, 2796, 2799, 2800, 2802, 2807, 2808, 2811, 2813, 2817, 2820, 2827, 2829, 2831, 2833, 2834, 2835, 2839, 2840, 2841, 2846, 2847, 2849, 2852, 2855, 2859, 2860, 2864, 2870, 2871, 2876, 2878, 2879, 2882, 2884, 2885, 2886, 2887, 2888, 2895, 2896, 2897, 2898, 2900, 2902, 2905, 2911, 2913, 2915, 2919, 2922, 2924, 2933, 2939, 2945, 2953, 2954, 2958, 2959, 2968, 2973, 2976, 2979, 2982, 2984, 2992, 3002, 3004, 3007, 3008, 3009, 3010, 3013, 3016, 3021, 3022, 3023, 3026, 3028, 3033, 3036, 3037, 3039, 3043, 3044, 3045, 3046, 3053, 3060, 3062, 3063, 3071, 3072, 3085, 3086, 3092, 3095, 3096, 3102, 3103, 3104, 3105, 3111, 3115, 3116, 3122, 3129, 3131, 3132, 3137, 3138, 3140, 3147, 3148, 3157, 3164, 3167, 3168, 3170, 3175, 3179, 3182, 3184, 3190, 3194, 3196, 3198, 3199, 3200, 3215, 3216, 3217, 3219, 3222, 3229, 3230, 3237, 3239, 3242, 3249, 3253, 3255, 3257, 3258, 3267, 3270, 3271, 3274, 3279, 3288, 3290, 3291, 3293, 3299, 3305, 3306, 3312, 3318, 3319, 3320, 3323, 3326, 3328, 3329, 3335, 3343, 3344, 3345, 3347, 3349, 3350, 3353, 3356, 3362, 3364, 3366, 3369, 3374, 3377, 3379, 3381, 3382, 3384, 3385, 3388, 3389, 3395, 3399, 3403, 3409, 3411, 3416, 3419, 3421, 3423, 3424, 3425, 3427, 3428, 3431, 3437, 3438, 3439, 3444, 3450, 3452, 3453, 3456, 3457, 3460, 3461, 3462, 3464, 3466, 3467, 3471, 3472, 3477, 3478, 3482, 3484, 3486, 3492, 3499, 3500, 3501, 3502, 3511, 3525, 3529, 3531, 3533, 3534, 3536, 3552, 3553, 3555, 3557, 3562, 3567, 3568, 3570, 3571, 3573, 3577, 3578, 3584, 3585, 3586, 3587, 3595, 3600, 3601, 3604, 3609, 3610, 3612, 3615, 3616, 3619, 3620, 3624, 3625, 3631, 3632, 3636, 3637, 3638, 3640, 3643, 3651, 3654, 3655, 3656, 3657, 3662, 3667, 3668, 3671, 3677, 3684, 3686, 3689, 3693, 3694, 3696, 3697, 3698, 3699, 3700, 3701, 3703, 3704, 3707, 3708, 3709, 3711, 3712, 3713, 3714, 3719, 3721, 3723, 3726, 3737, 3741, 3742, 3744, 3750, 3752, 3757, 3760, 3761, 3764, 3765, 3767, 3770, 3772, 3774, 3776, 3778, 3780, 3781, 3796, 3797, 3805, 3818, 3819, 3820, 3821, 3824, 3841, 3845, 3848, 3851, 3858, 3866, 3870, 3871, 3876, 3879, 3880, 3883, 3893, 3896, 3900, 3903, 3904, 3908, 3909, 3913, 3914, 3916, 3924, 3927, 3937, 3940, 3942, 3943, 3949, 3950, 3953, 3954, 3959, 3963, 3966, 3969, 3972, 3978, 3981, 3983, 3984, 3986, 3989, 3990, 3991, 3999, 4000, 4004, 4005, 4006, 4012, 4014, 4016, 4017, 4019, 4020, 4030, 4035, 4046, 4049, 4051, 4052, 4053, 4057, 4061, 4065, 4066, 4068, 4073, 4074, 4075, 4079, 4080, 4082, 4084, 4086, 4090, 4091, 4093, 4094, 4095, 4096, 4100, 4102, 4104, 4106, 4113, 4114, 4115, 4116, 4118, 4124, 4126, 4127, 4128, 4131, 4133, 4134, 4142, 4145, 4149, 4156, 4160, 4171, 4174, 4178, 4179, 4180, 4183, 4186, 4190, 4191, 4195, 4197, 4215, 4220, 4229, 4234, 4245, 4249, 4251, 4252, 4254, 4257, 4259, 4264, 4265, 4266, 4267, 4275, 4276, 4277, 4282, 4284, 4285, 4288, 4291, 4294, 4295, 4301, 4302, 4313, 4315, 4320, 4328, 4333, 4336, 4339, 4342, 4345, 4346, 4350, 4354, 4372, 4374, 4375, 4377, 4379, 4380, 4386, 4388, 4389, 4392, 4396, 4402, 4404, 4408, 4410, 4424, 4426, 4428, 4431, 4435, 4436, 4439, 4442, 4446, 4447, 4449, 4452, 4455, 4458, 4460, 4461, 4466, 4469, 4475, 4476, 4478, 4488, 4491, 4494, 4498]
LAVIS-main/lavis/projects/instructblip/caption_vatex_flant5xl_eval.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_t5_instruct
8
+ model_type: flant5xl
9
+ load_pretrained: True
10
+ prompt: "a short description"
11
+
12
+ datasets:
13
+ vatex_caption:
14
+ vis_processor:
15
+ train:
16
+ name: alpro_video_train
17
+ n_frms: 4
18
+ image_size: 224
19
+ min_scale: 0.9
20
+ max_scale: 1.0
21
+ eval:
22
+ name: alpro_video_eval
23
+ n_frms: 4
24
+ image_size: 224
25
+ min_scale: 0.9
26
+ max_scale: 1.0
27
+ text_processor:
28
+ train:
29
+ name: "blip_caption"
30
+ eval:
31
+ name: "blip_caption"
32
+
33
+ build_info:
34
+ # Be careful not to append minus sign (-) before split to avoid itemizing
35
+ annotations:
36
+ train:
37
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
38
+ storage: vatex/annotations/cap_train.json
39
+ val:
40
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
41
+ storage: vatex/annotations/cap_val.json
42
+ test:
43
+ # iWNXAYGh9cI_000004_000014.mp4 is corrupt and removed from youtube
44
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
45
+ storage: vatex/annotations/cap_test.json
46
+ videos:
47
+ storage: /export/video-language-dataset/data/vatex/
48
+
49
+
50
+ run:
51
+ task: captioning
52
+ # optimizer
53
+ lr_sched: "linear_warmup_cosine_lr"
54
+ init_lr: 1e-5
55
+ min_lr: 0
56
+ warmup_lr: 1e-8
57
+ warmup_steps: 1000
58
+ weight_decay: 0.05
59
+ max_epoch: 1
60
+ batch_size_train: 16
61
+ batch_size_eval: 1
62
+ num_workers: 8
63
+ accum_grad_iters: 1
64
+
65
+ max_len: 80
66
+ min_len: 10
67
+ num_beams: 5
68
+ inference_method: "generate"
69
+ prompt: "describe the video"
70
+ length_penalty: 1.
71
+
72
+
73
+ seed: 42
74
+ output_dir: "output/instructblip/vatex_caption_flant5xl/"
75
+
76
+ amp: True
77
+ resume_ckpt_path: null
78
+
79
+ evaluate: True
80
+ # train_splits: ["train"]
81
+ valid_splits: ["val"]
82
+ annotation_file: /export/home/.cache/lavis/vatex_caption_gt/vatex_caption_val_annotations.json
83
+
84
+
85
+ device: "cuda"
86
+ world_size: 1
87
+ dist_url: "env://"
88
+ distributed: True
89
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
90
+ val_freq: 1
LAVIS-main/lavis/projects/instructblip/caption_vatex_flant5xxl_eval.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_t5_instruct
8
+ model_type: flant5xxl
9
+ load_pretrained: True
10
+ prompt: "a short description"
11
+
12
+ datasets:
13
+ vatex_caption:
14
+ vis_processor:
15
+ train:
16
+ name: alpro_video_train
17
+ n_frms: 4
18
+ image_size: 224
19
+ min_scale: 0.9
20
+ max_scale: 1.0
21
+ eval:
22
+ name: alpro_video_eval
23
+ n_frms: 4
24
+ image_size: 224
25
+ min_scale: 0.9
26
+ max_scale: 1.0
27
+ text_processor:
28
+ train:
29
+ name: "blip_caption"
30
+ eval:
31
+ name: "blip_caption"
32
+
33
+ build_info:
34
+ # Be careful not to append minus sign (-) before split to avoid itemizing
35
+ annotations:
36
+ train:
37
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
38
+ storage: vatex/annotations/cap_train.json
39
+ val:
40
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
41
+ storage: vatex/annotations/cap_val.json
42
+ test:
43
+ # iWNXAYGh9cI_000004_000014.mp4 is corrupt and removed from youtube
44
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
45
+ storage: vatex/annotations/cap_test.json
46
+ videos:
47
+ storage: /export/video-language-dataset/data/vatex/
48
+
49
+
50
+ run:
51
+ task: captioning
52
+ # optimizer
53
+ lr_sched: "linear_warmup_cosine_lr"
54
+ init_lr: 1e-5
55
+ min_lr: 0
56
+ warmup_lr: 1e-8
57
+ warmup_steps: 1000
58
+ weight_decay: 0.05
59
+ max_epoch: 1
60
+ batch_size_train: 16
61
+ batch_size_eval: 1
62
+ num_workers: 0
63
+ accum_grad_iters: 1
64
+
65
+ max_len: 80
66
+ min_len: 10
67
+ num_beams: 5
68
+ inference_method: "generate"
69
+ prompt: "describe the video"
70
+ length_penalty: 1.
71
+
72
+
73
+ seed: 42
74
+ output_dir: "output/instructblip/vatex_caption_flant5xxl/"
75
+
76
+ amp: True
77
+ resume_ckpt_path: null
78
+
79
+ evaluate: True
80
+ # train_splits: ["train"]
81
+ valid_splits: ["val"]
82
+ annotation_file: /export/home/.cache/lavis/vatex_caption_gt/vatex_caption_val_annotations.json
83
+
84
+
85
+ device: "cuda"
86
+ world_size: 1
87
+ dist_url: "env://"
88
+ distributed: True
89
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
90
+ val_freq: 1
LAVIS-main/lavis/projects/instructblip/caption_vatex_vicuna13b_eval.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_vicuna_instruct
8
+ model_type: vicuna13b
9
+ load_pretrained: True
10
+ prompt: "describe the video"
11
+
12
+ datasets:
13
+ vatex_caption:
14
+ vis_processor:
15
+ train:
16
+ name: alpro_video_train
17
+ n_frms: 4
18
+ image_size: 224
19
+ min_scale: 0.9
20
+ max_scale: 1.0
21
+ eval:
22
+ name: alpro_video_eval
23
+ n_frms: 4
24
+ image_size: 224
25
+ min_scale: 0.9
26
+ max_scale: 1.0
27
+ text_processor:
28
+ train:
29
+ name: "blip_caption"
30
+ eval:
31
+ name: "blip_caption"
32
+
33
+ build_info:
34
+ # Be careful not to append minus sign (-) before split to avoid itemizing
35
+ annotations:
36
+ train:
37
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
38
+ storage: vatex/annotations/cap_train.json
39
+ val:
40
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
41
+ storage: vatex/annotations/cap_val.json
42
+ test:
43
+ # iWNXAYGh9cI_000004_000014.mp4 is corrupt and removed from youtube
44
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
45
+ storage: vatex/annotations/cap_test.json
46
+ videos:
47
+ storage: /export/video-language-dataset/data/vatex/
48
+
49
+
50
+ run:
51
+ task: captioning
52
+ # optimizer
53
+ lr_sched: "linear_warmup_cosine_lr"
54
+ init_lr: 1e-5
55
+ min_lr: 0
56
+ warmup_lr: 1e-8
57
+ warmup_steps: 1000
58
+ weight_decay: 0.05
59
+ max_epoch: 1
60
+ batch_size_train: 16
61
+ batch_size_eval: 1
62
+ num_workers: 8
63
+ accum_grad_iters: 1
64
+
65
+ max_len: 80
66
+ min_len: 10
67
+ num_beams: 5
68
+ inference_method: "generate"
69
+ prompt: "describe the video"
70
+ length_penalty: 0.
71
+
72
+
73
+ seed: 42
74
+ output_dir: "output/instructblip/msvd_caption_vicuna13b/"
75
+
76
+ amp: True
77
+ resume_ckpt_path: null
78
+
79
+ evaluate: True
80
+ # train_splits: ["train"]
81
+ valid_splits: ["val"]
82
+ annotation_file: /export/home/.cache/lavis/vatex_caption_gt/vatex_caption_val_annotations.json
83
+
84
+
85
+ device: "cuda"
86
+ world_size: 1
87
+ dist_url: "env://"
88
+ distributed: True
89
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
90
+ val_freq: 1
LAVIS-main/lavis/projects/instructblip/caption_vatex_vicuna7b_eval.yaml ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_vicuna_instruct
8
+ model_type: vicuna7b
9
+ load_pretrained: True
10
+ prompt: "a short description"
11
+
12
+
13
+ datasets:
14
+ vatex_caption:
15
+ vis_processor:
16
+ train:
17
+ name: alpro_video_train
18
+ n_frms: 4
19
+ image_size: 224
20
+ min_scale: 0.9
21
+ max_scale: 1.0
22
+ eval:
23
+ name: alpro_video_eval
24
+ n_frms: 4
25
+ image_size: 224
26
+ min_scale: 0.9
27
+ max_scale: 1.0
28
+ text_processor:
29
+ train:
30
+ name: "blip_caption"
31
+ eval:
32
+ name: "blip_caption"
33
+
34
+ build_info:
35
+ # Be careful not to append minus sign (-) before split to avoid itemizing
36
+ annotations:
37
+ train:
38
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
39
+ storage: vatex/annotations/cap_train.json
40
+ val:
41
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
42
+ storage: vatex/annotations/cap_val.json
43
+ test:
44
+ # iWNXAYGh9cI_000004_000014.mp4 is corrupt and removed from youtube
45
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
46
+ storage: vatex/annotations/cap_test.json
47
+ videos:
48
+ storage: /export/video-language-dataset/data/vatex/
49
+
50
+
51
+ run:
52
+ task: captioning
53
+ # optimizer
54
+ lr_sched: "linear_warmup_cosine_lr"
55
+ init_lr: 1e-5
56
+ min_lr: 0
57
+ warmup_lr: 1e-8
58
+ warmup_steps: 1000
59
+ weight_decay: 0.05
60
+ max_epoch: 1
61
+ batch_size_train: 16
62
+ batch_size_eval: 1
63
+ num_workers: 8
64
+ accum_grad_iters: 1
65
+
66
+ max_len: 80
67
+ min_len: 10
68
+ num_beams: 5
69
+ inference_method: "generate"
70
+ prompt: "describe the video"
71
+ length_penalty: 1.
72
+
73
+
74
+ seed: 42
75
+ output_dir: "output/instructblip/vatex_caption_vicuna7b/"
76
+
77
+ amp: True
78
+ resume_ckpt_path: null
79
+
80
+ evaluate: True
81
+ # train_splits: ["train"]
82
+ valid_splits: ["val"]
83
+ annotation_file: /export/home/.cache/lavis/vatex_caption_gt/vatex_caption_val_annotations.json
84
+
85
+
86
+ device: "cuda"
87
+ world_size: 1
88
+ dist_url: "env://"
89
+ distributed: True
90
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
91
+ val_freq: 1
LAVIS-main/lavis/projects/instructblip/classification_modelnet40_vicuna13b.yaml ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_vicuna_instruct
8
+ model_type: vicuna13b
9
+ load_pretrained: True
10
+ prompt: "describe the 3d model."
11
+ format_candidates_prompt: " a 3d model of a {}"
12
+
13
+ datasets:
14
+ modelnet40_cls: # name of the dataset builder
15
+ data_type: [pc, images]
16
+
17
+ vis_processor:
18
+ train:
19
+ name: "clip_image_train"
20
+ image_size: 224
21
+ eval:
22
+ name: "clip_image_eval"
23
+ image_size: 224
24
+
25
+ pc_processor:
26
+ train:
27
+ name: "ulip_pc"
28
+ eval:
29
+ name: "ulip_pc"
30
+ text_processor:
31
+ train:
32
+ name: "blip_caption"
33
+ eval:
34
+ name: "blip_caption"
35
+
36
+ build_info:
37
+ # Be careful not to append minus sign (-) before split to avoid itemizing
38
+ annotations:
39
+ train:
40
+ url:
41
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
42
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
43
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train.txt
44
+ storage:
45
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
46
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
47
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train.txt
48
+ val:
49
+ url:
50
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
51
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
52
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test.txt
53
+ storage:
54
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
55
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
56
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test.txt
57
+
58
+ pc:
59
+ storage: /export/home/ULIP/data/modelnet40_normal_resampled
60
+
61
+ images:
62
+ storage: /export/einstein-vision/3d_vision/3d_object_datasets/modelnet_images8192
63
+
64
+
65
+ run:
66
+ task: multimodal_classification
67
+ # optimizer
68
+ lr_sched: "linear_warmup_cosine_lr"
69
+ init_lr: 1e-5
70
+ min_lr: 0
71
+ warmup_lr: 1e-8
72
+ warmup_steps: 1000
73
+ weight_decay: 0.05
74
+ max_epoch: 1
75
+ batch_size_train: 16
76
+ batch_size_eval: 1
77
+ num_workers: 8
78
+ accum_grad_iters: 1
79
+ prompt: "describe the 3d model."
80
+
81
+ max_len: 3
82
+ min_len: 1
83
+ num_beams: 5
84
+
85
+ seed: 42
86
+ output_dir: "output/instructblip/modelent_classification_vicuna13b/"
87
+
88
+ amp: True
89
+ resume_ckpt_path: null
90
+
91
+ evaluate: True
92
+ # train_splits: ["train"]
93
+ valid_splits: ["val"]
94
+
95
+
96
+ device: "cuda"
97
+ world_size: 1
98
+ dist_url: "env://"
99
+ distributed: True
100
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
101
+ val_freq: 1
LAVIS-main/lavis/projects/instructblip/classification_modelnet40_vicuna7b.yaml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_vicuna_instruct
8
+ model_type: vicuna7b
9
+ load_pretrained: True
10
+ prompt: "describe the 3d model."
11
+ format_candidates_prompt: " a 3d model of a {}"
12
+
13
+ datasets:
14
+ modelnet40_cls: # name of the dataset builder
15
+ data_type: [pc, images]
16
+
17
+ vis_processor:
18
+ train:
19
+ name: "clip_image_train"
20
+ image_size: 224
21
+ eval:
22
+ name: "clip_image_eval"
23
+ image_size: 224
24
+
25
+ pc_processor:
26
+ train:
27
+ name: "ulip_pc"
28
+ eval:
29
+ name: "ulip_pc"
30
+ text_processor:
31
+ train:
32
+ name: "blip_caption"
33
+ eval:
34
+ name: "blip_caption"
35
+
36
+ build_info:
37
+ # Be careful not to append minus sign (-) before split to avoid itemizing
38
+ annotations:
39
+ train:
40
+ url:
41
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
42
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
43
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train.txt
44
+ storage:
45
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
46
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
47
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train.txt
48
+ val:
49
+ url:
50
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
51
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
52
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test.txt
53
+ storage:
54
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
55
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
56
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test.txt
57
+
58
+ pc:
59
+ storage: /export/home/ULIP/data/modelnet40_normal_resampled
60
+
61
+ images:
62
+ storage: /export/einstein-vision/3d_vision/3d_object_datasets/modelnet_images8192
63
+
64
+ run:
65
+ task: multimodal_classification
66
+ # optimizer
67
+ lr_sched: "linear_warmup_cosine_lr"
68
+ init_lr: 1e-5
69
+ min_lr: 0
70
+ warmup_lr: 1e-8
71
+ warmup_steps: 1000
72
+ weight_decay: 0.05
73
+ max_epoch: 1
74
+ batch_size_train: 16
75
+ batch_size_eval: 1
76
+ num_workers: 8
77
+ accum_grad_iters: 1
78
+ prompt: "describe the 3d model."
79
+
80
+ max_len: 3
81
+ min_len: 1
82
+ num_beams: 5
83
+
84
+ seed: 42
85
+ output_dir: "output/instructblip/modelent_classification_vicuna7b/"
86
+
87
+ amp: True
88
+ resume_ckpt_path: null
89
+
90
+ evaluate: True
91
+ # train_splits: ["train"]
92
+ valid_splits: ["val"]
93
+
94
+
95
+ device: "cuda"
96
+ world_size: 1
97
+ dist_url: "env://"
98
+ distributed: True
99
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
100
+ val_freq: 1
LAVIS-main/lavis/projects/instructblip/classification_snlive_flant5xl.yaml ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ ## note flant5 has been trained on snli
8
+ arch: blip2_t5_instruct
9
+ model_type: flant5xl
10
+ load_pretrained: True
11
+ prompt: ""
12
+
13
+ datasets:
14
+ snli_ve_instruct:
15
+ # data_dir: ${env.data_dir}/datasets
16
+ data_type: images # [images|videos|features]
17
+
18
+ vis_processor:
19
+ train:
20
+ name: "clip_image_train"
21
+ image_size: 224
22
+ eval:
23
+ name: "clip_image_eval"
24
+ image_size: 224
25
+
26
+ text_processor:
27
+ train:
28
+ name: "blip_caption"
29
+ eval:
30
+ name: "blip_caption"
31
+ prompt: "given the image respond to "
32
+
33
+ build_info:
34
+ # Be careful not to append minus sign (-) before split to avoid itemizing
35
+ annotations:
36
+ train:
37
+ url:
38
+ - /export/share/dongxuli/data/lavis/snli/ve_train.json
39
+ storage:
40
+ - snli/annotations/ve_train.json
41
+ val:
42
+ url:
43
+ - /export/share/dongxuli/data/lavis/snli/ve_dev.json
44
+ storage:
45
+ - snli/annotations/ve_dev.json
46
+ test:
47
+ url:
48
+ - /export/share/dongxuli/data/lavis/snli/ve_test.json
49
+ storage:
50
+ - snli/annotations/ve_test.json
51
+ images:
52
+ # storage: flickr30k/images/flickr30k-images
53
+ storage: /export/share/datasets/vision/flickr30k/flickr30k-images
54
+
55
+
56
+ run:
57
+ task: multimodal_classification
58
+ # optimizer
59
+ lr_sched: "linear_warmup_cosine_lr"
60
+ init_lr: 1e-5
61
+ min_lr: 0
62
+ warmup_lr: 1e-8
63
+ warmup_steps: 1000
64
+ weight_decay: 0.05
65
+ max_epoch: 1
66
+ batch_size_train: 16
67
+ batch_size_eval: 1
68
+ num_workers: 8
69
+ accum_grad_iters: 1
70
+
71
+ max_len: 30
72
+ min_len: 1
73
+ num_beams: 5
74
+ inference_method: "generate"
75
+ prompt: ""
76
+ length_penalty: -1.
77
+
78
+ seed: 42
79
+ output_dir: "output/instructblip/snlive_classification_flant5xl/"
80
+
81
+ amp: True
82
+ resume_ckpt_path: null
83
+
84
+ evaluate: True
85
+ # train_splits: ["train"]
86
+ valid_splits: ["val"]
87
+
88
+
89
+ device: "cuda"
90
+ world_size: 1
91
+ dist_url: "env://"
92
+ distributed: True
93
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
94
+ val_freq: 1
LAVIS-main/lavis/projects/instructblip/classification_snlive_flant5xxl.yaml ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ ## note flant5 has been trained on snli
8
+ arch: blip2_t5_instruct
9
+ model_type: flant5xxl
10
+ load_pretrained: True
11
+ prompt: ""
12
+
13
+
14
+ datasets:
15
+ snli_ve_instruct:
16
+ # data_dir: ${env.data_dir}/datasets
17
+ data_type: images # [images|videos|features]
18
+
19
+ vis_processor:
20
+ train:
21
+ name: "clip_image_train"
22
+ image_size: 224
23
+ eval:
24
+ name: "clip_image_eval"
25
+ image_size: 224
26
+
27
+ text_processor:
28
+ train:
29
+ name: "blip_caption"
30
+ eval:
31
+ name: "blip_caption"
32
+ prompt: "given the image respond to "
33
+
34
+ build_info:
35
+ # Be careful not to append minus sign (-) before split to avoid itemizing
36
+ annotations:
37
+ train:
38
+ url:
39
+ - /export/share/dongxuli/data/lavis/snli/ve_train.json
40
+ storage:
41
+ - snli/annotations/ve_train.json
42
+ val:
43
+ url:
44
+ - /export/share/dongxuli/data/lavis/snli/ve_dev.json
45
+ storage:
46
+ - snli/annotations/ve_dev.json
47
+ test:
48
+ url:
49
+ - /export/share/dongxuli/data/lavis/snli/ve_test.json
50
+ storage:
51
+ - snli/annotations/ve_test.json
52
+ images:
53
+ # storage: flickr30k/images/flickr30k-images
54
+ storage: /export/share/datasets/vision/flickr30k/flickr30k-images
55
+
56
+
57
+ run:
58
+ task: multimodal_classification
59
+ # optimizer
60
+ lr_sched: "linear_warmup_cosine_lr"
61
+ init_lr: 1e-5
62
+ min_lr: 0
63
+ warmup_lr: 1e-8
64
+ warmup_steps: 1000
65
+ weight_decay: 0.05
66
+ max_epoch: 1
67
+ batch_size_train: 16
68
+ batch_size_eval: 1
69
+ num_workers: 8
70
+ accum_grad_iters: 1
71
+
72
+ max_len: 30
73
+ min_len: 1
74
+ num_beams: 5
75
+ inference_method: "generate"
76
+ prompt: ""
77
+ length_penalty: -1.
78
+
79
+ seed: 42
80
+ output_dir: "output/instructblip/snlive_classification_flant5xxl/"
81
+
82
+ amp: True
83
+ resume_ckpt_path: null
84
+
85
+ evaluate: True
86
+ # train_splits: ["train"]
87
+ valid_splits: ["test"]
88
+
89
+
90
+ device: "cuda"
91
+ world_size: 1
92
+ dist_url: "env://"
93
+ distributed: True
94
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
95
+ val_freq: 1
LAVIS-main/lavis/projects/instructblip/classification_snlive_vicuna13b.yaml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_vicuna_instruct
8
+ model_type: vicuna13b
9
+ load_pretrained: True
10
+ prompt: ""
11
+
12
+ datasets:
13
+ snli_ve_instruct:
14
+ # data_dir: ${env.data_dir}/datasets
15
+ data_type: images # [images|videos|features]
16
+
17
+ vis_processor:
18
+ train:
19
+ name: "clip_image_train"
20
+ image_size: 224
21
+ eval:
22
+ name: "clip_image_eval"
23
+ image_size: 224
24
+
25
+ text_processor:
26
+ train:
27
+ name: "blip_caption"
28
+ eval:
29
+ name: "blip_caption"
30
+ # prompt: "how would you respond to "
31
+
32
+ build_info:
33
+ # Be careful not to append minus sign (-) before split to avoid itemizing
34
+ annotations:
35
+ train:
36
+ url:
37
+ - /export/share/dongxuli/data/lavis/snli/ve_train.json
38
+ storage:
39
+ - snli/annotations/ve_train.json
40
+ val:
41
+ url:
42
+ - /export/share/dongxuli/data/lavis/snli/ve_dev.json
43
+ storage:
44
+ - snli/annotations/ve_dev.json
45
+ test:
46
+ url:
47
+ - /export/share/dongxuli/data/lavis/snli/ve_test.json
48
+ storage:
49
+ - snli/annotations/ve_test.json
50
+ images:
51
+ # storage: flickr30k/images/flickr30k-images
52
+ storage: /export/share/datasets/vision/flickr30k/flickr30k-images
53
+
54
+
55
+ run:
56
+ task: multimodal_classification
57
+ # optimizer
58
+ lr_sched: "linear_warmup_cosine_lr"
59
+ init_lr: 1e-5
60
+ min_lr: 0
61
+ warmup_lr: 1e-8
62
+ warmup_steps: 1000
63
+ weight_decay: 0.05
64
+ max_epoch: 1
65
+ batch_size_train: 16
66
+ batch_size_eval: 1
67
+ num_workers: 8
68
+ accum_grad_iters: 1
69
+
70
+ max_len: 30
71
+ min_len: 1
72
+ num_beams: 5
73
+ inference_method: "generate"
74
+ prompt: ""
75
+ length_penalty: -1.
76
+
77
+ seed: 42
78
+ output_dir: "output/instructblip/snlive_classification_vicuna13b_val/"
79
+
80
+ amp: True
81
+ resume_ckpt_path: null
82
+
83
+ evaluate: True
84
+ # train_splits: ["train"]
85
+ valid_splits: ["val"]
86
+
87
+
88
+ device: "cuda"
89
+ world_size: 1
90
+ dist_url: "env://"
91
+ distributed: True
92
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
93
+ val_freq: 1
LAVIS-main/lavis/projects/instructblip/classification_snlive_vicuna13b_test.yaml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_vicuna_instruct
8
+ model_type: vicuna13b
9
+ load_pretrained: True
10
+ prompt: ""
11
+
12
+ datasets:
13
+ snli_ve_instruct:
14
+ # data_dir: ${env.data_dir}/datasets
15
+ data_type: images # [images|videos|features]
16
+
17
+ vis_processor:
18
+ train:
19
+ name: "clip_image_train"
20
+ image_size: 224
21
+ eval:
22
+ name: "clip_image_eval"
23
+ image_size: 224
24
+
25
+ text_processor:
26
+ train:
27
+ name: "blip_caption"
28
+ eval:
29
+ name: "blip_caption"
30
+ # prompt: "how would you respond to "
31
+
32
+ build_info:
33
+ # Be careful not to append minus sign (-) before split to avoid itemizing
34
+ annotations:
35
+ train:
36
+ url:
37
+ - /export/share/dongxuli/data/lavis/snli/ve_train.json
38
+ storage:
39
+ - snli/annotations/ve_train.json
40
+ val:
41
+ url:
42
+ - /export/share/dongxuli/data/lavis/snli/ve_dev.json
43
+ storage:
44
+ - snli/annotations/ve_dev.json
45
+ test:
46
+ url:
47
+ - /export/share/dongxuli/data/lavis/snli/ve_test.json
48
+ storage:
49
+ - snli/annotations/ve_test.json
50
+ images:
51
+ # storage: flickr30k/images/flickr30k-images
52
+ storage: /export/share/datasets/vision/flickr30k/flickr30k-images
53
+
54
+
55
+ run:
56
+ task: multimodal_classification
57
+ # optimizer
58
+ lr_sched: "linear_warmup_cosine_lr"
59
+ init_lr: 1e-5
60
+ min_lr: 0
61
+ warmup_lr: 1e-8
62
+ warmup_steps: 1000
63
+ weight_decay: 0.05
64
+ max_epoch: 1
65
+ batch_size_train: 16
66
+ batch_size_eval: 1
67
+ num_workers: 8
68
+ accum_grad_iters: 1
69
+
70
+ max_len: 30
71
+ min_len: 1
72
+ num_beams: 5
73
+ inference_method: "generate"
74
+ prompt: ""
75
+ length_penalty: -1.
76
+
77
+ seed: 42
78
+ output_dir: "output/instructblip/snlive_classification_vicuna13b_test/"
79
+
80
+ amp: True
81
+ resume_ckpt_path: null
82
+
83
+ evaluate: True
84
+ # train_splits: ["train"]
85
+ valid_splits: ["test"]
86
+
87
+
88
+ device: "cuda"
89
+ world_size: 1
90
+ dist_url: "env://"
91
+ distributed: True
92
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
93
+ val_freq: 1
LAVIS-main/lavis/projects/instructblip/classification_snlive_vicuna7b_test.yaml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_vicuna_instruct
8
+ model_type: vicuna7b
9
+ load_pretrained: True
10
+ prompt: ""
11
+
12
+ datasets:
13
+ snli_ve_instruct:
14
+ # data_dir: ${env.data_dir}/datasets
15
+ data_type: images # [images|videos|features]
16
+
17
+ vis_processor:
18
+ train:
19
+ name: "clip_image_train"
20
+ image_size: 224
21
+ eval:
22
+ name: "clip_image_eval"
23
+ image_size: 224
24
+
25
+ text_processor:
26
+ train:
27
+ name: "blip_caption"
28
+ eval:
29
+ name: "blip_caption"
30
+ # prompt: "given the image respond to "
31
+
32
+ build_info:
33
+ # Be careful not to append minus sign (-) before split to avoid itemizing
34
+ annotations:
35
+ train:
36
+ url:
37
+ - /export/share/dongxuli/data/lavis/snli/ve_train.json
38
+ storage:
39
+ - snli/annotations/ve_train.json
40
+ val:
41
+ url:
42
+ - /export/share/dongxuli/data/lavis/snli/ve_dev.json
43
+ storage:
44
+ - snli/annotations/ve_dev.json
45
+ test:
46
+ url:
47
+ - /export/share/dongxuli/data/lavis/snli/ve_test.json
48
+ storage:
49
+ - snli/annotations/ve_test.json
50
+ images:
51
+ # storage: flickr30k/images/flickr30k-images
52
+ storage: /export/share/datasets/vision/flickr30k/flickr30k-images
53
+
54
+
55
+ run:
56
+ task: multimodal_classification
57
+ # optimizer
58
+ lr_sched: "linear_warmup_cosine_lr"
59
+ init_lr: 1e-5
60
+ min_lr: 0
61
+ warmup_lr: 1e-8
62
+ warmup_steps: 1000
63
+ weight_decay: 0.05
64
+ max_epoch: 1
65
+ batch_size_train: 16
66
+ batch_size_eval: 1
67
+ num_workers: 8
68
+ accum_grad_iters: 1
69
+
70
+ max_len: 30
71
+ min_len: 1
72
+ num_beams: 5
73
+ inference_method: "generate"
74
+ prompt: ""
75
+ length_penalty: -1.
76
+
77
+ seed: 42
78
+ output_dir: "output/instructblip/snlive_classification_vicuna7b_test/"
79
+
80
+ amp: True
81
+ resume_ckpt_path: null
82
+
83
+ evaluate: True
84
+ # train_splits: ["train"]
85
+ valid_splits: ["test"]
86
+
87
+
88
+ device: "cuda"
89
+ world_size: 1
90
+ dist_url: "env://"
91
+ distributed: True
92
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
93
+ val_freq: 1
LAVIS-main/lavis/projects/instructblip/classification_snlive_vicuna7b_val.yaml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_vicuna_instruct
8
+ model_type: vicuna7b
9
+ load_pretrained: True
10
+ prompt: ""
11
+
12
+ datasets:
13
+ snli_ve_instruct:
14
+ # data_dir: ${env.data_dir}/datasets
15
+ data_type: images # [images|videos|features]
16
+
17
+ vis_processor:
18
+ train:
19
+ name: "clip_image_train"
20
+ image_size: 224
21
+ eval:
22
+ name: "clip_image_eval"
23
+ image_size: 224
24
+
25
+ text_processor:
26
+ train:
27
+ name: "blip_caption"
28
+ eval:
29
+ name: "blip_caption"
30
+ # prompt: "given the image respond to "
31
+
32
+ build_info:
33
+ # Be careful not to append minus sign (-) before split to avoid itemizing
34
+ annotations:
35
+ train:
36
+ url:
37
+ - /export/share/dongxuli/data/lavis/snli/ve_train.json
38
+ storage:
39
+ - snli/annotations/ve_train.json
40
+ val:
41
+ url:
42
+ - /export/share/dongxuli/data/lavis/snli/ve_dev.json
43
+ storage:
44
+ - snli/annotations/ve_dev.json
45
+ test:
46
+ url:
47
+ - /export/share/dongxuli/data/lavis/snli/ve_test.json
48
+ storage:
49
+ - snli/annotations/ve_test.json
50
+ images:
51
+ # storage: flickr30k/images/flickr30k-images
52
+ storage: /export/share/datasets/vision/flickr30k/flickr30k-images
53
+
54
+
55
+ run:
56
+ task: multimodal_classification
57
+ # optimizer
58
+ lr_sched: "linear_warmup_cosine_lr"
59
+ init_lr: 1e-5
60
+ min_lr: 0
61
+ warmup_lr: 1e-8
62
+ warmup_steps: 1000
63
+ weight_decay: 0.05
64
+ max_epoch: 1
65
+ batch_size_train: 16
66
+ batch_size_eval: 1
67
+ num_workers: 8
68
+ accum_grad_iters: 1
69
+
70
+ max_len: 30
71
+ min_len: 1
72
+ num_beams: 5
73
+ inference_method: "generate"
74
+ prompt: ""
75
+ length_penalty: -1.
76
+
77
+ seed: 42
78
+ output_dir: "output/instructblip/snlive_classification_vicuna7b_val/"
79
+
80
+ amp: True
81
+ resume_ckpt_path: null
82
+
83
+ evaluate: True
84
+ # train_splits: ["train"]
85
+ valid_splits: ["val"]
86
+
87
+
88
+ device: "cuda"
89
+ world_size: 1
90
+ dist_url: "env://"
91
+ distributed: True
92
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
93
+ val_freq: 1
LAVIS-main/lavis/projects/instructblip/completion_modelnet40_vicuna13b.yaml ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_vicuna_instruct
8
+ model_type: vicuna13b
9
+ load_pretrained: True
10
+ prompt: "describe the 3d model"
11
+
12
+ datasets:
13
+ modelnet40_cls: # name of the dataset builder
14
+ data_type: [images,pc]
15
+
16
+ vis_processor:
17
+ train:
18
+ name: "clip_image_train"
19
+ image_size: 224
20
+ eval:
21
+ name: "clip_image_eval"
22
+ image_size: 224
23
+
24
+ pc_processor:
25
+ train:
26
+ name: "ulip_pc"
27
+ eval:
28
+ name: "ulip_pc"
29
+ text_processor:
30
+ train:
31
+ name: "blip_caption"
32
+ eval:
33
+ name: "blip_caption"
34
+
35
+ build_info:
36
+ # Be careful not to append minus sign (-) before split to avoid itemizing
37
+ annotations:
38
+ train:
39
+ url:
40
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
41
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
42
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train.txt
43
+ storage:
44
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
45
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
46
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train.txt
47
+ val:
48
+ url:
49
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
50
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
51
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test.txt
52
+ storage:
53
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
54
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
55
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test.txt
56
+
57
+ pc:
58
+ storage: /export/home/ULIP/data/modelnet40_normal_resampled
59
+
60
+ images:
61
+ storage: /export/einstein-vision/3d_vision/3d_object_datasets/modelnet_images8192
62
+
63
+
64
+ run:
65
+ task: multimodal_classification
66
+ # optimizer
67
+ lr_sched: "linear_warmup_cosine_lr"
68
+ init_lr: 1e-5
69
+ min_lr: 0
70
+ warmup_lr: 1e-8
71
+ warmup_steps: 1000
72
+ weight_decay: 0.05
73
+ max_epoch: 1
74
+ batch_size_train: 32
75
+ batch_size_eval: 1
76
+ num_workers: 8
77
+ accum_grad_iters: 1
78
+
79
+ max_len: 80
80
+ min_len: 1
81
+ num_beams: 5
82
+ length_penalty: 0.
83
+ prompt: "describe the 3d model"
84
+
85
+ seed: 42
86
+ output_dir: "output/instructblip/modelent_completion_vicuna13b/"
87
+
88
+
89
+ amp: True
90
+ resume_ckpt_path: null
91
+
92
+ evaluate: True
93
+ # train_splits: ["train"]
94
+ valid_splits: ["val"]
95
+ device: "cuda"
96
+ world_size: 1
97
+ dist_url: "env://"
98
+ distributed: True
99
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
100
+ val_freq: 1
101
+
LAVIS-main/lavis/projects/instructblip/completion_modelnet40_vicuna7b.yaml ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_vicuna_instruct
8
+ model_type: vicuna7b
9
+ load_pretrained: True
10
+ prompt: "describe the 3d model"
11
+ predict_with_gen: True
12
+
13
+ datasets:
14
+ modelnet40_cls: # name of the dataset builder
15
+ data_type: [images, pc]
16
+
17
+ vis_processor:
18
+ train:
19
+ name: "clip_image_train"
20
+ image_size: 224
21
+ eval:
22
+ name: "clip_image_eval"
23
+ image_size: 224
24
+
25
+ pc_processor:
26
+ train:
27
+ name: "ulip_pc"
28
+ eval:
29
+ name: "ulip_pc"
30
+ text_processor:
31
+ train:
32
+ name: "blip_caption"
33
+ eval:
34
+ name: "blip_caption"
35
+
36
+ build_info:
37
+ # Be careful not to append minus sign (-) before split to avoid itemizing
38
+ annotations:
39
+ train:
40
+ url:
41
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
42
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
43
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train.txt
44
+ storage:
45
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
46
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
47
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train.txt
48
+ val:
49
+ url:
50
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
51
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
52
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test.txt
53
+ storage:
54
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
55
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
56
+ - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test.txt
57
+
58
+ pc:
59
+ storage: /export/home/ULIP/data/modelnet40_normal_resampled
60
+
61
+ images:
62
+ storage: /export/einstein-vision/3d_vision/3d_object_datasets/modelnet_images8192
63
+
64
+
65
+ run:
66
+ task: multimodal_classification
67
+ # optimizer
68
+ lr_sched: "linear_warmup_cosine_lr"
69
+ init_lr: 1e-5
70
+ min_lr: 0
71
+ warmup_lr: 1e-8
72
+ warmup_steps: 1000
73
+ weight_decay: 0.05
74
+ max_epoch: 1
75
+ batch_size_train: 32
76
+ batch_size_eval: 1
77
+ num_workers: 8
78
+ accum_grad_iters: 1
79
+
80
+ max_len: 80
81
+ min_len: 1
82
+ num_beams: 5
83
+ length_penalty: 0.
84
+ prompt: "describe the 3d model"
85
+
86
+ seed: 42
87
+ output_dir: "output/instructblip/modelent_completion_vicuna7b/"
88
+
89
+
90
+ amp: True
91
+ resume_ckpt_path: null
92
+
93
+ evaluate: True
94
+ # train_splits: ["train"]
95
+ valid_splits: ["val"]
96
+ device: "cuda"
97
+ world_size: 1
98
+ dist_url: "env://"
99
+ distributed: True
100
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
101
+ val_freq: 1
102
+
LAVIS-main/lavis/projects/instructblip/qa_msrvtt_flant5xl_eval_test.yaml ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_t5_instruct
8
+ model_type: flant5xl
9
+ load_pretrained: True
10
+ prompt: "Question: {} Short Answer:"
11
+
12
+ datasets:
13
+ msrvtt_qa_instruct:
14
+ vis_processor:
15
+ train:
16
+ name: alpro_video_train
17
+ n_frms: 4
18
+ image_size: 224
19
+ min_scale: 0.9
20
+ max_scale: 1.0
21
+ eval:
22
+ name: alpro_video_eval
23
+ n_frms: 4
24
+ image_size: 224
25
+ min_scale: 0.9
26
+ max_scale: 1.0
27
+ text_processor:
28
+ train:
29
+ name: "blip_question"
30
+ eval:
31
+ name: "blip_caption"
32
+
33
+ build_info:
34
+ # Be careful not to append minus sign (-) before split to avoid itemizing
35
+ annotations:
36
+ train:
37
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json
38
+ storage: msrvtt/annotations/qa_train.json
39
+ val:
40
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json
41
+ storage: msrvtt/annotations/qa_val.json
42
+ test:
43
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json
44
+ storage: msrvtt/annotations/qa_test.json
45
+ ans2label:
46
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json
47
+ storage: msrvtt/annotations/qa_ans2label.json
48
+ videos:
49
+ storage: /export/share/datasets/vision_language/msrvtt/videos
50
+
51
+ instance_id_key: question_id
52
+
53
+
54
+ run:
55
+ task: gqa
56
+ # optimizer
57
+ lr_sched: "linear_warmup_cosine_lr"
58
+ init_lr: 1e-5
59
+ min_lr: 0
60
+ warmup_lr: 1e-8
61
+ warmup_steps: 1000
62
+ weight_decay: 0.05
63
+ max_epoch: 1
64
+ batch_size_train: 16
65
+ batch_size_eval: 1
66
+ num_workers: 8
67
+ accum_grad_iters: 1
68
+
69
+ max_len: 10
70
+ min_len: 1
71
+ num_beams: 5
72
+ inference_method: "generate"
73
+ length_penalty: -1.
74
+
75
+
76
+ seed: 42
77
+ output_dir: "output/instructblip/msrvtt_qa_flant5xl_test/"
78
+
79
+ amp: True
80
+ resume_ckpt_path: null
81
+
82
+ evaluate: True
83
+ # train_splits: ["train"]
84
+ valid_splits: ["test"]
85
+
86
+ device: "cuda"
87
+ world_size: 1
88
+ dist_url: "env://"
89
+ distributed: True
90
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
91
+ val_freq: 1
92
+
LAVIS-main/lavis/projects/instructblip/qa_msrvtt_flant5xxl_eval_test.yaml ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_t5_instruct
8
+ model_type: flant5xxl
9
+ load_pretrained: True
10
+ prompt: "Question: {} Short Answer:"
11
+
12
+ datasets:
13
+ msrvtt_qa_instruct:
14
+ vis_processor:
15
+ train:
16
+ name: alpro_video_train
17
+ n_frms: 4
18
+ image_size: 224
19
+ min_scale: 0.9
20
+ max_scale: 1.0
21
+ eval:
22
+ name: alpro_video_eval
23
+ n_frms: 4
24
+ image_size: 224
25
+ min_scale: 0.9
26
+ max_scale: 1.0
27
+ text_processor:
28
+ train:
29
+ name: "blip_question"
30
+ eval:
31
+ name: "blip_caption"
32
+
33
+ build_info:
34
+ # Be careful not to append minus sign (-) before split to avoid itemizing
35
+ annotations:
36
+ train:
37
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json
38
+ storage: msrvtt/annotations/qa_train.json
39
+ val:
40
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json
41
+ storage: msrvtt/annotations/qa_val.json
42
+ test:
43
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json
44
+ storage: msrvtt/annotations/qa_test.json
45
+ ans2label:
46
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json
47
+ storage: msrvtt/annotations/qa_ans2label.json
48
+ videos:
49
+ storage: /export/share/datasets/vision_language/msrvtt/videos
50
+
51
+ instance_id_key: question_id
52
+
53
+
54
+ run:
55
+ task: gqa
56
+ # optimizer
57
+ lr_sched: "linear_warmup_cosine_lr"
58
+ init_lr: 1e-5
59
+ min_lr: 0
60
+ warmup_lr: 1e-8
61
+ warmup_steps: 1000
62
+ weight_decay: 0.05
63
+ max_epoch: 1
64
+ batch_size_train: 16
65
+ batch_size_eval: 1
66
+ num_workers: 0
67
+ accum_grad_iters: 1
68
+
69
+ max_len: 10
70
+ min_len: 1
71
+ num_beams: 5
72
+ inference_method: "generate"
73
+ length_penalty: -1.
74
+
75
+
76
+ seed: 42
77
+ output_dir: "output/instructblip/msrvtt_qa_flant5xxl_test/"
78
+
79
+ amp: True
80
+ resume_ckpt_path: null
81
+
82
+ evaluate: True
83
+ # train_splits: ["train"]
84
+ valid_splits: ["test"]
85
+
86
+ device: "cuda"
87
+ world_size: 1
88
+ dist_url: "env://"
89
+ distributed: True
90
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
91
+ val_freq: 1
92
+
LAVIS-main/lavis/projects/instructblip/qa_msrvtt_vicuna13b_eval_test.yaml ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_vicuna_instruct
8
+ model_type: vicuna13b
9
+ load_pretrained: True
10
+ prompt: "Question: {} Short Answer:"
11
+
12
+ datasets:
13
+ msrvtt_qa_instruct:
14
+ vis_processor:
15
+ train:
16
+ name: alpro_video_train
17
+ n_frms: 4
18
+ image_size: 224
19
+ min_scale: 0.9
20
+ max_scale: 1.0
21
+ eval:
22
+ name: alpro_video_eval
23
+ n_frms: 4
24
+ image_size: 224
25
+ min_scale: 0.9
26
+ max_scale: 1.0
27
+ text_processor:
28
+ train:
29
+ name: "blip_question"
30
+ eval:
31
+ name: "blip_caption"
32
+
33
+ build_info:
34
+ # Be careful not to append minus sign (-) before split to avoid itemizing
35
+ annotations:
36
+ train:
37
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json
38
+ storage: msrvtt/annotations/qa_train.json
39
+ val:
40
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json
41
+ storage: msrvtt/annotations/qa_val.json
42
+ test:
43
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json
44
+ storage: msrvtt/annotations/qa_test.json
45
+ ans2label:
46
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json
47
+ storage: msrvtt/annotations/qa_ans2label.json
48
+ videos:
49
+ storage: /export/share/datasets/vision_language/msrvtt/videos
50
+
51
+ instance_id_key: question_id
52
+
53
+
54
+ run:
55
+ task: gqa
56
+ # optimizer
57
+ lr_sched: "linear_warmup_cosine_lr"
58
+ init_lr: 1e-5
59
+ min_lr: 0
60
+ warmup_lr: 1e-8
61
+ warmup_steps: 1000
62
+ weight_decay: 0.05
63
+ max_epoch: 1
64
+ batch_size_train: 16
65
+ batch_size_eval: 1
66
+ num_workers: 8
67
+ accum_grad_iters: 1
68
+
69
+ max_len: 10
70
+ min_len: 1
71
+ num_beams: 5
72
+ inference_method: "generate"
73
+ length_penalty: -1.
74
+
75
+
76
+ seed: 42
77
+ output_dir: "output/instructblip/msrvtt_qa_vicuna13b_test/"
78
+
79
+ amp: True
80
+ resume_ckpt_path: null
81
+
82
+ evaluate: True
83
+ # train_splits: ["train"]
84
+ valid_splits: ["test"]
85
+
86
+ device: "cuda"
87
+ world_size: 1
88
+ dist_url: "env://"
89
+ distributed: True
90
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
91
+ val_freq: 1
92
+
LAVIS-main/lavis/projects/instructblip/qa_msrvtt_vicuna7b_eval_test.yaml ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_vicuna_instruct
8
+ model_type: vicuna7b
9
+ load_pretrained: True
10
+ prompt: "Question: {} Short Answer:"
11
+
12
+ datasets:
13
+ msrvtt_qa_instruct:
14
+ vis_processor:
15
+ train:
16
+ name: alpro_video_train
17
+ n_frms: 4
18
+ image_size: 224
19
+ min_scale: 0.9
20
+ max_scale: 1.0
21
+ eval:
22
+ name: alpro_video_eval
23
+ n_frms: 4
24
+ image_size: 224
25
+ min_scale: 0.9
26
+ max_scale: 1.0
27
+ text_processor:
28
+ train:
29
+ name: "blip_question"
30
+ eval:
31
+ name: "blip_caption"
32
+
33
+ build_info:
34
+ # Be careful not to append minus sign (-) before split to avoid itemizing
35
+ annotations:
36
+ train:
37
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json
38
+ storage: msrvtt/annotations/qa_train.json
39
+ val:
40
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json
41
+ storage: msrvtt/annotations/qa_val.json
42
+ test:
43
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json
44
+ storage: msrvtt/annotations/qa_test.json
45
+ ans2label:
46
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json
47
+ storage: msrvtt/annotations/qa_ans2label.json
48
+ videos:
49
+ storage: /export/share/datasets/vision_language/msrvtt/videos
50
+
51
+ instance_id_key: question_id
52
+
53
+
54
+ run:
55
+ task: gqa
56
+ # optimizer
57
+ lr_sched: "linear_warmup_cosine_lr"
58
+ init_lr: 1e-5
59
+ min_lr: 0
60
+ warmup_lr: 1e-8
61
+ warmup_steps: 1000
62
+ weight_decay: 0.05
63
+ max_epoch: 1
64
+ batch_size_train: 16
65
+ batch_size_eval: 1
66
+ num_workers: 8
67
+ accum_grad_iters: 1
68
+
69
+ max_len: 10
70
+ min_len: 1
71
+ num_beams: 5
72
+ inference_method: "generate"
73
+ length_penalty: -1.
74
+
75
+
76
+ seed: 42
77
+ output_dir: "output/instructblip/msrvtt_qa_vicuna7b_test/"
78
+
79
+ amp: True
80
+ resume_ckpt_path: null
81
+
82
+ evaluate: True
83
+ # train_splits: ["train"]
84
+ valid_splits: ["test"]
85
+
86
+ device: "cuda"
87
+ world_size: 1
88
+ dist_url: "env://"
89
+ distributed: True
90
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
91
+ val_freq: 1
92
+
LAVIS-main/lavis/projects/instructblip/qa_msvd_flant5xl_eval.yaml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_t5_instruct
8
+ model_type: flant5xl
9
+ load_pretrained: True
10
+ prompt: "Question: {} Short Answer:"
11
+
12
+ datasets:
13
+ msvd_qa_instruct:
14
+ vis_processor:
15
+ train:
16
+ name: alpro_video_train
17
+ n_frms: 4
18
+ image_size: 224
19
+ min_scale: 0.9
20
+ max_scale: 1.0
21
+ eval:
22
+ name: alpro_video_eval
23
+ n_frms: 4
24
+ image_size: 224
25
+ min_scale: 0.9
26
+ max_scale: 1.0
27
+ text_processor:
28
+ train:
29
+ name: "blip_question"
30
+ eval:
31
+ name: "blip_caption"
32
+
33
+ build_info:
34
+ # Be careful not to append minus sign (-) before split to avoid itemizing
35
+ annotations:
36
+ train:
37
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json
38
+ storage: msvd/annotations/qa_train.json
39
+ val:
40
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json
41
+ storage: msvd/annotations/qa_val.json
42
+ test:
43
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json
44
+ storage: msvd/annotations/qa_test.json
45
+ ans2label:
46
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json
47
+ storage: msvd/annotations/qa_ans2label.json
48
+ videos:
49
+ storage: /export/share/datasets/vision_language/msvd/videos
50
+
51
+ instance_id_key: question_id
52
+
53
+ run:
54
+ task: gqa
55
+ # optimizer
56
+ lr_sched: "linear_warmup_cosine_lr"
57
+ init_lr: 1e-5
58
+ min_lr: 0
59
+ warmup_lr: 1e-8
60
+ warmup_steps: 1000
61
+ weight_decay: 0.05
62
+ max_epoch: 1
63
+ batch_size_train: 16
64
+ batch_size_eval: 1
65
+ num_workers: 8
66
+ accum_grad_iters: 1
67
+
68
+ max_len: 30
69
+ min_len: 1
70
+ num_beams: 5
71
+ inference_method: "generate"
72
+ prompt: "Question: {} Short Answer:"
73
+ length_penalty: -1.
74
+
75
+
76
+ seed: 42
77
+ output_dir: "output/instructblip/msvd_qa_flant5xl/"
78
+
79
+ amp: True
80
+ resume_ckpt_path: null
81
+
82
+ evaluate: True
83
+ # train_splits: ["train"]
84
+ valid_splits: ["test"]
85
+ ques_files: {"train": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_questions.json",
86
+ "val": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_questions.json",
87
+ "test":"/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_test_questions.json"}
88
+ anno_files: {"train": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_annotations.json",
89
+ "val": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_annotations.json",
90
+ "test":"/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_test_annotations.json"}
91
+
92
+
93
+
94
+
95
+ device: "cuda"
96
+ world_size: 1
97
+ dist_url: "env://"
98
+ distributed: True
99
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
100
+ val_freq: 1
LAVIS-main/lavis/projects/instructblip/qa_msvd_flant5xxl_eval.yaml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_t5_instruct
8
+ model_type: flant5xxl
9
+ load_pretrained: True
10
+ prompt: "Question: {} Short Answer:"
11
+
12
+ datasets:
13
+ msvd_qa_instruct:
14
+ vis_processor:
15
+ train:
16
+ name: alpro_video_train
17
+ n_frms: 4
18
+ image_size: 224
19
+ min_scale: 0.9
20
+ max_scale: 1.0
21
+ eval:
22
+ name: alpro_video_eval
23
+ n_frms: 4
24
+ image_size: 224
25
+ min_scale: 0.9
26
+ max_scale: 1.0
27
+ text_processor:
28
+ train:
29
+ name: "blip_question"
30
+ eval:
31
+ name: "blip_caption"
32
+
33
+ build_info:
34
+ # Be careful not to append minus sign (-) before split to avoid itemizing
35
+ annotations:
36
+ train:
37
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json
38
+ storage: msvd/annotations/qa_train.json
39
+ val:
40
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json
41
+ storage: msvd/annotations/qa_val.json
42
+ test:
43
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json
44
+ storage: msvd/annotations/qa_test.json
45
+ ans2label:
46
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json
47
+ storage: msvd/annotations/qa_ans2label.json
48
+ videos:
49
+ storage: /export/share/datasets/vision_language/msvd/videos
50
+
51
+ instance_id_key: question_id
52
+
53
+ run:
54
+ task: gqa
55
+ # optimizer
56
+ lr_sched: "linear_warmup_cosine_lr"
57
+ init_lr: 1e-5
58
+ min_lr: 0
59
+ warmup_lr: 1e-8
60
+ warmup_steps: 1000
61
+ weight_decay: 0.05
62
+ max_epoch: 1
63
+ batch_size_train: 16
64
+ batch_size_eval: 1
65
+ num_workers: 8
66
+ accum_grad_iters: 1
67
+
68
+ max_len: 30
69
+ min_len: 1
70
+ num_beams: 5
71
+ inference_method: "generate"
72
+ prompt: "Question: {} Short Answer:"
73
+ length_penalty: -1.
74
+
75
+
76
+ seed: 42
77
+ output_dir: "output/instructblip/msvd_qa_flant5xxl/"
78
+
79
+ amp: True
80
+ resume_ckpt_path: null
81
+
82
+ evaluate: True
83
+ # train_splits: ["train"]
84
+ valid_splits: ["test"]
85
+ ques_files: {"train": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_questions.json",
86
+ "val": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_questions.json",
87
+ "test":"/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_test_questions.json"}
88
+ anno_files: {"train": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_annotations.json",
89
+ "val": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_annotations.json",
90
+ "test":"/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_test_annotations.json"}
91
+
92
+
93
+
94
+
95
+ device: "cuda"
96
+ world_size: 1
97
+ dist_url: "env://"
98
+ distributed: True
99
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
100
+ val_freq: 1
LAVIS-main/lavis/projects/instructblip/qa_msvd_vicuna13b_eval.yaml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_vicuna_instruct
8
+ model_type: vicuna13b
9
+ load_pretrained: True
10
+ prompt: "Question: {} Short Answer:"
11
+
12
+ datasets:
13
+ msvd_qa_instruct:
14
+ vis_processor:
15
+ train:
16
+ name: alpro_video_train
17
+ n_frms: 4
18
+ image_size: 224
19
+ min_scale: 0.9
20
+ max_scale: 1.0
21
+ eval:
22
+ name: alpro_video_eval
23
+ n_frms: 4
24
+ image_size: 224
25
+ min_scale: 0.9
26
+ max_scale: 1.0
27
+ text_processor:
28
+ train:
29
+ name: "blip_question"
30
+ eval:
31
+ name: "blip_caption"
32
+
33
+ build_info:
34
+ # Be careful not to append minus sign (-) before split to avoid itemizing
35
+ annotations:
36
+ train:
37
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json
38
+ storage: msvd/annotations/qa_train.json
39
+ val:
40
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json
41
+ storage: msvd/annotations/qa_val.json
42
+ test:
43
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json
44
+ storage: msvd/annotations/qa_test.json
45
+ ans2label:
46
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json
47
+ storage: msvd/annotations/qa_ans2label.json
48
+ videos:
49
+ storage: /export/share/datasets/vision_language/msvd/videos
50
+
51
+ instance_id_key: question_id
52
+
53
+ run:
54
+ task: gqa
55
+ # optimizer
56
+ lr_sched: "linear_warmup_cosine_lr"
57
+ init_lr: 1e-5
58
+ min_lr: 0
59
+ warmup_lr: 1e-8
60
+ warmup_steps: 1000
61
+ weight_decay: 0.05
62
+ max_epoch: 1
63
+ batch_size_train: 16
64
+ batch_size_eval: 1
65
+ num_workers: 8
66
+ accum_grad_iters: 1
67
+
68
+ max_len: 30
69
+ min_len: 1
70
+ num_beams: 5
71
+ inference_method: "generate"
72
+ prompt: "Question: {} Short Answer:"
73
+ length_penalty: -1.
74
+
75
+
76
+ seed: 42
77
+ output_dir: "output/instructblip/msvd_qa_vicuna13b/"
78
+
79
+ amp: True
80
+ resume_ckpt_path: null
81
+
82
+ evaluate: True
83
+ # train_splits: ["train"]
84
+ valid_splits: ["test"]
85
+ ques_files: {"train": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_questions.json",
86
+ "val": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_questions.json",
87
+ "test":"/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_test_questions.json"}
88
+ anno_files: {"train": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_annotations.json",
89
+ "val": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_annotations.json",
90
+ "test":"/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_test_annotations.json"}
91
+
92
+
93
+
94
+
95
+ device: "cuda"
96
+ world_size: 1
97
+ dist_url: "env://"
98
+ distributed: True
99
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
100
+ val_freq: 1
LAVIS-main/lavis/projects/instructblip/qa_msvd_vicuna7b_eval.yaml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_vicuna_instruct
8
+ model_type: vicuna7b
9
+ load_pretrained: True
10
+ prompt: "Question: {} Short Answer:"
11
+
12
+ datasets:
13
+ msvd_qa_instruct:
14
+ vis_processor:
15
+ train:
16
+ name: alpro_video_train
17
+ n_frms: 4
18
+ image_size: 224
19
+ min_scale: 0.9
20
+ max_scale: 1.0
21
+ eval:
22
+ name: alpro_video_eval
23
+ n_frms: 4
24
+ image_size: 224
25
+ min_scale: 0.9
26
+ max_scale: 1.0
27
+ text_processor:
28
+ train:
29
+ name: "blip_question"
30
+ eval:
31
+ name: "blip_caption"
32
+
33
+ build_info:
34
+ # Be careful not to append minus sign (-) before split to avoid itemizing
35
+ annotations:
36
+ train:
37
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json
38
+ storage: msvd/annotations/qa_train.json
39
+ val:
40
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json
41
+ storage: msvd/annotations/qa_val.json
42
+ test:
43
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json
44
+ storage: msvd/annotations/qa_test.json
45
+ ans2label:
46
+ url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json
47
+ storage: msvd/annotations/qa_ans2label.json
48
+ videos:
49
+ storage: /export/share/datasets/vision_language/msvd/videos
50
+
51
+ instance_id_key: question_id
52
+
53
+ run:
54
+ task: gqa
55
+ # optimizer
56
+ lr_sched: "linear_warmup_cosine_lr"
57
+ init_lr: 1e-5
58
+ min_lr: 0
59
+ warmup_lr: 1e-8
60
+ warmup_steps: 1000
61
+ weight_decay: 0.05
62
+ max_epoch: 1
63
+ batch_size_train: 16
64
+ batch_size_eval: 1
65
+ num_workers: 8
66
+ accum_grad_iters: 1
67
+
68
+ max_len: 30
69
+ min_len: 1
70
+ num_beams: 5
71
+ inference_method: "generate"
72
+ prompt: "Question: {} Short Answer:"
73
+ length_penalty: -1.
74
+
75
+
76
+ seed: 42
77
+ output_dir: "output/instructblip/msvd_qa_vicuna7b/"
78
+
79
+ amp: True
80
+ resume_ckpt_path: null
81
+
82
+ evaluate: True
83
+ # train_splits: ["train"]
84
+ valid_splits: ["test"]
85
+ ques_files: {"train": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_questions.json",
86
+ "val": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_questions.json",
87
+ "test":"/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_test_questions.json"}
88
+ anno_files: {"train": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_annotations.json",
89
+ "val": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_annotations.json",
90
+ "test":"/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_test_annotations.json"}
91
+
92
+
93
+
94
+
95
+ device: "cuda"
96
+ world_size: 1
97
+ dist_url: "env://"
98
+ distributed: True
99
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
100
+ val_freq: 1
LAVIS-main/lavis/projects/instructblip/qa_okvqa_flant5xl_eval.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_t5_instruct
8
+ model_type: flant5xl
9
+ load_pretrained: True
10
+ prompt: "Question: {} Short Answer:"
11
+
12
+ datasets:
13
+ ok_vqa:
14
+ vis_processor:
15
+ train:
16
+ name: "clip_image_train"
17
+ eval:
18
+ name: "clip_image_eval"
19
+ text_processor:
20
+ train:
21
+ name: "blip_question"
22
+ eval:
23
+ name: "blip_caption"
24
+ build_info:
25
+ # Be careful not to append minus sign (-) before split to avoid itemizing
26
+ annotations:
27
+ train:
28
+ url:
29
+ # TODO make this order insensitive
30
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
31
+ # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
32
+ # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
33
+ storage:
34
+ - okvqa/annotations/okvqa_train.json
35
+ # - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json
36
+ # - okvqa/annotations/mscoco_train2014_annotations.json
37
+ test:
38
+ url:
39
+ # TODO make this order insensitive
40
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
41
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
42
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
43
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
44
+ storage:
45
+ - okvqa/annotations/vqa_val_eval.json
46
+ - okvqa/annotations/answer_list.json
47
+ - okvqa/annotations/OpenEnded_mscoco_val2014_questions.json
48
+ - okvqa/annotations/mscoco_val2014_annotations.json
49
+ images:
50
+ storage: /export/share/datasets/vision/coco/images
51
+
52
+
53
+ run:
54
+ task: vqa
55
+ # optimizer
56
+ lr_sched: "linear_warmup_cosine_lr"
57
+ init_lr: 1e-5
58
+ min_lr: 0
59
+ warmup_lr: 1e-8
60
+ warmup_steps: 1000
61
+ weight_decay: 0.05
62
+ max_epoch: 1
63
+ batch_size_train: 16
64
+ batch_size_eval: 1
65
+ num_workers: 8
66
+ accum_grad_iters: 1
67
+
68
+ max_len: 10
69
+ min_len: 1
70
+ num_beams: 5
71
+ inference_method: "generate"
72
+ length_penalty: -1.
73
+
74
+ seed: 42
75
+ output_dir: "output/instructblip/okavqa_qa_flant5xl/"
76
+
77
+ amp: True
78
+ resume_ckpt_path: null
79
+
80
+ evaluate: True
81
+ # train_splits: ["train"]
82
+ valid_splits: ["test"]
83
+
84
+
85
+ device: "cuda"
86
+ world_size: 1
87
+ dist_url: "env://"
88
+ distributed: True
89
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
90
+ val_freq: 1
LAVIS-main/lavis/projects/instructblip/qa_okvqa_flant5xxl_eval.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_t5_instruct
8
+ model_type: flant5xxl
9
+ load_pretrained: True
10
+ prompt: "Question: {} Short Answer:"
11
+
12
+ datasets:
13
+ ok_vqa:
14
+ vis_processor:
15
+ train:
16
+ name: "clip_image_train"
17
+ eval:
18
+ name: "clip_image_eval"
19
+ text_processor:
20
+ train:
21
+ name: "blip_question"
22
+ eval:
23
+ name: "blip_caption"
24
+ build_info:
25
+ # Be careful not to append minus sign (-) before split to avoid itemizing
26
+ annotations:
27
+ train:
28
+ url:
29
+ # TODO make this order insensitive
30
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
31
+ # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
32
+ # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
33
+ storage:
34
+ - okvqa/annotations/okvqa_train.json
35
+ # - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json
36
+ # - okvqa/annotations/mscoco_train2014_annotations.json
37
+ test:
38
+ url:
39
+ # TODO make this order insensitive
40
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
41
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
42
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
43
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
44
+ storage:
45
+ - okvqa/annotations/vqa_val_eval.json
46
+ - okvqa/annotations/answer_list.json
47
+ - okvqa/annotations/OpenEnded_mscoco_val2014_questions.json
48
+ - okvqa/annotations/mscoco_val2014_annotations.json
49
+ images:
50
+ storage: /export/share/datasets/vision/coco/images
51
+
52
+
53
+ run:
54
+ task: vqa
55
+ # optimizer
56
+ lr_sched: "linear_warmup_cosine_lr"
57
+ init_lr: 1e-5
58
+ min_lr: 0
59
+ warmup_lr: 1e-8
60
+ warmup_steps: 1000
61
+ weight_decay: 0.05
62
+ max_epoch: 1
63
+ batch_size_train: 16
64
+ batch_size_eval: 1
65
+ num_workers: 0
66
+ accum_grad_iters: 1
67
+
68
+ max_len: 10
69
+ min_len: 1
70
+ num_beams: 5
71
+ inference_method: "generate"
72
+ length_penalty: -1.
73
+
74
+ seed: 42
75
+ output_dir: "output/instructblip/okavqa_qa_flant5xxl/"
76
+
77
+ amp: True
78
+ resume_ckpt_path: null
79
+
80
+ evaluate: True
81
+ # train_splits: ["train"]
82
+ valid_splits: ["test"]
83
+
84
+
85
+ device: "cuda"
86
+ world_size: 1
87
+ dist_url: "env://"
88
+ distributed: True
89
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
90
+ val_freq: 1
LAVIS-main/lavis/projects/instructblip/qa_okvqa_vicuna13b_eval.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_vicuna_instruct
8
+ model_type: vicuna13b
9
+ load_pretrained: True
10
+ prompt: "Question: {} Short Answer:"
11
+
12
+ datasets:
13
+ ok_vqa:
14
+ vis_processor:
15
+ train:
16
+ name: "clip_image_train"
17
+ eval:
18
+ name: "clip_image_eval"
19
+ text_processor:
20
+ train:
21
+ name: "blip_question"
22
+ eval:
23
+ name: "blip_caption"
24
+ build_info:
25
+ # Be careful not to append minus sign (-) before split to avoid itemizing
26
+ annotations:
27
+ train:
28
+ url:
29
+ # TODO make this order insensitive
30
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
31
+ # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
32
+ # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
33
+ storage:
34
+ - okvqa/annotations/okvqa_train.json
35
+ # - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json
36
+ # - okvqa/annotations/mscoco_train2014_annotations.json
37
+ test:
38
+ url:
39
+ # TODO make this order insensitive
40
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
41
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
42
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
43
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
44
+ storage:
45
+ - okvqa/annotations/vqa_val_eval.json
46
+ - okvqa/annotations/answer_list.json
47
+ - okvqa/annotations/OpenEnded_mscoco_val2014_questions.json
48
+ - okvqa/annotations/mscoco_val2014_annotations.json
49
+ images:
50
+ storage: /export/share/datasets/vision/coco/images
51
+
52
+
53
+ run:
54
+ task: vqa
55
+ # optimizer
56
+ lr_sched: "linear_warmup_cosine_lr"
57
+ init_lr: 1e-5
58
+ min_lr: 0
59
+ warmup_lr: 1e-8
60
+ warmup_steps: 1000
61
+ weight_decay: 0.05
62
+ max_epoch: 1
63
+ batch_size_train: 16
64
+ batch_size_eval: 1
65
+ num_workers: 8
66
+ accum_grad_iters: 1
67
+
68
+ max_len: 10
69
+ min_len: 1
70
+ num_beams: 5
71
+ inference_method: "generate"
72
+ length_penalty: -1.
73
+
74
+ seed: 42
75
+ output_dir: "output/instructblip/okavqa_qa_vicuna13b/"
76
+
77
+ amp: True
78
+ resume_ckpt_path: null
79
+
80
+ evaluate: True
81
+ # train_splits: ["train"]
82
+ valid_splits: ["test"]
83
+
84
+
85
+ device: "cuda"
86
+ world_size: 1
87
+ dist_url: "env://"
88
+ distributed: True
89
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
90
+ val_freq: 1
LAVIS-main/lavis/projects/instructblip/qa_okvqa_vicuna7b_eval.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: blip2_vicuna_instruct
8
+ model_type: vicuna7b
9
+ load_pretrained: True
10
+ prompt: "Question: {} Short Answer:"
11
+
12
+ datasets:
13
+ ok_vqa:
14
+ vis_processor:
15
+ train:
16
+ name: "clip_image_train"
17
+ eval:
18
+ name: "clip_image_eval"
19
+ text_processor:
20
+ train:
21
+ name: "blip_question"
22
+ eval:
23
+ name: "blip_caption"
24
+ build_info:
25
+ # Be careful not to append minus sign (-) before split to avoid itemizing
26
+ annotations:
27
+ train:
28
+ url:
29
+ # TODO make this order insensitive
30
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
31
+ # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
32
+ # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
33
+ storage:
34
+ - okvqa/annotations/okvqa_train.json
35
+ # - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json
36
+ # - okvqa/annotations/mscoco_train2014_annotations.json
37
+ test:
38
+ url:
39
+ # TODO make this order insensitive
40
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
41
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
42
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
43
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
44
+ storage:
45
+ - okvqa/annotations/vqa_val_eval.json
46
+ - okvqa/annotations/answer_list.json
47
+ - okvqa/annotations/OpenEnded_mscoco_val2014_questions.json
48
+ - okvqa/annotations/mscoco_val2014_annotations.json
49
+ images:
50
+ storage: /export/share/datasets/vision/coco/images
51
+
52
+
53
+ run:
54
+ task: vqa
55
+ # optimizer
56
+ lr_sched: "linear_warmup_cosine_lr"
57
+ init_lr: 1e-5
58
+ min_lr: 0
59
+ warmup_lr: 1e-8
60
+ warmup_steps: 1000
61
+ weight_decay: 0.05
62
+ max_epoch: 1
63
+ batch_size_train: 16
64
+ batch_size_eval: 1
65
+ num_workers: 8
66
+ accum_grad_iters: 1
67
+
68
+ max_len: 10
69
+ min_len: 1
70
+ num_beams: 5
71
+ inference_method: "generate"
72
+ length_penalty: -1.
73
+
74
+ seed: 42
75
+ output_dir: "output/instructblip/okavqa_qa_vicuna7b/"
76
+
77
+ amp: True
78
+ resume_ckpt_path: null
79
+
80
+ evaluate: True
81
+ # train_splits: ["train"]
82
+ valid_splits: ["test"]
83
+
84
+
85
+ device: "cuda"
86
+ world_size: 1
87
+ dist_url: "env://"
88
+ distributed: True
89
+ save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
90
+ val_freq: 1
LAVIS-main/lavis/projects/pnp-vqa/eval/gqa_eval.yaml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: pnp_vqa
8
+ model_type: base
9
+
10
+ datasets:
11
+ gqa: # name of the dataset builder
12
+ type: balanced_testdev
13
+ vis_processor:
14
+ eval:
15
+ name: "blip_image_eval"
16
+ image_size: 384
17
+ text_processor:
18
+ eval:
19
+ name: "blip_question"
20
+
21
+ run:
22
+ task: gqa_reading_comprehension
23
+
24
+ # optimization-specific
25
+ batch_size_train: 16
26
+ batch_size_eval: 16
27
+ num_workers: 4
28
+
29
+ # image question matching specific
30
+ block_num: 7
31
+
32
+ # image captioning specific
33
+ top_k: 50
34
+ top_p: 1
35
+ cap_min_length: 10
36
+ cap_max_length: 20
37
+ repetition_penalty: 1
38
+ num_patches: 20
39
+ num_captions: 100
40
+ prompt: 'a picture of '
41
+
42
+ # question answering specific
43
+ internal_bsz_fid: 1
44
+ num_captions_fid: 5
45
+ min_len: 0
46
+ max_len: 20
47
+ num_beams: 1
48
+ inference_method: "generate"
49
+
50
+ seed: 42
51
+ output_dir: "output/PNP-VQA/GQA"
52
+
53
+ evaluate: True
54
+ test_splits: ["val"]
55
+
56
+ # distribution-specific
57
+ device: "cuda"
58
+ world_size: 1
59
+ dist_url: "env://"
60
+ distributed: True
LAVIS-main/lavis/projects/pnp-vqa/eval/gqa_eval_3b.yaml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: pnp_vqa
8
+ model_type: 3b
9
+
10
+ datasets:
11
+ gqa: # name of the dataset builder
12
+ type: balanced_testdev
13
+ vis_processor:
14
+ eval:
15
+ name: "blip_image_eval"
16
+ image_size: 384
17
+ text_processor:
18
+ eval:
19
+ name: "blip_question"
20
+
21
+ run:
22
+ task: gqa_reading_comprehension
23
+
24
+ # optimization-specific
25
+ batch_size_train: 4
26
+ batch_size_eval: 4
27
+ num_workers: 4
28
+
29
+ # image question matching specific
30
+ block_num: 7
31
+
32
+ # image captioning specific
33
+ top_k: 50
34
+ top_p: 1
35
+ cap_min_length: 10
36
+ cap_max_length: 20
37
+ repetition_penalty: 1
38
+ num_patches: 20
39
+ num_captions: 100
40
+ prompt: 'a picture of '
41
+
42
+ # question answering specific
43
+ internal_bsz_fid: 1
44
+ num_captions_fid: 5
45
+ min_len: 0
46
+ max_len: 20
47
+ num_beams: 1
48
+ inference_method: "generate"
49
+
50
+ seed: 42
51
+ output_dir: "output/PNP-VQA-3b/GQA"
52
+
53
+ evaluate: True
54
+ test_splits: ["val"]
55
+
56
+ # distribution-specific
57
+ device: "cuda"
58
+ world_size: 1
59
+ dist_url: "env://"
60
+ distributed: True
LAVIS-main/lavis/projects/pnp-vqa/eval/gqa_eval_large.yaml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: pnp_vqa
8
+ model_type: large
9
+
10
+ datasets:
11
+ gqa: # name of the dataset builder
12
+ type: balanced_testdev
13
+ vis_processor:
14
+ eval:
15
+ name: "blip_image_eval"
16
+ image_size: 384
17
+ text_processor:
18
+ eval:
19
+ name: "blip_question"
20
+
21
+ run:
22
+ task: gqa_reading_comprehension
23
+
24
+ # optimization-specific
25
+ batch_size_train: 12
26
+ batch_size_eval: 12
27
+ num_workers: 4
28
+
29
+ # image question matching specific
30
+ block_num: 7
31
+
32
+ # image captioning specific
33
+ top_k: 50
34
+ top_p: 1
35
+ cap_min_length: 10
36
+ cap_max_length: 20
37
+ repetition_penalty: 1
38
+ num_patches: 20
39
+ num_captions: 100
40
+ prompt: 'a picture of '
41
+
42
+ # question answering specific
43
+ internal_bsz_fid: 1
44
+ num_captions_fid: 5
45
+ min_len: 0
46
+ max_len: 20
47
+ num_beams: 1
48
+ inference_method: "generate"
49
+
50
+ seed: 42
51
+ output_dir: "output/PNP-VQA-large/GQA"
52
+
53
+ evaluate: True
54
+ test_splits: ["val"]
55
+
56
+ # distribution-specific
57
+ device: "cuda"
58
+ world_size: 1
59
+ dist_url: "env://"
60
+ distributed: True
LAVIS-main/lavis/projects/pnp-vqa/eval/okvqa_eval.yaml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: pnp_vqa
8
+ model_type: base
9
+
10
+ datasets:
11
+ ok_vqa: # name of the dataset builder
12
+ vis_processor:
13
+ eval:
14
+ name: "blip_image_eval"
15
+ image_size: 384
16
+ text_processor:
17
+ eval:
18
+ name: "blip_question"
19
+
20
+ run:
21
+ task: vqa_reading_comprehension
22
+
23
+ # optimization-specific
24
+ batch_size_train: 16
25
+ batch_size_eval: 16
26
+ num_workers: 4
27
+
28
+ # image question matching specific
29
+ block_num: 7
30
+
31
+ # image captioning specific
32
+ top_k: 50
33
+ top_p: 1
34
+ cap_min_length: 10
35
+ cap_max_length: 20
36
+ repetition_penalty: 1
37
+ num_patches: 20
38
+ num_captions: 100
39
+ prompt: 'a picture of '
40
+
41
+ # question answering specific
42
+ internal_bsz_fid: 1
43
+ num_captions_fid: 1
44
+ min_len: 0
45
+ max_len: 20
46
+ num_beams: 1
47
+ inference_method: "generate"
48
+
49
+ seed: 42
50
+ output_dir: "output/PNP-VQA/OKVQA"
51
+
52
+ evaluate: True
53
+ test_splits: ["test"]
54
+
55
+ # distribution-specific
56
+ device: "cuda"
57
+ world_size: 1
58
+ dist_url: "env://"
59
+ distributed: True
LAVIS-main/lavis/projects/pnp-vqa/eval/okvqa_eval_3b.yaml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: pnp_vqa
8
+ model_type: 3b
9
+
10
+ datasets:
11
+ ok_vqa: # name of the dataset builder
12
+ vis_processor:
13
+ eval:
14
+ name: "blip_image_eval"
15
+ image_size: 384
16
+ text_processor:
17
+ eval:
18
+ name: "blip_question"
19
+
20
+ run:
21
+ task: vqa_reading_comprehension
22
+
23
+ # optimization-specific
24
+ batch_size_train: 4
25
+ batch_size_eval: 4
26
+ num_workers: 4
27
+
28
+ # image question matching specific
29
+ block_num: 7
30
+
31
+ # image captioning specific
32
+ top_k: 50
33
+ top_p: 1
34
+ cap_min_length: 10
35
+ cap_max_length: 20
36
+ repetition_penalty: 1
37
+ num_patches: 20
38
+ num_captions: 100
39
+ prompt: 'a picture of '
40
+
41
+ # question answering specific
42
+ internal_bsz_fid: 1
43
+ num_captions_fid: 1
44
+ min_len: 0
45
+ max_len: 20
46
+ num_beams: 1
47
+ inference_method: "generate"
48
+
49
+ seed: 42
50
+ output_dir: "output/PNP-VQA-3b/OKVQA"
51
+
52
+ evaluate: True
53
+ test_splits: ["test"]
54
+
55
+ # distribution-specific
56
+ device: "cuda"
57
+ world_size: 1
58
+ dist_url: "env://"
59
+ distributed: True
LAVIS-main/lavis/projects/pnp-vqa/eval/okvqa_eval_large.yaml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: pnp_vqa
8
+ model_type: large
9
+
10
+ datasets:
11
+ ok_vqa: # name of the dataset builder
12
+ vis_processor:
13
+ eval:
14
+ name: "blip_image_eval"
15
+ image_size: 384
16
+ text_processor:
17
+ eval:
18
+ name: "blip_question"
19
+
20
+ run:
21
+ task: vqa_reading_comprehension
22
+
23
+ # optimization-specific
24
+ batch_size_train: 12
25
+ batch_size_eval: 12
26
+ num_workers: 4
27
+
28
+ # image question matching specific
29
+ block_num: 7
30
+
31
+ # image captioning specific
32
+ top_k: 50
33
+ top_p: 1
34
+ cap_min_length: 10
35
+ cap_max_length: 20
36
+ repetition_penalty: 1
37
+ num_patches: 20
38
+ num_captions: 100
39
+ prompt: 'a picture of '
40
+
41
+ # question answering specific
42
+ internal_bsz_fid: 1
43
+ num_captions_fid: 1
44
+ min_len: 0
45
+ max_len: 20
46
+ num_beams: 1
47
+ inference_method: "generate"
48
+
49
+ seed: 42
50
+ output_dir: "output/PNP-VQA-large/OKVQA"
51
+
52
+ evaluate: True
53
+ test_splits: ["test"]
54
+
55
+ # distribution-specific
56
+ device: "cuda"
57
+ world_size: 1
58
+ dist_url: "env://"
59
+ distributed: True
LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_eval.yaml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: pnp_vqa
8
+ model_type: base
9
+
10
+ datasets:
11
+ coco_vqa: # name of the dataset builder
12
+ type: eval
13
+ vis_processor:
14
+ eval:
15
+ name: "blip_image_eval"
16
+ image_size: 384
17
+ text_processor:
18
+ eval:
19
+ name: "blip_question"
20
+
21
+ run:
22
+ task: vqa_reading_comprehension
23
+
24
+ # optimization-specific
25
+ batch_size_train: 16
26
+ batch_size_eval: 16
27
+ num_workers: 4
28
+
29
+ # image question matching specific
30
+ block_num: 7
31
+
32
+ # image captioning specific
33
+ top_k: 50
34
+ top_p: 1
35
+ cap_min_length: 10
36
+ cap_max_length: 20
37
+ repetition_penalty: 1
38
+ num_patches: 20
39
+ num_captions: 100
40
+ prompt: 'a picture of '
41
+
42
+ # question answering specific
43
+ internal_bsz_fid: 1
44
+ num_captions_fid: 1
45
+ min_len: 0
46
+ max_len: 20
47
+ num_beams: 1
48
+ inference_method: "generate"
49
+
50
+ seed: 42
51
+ output_dir: "output/PNP-VQA/VQAv2_val"
52
+
53
+ evaluate: True
54
+ test_splits: ["val"]
55
+
56
+ # distribution-specific
57
+ device: "cuda"
58
+ world_size: 1
59
+ dist_url: "env://"
60
+ distributed: True
LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_eval_3b.yaml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: pnp_vqa
8
+ model_type: 3b
9
+
10
+ datasets:
11
+ coco_vqa: # name of the dataset builder
12
+ type: eval
13
+ vis_processor:
14
+ eval:
15
+ name: "blip_image_eval"
16
+ image_size: 384
17
+ text_processor:
18
+ eval:
19
+ name: "blip_question"
20
+
21
+ run:
22
+ task: vqa_reading_comprehension
23
+
24
+ # optimization-specific
25
+ batch_size_train: 4
26
+ batch_size_eval: 4
27
+ num_workers: 4
28
+
29
+ # image question matching specific
30
+ block_num: 7
31
+
32
+ # image captioning specific
33
+ top_k: 50
34
+ top_p: 1
35
+ cap_min_length: 10
36
+ cap_max_length: 20
37
+ repetition_penalty: 1
38
+ num_patches: 20
39
+ num_captions: 100
40
+ prompt: 'a picture of '
41
+
42
+ # question answering specific
43
+ internal_bsz_fid: 1
44
+ num_captions_fid: 1
45
+ min_len: 0
46
+ max_len: 20
47
+ num_beams: 1
48
+ inference_method: "generate"
49
+
50
+ seed: 42
51
+ output_dir: "output/PNP-VQA-3b/VQAv2_val"
52
+
53
+ evaluate: True
54
+ test_splits: ["val"]
55
+
56
+ # distribution-specific
57
+ device: "cuda"
58
+ world_size: 1
59
+ dist_url: "env://"
60
+ distributed: True
LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_eval_large.yaml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: pnp_vqa
8
+ model_type: large
9
+
10
+ datasets:
11
+ coco_vqa: # name of the dataset builder
12
+ type: eval
13
+ vis_processor:
14
+ eval:
15
+ name: "blip_image_eval"
16
+ image_size: 384
17
+ text_processor:
18
+ eval:
19
+ name: "blip_question"
20
+
21
+ run:
22
+ task: vqa_reading_comprehension
23
+
24
+ # optimization-specific
25
+ batch_size_train: 12
26
+ batch_size_eval: 12
27
+ num_workers: 4
28
+
29
+ # image question matching specific
30
+ block_num: 7
31
+
32
+ # image captioning specific
33
+ top_k: 50
34
+ top_p: 1
35
+ cap_min_length: 10
36
+ cap_max_length: 20
37
+ repetition_penalty: 1
38
+ num_patches: 20
39
+ num_captions: 100
40
+ prompt: 'a picture of '
41
+
42
+ # question answering specific
43
+ internal_bsz_fid: 1
44
+ num_captions_fid: 1
45
+ min_len: 0
46
+ max_len: 20
47
+ num_beams: 1
48
+ inference_method: "generate"
49
+
50
+ seed: 42
51
+ output_dir: "output/PNP-VQA-large/VQAv2_val"
52
+
53
+ evaluate: True
54
+ test_splits: ["val"]
55
+
56
+ # distribution-specific
57
+ device: "cuda"
58
+ world_size: 1
59
+ dist_url: "env://"
60
+ distributed: True
LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_test_eval.yaml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: pnp_vqa
8
+ model_type: base
9
+
10
+ datasets:
11
+ coco_vqa: # name of the dataset builder
12
+ type: default
13
+ vis_processor:
14
+ eval:
15
+ name: "blip_image_eval"
16
+ image_size: 384
17
+ text_processor:
18
+ eval:
19
+ name: "blip_question"
20
+
21
+ run:
22
+ task: vqa_reading_comprehension
23
+
24
+ # optimization-specific
25
+ batch_size_train: 16
26
+ batch_size_eval: 16
27
+ num_workers: 4
28
+
29
+ # image question matching specific
30
+ block_num: 7
31
+
32
+ # image captioning specific
33
+ top_k: 50
34
+ top_p: 1
35
+ cap_min_length: 10
36
+ cap_max_length: 20
37
+ repetition_penalty: 1
38
+ num_patches: 20
39
+ num_captions: 100
40
+ prompt: 'a picture of '
41
+
42
+ # question answering specific
43
+ internal_bsz_fid: 1
44
+ num_captions_fid: 1
45
+ min_len: 0
46
+ max_len: 20
47
+ num_beams: 1
48
+ inference_method: "generate"
49
+
50
+ seed: 42
51
+ output_dir: "output/PNP-VQA/VQAv2_test"
52
+
53
+ evaluate: True
54
+ test_splits: ["test"]
55
+
56
+ # distribution-specific
57
+ device: "cuda"
58
+ world_size: 1
59
+ dist_url: "env://"
60
+ distributed: True
LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_test_eval_3b.yaml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: pnp_vqa
8
+ model_type: 3b
9
+
10
+ datasets:
11
+ coco_vqa: # name of the dataset builder
12
+ type: default
13
+ vis_processor:
14
+ eval:
15
+ name: "blip_image_eval"
16
+ image_size: 384
17
+ text_processor:
18
+ eval:
19
+ name: "blip_question"
20
+
21
+ run:
22
+ task: vqa_reading_comprehension
23
+
24
+ # optimization-specific
25
+ batch_size_train: 4
26
+ batch_size_eval: 4
27
+ num_workers: 4
28
+
29
+ # image question matching specific
30
+ block_num: 7
31
+
32
+ # image captioning specific
33
+ top_k: 50
34
+ top_p: 1
35
+ cap_min_length: 10
36
+ cap_max_length: 20
37
+ repetition_penalty: 1
38
+ num_patches: 20
39
+ num_captions: 100
40
+ prompt: 'a picture of '
41
+
42
+ # question answering specific
43
+ internal_bsz_fid: 1
44
+ num_captions_fid: 1
45
+ min_len: 0
46
+ max_len: 20
47
+ num_beams: 1
48
+ inference_method: "generate"
49
+
50
+ seed: 42
51
+ output_dir: "output/PNP-VQA-3b/VQAv2_test"
52
+
53
+ evaluate: True
54
+ test_splits: ["test"]
55
+
56
+ # distribution-specific
57
+ device: "cuda"
58
+ world_size: 1
59
+ dist_url: "env://"
60
+ distributed: True
LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_test_eval_large.yaml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: pnp_vqa
8
+ model_type: large
9
+
10
+ datasets:
11
+ coco_vqa: # name of the dataset builder
12
+ type: default
13
+ vis_processor:
14
+ eval:
15
+ name: "blip_image_eval"
16
+ image_size: 384
17
+ text_processor:
18
+ eval:
19
+ name: "blip_question"
20
+
21
+ run:
22
+ task: vqa_reading_comprehension
23
+
24
+ # optimization-specific
25
+ batch_size_train: 12
26
+ batch_size_eval: 12
27
+ num_workers: 4
28
+
29
+ # image question matching specific
30
+ block_num: 7
31
+
32
+ # image captioning specific
33
+ top_k: 50
34
+ top_p: 1
35
+ cap_min_length: 10
36
+ cap_max_length: 20
37
+ repetition_penalty: 1
38
+ num_patches: 20
39
+ num_captions: 100
40
+ prompt: 'a picture of '
41
+
42
+ # question answering specific
43
+ internal_bsz_fid: 1
44
+ num_captions_fid: 1
45
+ min_len: 0
46
+ max_len: 20
47
+ num_beams: 1
48
+ inference_method: "generate"
49
+
50
+ seed: 42
51
+ output_dir: "output/PNP-VQA-large/VQAv2_test"
52
+
53
+ evaluate: True
54
+ test_splits: ["test"]
55
+
56
+ # distribution-specific
57
+ device: "cuda"
58
+ world_size: 1
59
+ dist_url: "env://"
60
+ distributed: True
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_caption.yaml ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+ model:
6
+ arch: blip2_vicuna_xinstruct
7
+ model_type: vicuna7b
8
+ load_pretrained: True
9
+ # pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
10
+ load_finetuned: False
11
+ finetuned: ""
12
+ stage1_url_or_filename: null
13
+ image_model: "eva_clip_g"
14
+ pc_model: "ulip2_pointbert"
15
+ video_model: "eva_clip_g"
16
+ audio_model: "beats"
17
+ pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
18
+ pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth
19
+ pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth
20
+ pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/audio_qformer.pth
21
+ load_attention_image_qformer: True
22
+ load_attention_pc_qformer: True
23
+ load_attention_video_qformer: True
24
+ load_attention_audio_qformer: True
25
+ load_ln_type_image: "image"
26
+ load_ln_type_video: "video"
27
+ load_ln_type_pc: "pc"
28
+ load_ln_type_audio: "audio"
29
+ load_qformer_type_image: "image"
30
+ load_qformer_type_pc: "pc"
31
+ load_qformer_type_video: "video"
32
+ load_qformer_type_audio: "audio"
33
+ load_projection_image: True
34
+ load_projection_pc: True
35
+ load_projection_video: True
36
+ load_projection_audio: True
37
+ load_projection_type_image: "image"
38
+ load_projection_type_pc: "pc"
39
+ load_projection_type_video: "video"
40
+ load_projection_type_audio: "audio"
41
+ image_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
42
+ pc_encoder_kwargs : {}
43
+ video_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
44
+ audio_encoder_kwargs : {}
45
+ image_precision: "fp16"
46
+ pc_precision: "fp16"
47
+ video_precision: "fp16"
48
+ audio_precision: "fp16"
49
+ freeze_image: True
50
+ freeze_pc: True
51
+ freeze_video: True
52
+ freeze_audio: True
53
+ num_query_token: 32
54
+ llm_model: /path/to/vicuna-7b
55
+ prompt: "question: {} answer:"
56
+ max_txt_len: 128
57
+ max_output_txt_len: 256
58
+ apply_lemmatizer: False
59
+ num_few_shot_examples: 0
60
+ few_shot_prob: 0
61
+ qformer_text_input: True
62
+ llm_text_input: True
63
+ modalities : [audio, video]
64
+ use_cues: True
65
+ shared_qformer: False
66
+ pretrained_shared_qformer: Null
67
+ load_attention_shared_qformer: False
68
+ load_qformer_type_shared: ""
69
+ load_projection_shared: False
70
+ load_projection_type_shaped: ""
71
+ load_ln_type_shared: ""
72
+ shared_qformer_num_features: 512
73
+ special_qformer_input_prompt: "a short description"
74
+ prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
75
+ predict_with_gen: False
76
+ use_caption: True
77
+ use_describe: False
78
+ enumerate_inputs: False
79
+ add_space: True
80
+
81
+
82
+ datasets:
83
+ audio_video_discrn:
84
+ # data_dir: ${env.data_dir}/datasets
85
+ audio_processor:
86
+ train:
87
+ name: beats_audio
88
+ n_frames: 2
89
+ eval:
90
+ name: beats_audio
91
+ n_frames: 2
92
+
93
+ text_processor:
94
+ train:
95
+ name: "blip_caption"
96
+ eval:
97
+ name: "blip_caption"
98
+
99
+ video_processor:
100
+ train:
101
+ name: alpro_video_train
102
+ n_frms: 5
103
+ image_size: 224
104
+ min_scale: 0.9
105
+ max_scale: 1.0
106
+ full_video: True
107
+ eval:
108
+ name: alpro_video_eval
109
+ n_frms: 5
110
+ image_size: 224
111
+ min_scale: 0.9
112
+ max_scale: 1.0
113
+ full_video: True
114
+
115
+ data_type: [audio, video] # [images|videos|features]
116
+
117
+ build_info:
118
+ kwargs:
119
+ total: all
120
+ shuffle_modalities: False
121
+ balance_labels: True
122
+ dataset_name: audiocaps
123
+ ground_truth: False
124
+ classnames: [audio, video]
125
+ raw: True
126
+
127
+ # Be careful not to append minus sign (-) before split to avoid itemizing
128
+ annotations:
129
+ val:
130
+ url:
131
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
132
+ storage:
133
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
134
+
135
+ audio:
136
+ storage: /audiocaps/AUDIOCAPS_32000Hz/audio/val
137
+ video:
138
+ storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val
139
+
140
+ run:
141
+ task: discrn_qa
142
+ # optimization-specific
143
+ batch_size_train: 8
144
+ batch_size_eval: 1
145
+ num_workers: 0
146
+ max_epoch: 1
147
+ segments: 1
148
+
149
+ # inference-specific
150
+ max_len: 10
151
+ min_len: 1
152
+ length_penalty: -1.
153
+ num_beams: 5
154
+ inference_method: "generate"
155
+
156
+ train_splits: ["train"]
157
+ valid_splits: ["val"]
158
+ # test_splits: ["test"]
159
+
160
+ # distribution
161
+ device: "cuda"
162
+ world_size: 1
163
+ dist_url: "env://"
164
+ distributed: True
165
+ use_dist_eval_sampler: False
166
+
167
+
168
+ # model specific
169
+ k_test: 128
170
+
171
+ # misc
172
+ seed: 42
173
+ output_dir: "output/xinstructblip/eval/vicuna7b/discrn/audio_video_caption"
174
+
175
+ evaluate: True
176
+ save_freq: -1
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_caption_13b.yaml ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+ model:
6
+ arch: blip2_vicuna_xinstruct
7
+ model_type: vicuna13b
8
+ load_pretrained: True
9
+ # pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/image_qformer.pth
10
+ load_finetuned: False
11
+ finetuned: ""
12
+ stage1_url_or_filename: null
13
+ image_model: "eva_clip_g"
14
+ pc_model: "ulip2_pointbert"
15
+ video_model: "eva_clip_g"
16
+ audio_model: "beats"
17
+ pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/image_qformer.pth
18
+ pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/pc_qformer.pth
19
+ pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/video_qformer.pth
20
+ pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/audio_qformer.pth
21
+ load_attention_image_qformer: True
22
+ load_attention_pc_qformer: True
23
+ load_attention_video_qformer: True
24
+ load_attention_audio_qformer: True
25
+ load_ln_type_image: "image"
26
+ load_ln_type_video: "video"
27
+ load_ln_type_pc: "pc"
28
+ load_ln_type_audio: "audio"
29
+ load_qformer_type_image: "image"
30
+ load_qformer_type_pc: "pc"
31
+ load_qformer_type_video: "video"
32
+ load_qformer_type_audio: "audio"
33
+ load_projection_image: True
34
+ load_projection_pc: True
35
+ load_projection_video: True
36
+ load_projection_audio: True
37
+ load_projection_type_image: "image"
38
+ load_projection_type_pc: "pc"
39
+ load_projection_type_video: "video"
40
+ load_projection_type_audio: "audio"
41
+ image_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
42
+ pc_encoder_kwargs : {}
43
+ video_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
44
+ audio_encoder_kwargs : {}
45
+ image_precision: "fp16"
46
+ pc_precision: "fp16"
47
+ video_precision: "fp16"
48
+ audio_precision: "fp16"
49
+ freeze_image: True
50
+ freeze_pc: True
51
+ freeze_video: True
52
+ freeze_audio: True
53
+ num_query_token: 32
54
+ llm_model: "/path/to/vicuna-13b"
55
+ prompt: "question: {} answer:"
56
+ max_txt_len: 128
57
+ max_output_txt_len: 256
58
+ apply_lemmatizer: False
59
+ num_few_shot_examples: 0
60
+ few_shot_prob: 0
61
+ qformer_text_input: True
62
+ llm_text_input: True
63
+ modalities : [audio, video]
64
+ use_cues: True
65
+ shared_qformer: False
66
+ pretrained_shared_qformer: Null
67
+ load_attention_shared_qformer: False
68
+ load_qformer_type_shared: ""
69
+ load_projection_shared: False
70
+ load_projection_type_shaped: ""
71
+ load_ln_type_shared: ""
72
+ shared_qformer_num_features: 512
73
+ special_qformer_input_prompt: "a short description"
74
+ prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
75
+ predict_with_gen: False
76
+ use_caption: True
77
+ use_describe: False
78
+ enumerate_inputs: False
79
+ add_space: True
80
+
81
+
82
+ datasets:
83
+ audio_video_discrn:
84
+ # data_dir: ${env.data_dir}/datasets
85
+ audio_processor:
86
+ train:
87
+ name: beats_audio
88
+ n_frames: 2
89
+ eval:
90
+ name: beats_audio
91
+ n_frames: 2
92
+
93
+ text_processor:
94
+ train:
95
+ name: "blip_caption"
96
+ eval:
97
+ name: "blip_caption"
98
+
99
+ video_processor:
100
+ train:
101
+ name: alpro_video_train
102
+ n_frms: 5
103
+ image_size: 224
104
+ min_scale: 0.9
105
+ max_scale: 1.0
106
+ full_video: True
107
+ eval:
108
+ name: alpro_video_eval
109
+ n_frms: 5
110
+ image_size: 224
111
+ min_scale: 0.9
112
+ max_scale: 1.0
113
+ full_video: True
114
+
115
+ data_type: [audio, video] # [images|videos|features]
116
+
117
+ build_info:
118
+ kwargs:
119
+ total: all
120
+ shuffle_modalities: False
121
+ balance_labels: True
122
+ dataset_name: audiocaps
123
+ ground_truth: False
124
+ classnames: [audio, video]
125
+ raw: True
126
+
127
+ # Be careful not to append minus sign (-) before split to avoid itemizing
128
+ annotations:
129
+ val:
130
+ url:
131
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
132
+ storage:
133
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
134
+
135
+ audio:
136
+ storage: /audiocaps/AUDIOCAPS_32000Hz/audio/val
137
+ video:
138
+ storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val
139
+
140
+ run:
141
+ task: discrn_qa
142
+ # optimization-specific
143
+ batch_size_train: 8
144
+ batch_size_eval: 1
145
+ num_workers: 0
146
+ max_epoch: 1
147
+ segments: 1
148
+
149
+ # inference-specific
150
+ max_len: 10
151
+ min_len: 1
152
+ length_penalty: -1.
153
+ num_beams: 5
154
+ inference_method: "generate"
155
+
156
+ train_splits: ["train"]
157
+ valid_splits: ["val"]
158
+ # test_splits: ["test"]
159
+
160
+ # distribution
161
+ device: "cuda"
162
+ world_size: 1
163
+ dist_url: "env://"
164
+ distributed: True
165
+ use_dist_eval_sampler: False
166
+
167
+
168
+ # model specific
169
+ k_test: 128
170
+
171
+ # misc
172
+ seed: 42
173
+ output_dir: "output/xinstructblip/eval/vicuna13b/discrn/audio_video_caption"
174
+
175
+ evaluate: True
176
+ save_freq: -1
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe.yaml ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+ model:
6
+ arch: blip2_vicuna_xinstruct
7
+ model_type: vicuna7b
8
+ load_pretrained: True
9
+ # pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
10
+ load_finetuned: False
11
+ finetuned: ""
12
+ stage1_url_or_filename: null
13
+ image_model: "eva_clip_g"
14
+ pc_model: "ulip2_pointbert"
15
+ video_model: "eva_clip_g"
16
+ audio_model: "beats"
17
+ pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
18
+ pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth
19
+ pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth
20
+ pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/audio_qformer.pth
21
+ load_attention_image_qformer: True
22
+ load_attention_pc_qformer: True
23
+ load_attention_video_qformer: True
24
+ load_attention_audio_qformer: True
25
+ load_ln_type_image: "image"
26
+ load_ln_type_video: "video"
27
+ load_ln_type_pc: "pc"
28
+ load_ln_type_audio: "audio"
29
+ load_qformer_type_image: "image"
30
+ load_qformer_type_pc: "pc"
31
+ load_qformer_type_video: "video"
32
+ load_qformer_type_audio: "audio"
33
+ load_projection_image: True
34
+ load_projection_pc: True
35
+ load_projection_video: True
36
+ load_projection_audio: True
37
+ load_projection_type_image: "image"
38
+ load_projection_type_pc: "pc"
39
+ load_projection_type_video: "video"
40
+ load_projection_type_audio: "audio"
41
+ image_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
42
+ pc_encoder_kwargs : {}
43
+ video_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
44
+ audio_encoder_kwargs : {}
45
+ image_precision: "fp16"
46
+ pc_precision: "fp16"
47
+ video_precision: "fp16"
48
+ audio_precision: "fp16"
49
+ freeze_image: True
50
+ freeze_pc: True
51
+ freeze_video: True
52
+ freeze_audio: True
53
+ num_query_token: 32
54
+ llm_model: /path/to/vicuna-7b
55
+ prompt: "question: {} answer:"
56
+ max_txt_len: 128
57
+ max_output_txt_len: 256
58
+ apply_lemmatizer: False
59
+ num_few_shot_examples: 0
60
+ few_shot_prob: 0
61
+ qformer_text_input: True
62
+ llm_text_input: True
63
+ modalities : [audio, video]
64
+ use_cues: True
65
+ shared_qformer: False
66
+ pretrained_shared_qformer: Null
67
+ load_attention_shared_qformer: False
68
+ load_qformer_type_shared: ""
69
+ load_projection_shared: False
70
+ load_projection_type_shaped: ""
71
+ load_ln_type_shared: ""
72
+ shared_qformer_num_features: 512
73
+ special_qformer_input_prompt: "a short description"
74
+ prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
75
+ predict_with_gen: False
76
+ use_caption: False
77
+ use_describe: False
78
+ enumerate_inputs: False
79
+ add_space: True
80
+
81
+
82
+ datasets:
83
+ audio_video_discrn:
84
+ # data_dir: ${env.data_dir}/datasets
85
+ audio_processor:
86
+ train:
87
+ name: beats_audio
88
+ n_frames: 2
89
+ eval:
90
+ name: beats_audio
91
+ n_frames: 2
92
+
93
+ text_processor:
94
+ train:
95
+ name: "blip_caption"
96
+ eval:
97
+ name: "blip_caption"
98
+
99
+ video_processor:
100
+ train:
101
+ name: alpro_video_train
102
+ n_frms: 2
103
+ image_size: 224
104
+ min_scale: 0.9
105
+ max_scale: 1.0
106
+ full_video: True
107
+ eval:
108
+ name: alpro_video_eval
109
+ n_frms: 2
110
+ image_size: 224
111
+ min_scale: 0.9
112
+ max_scale: 1.0
113
+ full_video: True
114
+
115
+ data_type: [audio, video] # [images|videos|features]
116
+
117
+ build_info:
118
+ kwargs:
119
+ total: 100
120
+ shuffle_modalities: False
121
+ balance_labels: True
122
+ dataset_name: audiocaps
123
+ ground_truth: False
124
+ classnames: [audio, video]
125
+ raw: False
126
+
127
+ # Be careful not to append minus sign (-) before split to avoid itemizing
128
+ annotations:
129
+ val:
130
+ url:
131
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
132
+ storage:
133
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
134
+
135
+ audio:
136
+ storage: /audiocaps/AUDIOCAPS_32000Hz/audio/val
137
+ video:
138
+ storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val
139
+
140
+ run:
141
+ task: discrn_qa
142
+ # optimization-specific
143
+ batch_size_train: 8
144
+ batch_size_eval: 1
145
+ num_workers: 8
146
+ max_epoch: 1
147
+ segments: 1
148
+
149
+ # inference-specific
150
+ max_len: 10
151
+ min_len: 1
152
+ length_penalty: -1.
153
+ num_beams: 5
154
+ inference_method: "generate"
155
+
156
+ train_splits: ["train"]
157
+ valid_splits: ["val"]
158
+ # test_splits: ["test"]
159
+
160
+ # distribution
161
+ device: "cuda"
162
+ world_size: 1
163
+ dist_url: "env://"
164
+ distributed: True
165
+ use_dist_eval_sampler: False
166
+
167
+
168
+ # model specific
169
+ k_test: 128
170
+
171
+ # misc
172
+ seed: 42
173
+ output_dir: "output/xinstructblip/eval/vicuna7b/discrn/audio_video_describe"
174
+
175
+ evaluate: True
176
+ save_freq: -1
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_13b.yaml ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+ model:
6
+ arch: blip2_vicuna_xinstruct
7
+ model_type: vicuna13b
8
+ load_pretrained: True
9
+ # pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/image_qformer.pth
10
+ load_finetuned: False
11
+ finetuned: ""
12
+ stage1_url_or_filename: null
13
+ image_model: "eva_clip_g"
14
+ pc_model: "ulip2_pointbert"
15
+ video_model: "eva_clip_g"
16
+ audio_model: "beats"
17
+ pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/image_qformer.pth
18
+ pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/pc_qformer_last.pth
19
+ pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/video_qformer.pth
20
+ pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/audio_qformer.pth
21
+ load_attention_image_qformer: True
22
+ load_attention_pc_qformer: True
23
+ load_attention_video_qformer: True
24
+ load_attention_audio_qformer: True
25
+ load_ln_type_image: "image"
26
+ load_ln_type_video: "video"
27
+ load_ln_type_pc: "pc"
28
+ load_ln_type_audio: "audio"
29
+ load_qformer_type_image: "image"
30
+ load_qformer_type_pc: "pc"
31
+ load_qformer_type_video: "video"
32
+ load_qformer_type_audio: "audio"
33
+ load_projection_image: True
34
+ load_projection_pc: True
35
+ load_projection_video: True
36
+ load_projection_audio: True
37
+ load_projection_type_image: "image"
38
+ load_projection_type_pc: "pc"
39
+ load_projection_type_video: "video"
40
+ load_projection_type_audio: "audio"
41
+ image_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
42
+ pc_encoder_kwargs : {}
43
+ video_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
44
+ audio_encoder_kwargs : {}
45
+ image_precision: "fp16"
46
+ pc_precision: "fp16"
47
+ video_precision: "fp16"
48
+ audio_precision: "fp16"
49
+ freeze_image: True
50
+ freeze_pc: True
51
+ freeze_video: True
52
+ freeze_audio: True
53
+ num_query_token: 32
54
+ llm_model: "/path/to/vicuna-13b"
55
+ prompt: "question: {} answer:"
56
+ max_txt_len: 128
57
+ max_output_txt_len: 256
58
+ apply_lemmatizer: False
59
+ num_few_shot_examples: 0
60
+ few_shot_prob: 0
61
+ qformer_text_input: True
62
+ llm_text_input: True
63
+ modalities : [audio, video]
64
+ use_cues: True
65
+ shared_qformer: False
66
+ pretrained_shared_qformer: Null
67
+ load_attention_shared_qformer: False
68
+ load_qformer_type_shared: ""
69
+ load_projection_shared: False
70
+ load_projection_type_shaped: ""
71
+ load_ln_type_shared: ""
72
+ shared_qformer_num_features: 512
73
+ # special_qformer_input_prompt: "a short description"
74
+ prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
75
+ predict_with_gen: False
76
+ use_caption: False
77
+ use_describe: False
78
+ enumerate_inputs: False
79
+ add_space: True
80
+ remove_start: True
81
+
82
+
83
+ datasets:
84
+ audio_video_discrn:
85
+ # data_dir: ${env.data_dir}/datasets
86
+ audio_processor:
87
+ train:
88
+ name: beats_audio
89
+ n_frames: 2
90
+ eval:
91
+ name: beats_audio
92
+ n_frames: 2
93
+
94
+ text_processor:
95
+ train:
96
+ name: "blip_caption"
97
+ eval:
98
+ name: "blip_caption"
99
+
100
+ video_processor:
101
+ train:
102
+ name: alpro_video_train
103
+ n_frms: 2
104
+ image_size: 224
105
+ min_scale: 0.9
106
+ max_scale: 1.0
107
+ full_video: True
108
+ eval:
109
+ name: alpro_video_eval
110
+ n_frms: 2
111
+ image_size: 224
112
+ min_scale: 0.9
113
+ max_scale: 1.0
114
+ full_video: True
115
+
116
+ data_type: [audio, video] # [images|videos|features]
117
+
118
+ build_info:
119
+ kwargs:
120
+ total: 100
121
+ shuffle_modalities: False
122
+ balance_labels: True
123
+ dataset_name: audiocaps
124
+ ground_truth: False
125
+ classnames: [audio, video]
126
+ raw: False
127
+
128
+ # Be careful not to append minus sign (-) before split to avoid itemizing
129
+ annotations:
130
+ val:
131
+ url:
132
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
133
+ storage:
134
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
135
+
136
+ audio:
137
+ storage: /audiocaps/AUDIOCAPS_32000Hz/audio/val
138
+ video:
139
+ storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val
140
+
141
+ run:
142
+ task: discrn_qa
143
+ # optimization-specific
144
+ batch_size_train: 8
145
+ batch_size_eval: 1
146
+ num_workers: 8
147
+ max_epoch: 1
148
+ segments: 1
149
+
150
+ # inference-specific
151
+ max_len: 10
152
+ min_len: 1
153
+ length_penalty: -1.
154
+ num_beams: 5
155
+ inference_method: "generate"
156
+
157
+ train_splits: ["train"]
158
+ valid_splits: ["val"]
159
+ # test_splits: ["test"]
160
+
161
+ # distribution
162
+ device: "cuda"
163
+ world_size: 1
164
+ dist_url: "env://"
165
+ distributed: True
166
+ use_dist_eval_sampler: False
167
+
168
+
169
+ # model specific
170
+ k_test: 128
171
+
172
+ # misc
173
+ seed: 42
174
+ output_dir: "output/xinstructblip/eval/vicuna13b/discrn/audio_video_describe"
175
+
176
+ evaluate: True
177
+ save_freq: -1
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_nocue.yaml ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+ model:
6
+ arch: blip2_vicuna_xinstruct
7
+ model_type: vicuna7b
8
+ load_pretrained: True
9
+ # pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
10
+ load_finetuned: False
11
+ finetuned: ""
12
+ stage1_url_or_filename: null
13
+ image_model: "eva_clip_g"
14
+ pc_model: "ulip2_pointbert"
15
+ video_model: "eva_clip_g"
16
+ audio_model: "beats"
17
+ pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b_nocue/image_qformer.pth
18
+ pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b_nocue/pc_qformer.pth
19
+ pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b_nocue/video_qformer.pth
20
+ pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b_nocue/audio_qformer.pth
21
+ load_attention_image_qformer: True
22
+ load_attention_pc_qformer: True
23
+ load_attention_video_qformer: True
24
+ load_attention_audio_qformer: True
25
+ load_ln_type_image: "image"
26
+ load_ln_type_video: "video"
27
+ load_ln_type_pc: "pc"
28
+ load_ln_type_audio: "audio"
29
+ load_qformer_type_image: "image"
30
+ load_qformer_type_pc: "pc"
31
+ load_qformer_type_video: "video"
32
+ load_qformer_type_audio: "audio"
33
+ load_projection_image: True
34
+ load_projection_pc: True
35
+ load_projection_video: True
36
+ load_projection_audio: True
37
+ load_projection_type_image: "image"
38
+ load_projection_type_pc: "pc"
39
+ load_projection_type_video: "video"
40
+ load_projection_type_audio: "audio"
41
+ image_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
42
+ pc_encoder_kwargs : {}
43
+ video_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
44
+ audio_encoder_kwargs : {}
45
+ image_precision: "fp16"
46
+ pc_precision: "fp16"
47
+ video_precision: "fp16"
48
+ audio_precision: "fp16"
49
+ freeze_image: True
50
+ freeze_pc: True
51
+ freeze_video: True
52
+ freeze_audio: True
53
+ num_query_token: 32
54
+ llm_model: /path/to/vicuna-7b
55
+ prompt: "question: {} answer:"
56
+ max_txt_len: 128
57
+ max_output_txt_len: 256
58
+ apply_lemmatizer: False
59
+ num_few_shot_examples: 0
60
+ few_shot_prob: 0
61
+ qformer_text_input: True
62
+ llm_text_input: True
63
+ modalities : [audio, video]
64
+ use_cues: False
65
+ shared_qformer: False
66
+ pretrained_shared_qformer: Null
67
+ load_attention_shared_qformer: False
68
+ load_qformer_type_shared: ""
69
+ load_projection_shared: False
70
+ load_projection_type_shaped: ""
71
+ load_ln_type_shared: ""
72
+ shared_qformer_num_features: 512
73
+ special_qformer_input_prompt: "a short description"
74
+ prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
75
+ predict_with_gen: False
76
+ use_caption: False
77
+ use_describe: False
78
+ enumerate_inputs: False
79
+ add_space: True
80
+
81
+
82
+ datasets:
83
+ audio_video_discrn:
84
+ # data_dir: ${env.data_dir}/datasets
85
+ audio_processor:
86
+ train:
87
+ name: beats_audio
88
+ n_frames: 2
89
+ eval:
90
+ name: beats_audio
91
+ n_frames: 2
92
+
93
+ text_processor:
94
+ train:
95
+ name: "blip_caption"
96
+ eval:
97
+ name: "blip_caption"
98
+
99
+ video_processor:
100
+ train:
101
+ name: alpro_video_train
102
+ n_frms: 2
103
+ image_size: 224
104
+ min_scale: 0.9
105
+ max_scale: 1.0
106
+ full_video: True
107
+ eval:
108
+ name: alpro_video_eval
109
+ n_frms: 2
110
+ image_size: 224
111
+ min_scale: 0.9
112
+ max_scale: 1.0
113
+ full_video: True
114
+
115
+ data_type: [audio, video] # [images|videos|features]
116
+
117
+ build_info:
118
+ kwargs:
119
+ total: all
120
+ shuffle_modalities: False
121
+ balance_labels: True
122
+ dataset_name: audiocaps
123
+ ground_truth: False
124
+ classnames: [audio, video]
125
+ raw: False
126
+
127
+ # Be careful not to append minus sign (-) before split to avoid itemizing
128
+ annotations:
129
+ val:
130
+ url:
131
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
132
+ storage:
133
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
134
+
135
+ audio:
136
+ storage: /audiocaps/AUDIOCAPS_32000Hz/audio/val
137
+ video:
138
+ storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val
139
+
140
+ run:
141
+ task: discrn_qa
142
+ # optimization-specific
143
+ batch_size_train: 8
144
+ batch_size_eval: 1
145
+ num_workers: 8
146
+ max_epoch: 1
147
+ segments: 1
148
+
149
+ # inference-specific
150
+ max_len: 10
151
+ min_len: 1
152
+ length_penalty: -1.
153
+ num_beams: 5
154
+ inference_method: "generate"
155
+
156
+ train_splits: ["train"]
157
+ valid_splits: ["val"]
158
+ # test_splits: ["test"]
159
+
160
+ # distribution
161
+ device: "cuda"
162
+ world_size: 1
163
+ dist_url: "env://"
164
+ distributed: True
165
+ use_dist_eval_sampler: False
166
+
167
+
168
+ # model specific
169
+ k_test: 128
170
+
171
+ # misc
172
+ seed: 42
173
+ output_dir: "output/xinstructblip/eval/vicuna7b/discrn/audio_video_describe_nocue"
174
+
175
+ evaluate: True
176
+ save_freq: -1
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_proj copy.yaml ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+ model:
6
+ arch: blip2_vicuna_xinstruct
7
+ model_type: vicuna7b
8
+ load_pretrained: True
9
+ # pretrained: /export/home/LAVIS-xgen_mm/lavis/output/xinstructblip/train/vicuna7b/audio/20231115194/checkpoint_65001.pth
10
+ load_finetuned: False
11
+ finetuned: ""
12
+ stage1_url_or_filename: null
13
+ image_model: "eva_clip_g"
14
+ pc_model: "ulip2_pointbert"
15
+ video_model: "eva_clip_g"
16
+ audio_model: "beats"
17
+ pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
18
+ pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth
19
+ pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth
20
+ pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/linear_projection_7b/audio_qformer_linear.pth
21
+ load_attention_image_qformer: True
22
+ load_attention_pc_qformer: True
23
+ load_attention_video_qformer: True
24
+ load_attention_audio_qformer: True
25
+ load_ln_type_image: "image"
26
+ load_ln_type_video: "video"
27
+ load_ln_type_pc: "pc"
28
+ load_ln_type_audio: "audio"
29
+ load_qformer_type_image: "image"
30
+ load_qformer_type_pc: "pc"
31
+ load_qformer_type_video: "video"
32
+ load_qformer_type_audio: "audio"
33
+ load_projection_image: True
34
+ load_projection_pc: True
35
+ load_projection_video: True
36
+ load_projection_audio: True
37
+ load_projection_type_image: "image"
38
+ load_projection_type_pc: "pc"
39
+ load_projection_type_video: "video"
40
+ load_projection_type_audio: "audio"
41
+ image_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
42
+ pc_encoder_kwargs : {}
43
+ video_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
44
+ audio_encoder_kwargs : {}
45
+ image_precision: "fp16"
46
+ pc_precision: "fp16"
47
+ video_precision: "fp16"
48
+ audio_precision: "fp16"
49
+ freeze_image: True
50
+ freeze_pc: True
51
+ freeze_video: True
52
+ freeze_audio: True
53
+ num_query_token: 32
54
+ llm_model: /path/to/vicuna-7b
55
+ prompt: "question: {} answer:"
56
+ max_txt_len: 128
57
+ max_output_txt_len: 256
58
+ apply_lemmatizer: False
59
+ num_few_shot_examples: 0
60
+ few_shot_prob: 0
61
+ qformer_text_input: True
62
+ llm_text_input: True
63
+ modalities : [audio, video]
64
+ use_cues: True
65
+ shared_qformer: False
66
+ pretrained_shared_qformer: Null
67
+ load_attention_shared_qformer: False
68
+ load_qformer_type_shared: ""
69
+ load_projection_shared: False
70
+ load_projection_type_shaped: ""
71
+ load_ln_type_shared: ""
72
+ shared_qformer_num_features: 512
73
+ special_qformer_input_prompt: "a short description"
74
+ prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
75
+ predict_with_gen: False
76
+ use_caption: False
77
+ use_describe: False
78
+ enumerate_inputs: False
79
+ add_space: True
80
+ projection_only_audio: True
81
+ projection_path_audio: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/linear_projection_7b/audio_qformer_linear_768.pth
82
+ proj_dim: 768
83
+
84
+
85
+ datasets:
86
+ audio_video_discrn:
87
+ # data_dir: ${env.data_dir}/datasets
88
+ audio_processor:
89
+ train:
90
+ name: beats_audio
91
+ n_frames: 2
92
+ eval:
93
+ name: beats_audio
94
+ n_frames: 2
95
+
96
+ text_processor:
97
+ train:
98
+ name: "blip_caption"
99
+ eval:
100
+ name: "blip_caption"
101
+
102
+ video_processor:
103
+ train:
104
+ name: alpro_video_train
105
+ n_frms: 2
106
+ image_size: 224
107
+ min_scale: 0.9
108
+ max_scale: 1.0
109
+ full_video: True
110
+ eval:
111
+ name: alpro_video_eval
112
+ n_frms: 2
113
+ image_size: 224
114
+ min_scale: 0.9
115
+ max_scale: 1.0
116
+ full_video: True
117
+
118
+ data_type: [audio, video] # [images|videos|features]
119
+
120
+ build_info:
121
+ kwargs:
122
+ total: all
123
+ shuffle_modalities: False
124
+ balance_labels: True
125
+ dataset_name: audiocaps
126
+ ground_truth: False
127
+ classnames: [audio, video]
128
+ raw: False
129
+
130
+ # Be careful not to append minus sign (-) before split to avoid itemizing
131
+ annotations:
132
+ val:
133
+ url:
134
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
135
+ storage:
136
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
137
+
138
+ audio:
139
+ storage: /audiocaps/AUDIOCAPS_32000Hz/audio/val
140
+ video:
141
+ storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val
142
+
143
+ run:
144
+ task: discrn_qa
145
+ # optimization-specific
146
+ batch_size_train: 8
147
+ batch_size_eval: 1
148
+ num_workers: 8
149
+ max_epoch: 1
150
+ segments: 1
151
+
152
+ # inference-specific
153
+ max_len: 10
154
+ min_len: 1
155
+ length_penalty: -1.
156
+ num_beams: 5
157
+ inference_method: "generate"
158
+
159
+ train_splits: ["train"]
160
+ valid_splits: ["val"]
161
+ # test_splits: ["test"]
162
+
163
+ # distribution
164
+ device: "cuda"
165
+ world_size: 1
166
+ dist_url: "env://"
167
+ distributed: True
168
+ use_dist_eval_sampler: False
169
+
170
+
171
+ # model specific
172
+ k_test: 128
173
+
174
+ # misc
175
+ seed: 42
176
+ output_dir: "output/xinstructblip/eval/vicuna7b/discrn/audio_video_describe_proj"
177
+
178
+ evaluate: True
179
+ save_freq: -1
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_proj.yaml ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+ model:
6
+ arch: blip2_vicuna_xinstruct
7
+ model_type: vicuna7b
8
+ load_pretrained: True
9
+ # pretrained: /export/home/LAVIS-xgen_mm/lavis/output/xinstructblip/train/vicuna7b/audio/20231115194/checkpoint_65001.pth
10
+ load_finetuned: False
11
+ finetuned: ""
12
+ stage1_url_or_filename: null
13
+ image_model: "eva_clip_g"
14
+ pc_model: "ulip2_pointbert"
15
+ video_model: "eva_clip_g"
16
+ audio_model: "beats"
17
+ pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
18
+ pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth
19
+ pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth
20
+ pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/linear_projection_7b/audio_qformer_linear.pth
21
+ load_attention_image_qformer: True
22
+ load_attention_pc_qformer: True
23
+ load_attention_video_qformer: True
24
+ load_attention_audio_qformer: True
25
+ load_ln_type_image: "image"
26
+ load_ln_type_video: "video"
27
+ load_ln_type_pc: "pc"
28
+ load_ln_type_audio: "audio"
29
+ load_qformer_type_image: "image"
30
+ load_qformer_type_pc: "pc"
31
+ load_qformer_type_video: "video"
32
+ load_qformer_type_audio: "audio"
33
+ load_projection_image: True
34
+ load_projection_pc: True
35
+ load_projection_video: True
36
+ load_projection_audio: True
37
+ load_projection_type_image: "image"
38
+ load_projection_type_pc: "pc"
39
+ load_projection_type_video: "video"
40
+ load_projection_type_audio: "audio"
41
+ image_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
42
+ pc_encoder_kwargs : {}
43
+ video_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
44
+ audio_encoder_kwargs : {}
45
+ image_precision: "fp16"
46
+ pc_precision: "fp16"
47
+ video_precision: "fp16"
48
+ audio_precision: "fp16"
49
+ freeze_image: True
50
+ freeze_pc: True
51
+ freeze_video: True
52
+ freeze_audio: True
53
+ num_query_token: 32
54
+ llm_model: /path/to/vicuna-7b
55
+ prompt: "question: {} answer:"
56
+ max_txt_len: 128
57
+ max_output_txt_len: 256
58
+ apply_lemmatizer: False
59
+ num_few_shot_examples: 0
60
+ few_shot_prob: 0
61
+ qformer_text_input: True
62
+ llm_text_input: True
63
+ modalities : [audio, video]
64
+ use_cues: True
65
+ shared_qformer: False
66
+ pretrained_shared_qformer: Null
67
+ load_attention_shared_qformer: False
68
+ load_qformer_type_shared: ""
69
+ load_projection_shared: False
70
+ load_projection_type_shaped: ""
71
+ load_ln_type_shared: ""
72
+ shared_qformer_num_features: 512
73
+ special_qformer_input_prompt: "a short description"
74
+ prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
75
+ predict_with_gen: False
76
+ use_caption: False
77
+ use_describe: False
78
+ enumerate_inputs: False
79
+ add_space: True
80
+ projection_only_audio: True
81
+ projection_path_audio: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/linear_projection_7b/audio_qformer_linear_768.pth
82
+ proj_dim: 768
83
+
84
+
85
+ datasets:
86
+ audio_video_discrn:
87
+ # data_dir: ${env.data_dir}/datasets
88
+ audio_processor:
89
+ train:
90
+ name: beats_audio
91
+ n_frames: 2
92
+ eval:
93
+ name: beats_audio
94
+ n_frames: 2
95
+
96
+ text_processor:
97
+ train:
98
+ name: "blip_caption"
99
+ eval:
100
+ name: "blip_caption"
101
+
102
+ video_processor:
103
+ train:
104
+ name: alpro_video_train
105
+ n_frms: 2
106
+ image_size: 224
107
+ min_scale: 0.9
108
+ max_scale: 1.0
109
+ full_video: True
110
+ eval:
111
+ name: alpro_video_eval
112
+ n_frms: 2
113
+ image_size: 224
114
+ min_scale: 0.9
115
+ max_scale: 1.0
116
+ full_video: True
117
+
118
+ data_type: [audio, video] # [images|videos|features]
119
+
120
+ build_info:
121
+ kwargs:
122
+ total: all
123
+ shuffle_modalities: False
124
+ balance_labels: True
125
+ dataset_name: audiocaps
126
+ ground_truth: False
127
+ classnames: [audio, video]
128
+ raw: False
129
+
130
+ # Be careful not to append minus sign (-) before split to avoid itemizing
131
+ annotations:
132
+ val:
133
+ url:
134
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
135
+ storage:
136
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
137
+
138
+ audio:
139
+ storage: /audiocaps/AUDIOCAPS_32000Hz/audio/val
140
+ video:
141
+ storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val
142
+
143
+ run:
144
+ task: discrn_qa
145
+ # optimization-specific
146
+ batch_size_train: 8
147
+ batch_size_eval: 1
148
+ num_workers: 8
149
+ max_epoch: 1
150
+ segments: 1
151
+
152
+ # inference-specific
153
+ max_len: 10
154
+ min_len: 1
155
+ length_penalty: -1.
156
+ num_beams: 5
157
+ inference_method: "generate"
158
+
159
+ train_splits: ["train"]
160
+ valid_splits: ["val"]
161
+ # test_splits: ["test"]
162
+
163
+ # distribution
164
+ device: "cuda"
165
+ world_size: 1
166
+ dist_url: "env://"
167
+ distributed: True
168
+ use_dist_eval_sampler: False
169
+
170
+
171
+ # model specific
172
+ k_test: 128
173
+
174
+ # misc
175
+ seed: 42
176
+ output_dir: "output/xinstructblip/eval/vicuna7b/discrn/audio_video_describe_proj"
177
+
178
+ evaluate: True
179
+ save_freq: -1
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_rand_init.yaml ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+ model:
6
+ arch: blip2_vicuna_xinstruct
7
+ model_type: vicuna7b
8
+ load_pretrained: True
9
+ # pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
10
+ load_finetuned: False
11
+ finetuned: ""
12
+ stage1_url_or_filename: null
13
+ image_model: "eva_clip_g"
14
+ pc_model: "ulip2_pointbert"
15
+ video_model: "eva_clip_g"
16
+ audio_model: "beats"
17
+ pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
18
+ pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth
19
+ pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth
20
+ pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/audio_qformer_no_init.pth
21
+ load_attention_image_qformer: True
22
+ load_attention_pc_qformer: True
23
+ load_attention_video_qformer: True
24
+ load_attention_audio_qformer: True
25
+ load_ln_type_image: "image"
26
+ load_ln_type_video: "video"
27
+ load_ln_type_pc: "pc"
28
+ load_ln_type_audio: "audio"
29
+ load_qformer_type_image: "image"
30
+ load_qformer_type_pc: "pc"
31
+ load_qformer_type_video: "video"
32
+ load_qformer_type_audio: "audio"
33
+ load_projection_image: True
34
+ load_projection_pc: True
35
+ load_projection_video: True
36
+ load_projection_audio: True
37
+ load_projection_type_image: "image"
38
+ load_projection_type_pc: "pc"
39
+ load_projection_type_video: "video"
40
+ load_projection_type_audio: "audio"
41
+ image_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
42
+ pc_encoder_kwargs : {}
43
+ video_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
44
+ audio_encoder_kwargs : {}
45
+ image_precision: "fp16"
46
+ pc_precision: "fp16"
47
+ video_precision: "fp16"
48
+ audio_precision: "fp16"
49
+ freeze_image: True
50
+ freeze_pc: True
51
+ freeze_video: True
52
+ freeze_audio: True
53
+ num_query_token: 32
54
+ llm_model: /path/to/vicuna-7b
55
+ prompt: "question: {} answer:"
56
+ max_txt_len: 128
57
+ max_output_txt_len: 256
58
+ apply_lemmatizer: False
59
+ num_few_shot_examples: 0
60
+ few_shot_prob: 0
61
+ qformer_text_input: True
62
+ llm_text_input: True
63
+ modalities : [audio, video]
64
+ use_cues: True
65
+ shared_qformer: False
66
+ pretrained_shared_qformer: Null
67
+ load_attention_shared_qformer: False
68
+ load_qformer_type_shared: ""
69
+ load_projection_shared: False
70
+ load_projection_type_shaped: ""
71
+ load_ln_type_shared: ""
72
+ shared_qformer_num_features: 512
73
+ special_qformer_input_prompt: "a short description"
74
+ prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
75
+ predict_with_gen: False
76
+ use_caption: False
77
+ use_describe: False
78
+ enumerate_inputs: False
79
+ add_space: True
80
+
81
+
82
+ datasets:
83
+ audio_video_discrn:
84
+ # data_dir: ${env.data_dir}/datasets
85
+ audio_processor:
86
+ train:
87
+ name: beats_audio
88
+ n_frames: 2
89
+ eval:
90
+ name: beats_audio
91
+ n_frames: 2
92
+
93
+ text_processor:
94
+ train:
95
+ name: "blip_caption"
96
+ eval:
97
+ name: "blip_caption"
98
+
99
+ video_processor:
100
+ train:
101
+ name: alpro_video_train
102
+ n_frms: 2
103
+ image_size: 224
104
+ min_scale: 0.9
105
+ max_scale: 1.0
106
+ full_video: True
107
+ eval:
108
+ name: alpro_video_eval
109
+ n_frms: 2
110
+ image_size: 224
111
+ min_scale: 0.9
112
+ max_scale: 1.0
113
+ full_video: True
114
+
115
+ data_type: [audio, video] # [images|videos|features]
116
+
117
+ build_info:
118
+ kwargs:
119
+ total: all
120
+ shuffle_modalities: False
121
+ balance_labels: True
122
+ dataset_name: audiocaps
123
+ ground_truth: False
124
+ classnames: [audio, video]
125
+ raw: False
126
+
127
+ # Be careful not to append minus sign (-) before split to avoid itemizing
128
+ annotations:
129
+ val:
130
+ url:
131
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
132
+ storage:
133
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
134
+
135
+ audio:
136
+ storage: /audiocaps/AUDIOCAPS_32000Hz/audio/val
137
+ video:
138
+ storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val
139
+
140
+ run:
141
+ task: discrn_qa
142
+ # optimization-specific
143
+ batch_size_train: 8
144
+ batch_size_eval: 1
145
+ num_workers: 8
146
+ max_epoch: 1
147
+ segments: 1
148
+
149
+ # inference-specific
150
+ max_len: 10
151
+ min_len: 1
152
+ length_penalty: -1.
153
+ num_beams: 5
154
+ inference_method: "generate"
155
+
156
+ train_splits: ["train"]
157
+ valid_splits: ["val"]
158
+ # test_splits: ["test"]
159
+
160
+ # distribution
161
+ device: "cuda"
162
+ world_size: 1
163
+ dist_url: "env://"
164
+ distributed: True
165
+ use_dist_eval_sampler: False
166
+
167
+
168
+ # model specific
169
+ k_test: 128
170
+
171
+ # misc
172
+ seed: 42
173
+ output_dir: "output/xinstructblip/eval/vicuna7b/discrn/audio_video_describe_rand_init"
174
+
175
+ evaluate: True
176
+ save_freq: -1
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/image_3d_caption.yaml ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+ model:
6
+ arch: blip2_vicuna_xinstruct
7
+ model_type: vicuna7b
8
+ load_pretrained: True
9
+ # pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
10
+ load_finetuned: False
11
+ finetuned: ""
12
+ stage1_url_or_filename: null
13
+ image_model: "eva_clip_g"
14
+ pc_model: "ulip2_pointbert"
15
+ video_model: "eva_clip_g"
16
+ audio_model: "beats"
17
+ pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
18
+ pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth
19
+ pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth
20
+ pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/audio_qformer.pth
21
+ load_attention_image_qformer: True
22
+ load_attention_pc_qformer: True
23
+ load_attention_video_qformer: True
24
+ load_attention_audio_qformer: True
25
+ load_ln_type_image: "image"
26
+ load_ln_type_video: "video"
27
+ load_ln_type_pc: "pc"
28
+ load_ln_type_audio: "audio"
29
+ load_qformer_type_image: "image"
30
+ load_qformer_type_pc: "pc"
31
+ load_qformer_type_video: "video"
32
+ load_qformer_type_audio: "audio"
33
+ load_projection_image: True
34
+ load_projection_pc: True
35
+ load_projection_video: True
36
+ load_projection_audio: True
37
+ load_projection_type_image: "image"
38
+ load_projection_type_pc: "pc"
39
+ load_projection_type_video: "video"
40
+ load_projection_type_audio: "audio"
41
+ image_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
42
+ pc_encoder_kwargs : {}
43
+ video_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
44
+ audio_encoder_kwargs : {}
45
+ image_precision: "fp16"
46
+ pc_precision: "fp16"
47
+ video_precision: "fp16"
48
+ audio_precision: "fp16"
49
+ freeze_image: True
50
+ freeze_pc: True
51
+ freeze_video: True
52
+ freeze_audio: True
53
+ num_query_token: 32
54
+ llm_model: /path/to/vicuna-7b
55
+ prompt: "question: {} answer:"
56
+ modalities : [image, pc]
57
+ use_cues: True
58
+ shared_qformer: False
59
+ pretrained_shared_qformer: Null
60
+ load_attention_shared_qformer: False
61
+ load_qformer_type_shared: ""
62
+ load_projection_shared: False
63
+ load_projection_type_shaped: ""
64
+ load_ln_type_shared: ""
65
+ shared_qformer_num_features: 512
66
+ special_qformer_input_prompt: "a short description"
67
+ prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
68
+ predict_with_gen: False
69
+ use_caption: True
70
+ use_describe: False
71
+ enumerate_inputs: False
72
+ add_space: True
73
+
74
+ datasets:
75
+ image_pc_discrn: # name of the dataset builder
76
+ vis_processor:
77
+ train:
78
+ name: "clip_image_train"
79
+ eval:
80
+ name: "clip_image_eval"
81
+ pc_processor:
82
+ train:
83
+ name: "ulip_pc"
84
+ eval:
85
+ name: "ulip_pc"
86
+ text_processor:
87
+ train:
88
+ name: "blip_caption"
89
+ eval:
90
+ name: "blip_caption"
91
+
92
+ data_type: [images, pc] # [images|videos|features]
93
+
94
+
95
+ build_info:
96
+ kwargs:
97
+ total: all
98
+ raw: True
99
+ shuffle_modalities: False
100
+ balance_labels: True
101
+ dataset_name: objaverse
102
+ classnames: [image, 3d]
103
+ ground_truth: False
104
+
105
+ # Be careful not to append minus sign (-) before split to avoid itemizing
106
+ annotations:
107
+ val:
108
+ url:
109
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/objaverse.json
110
+ storage:
111
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/objaverse.json
112
+ pc:
113
+ storage: /export/einstein-vision/3d_vision/objaverse/objaverse_pc_parallel
114
+
115
+ images:
116
+ storage: /export/einstein-vision/3d_vision/objaverse_captions/images/
117
+
118
+ run:
119
+ task: discrn_qa
120
+ # optimization-specific
121
+ batch_size_train: 8
122
+ batch_size_eval: 1
123
+ num_workers: 10
124
+ max_epoch: 1
125
+ segments: 1
126
+
127
+ # inference-specific
128
+ max_len: 10
129
+ min_len: 1
130
+ length_penalty: -1.
131
+ num_beams: 5
132
+ inference_method: "generate"
133
+
134
+ train_splits: ["train"]
135
+ valid_splits: ["val"]
136
+ # test_splits: ["test"]
137
+
138
+ # distribution
139
+ device: "cuda"
140
+ world_size: 1
141
+ dist_url: "env://"
142
+ distributed: True
143
+ use_dist_eval_sampler: False
144
+
145
+
146
+ # model specific
147
+ k_test: 128
148
+
149
+ # misc
150
+ seed: 42
151
+ output_dir: "output/xinstructblip/eval/vicuna7b/discrn/image_3d_caption"
152
+
153
+ evaluate: True
154
+ save_freq: -1
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/image_3d_caption_13b.yaml ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+ model:
6
+ arch: blip2_vicuna_xinstruct
7
+ model_type: vicuna13b
8
+ load_pretrained: True
9
+ # pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/image_qformer.pth
10
+ load_finetuned: False
11
+ finetuned: ""
12
+ stage1_url_or_filename: null
13
+ image_model: "eva_clip_g"
14
+ pc_model: "ulip2_pointbert"
15
+ video_model: "eva_clip_g"
16
+ audio_model: "beats"
17
+ pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/image_qformer.pth
18
+ pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/pc_qformer.pth
19
+ pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/video_qformer.pth
20
+ pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/audio_qformer.pth
21
+ load_attention_image_qformer: True
22
+ load_attention_pc_qformer: True
23
+ load_attention_video_qformer: True
24
+ load_attention_audio_qformer: True
25
+ load_ln_type_image: "image"
26
+ load_ln_type_video: "video"
27
+ load_ln_type_pc: "pc"
28
+ load_ln_type_audio: "audio"
29
+ load_qformer_type_image: "image"
30
+ load_qformer_type_pc: "pc"
31
+ load_qformer_type_video: "video"
32
+ load_qformer_type_audio: "audio"
33
+ load_projection_image: True
34
+ load_projection_pc: True
35
+ load_projection_video: True
36
+ load_projection_audio: True
37
+ load_projection_type_image: "image"
38
+ load_projection_type_pc: "pc"
39
+ load_projection_type_video: "video"
40
+ load_projection_type_audio: "audio"
41
+ image_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
42
+ pc_encoder_kwargs : {}
43
+ video_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
44
+ audio_encoder_kwargs : {}
45
+ image_precision: "fp16"
46
+ pc_precision: "fp16"
47
+ video_precision: "fp16"
48
+ audio_precision: "fp16"
49
+ freeze_image: True
50
+ freeze_pc: True
51
+ freeze_video: True
52
+ freeze_audio: True
53
+ num_query_token: 32
54
+ llm_model: "/path/to/vicuna-13b"
55
+ prompt: "question: {} answer:"
56
+ modalities : [image, pc]
57
+ use_cues: True
58
+ shared_qformer: False
59
+ pretrained_shared_qformer: Null
60
+ load_attention_shared_qformer: False
61
+ load_qformer_type_shared: ""
62
+ load_projection_shared: False
63
+ load_projection_type_shaped: ""
64
+ load_ln_type_shared: ""
65
+ shared_qformer_num_features: 512
66
+ special_qformer_input_prompt: "a short description"
67
+ prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
68
+ predict_with_gen: False
69
+ use_caption: True
70
+ use_describe: False
71
+ enumerate_inputs: False
72
+ add_space: True
73
+
74
+ datasets:
75
+ image_pc_discrn: # name of the dataset builder
76
+ vis_processor:
77
+ train:
78
+ name: "clip_image_train"
79
+ eval:
80
+ name: "clip_image_eval"
81
+ pc_processor:
82
+ train:
83
+ name: "ulip_pc"
84
+ eval:
85
+ name: "ulip_pc"
86
+ text_processor:
87
+ train:
88
+ name: "blip_caption"
89
+ eval:
90
+ name: "blip_caption"
91
+
92
+ data_type: [images, pc] # [images|videos|features]
93
+
94
+
95
+ build_info:
96
+ kwargs:
97
+ total: 100
98
+ raw: True
99
+ shuffle_modalities: False
100
+ balance_labels: True
101
+ dataset_name: objaverse
102
+ classnames: [image, 3d]
103
+ ground_truth: False
104
+
105
+ # Be careful not to append minus sign (-) before split to avoid itemizing
106
+ annotations:
107
+ val:
108
+ url:
109
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/objaverse.json
110
+ storage:
111
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/objaverse.json
112
+ pc:
113
+ storage: /export/einstein-vision/3d_vision/objaverse/objaverse_pc_parallel
114
+
115
+ images:
116
+ storage: /export/einstein-vision/3d_vision/objaverse_captions/images/
117
+
118
+ run:
119
+ task: discrn_qa
120
+ # optimization-specific
121
+ batch_size_train: 8
122
+ batch_size_eval: 1
123
+ num_workers: 2
124
+ max_epoch: 1
125
+ segments: 1
126
+
127
+ # inference-specific
128
+ max_len: 10
129
+ min_len: 1
130
+ length_penalty: -1.
131
+ num_beams: 5
132
+ inference_method: "generate"
133
+
134
+ train_splits: ["train"]
135
+ valid_splits: ["val"]
136
+ # test_splits: ["test"]
137
+
138
+ # distribution
139
+ device: "cuda"
140
+ world_size: 1
141
+ dist_url: "env://"
142
+ distributed: True
143
+ use_dist_eval_sampler: False
144
+
145
+
146
+ # model specific
147
+ k_test: 128
148
+
149
+ # misc
150
+ seed: 42
151
+ output_dir: "output/xinstructblip/eval/vicuna13b/discrn/image_3d_caption"
152
+
153
+ evaluate: True
154
+ save_freq: -1
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/image_3d_describe.yaml ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+ model:
6
+ arch: blip2_vicuna_xinstruct
7
+ model_type: vicuna7b
8
+ load_pretrained: True
9
+ # pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
10
+ load_finetuned: False
11
+ finetuned: ""
12
+ stage1_url_or_filename: null
13
+ image_model: "eva_clip_g"
14
+ pc_model: "ulip2_pointbert"
15
+ video_model: "eva_clip_g"
16
+ audio_model: "beats"
17
+ pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
18
+ pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth
19
+ pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth
20
+ pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/audio_qformer.pth
21
+ load_attention_image_qformer: True
22
+ load_attention_pc_qformer: True
23
+ load_attention_video_qformer: True
24
+ load_attention_audio_qformer: True
25
+ load_ln_type_image: "image"
26
+ load_ln_type_video: "video"
27
+ load_ln_type_pc: "pc"
28
+ load_ln_type_audio: "audio"
29
+ load_qformer_type_image: "image"
30
+ load_qformer_type_pc: "pc"
31
+ load_qformer_type_video: "video"
32
+ load_qformer_type_audio: "audio"
33
+ load_projection_image: True
34
+ load_projection_pc: True
35
+ load_projection_video: True
36
+ load_projection_audio: True
37
+ load_projection_type_image: "image"
38
+ load_projection_type_pc: "pc"
39
+ load_projection_type_video: "video"
40
+ load_projection_type_audio: "audio"
41
+ image_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
42
+ pc_encoder_kwargs : {}
43
+ video_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
44
+ audio_encoder_kwargs : {}
45
+ image_precision: "fp16"
46
+ pc_precision: "fp16"
47
+ video_precision: "fp16"
48
+ audio_precision: "fp16"
49
+ freeze_image: True
50
+ freeze_pc: True
51
+ freeze_video: True
52
+ freeze_audio: True
53
+ num_query_token: 32
54
+ llm_model: /path/to/vicuna-7b
55
+ prompt: "question: {} answer:"
56
+ modalities : [image, pc]
57
+ use_cues: True
58
+ shared_qformer: False
59
+ pretrained_shared_qformer: Null
60
+ load_attention_shared_qformer: False
61
+ load_qformer_type_shared: ""
62
+ load_projection_shared: False
63
+ load_projection_type_shaped: ""
64
+ load_ln_type_shared: ""
65
+ shared_qformer_num_features: 512
66
+ special_qformer_input_prompt: "a short description"
67
+ prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
68
+ predict_with_gen: False
69
+ use_caption: False
70
+ use_describe: False
71
+ enumerate_inputs: False
72
+ add_space: True
73
+
74
+ datasets:
75
+ image_pc_discrn: # name of the dataset builder
76
+ vis_processor:
77
+ train:
78
+ name: "clip_image_train"
79
+ eval:
80
+ name: "clip_image_eval"
81
+ pc_processor:
82
+ train:
83
+ name: "ulip_pc"
84
+ eval:
85
+ name: "ulip_pc"
86
+ text_processor:
87
+ train:
88
+ name: "blip_caption"
89
+ eval:
90
+ name: "blip_caption"
91
+
92
+ data_type: [images, pc] # [images|videos|features]
93
+
94
+
95
+ build_info:
96
+ kwargs:
97
+ total: all
98
+ raw: False
99
+ shuffle_modalities: False
100
+ balance_labels: True
101
+ dataset_name: objaverse
102
+ classnames: [image, 3d]
103
+ ground_truth: False
104
+
105
+ # Be careful not to append minus sign (-) before split to avoid itemizing
106
+ annotations:
107
+ val:
108
+ url:
109
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/objaverse.json
110
+ storage:
111
+ - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/objaverse.json
112
+ pc:
113
+ storage: /export/einstein-vision/3d_vision/objaverse/objaverse_pc_parallel
114
+
115
+ images:
116
+ storage: /export/einstein-vision/3d_vision/objaverse_captions/images/
117
+
118
+ run:
119
+ task: discrn_qa
120
+ # optimization-specific
121
+ batch_size_train: 8
122
+ batch_size_eval: 1
123
+ num_workers: 10
124
+ max_epoch: 1
125
+ segments: 1
126
+
127
+ # inference-specific
128
+ max_len: 10
129
+ min_len: 1
130
+ length_penalty: 1.
131
+ num_beams: 5
132
+ inference_method: "generate"
133
+
134
+ train_splits: ["train"]
135
+ valid_splits: ["val"]
136
+ # test_splits: ["test"]
137
+
138
+ # distribution
139
+ device: "cuda"
140
+ world_size: 1
141
+ dist_url: "env://"
142
+ distributed: True
143
+ use_dist_eval_sampler: False
144
+
145
+
146
+ # model specific
147
+ k_test: 128
148
+
149
+ # misc
150
+ seed: 42
151
+ output_dir: "output/xinstructblip/eval/vicuna7b/discrn/image_3d_describe"
152
+
153
+ evaluate: True
154
+ save_freq: -1