yuccaaa commited on Sep 3, 2025

Commit

31ec239

verified ·

1 Parent(s): 5ea4862

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

LAVIS-main/lavis/projects/instructblip/caption_nocaps_out_domain_vicuna7b_eval.yaml +82 -0
LAVIS-main/lavis/projects/instructblip/caption_vatex_flant5xl_eval.yaml +90 -0
LAVIS-main/lavis/projects/instructblip/caption_vatex_flant5xxl_eval.yaml +90 -0
LAVIS-main/lavis/projects/instructblip/caption_vatex_vicuna13b_eval.yaml +90 -0
LAVIS-main/lavis/projects/instructblip/caption_vatex_vicuna7b_eval.yaml +91 -0
LAVIS-main/lavis/projects/instructblip/classification_modelnet40_vicuna13b.yaml +101 -0
LAVIS-main/lavis/projects/instructblip/classification_modelnet40_vicuna7b.yaml +100 -0
LAVIS-main/lavis/projects/instructblip/classification_snlive_flant5xl.yaml +94 -0
LAVIS-main/lavis/projects/instructblip/classification_snlive_flant5xxl.yaml +95 -0
LAVIS-main/lavis/projects/instructblip/classification_snlive_vicuna13b.yaml +93 -0
LAVIS-main/lavis/projects/instructblip/classification_snlive_vicuna13b_test.yaml +93 -0
LAVIS-main/lavis/projects/instructblip/classification_snlive_vicuna7b_test.yaml +93 -0
LAVIS-main/lavis/projects/instructblip/classification_snlive_vicuna7b_val.yaml +93 -0
LAVIS-main/lavis/projects/instructblip/completion_modelnet40_vicuna13b.yaml +101 -0
LAVIS-main/lavis/projects/instructblip/completion_modelnet40_vicuna7b.yaml +102 -0
LAVIS-main/lavis/projects/instructblip/qa_msrvtt_flant5xl_eval_test.yaml +92 -0
LAVIS-main/lavis/projects/instructblip/qa_msrvtt_flant5xxl_eval_test.yaml +92 -0
LAVIS-main/lavis/projects/instructblip/qa_msrvtt_vicuna13b_eval_test.yaml +92 -0
LAVIS-main/lavis/projects/instructblip/qa_msrvtt_vicuna7b_eval_test.yaml +92 -0
LAVIS-main/lavis/projects/instructblip/qa_msvd_flant5xl_eval.yaml +100 -0
LAVIS-main/lavis/projects/instructblip/qa_msvd_flant5xxl_eval.yaml +100 -0
LAVIS-main/lavis/projects/instructblip/qa_msvd_vicuna13b_eval.yaml +100 -0
LAVIS-main/lavis/projects/instructblip/qa_msvd_vicuna7b_eval.yaml +100 -0
LAVIS-main/lavis/projects/instructblip/qa_okvqa_flant5xl_eval.yaml +90 -0
LAVIS-main/lavis/projects/instructblip/qa_okvqa_flant5xxl_eval.yaml +90 -0
LAVIS-main/lavis/projects/instructblip/qa_okvqa_vicuna13b_eval.yaml +90 -0
LAVIS-main/lavis/projects/instructblip/qa_okvqa_vicuna7b_eval.yaml +90 -0
LAVIS-main/lavis/projects/pnp-vqa/eval/gqa_eval.yaml +60 -0
LAVIS-main/lavis/projects/pnp-vqa/eval/gqa_eval_3b.yaml +60 -0
LAVIS-main/lavis/projects/pnp-vqa/eval/gqa_eval_large.yaml +60 -0
LAVIS-main/lavis/projects/pnp-vqa/eval/okvqa_eval.yaml +59 -0
LAVIS-main/lavis/projects/pnp-vqa/eval/okvqa_eval_3b.yaml +59 -0
LAVIS-main/lavis/projects/pnp-vqa/eval/okvqa_eval_large.yaml +59 -0
LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_eval.yaml +60 -0
LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_eval_3b.yaml +60 -0
LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_eval_large.yaml +60 -0
LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_test_eval.yaml +60 -0
LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_test_eval_3b.yaml +60 -0
LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_test_eval_large.yaml +60 -0
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_caption.yaml +176 -0
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_caption_13b.yaml +176 -0
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe.yaml +176 -0
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_13b.yaml +177 -0
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_nocue.yaml +176 -0
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_proj copy.yaml +179 -0
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_proj.yaml +179 -0
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_rand_init.yaml +176 -0
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/image_3d_caption.yaml +154 -0
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/image_3d_caption_13b.yaml +154 -0
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/image_3d_describe.yaml +154 -0

LAVIS-main/lavis/projects/instructblip/caption_nocaps_out_domain_vicuna7b_eval.yaml ADDED Viewed

	@@ -0,0 +1,82 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_instruct
+  model_type: vicuna7b
+  load_pretrained: True
+  prompt: "A short image caption."
+datasets:
+  nocaps: # name of the dataset builder
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+        eval:
+          name: "blip_image_eval"
+          image_size: 224
+    text_processor:
+        eval:
+          name: "blip_caption"
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_val.json
+          storage:  nocaps/annotations/nocaps_val.json
+        test:
+          url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_test.json
+          storage: nocaps/annotations/nocaps_test.json
+      images:
+        storage: /export/share/datasets/vision/nocaps/
+run:
+  task: captioning
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 8
+  accum_grad_iters: 1
+  max_len: 80
+  min_len: 10
+  num_beams: 5
+  inference_method: "generate"
+  # prompt: an image that shows
+  length_penalty: 1.
+  annotation_file: https://nocaps.s3.amazonaws.com/nocaps_val_4500_captions.json
+  seed: 42
+  output_dir: "output/instructblip/nocaps_out_domain_captioning_vicuna7b/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["val"]
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1
+  img_ids: [2, 4, 5, 8, 15, 18, 19, 22, 27, 30, 33, 35, 41, 42, 43, 46, 47, 51, 59, 60, 64, 65, 68, 69, 71, 72, 73, 77, 79, 81, 85, 87, 88, 90, 92, 100, 101, 102, 105, 107, 109, 115, 120, 124, 125, 126, 127, 129, 133, 135, 137, 139, 140, 141, 143, 150, 153, 155, 158, 164, 165, 167, 170, 171, 173, 182, 190, 191, 196, 200, 201, 203, 205, 208, 219, 225, 226, 228, 229, 232, 239, 240, 243, 245, 250, 262, 263, 264, 267, 272, 278, 283, 284, 290, 291, 297, 301, 304, 305, 309, 310, 311, 314, 323, 325, 329, 330, 331, 333, 334, 341, 349, 350, 351, 352, 354, 358, 359, 363, 365, 366, 368, 371, 372, 379, 381, 383, 386, 388, 389, 390, 392, 405, 415, 417, 418, 420, 421, 424, 428, 429, 432, 436, 441, 443, 452, 453, 454, 455, 456, 459, 464, 465, 468, 469, 476, 477, 478, 480, 487, 488, 490, 491, 493, 500, 502, 504, 506, 509, 510, 511, 512, 515, 516, 520, 527, 529, 533, 539, 540, 541, 544, 545, 547, 551, 554, 556, 559, 577, 579, 580, 582, 586, 587, 590, 593, 594, 607, 609, 616, 617, 619, 623, 628, 631, 634, 637, 648, 651, 654, 655, 665, 673, 678, 682, 684, 685, 688, 690, 695, 696, 701, 702, 705, 707, 708, 712, 714, 718, 719, 723, 725, 726, 730, 731, 733, 734, 740, 744, 748, 750, 751, 756, 757, 760, 761, 763, 767, 775, 779, 782, 783, 784, 787, 790, 792, 794, 798, 799, 802, 805, 807, 810, 812, 816, 818, 819, 820, 821, 829, 831, 836, 841, 842, 844, 845, 849, 850, 853, 854, 857, 859, 861, 868, 871, 874, 875, 877, 879, 886, 887, 889, 890, 891, 892, 893, 894, 896, 899, 900, 905, 918, 924, 926, 927, 929, 932, 934, 935, 943, 948, 950, 952, 953, 954, 956, 957, 963, 965, 969, 972, 973, 974, 976, 980, 985, 987, 988, 990, 992, 993, 994, 1000, 1001, 1003, 1005, 1009, 1013, 1016, 1018, 1019, 1020, 1021, 1022, 1024, 1028, 1029, 1033, 1036, 1037, 1038, 1042, 1045, 1046, 1050, 1053, 1054, 1056, 1065, 1072, 1076, 1079, 1082, 1083, 1096, 1101, 1103, 1107, 1112, 1117, 1129, 1132, 1133, 1136, 1138, 1141, 1143, 1155, 1157, 1160, 1164, 1165, 1166, 1172, 1175, 1179, 1183, 1194, 1197, 1200, 1202, 1210, 1228, 1234, 1236, 1241, 1246, 1251, 1253, 1255, 1261, 1265, 1268, 1269, 1271, 1272, 1273, 1277, 1286, 1287, 1290, 1296, 1297, 1302, 1303, 1308, 1310, 1312, 1315, 1316, 1317, 1320, 1321, 1324, 1327, 1329, 1330, 1331, 1333, 1334, 1336, 1338, 1339, 1340, 1345, 1347, 1356, 1362, 1366, 1371, 1374, 1376, 1381, 1384, 1385, 1388, 1394, 1396, 1397, 1398, 1403, 1404, 1408, 1410, 1413, 1414, 1417, 1424, 1430, 1433, 1436, 1438, 1440, 1446, 1449, 1453, 1454, 1456, 1460, 1462, 1464, 1465, 1467, 1470, 1473, 1477, 1491, 1494, 1498, 1504, 1506, 1509, 1511, 1515, 1519, 1524, 1530, 1533, 1540, 1541, 1549, 1557, 1558, 1561, 1570, 1572, 1575, 1579, 1591, 1593, 1594, 1595, 1596, 1602, 1605, 1609, 1611, 1615, 1617, 1618, 1624, 1625, 1627, 1636, 1637, 1639, 1640, 1641, 1646, 1647, 1648, 1649, 1655, 1656, 1657, 1658, 1659, 1662, 1671, 1675, 1679, 1681, 1690, 1692, 1701, 1712, 1715, 1716, 1718, 1719, 1721, 1723, 1725, 1728, 1729, 1730, 1732, 1737, 1740, 1746, 1747, 1753, 1754, 1756, 1757, 1758, 1761, 1762, 1767, 1771, 1773, 1775, 1779, 1783, 1784, 1786, 1787, 1789, 1791, 1794, 1802, 1803, 1807, 1813, 1814, 1815, 1817, 1824, 1826, 1827, 1832, 1834, 1835, 1838, 1839, 1840, 1841, 1842, 1843, 1844, 1847, 1850, 1860, 1861, 1866, 1870, 1872, 1873, 1876, 1878, 1886, 1889, 1894, 1897, 1899, 1902, 1907, 1911, 1912, 1917, 1920, 1924, 1925, 1928, 1931, 1935, 1936, 1937, 1939, 1941, 1946, 1948, 1949, 1952, 1954, 1955, 1956, 1959, 1967, 1968, 1970, 1975, 1976, 1979, 1980, 1985, 1986, 1994, 1996, 1998, 2001, 2003, 2007, 2009, 2011, 2012, 2014, 2019, 2028, 2029, 2042, 2047, 2049, 2050, 2060, 2068, 2071, 2076, 2078, 2080, 2081, 2086, 2089, 2090, 2093, 2094, 2099, 2102, 2107, 2112, 2115, 2121, 2124, 2125, 2129, 2131, 2133, 2135, 2140, 2141, 2148, 2150, 2151, 2152, 2155, 2163, 2173, 2176, 2178, 2182, 2183, 2187, 2188, 2196, 2197, 2198, 2199, 2200, 2205, 2207, 2209, 2215, 2217, 2220, 2221, 2223, 2230, 2235, 2236, 2237, 2238, 2241, 2242, 2243, 2244, 2246, 2252, 2253, 2261, 2265, 2274, 2277, 2278, 2281, 2286, 2290, 2292, 2293, 2294, 2296, 2299, 2301, 2304, 2305, 2307, 2309, 2312, 2314, 2315, 2319, 2323, 2324, 2337, 2338, 2339, 2340, 2342, 2351, 2356, 2358, 2360, 2367, 2369, 2371, 2374, 2376, 2378, 2382, 2383, 2387, 2388, 2390, 2399, 2400, 2412, 2416, 2422, 2423, 2427, 2428, 2435, 2439, 2440, 2442, 2447, 2450, 2455, 2459, 2461, 2462, 2463, 2466, 2468, 2470, 2479, 2480, 2482, 2483, 2485, 2488, 2491, 2495, 2496, 2502, 2505, 2506, 2507, 2510, 2511, 2515, 2522, 2524, 2532, 2534, 2546, 2547, 2550, 2554, 2558, 2562, 2563, 2574, 2583, 2584, 2590, 2594, 2598, 2602, 2603, 2606, 2611, 2613, 2615, 2617, 2619, 2623, 2625, 2630, 2636, 2642, 2643, 2644, 2646, 2647, 2649, 2650, 2659, 2661, 2664, 2674, 2675, 2677, 2682, 2684, 2685, 2691, 2693, 2695, 2698, 2699, 2703, 2704, 2706, 2707, 2711, 2713, 2719, 2720, 2723, 2726, 2727, 2729, 2730, 2733, 2734, 2738, 2739, 2741, 2744, 2745, 2748, 2749, 2754, 2757, 2761, 2762, 2764, 2765, 2767, 2768, 2772, 2776, 2778, 2779, 2780, 2781, 2783, 2787, 2791, 2795, 2796, 2799, 2800, 2802, 2807, 2808, 2811, 2813, 2817, 2820, 2827, 2829, 2831, 2833, 2834, 2835, 2839, 2840, 2841, 2846, 2847, 2849, 2852, 2855, 2859, 2860, 2864, 2870, 2871, 2876, 2878, 2879, 2882, 2884, 2885, 2886, 2887, 2888, 2895, 2896, 2897, 2898, 2900, 2902, 2905, 2911, 2913, 2915, 2919, 2922, 2924, 2933, 2939, 2945, 2953, 2954, 2958, 2959, 2968, 2973, 2976, 2979, 2982, 2984, 2992, 3002, 3004, 3007, 3008, 3009, 3010, 3013, 3016, 3021, 3022, 3023, 3026, 3028, 3033, 3036, 3037, 3039, 3043, 3044, 3045, 3046, 3053, 3060, 3062, 3063, 3071, 3072, 3085, 3086, 3092, 3095, 3096, 3102, 3103, 3104, 3105, 3111, 3115, 3116, 3122, 3129, 3131, 3132, 3137, 3138, 3140, 3147, 3148, 3157, 3164, 3167, 3168, 3170, 3175, 3179, 3182, 3184, 3190, 3194, 3196, 3198, 3199, 3200, 3215, 3216, 3217, 3219, 3222, 3229, 3230, 3237, 3239, 3242, 3249, 3253, 3255, 3257, 3258, 3267, 3270, 3271, 3274, 3279, 3288, 3290, 3291, 3293, 3299, 3305, 3306, 3312, 3318, 3319, 3320, 3323, 3326, 3328, 3329, 3335, 3343, 3344, 3345, 3347, 3349, 3350, 3353, 3356, 3362, 3364, 3366, 3369, 3374, 3377, 3379, 3381, 3382, 3384, 3385, 3388, 3389, 3395, 3399, 3403, 3409, 3411, 3416, 3419, 3421, 3423, 3424, 3425, 3427, 3428, 3431, 3437, 3438, 3439, 3444, 3450, 3452, 3453, 3456, 3457, 3460, 3461, 3462, 3464, 3466, 3467, 3471, 3472, 3477, 3478, 3482, 3484, 3486, 3492, 3499, 3500, 3501, 3502, 3511, 3525, 3529, 3531, 3533, 3534, 3536, 3552, 3553, 3555, 3557, 3562, 3567, 3568, 3570, 3571, 3573, 3577, 3578, 3584, 3585, 3586, 3587, 3595, 3600, 3601, 3604, 3609, 3610, 3612, 3615, 3616, 3619, 3620, 3624, 3625, 3631, 3632, 3636, 3637, 3638, 3640, 3643, 3651, 3654, 3655, 3656, 3657, 3662, 3667, 3668, 3671, 3677, 3684, 3686, 3689, 3693, 3694, 3696, 3697, 3698, 3699, 3700, 3701, 3703, 3704, 3707, 3708, 3709, 3711, 3712, 3713, 3714, 3719, 3721, 3723, 3726, 3737, 3741, 3742, 3744, 3750, 3752, 3757, 3760, 3761, 3764, 3765, 3767, 3770, 3772, 3774, 3776, 3778, 3780, 3781, 3796, 3797, 3805, 3818, 3819, 3820, 3821, 3824, 3841, 3845, 3848, 3851, 3858, 3866, 3870, 3871, 3876, 3879, 3880, 3883, 3893, 3896, 3900, 3903, 3904, 3908, 3909, 3913, 3914, 3916, 3924, 3927, 3937, 3940, 3942, 3943, 3949, 3950, 3953, 3954, 3959, 3963, 3966, 3969, 3972, 3978, 3981, 3983, 3984, 3986, 3989, 3990, 3991, 3999, 4000, 4004, 4005, 4006, 4012, 4014, 4016, 4017, 4019, 4020, 4030, 4035, 4046, 4049, 4051, 4052, 4053, 4057, 4061, 4065, 4066, 4068, 4073, 4074, 4075, 4079, 4080, 4082, 4084, 4086, 4090, 4091, 4093, 4094, 4095, 4096, 4100, 4102, 4104, 4106, 4113, 4114, 4115, 4116, 4118, 4124, 4126, 4127, 4128, 4131, 4133, 4134, 4142, 4145, 4149, 4156, 4160, 4171, 4174, 4178, 4179, 4180, 4183, 4186, 4190, 4191, 4195, 4197, 4215, 4220, 4229, 4234, 4245, 4249, 4251, 4252, 4254, 4257, 4259, 4264, 4265, 4266, 4267, 4275, 4276, 4277, 4282, 4284, 4285, 4288, 4291, 4294, 4295, 4301, 4302, 4313, 4315, 4320, 4328, 4333, 4336, 4339, 4342, 4345, 4346, 4350, 4354, 4372, 4374, 4375, 4377, 4379, 4380, 4386, 4388, 4389, 4392, 4396, 4402, 4404, 4408, 4410, 4424, 4426, 4428, 4431, 4435, 4436, 4439, 4442, 4446, 4447, 4449, 4452, 4455, 4458, 4460, 4461, 4466, 4469, 4475, 4476, 4478, 4488, 4491, 4494, 4498]

LAVIS-main/lavis/projects/instructblip/caption_vatex_flant5xl_eval.yaml ADDED Viewed

	@@ -0,0 +1,90 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_t5_instruct
+  model_type: flant5xl
+  load_pretrained: True
+  prompt: "a short description"
+datasets:
+  vatex_caption:
+    vis_processor:
+          train:
+            name: alpro_video_train
+            n_frms: 4
+            image_size: 224
+            min_scale: 0.9
+            max_scale: 1.0
+          eval:
+            name: alpro_video_eval
+            n_frms: 4
+            image_size: 224
+            min_scale: 0.9
+            max_scale: 1.0
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
+          storage: vatex/annotations/cap_train.json
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
+          storage: vatex/annotations/cap_val.json
+        test:
+          # iWNXAYGh9cI_000004_000014.mp4 is corrupt and removed from youtube
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
+          storage: vatex/annotations/cap_test.json
+      videos:
+        storage: /export/video-language-dataset/data/vatex/
+run:
+  task: captioning
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 8
+  accum_grad_iters: 1
+  max_len: 80
+  min_len: 10
+  num_beams: 5
+  inference_method: "generate"
+  prompt: "describe the video"
+  length_penalty: 1.
+  seed: 42
+  output_dir:  "output/instructblip/vatex_caption_flant5xl/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["val"]
+  annotation_file: /export/home/.cache/lavis/vatex_caption_gt/vatex_caption_val_annotations.json
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/caption_vatex_flant5xxl_eval.yaml ADDED Viewed

	@@ -0,0 +1,90 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_t5_instruct
+  model_type: flant5xxl
+  load_pretrained: True
+  prompt: "a short description"
+datasets:
+  vatex_caption:
+    vis_processor:
+          train:
+            name: alpro_video_train
+            n_frms: 4
+            image_size: 224
+            min_scale: 0.9
+            max_scale: 1.0
+          eval:
+            name: alpro_video_eval
+            n_frms: 4
+            image_size: 224
+            min_scale: 0.9
+            max_scale: 1.0
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
+          storage: vatex/annotations/cap_train.json
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
+          storage: vatex/annotations/cap_val.json
+        test:
+          # iWNXAYGh9cI_000004_000014.mp4 is corrupt and removed from youtube
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
+          storage: vatex/annotations/cap_test.json
+      videos:
+        storage: /export/video-language-dataset/data/vatex/
+run:
+  task: captioning
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 0
+  accum_grad_iters: 1
+  max_len: 80
+  min_len: 10
+  num_beams: 5
+  inference_method: "generate"
+  prompt: "describe the video"
+  length_penalty: 1.
+  seed: 42
+  output_dir:  "output/instructblip/vatex_caption_flant5xxl/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["val"]
+  annotation_file: /export/home/.cache/lavis/vatex_caption_gt/vatex_caption_val_annotations.json
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/caption_vatex_vicuna13b_eval.yaml ADDED Viewed

	@@ -0,0 +1,90 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_instruct
+  model_type: vicuna13b
+  load_pretrained: True
+  prompt: "describe the video"
+datasets:
+  vatex_caption:
+    vis_processor:
+          train:
+            name: alpro_video_train
+            n_frms: 4
+            image_size: 224
+            min_scale: 0.9
+            max_scale: 1.0
+          eval:
+            name: alpro_video_eval
+            n_frms: 4
+            image_size: 224
+            min_scale: 0.9
+            max_scale: 1.0
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
+          storage: vatex/annotations/cap_train.json
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
+          storage: vatex/annotations/cap_val.json
+        test:
+          # iWNXAYGh9cI_000004_000014.mp4 is corrupt and removed from youtube
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
+          storage: vatex/annotations/cap_test.json
+      videos:
+        storage: /export/video-language-dataset/data/vatex/
+run:
+  task: captioning
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 8
+  accum_grad_iters: 1
+  max_len: 80
+  min_len: 10
+  num_beams: 5
+  inference_method: "generate"
+  prompt: "describe the video"
+  length_penalty: 0.
+  seed: 42
+  output_dir:  "output/instructblip/msvd_caption_vicuna13b/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["val"]
+  annotation_file: /export/home/.cache/lavis/vatex_caption_gt/vatex_caption_val_annotations.json
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/caption_vatex_vicuna7b_eval.yaml ADDED Viewed

	@@ -0,0 +1,91 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_instruct
+  model_type: vicuna7b
+  load_pretrained: True
+  prompt: "a short description"
+datasets:
+  vatex_caption:
+    vis_processor:
+          train:
+            name: alpro_video_train
+            n_frms: 4
+            image_size: 224
+            min_scale: 0.9
+            max_scale: 1.0
+          eval:
+            name: alpro_video_eval
+            n_frms: 4
+            image_size: 224
+            min_scale: 0.9
+            max_scale: 1.0
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
+          storage: vatex/annotations/cap_train.json
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
+          storage: vatex/annotations/cap_val.json
+        test:
+          # iWNXAYGh9cI_000004_000014.mp4 is corrupt and removed from youtube
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
+          storage: vatex/annotations/cap_test.json
+      videos:
+        storage: /export/video-language-dataset/data/vatex/
+run:
+  task: captioning
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 8
+  accum_grad_iters: 1
+  max_len: 80
+  min_len: 10
+  num_beams: 5
+  inference_method: "generate"
+  prompt: "describe the video"
+  length_penalty: 1.
+  seed: 42
+  output_dir:  "output/instructblip/vatex_caption_vicuna7b/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["val"]
+  annotation_file: /export/home/.cache/lavis/vatex_caption_gt/vatex_caption_val_annotations.json
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/classification_modelnet40_vicuna13b.yaml ADDED Viewed

	@@ -0,0 +1,101 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_instruct
+  model_type: vicuna13b
+  load_pretrained: True
+  prompt: "describe the 3d model."
+  format_candidates_prompt: " a 3d model of a {}"
+datasets:
+  modelnet40_cls: # name of the dataset builder
+    data_type: [pc, images]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    pc_processor:
+        train:
+          name: "ulip_pc"
+        eval:
+          name: "ulip_pc"
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train.txt
+          storage:
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train.txt
+        val:
+          url:
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test.txt
+          storage:
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test.txt
+      pc:
+        storage: /export/home/ULIP/data/modelnet40_normal_resampled
+      images:
+        storage: /export/einstein-vision/3d_vision/3d_object_datasets/modelnet_images8192
+run:
+  task: multimodal_classification
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 8
+  accum_grad_iters: 1
+  prompt: "describe the 3d model."
+  max_len: 3
+  min_len: 1
+  num_beams: 5
+  seed: 42
+  output_dir: "output/instructblip/modelent_classification_vicuna13b/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["val"]
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/classification_modelnet40_vicuna7b.yaml ADDED Viewed

	@@ -0,0 +1,100 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_instruct
+  model_type: vicuna7b
+  load_pretrained: True
+  prompt: "describe the 3d model."
+  format_candidates_prompt: " a 3d model of a {}"
+datasets:
+  modelnet40_cls: # name of the dataset builder
+    data_type: [pc, images]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    pc_processor:
+        train:
+          name: "ulip_pc"
+        eval:
+          name: "ulip_pc"
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train.txt
+          storage:
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train.txt
+        val:
+          url:
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test.txt
+          storage:
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test.txt
+      pc:
+        storage: /export/home/ULIP/data/modelnet40_normal_resampled
+      images:
+        storage: /export/einstein-vision/3d_vision/3d_object_datasets/modelnet_images8192
+run:
+  task: multimodal_classification
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 8
+  accum_grad_iters: 1
+  prompt: "describe the 3d model."
+  max_len: 3
+  min_len: 1
+  num_beams: 5
+  seed: 42
+  output_dir: "output/instructblip/modelent_classification_vicuna7b/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["val"]
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/classification_snlive_flant5xl.yaml ADDED Viewed

	@@ -0,0 +1,94 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+## note flant5 has been trained on snli
+  arch: blip2_t5_instruct
+  model_type: flant5xl
+  load_pretrained: True
+  prompt: ""
+datasets:
+  snli_ve_instruct:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+          train:
+            name: "blip_caption"
+          eval:
+            name: "blip_caption"
+            prompt: "given the image respond to "
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - /export/share/dongxuli/data/lavis/snli/ve_train.json
+          storage:
+            - snli/annotations/ve_train.json
+        val:
+          url:
+            - /export/share/dongxuli/data/lavis/snli/ve_dev.json
+          storage:
+            - snli/annotations/ve_dev.json
+        test:
+          url:
+            - /export/share/dongxuli/data/lavis/snli/ve_test.json
+          storage:
+            - snli/annotations/ve_test.json
+      images:
+          # storage: flickr30k/images/flickr30k-images
+          storage: /export/share/datasets/vision/flickr30k/flickr30k-images
+run:
+  task: multimodal_classification
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 8
+  accum_grad_iters: 1
+  max_len: 30
+  min_len: 1
+  num_beams: 5
+  inference_method: "generate"
+  prompt: ""
+  length_penalty: -1.
+  seed: 42
+  output_dir: "output/instructblip/snlive_classification_flant5xl/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["val"]
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/classification_snlive_flant5xxl.yaml ADDED Viewed

	@@ -0,0 +1,95 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+## note flant5 has been trained on snli
+  arch: blip2_t5_instruct
+  model_type: flant5xxl
+  load_pretrained: True
+  prompt: ""
+datasets:
+  snli_ve_instruct:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+          train:
+            name: "blip_caption"
+          eval:
+            name: "blip_caption"
+            prompt: "given the image respond to "
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - /export/share/dongxuli/data/lavis/snli/ve_train.json
+          storage:
+            - snli/annotations/ve_train.json
+        val:
+          url:
+            - /export/share/dongxuli/data/lavis/snli/ve_dev.json
+          storage:
+            - snli/annotations/ve_dev.json
+        test:
+          url:
+            - /export/share/dongxuli/data/lavis/snli/ve_test.json
+          storage:
+            - snli/annotations/ve_test.json
+      images:
+          # storage: flickr30k/images/flickr30k-images
+          storage: /export/share/datasets/vision/flickr30k/flickr30k-images
+run:
+  task: multimodal_classification
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 8
+  accum_grad_iters: 1
+  max_len: 30
+  min_len: 1
+  num_beams: 5
+  inference_method: "generate"
+  prompt: ""
+  length_penalty: -1.
+  seed: 42
+  output_dir: "output/instructblip/snlive_classification_flant5xxl/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["test"]
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/classification_snlive_vicuna13b.yaml ADDED Viewed

	@@ -0,0 +1,93 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_instruct
+  model_type: vicuna13b
+  load_pretrained: True
+  prompt: ""
+datasets:
+  snli_ve_instruct:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+          train:
+            name: "blip_caption"
+          eval:
+            name: "blip_caption"
+            # prompt: "how would you respond to "
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - /export/share/dongxuli/data/lavis/snli/ve_train.json
+          storage:
+            - snli/annotations/ve_train.json
+        val:
+          url:
+            - /export/share/dongxuli/data/lavis/snli/ve_dev.json
+          storage:
+            - snli/annotations/ve_dev.json
+        test:
+          url:
+            - /export/share/dongxuli/data/lavis/snli/ve_test.json
+          storage:
+            - snli/annotations/ve_test.json
+      images:
+          # storage: flickr30k/images/flickr30k-images
+          storage: /export/share/datasets/vision/flickr30k/flickr30k-images
+run:
+  task: multimodal_classification
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 8
+  accum_grad_iters: 1
+  max_len: 30
+  min_len: 1
+  num_beams: 5
+  inference_method: "generate"
+  prompt: ""
+  length_penalty: -1.
+  seed: 42
+  output_dir: "output/instructblip/snlive_classification_vicuna13b_val/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["val"]
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/classification_snlive_vicuna13b_test.yaml ADDED Viewed

	@@ -0,0 +1,93 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_instruct
+  model_type: vicuna13b
+  load_pretrained: True
+  prompt: ""
+datasets:
+  snli_ve_instruct:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+          train:
+            name: "blip_caption"
+          eval:
+            name: "blip_caption"
+            # prompt: "how would you respond to "
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - /export/share/dongxuli/data/lavis/snli/ve_train.json
+          storage:
+            - snli/annotations/ve_train.json
+        val:
+          url:
+            - /export/share/dongxuli/data/lavis/snli/ve_dev.json
+          storage:
+            - snli/annotations/ve_dev.json
+        test:
+          url:
+            - /export/share/dongxuli/data/lavis/snli/ve_test.json
+          storage:
+            - snli/annotations/ve_test.json
+      images:
+          # storage: flickr30k/images/flickr30k-images
+          storage: /export/share/datasets/vision/flickr30k/flickr30k-images
+run:
+  task: multimodal_classification
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 8
+  accum_grad_iters: 1
+  max_len: 30
+  min_len: 1
+  num_beams: 5
+  inference_method: "generate"
+  prompt: ""
+  length_penalty: -1.
+  seed: 42
+  output_dir: "output/instructblip/snlive_classification_vicuna13b_test/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["test"]
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/classification_snlive_vicuna7b_test.yaml ADDED Viewed

	@@ -0,0 +1,93 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_instruct
+  model_type: vicuna7b
+  load_pretrained: True
+  prompt: ""
+datasets:
+  snli_ve_instruct:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+          train:
+            name: "blip_caption"
+          eval:
+            name: "blip_caption"
+            # prompt: "given the image respond to "
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - /export/share/dongxuli/data/lavis/snli/ve_train.json
+          storage:
+            - snli/annotations/ve_train.json
+        val:
+          url:
+            - /export/share/dongxuli/data/lavis/snli/ve_dev.json
+          storage:
+            - snli/annotations/ve_dev.json
+        test:
+          url:
+            - /export/share/dongxuli/data/lavis/snli/ve_test.json
+          storage:
+            - snli/annotations/ve_test.json
+      images:
+          # storage: flickr30k/images/flickr30k-images
+          storage: /export/share/datasets/vision/flickr30k/flickr30k-images
+run:
+  task: multimodal_classification
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 8
+  accum_grad_iters: 1
+  max_len: 30
+  min_len: 1
+  num_beams: 5
+  inference_method: "generate"
+  prompt: ""
+  length_penalty: -1.
+  seed: 42
+  output_dir: "output/instructblip/snlive_classification_vicuna7b_test/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["test"]
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/classification_snlive_vicuna7b_val.yaml ADDED Viewed

	@@ -0,0 +1,93 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_instruct
+  model_type: vicuna7b
+  load_pretrained: True
+  prompt: ""
+datasets:
+  snli_ve_instruct:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    text_processor:
+          train:
+            name: "blip_caption"
+          eval:
+            name: "blip_caption"
+            # prompt: "given the image respond to "
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - /export/share/dongxuli/data/lavis/snli/ve_train.json
+          storage:
+            - snli/annotations/ve_train.json
+        val:
+          url:
+            - /export/share/dongxuli/data/lavis/snli/ve_dev.json
+          storage:
+            - snli/annotations/ve_dev.json
+        test:
+          url:
+            - /export/share/dongxuli/data/lavis/snli/ve_test.json
+          storage:
+            - snli/annotations/ve_test.json
+      images:
+          # storage: flickr30k/images/flickr30k-images
+          storage: /export/share/datasets/vision/flickr30k/flickr30k-images
+run:
+  task: multimodal_classification
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 8
+  accum_grad_iters: 1
+  max_len: 30
+  min_len: 1
+  num_beams: 5
+  inference_method: "generate"
+  prompt: ""
+  length_penalty: -1.
+  seed: 42
+  output_dir: "output/instructblip/snlive_classification_vicuna7b_val/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["val"]
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/completion_modelnet40_vicuna13b.yaml ADDED Viewed

	@@ -0,0 +1,101 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_instruct
+  model_type: vicuna13b
+  load_pretrained: True
+  prompt: "describe the 3d model"
+datasets:
+  modelnet40_cls: # name of the dataset builder
+    data_type: [images,pc]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    pc_processor:
+        train:
+          name: "ulip_pc"
+        eval:
+          name: "ulip_pc"
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train.txt
+          storage:
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train.txt
+        val:
+          url:
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test.txt
+          storage:
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test.txt
+      pc:
+        storage: /export/home/ULIP/data/modelnet40_normal_resampled
+      images:
+        storage: /export/einstein-vision/3d_vision/3d_object_datasets/modelnet_images8192
+run:
+  task: multimodal_classification
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 32
+  batch_size_eval: 1
+  num_workers: 8
+  accum_grad_iters: 1
+  max_len: 80
+  min_len: 1
+  num_beams: 5
+  length_penalty: 0.
+  prompt: "describe the 3d model"
+  seed: 42
+  output_dir: "output/instructblip/modelent_completion_vicuna13b/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["val"]
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/completion_modelnet40_vicuna7b.yaml ADDED Viewed

	@@ -0,0 +1,102 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_instruct
+  model_type: vicuna7b
+  load_pretrained: True
+  prompt: "describe the 3d model"
+  predict_with_gen: True
+datasets:
+  modelnet40_cls: # name of the dataset builder
+    data_type: [images, pc]
+    vis_processor:
+      train:
+        name: "clip_image_train"
+        image_size: 224
+      eval:
+        name: "clip_image_eval"
+        image_size: 224
+    pc_processor:
+        train:
+          name: "ulip_pc"
+        eval:
+          name: "ulip_pc"
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train.txt
+          storage:
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train.txt
+        val:
+          url:
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test.txt
+          storage:
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
+            - /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test.txt
+      pc:
+        storage: /export/home/ULIP/data/modelnet40_normal_resampled
+      images:
+        storage: /export/einstein-vision/3d_vision/3d_object_datasets/modelnet_images8192
+run:
+  task: multimodal_classification
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 32
+  batch_size_eval: 1
+  num_workers: 8
+  accum_grad_iters: 1
+  max_len: 80
+  min_len: 1
+  num_beams: 5
+  length_penalty: 0.
+  prompt: "describe the 3d model"
+  seed: 42
+  output_dir: "output/instructblip/modelent_completion_vicuna7b/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["val"]
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/qa_msrvtt_flant5xl_eval_test.yaml ADDED Viewed

	@@ -0,0 +1,92 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_t5_instruct
+  model_type: flant5xl
+  load_pretrained: True
+  prompt: "Question: {} Short Answer:"
+datasets:
+  msrvtt_qa_instruct:
+    vis_processor:
+          train:
+            name: alpro_video_train
+            n_frms: 4
+            image_size: 224
+            min_scale: 0.9
+            max_scale: 1.0
+          eval:
+            name: alpro_video_eval
+            n_frms: 4
+            image_size: 224
+            min_scale: 0.9
+            max_scale: 1.0
+    text_processor:
+        train:
+          name: "blip_question"
+        eval:
+          name: "blip_caption"
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json
+          storage: msrvtt/annotations/qa_train.json
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json
+          storage: msrvtt/annotations/qa_val.json
+        test:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json
+          storage: msrvtt/annotations/qa_test.json
+        ans2label:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json
+          storage: msrvtt/annotations/qa_ans2label.json
+      videos:
+        storage: /export/share/datasets/vision_language/msrvtt/videos
+      instance_id_key: question_id
+run:
+  task: gqa
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 8
+  accum_grad_iters: 1
+  max_len: 10
+  min_len: 1
+  num_beams: 5
+  inference_method: "generate"
+  length_penalty: -1.
+  seed: 42
+  output_dir: "output/instructblip/msrvtt_qa_flant5xl_test/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["test"]
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/qa_msrvtt_flant5xxl_eval_test.yaml ADDED Viewed

	@@ -0,0 +1,92 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_t5_instruct
+  model_type: flant5xxl
+  load_pretrained: True
+  prompt: "Question: {} Short Answer:"
+datasets:
+  msrvtt_qa_instruct:
+    vis_processor:
+          train:
+            name: alpro_video_train
+            n_frms: 4
+            image_size: 224
+            min_scale: 0.9
+            max_scale: 1.0
+          eval:
+            name: alpro_video_eval
+            n_frms: 4
+            image_size: 224
+            min_scale: 0.9
+            max_scale: 1.0
+    text_processor:
+        train:
+          name: "blip_question"
+        eval:
+          name: "blip_caption"
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json
+          storage: msrvtt/annotations/qa_train.json
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json
+          storage: msrvtt/annotations/qa_val.json
+        test:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json
+          storage: msrvtt/annotations/qa_test.json
+        ans2label:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json
+          storage: msrvtt/annotations/qa_ans2label.json
+      videos:
+        storage: /export/share/datasets/vision_language/msrvtt/videos
+      instance_id_key: question_id
+run:
+  task: gqa
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 0
+  accum_grad_iters: 1
+  max_len: 10
+  min_len: 1
+  num_beams: 5
+  inference_method: "generate"
+  length_penalty: -1.
+  seed: 42
+  output_dir: "output/instructblip/msrvtt_qa_flant5xxl_test/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["test"]
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/qa_msrvtt_vicuna13b_eval_test.yaml ADDED Viewed

	@@ -0,0 +1,92 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_instruct
+  model_type: vicuna13b
+  load_pretrained: True
+  prompt: "Question: {} Short Answer:"
+datasets:
+  msrvtt_qa_instruct:
+    vis_processor:
+          train:
+            name: alpro_video_train
+            n_frms: 4
+            image_size: 224
+            min_scale: 0.9
+            max_scale: 1.0
+          eval:
+            name: alpro_video_eval
+            n_frms: 4
+            image_size: 224
+            min_scale: 0.9
+            max_scale: 1.0
+    text_processor:
+        train:
+          name: "blip_question"
+        eval:
+          name: "blip_caption"
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json
+          storage: msrvtt/annotations/qa_train.json
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json
+          storage: msrvtt/annotations/qa_val.json
+        test:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json
+          storage: msrvtt/annotations/qa_test.json
+        ans2label:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json
+          storage: msrvtt/annotations/qa_ans2label.json
+      videos:
+        storage: /export/share/datasets/vision_language/msrvtt/videos
+      instance_id_key: question_id
+run:
+  task: gqa
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 8
+  accum_grad_iters: 1
+  max_len: 10
+  min_len: 1
+  num_beams: 5
+  inference_method: "generate"
+  length_penalty: -1.
+  seed: 42
+  output_dir: "output/instructblip/msrvtt_qa_vicuna13b_test/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["test"]
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/qa_msrvtt_vicuna7b_eval_test.yaml ADDED Viewed

	@@ -0,0 +1,92 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_instruct
+  model_type: vicuna7b
+  load_pretrained: True
+  prompt: "Question: {} Short Answer:"
+datasets:
+  msrvtt_qa_instruct:
+    vis_processor:
+          train:
+            name: alpro_video_train
+            n_frms: 4
+            image_size: 224
+            min_scale: 0.9
+            max_scale: 1.0
+          eval:
+            name: alpro_video_eval
+            n_frms: 4
+            image_size: 224
+            min_scale: 0.9
+            max_scale: 1.0
+    text_processor:
+        train:
+          name: "blip_question"
+        eval:
+          name: "blip_caption"
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json
+          storage: msrvtt/annotations/qa_train.json
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json
+          storage: msrvtt/annotations/qa_val.json
+        test:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json
+          storage: msrvtt/annotations/qa_test.json
+        ans2label:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json
+          storage: msrvtt/annotations/qa_ans2label.json
+      videos:
+        storage: /export/share/datasets/vision_language/msrvtt/videos
+      instance_id_key: question_id
+run:
+  task: gqa
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 8
+  accum_grad_iters: 1
+  max_len: 10
+  min_len: 1
+  num_beams: 5
+  inference_method: "generate"
+  length_penalty: -1.
+  seed: 42
+  output_dir: "output/instructblip/msrvtt_qa_vicuna7b_test/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["test"]
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/qa_msvd_flant5xl_eval.yaml ADDED Viewed

	@@ -0,0 +1,100 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_t5_instruct
+  model_type: flant5xl
+  load_pretrained: True
+  prompt: "Question: {} Short Answer:"
+datasets:
+  msvd_qa_instruct:
+    vis_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+    text_processor:
+        train:
+          name: "blip_question"
+        eval:
+          name: "blip_caption"
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json
+          storage: msvd/annotations/qa_train.json
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json
+          storage: msvd/annotations/qa_val.json
+        test:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json
+          storage: msvd/annotations/qa_test.json
+        ans2label:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json
+          storage: msvd/annotations/qa_ans2label.json
+      videos:
+        storage: /export/share/datasets/vision_language/msvd/videos
+      instance_id_key: question_id
+run:
+  task: gqa
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 8
+  accum_grad_iters: 1
+  max_len: 30
+  min_len: 1
+  num_beams: 5
+  inference_method: "generate"
+  prompt: "Question: {} Short Answer:"
+  length_penalty: -1.
+  seed: 42
+  output_dir:  "output/instructblip/msvd_qa_flant5xl/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["test"]
+  ques_files: {"train": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_questions.json",
+              "val": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_questions.json",
+              "test":"/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_test_questions.json"}
+  anno_files: {"train": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_annotations.json",
+            "val": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_annotations.json",
+            "test":"/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_test_annotations.json"}
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/qa_msvd_flant5xxl_eval.yaml ADDED Viewed

	@@ -0,0 +1,100 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_t5_instruct
+  model_type: flant5xxl
+  load_pretrained: True
+  prompt: "Question: {} Short Answer:"
+datasets:
+  msvd_qa_instruct:
+    vis_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+    text_processor:
+        train:
+          name: "blip_question"
+        eval:
+          name: "blip_caption"
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json
+          storage: msvd/annotations/qa_train.json
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json
+          storage: msvd/annotations/qa_val.json
+        test:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json
+          storage: msvd/annotations/qa_test.json
+        ans2label:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json
+          storage: msvd/annotations/qa_ans2label.json
+      videos:
+        storage: /export/share/datasets/vision_language/msvd/videos
+      instance_id_key: question_id
+run:
+  task: gqa
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 8
+  accum_grad_iters: 1
+  max_len: 30
+  min_len: 1
+  num_beams: 5
+  inference_method: "generate"
+  prompt: "Question: {} Short Answer:"
+  length_penalty: -1.
+  seed: 42
+  output_dir: "output/instructblip/msvd_qa_flant5xxl/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["test"]
+  ques_files: {"train": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_questions.json",
+              "val": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_questions.json",
+              "test":"/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_test_questions.json"}
+  anno_files: {"train": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_annotations.json",
+            "val": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_annotations.json",
+            "test":"/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_test_annotations.json"}
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/qa_msvd_vicuna13b_eval.yaml ADDED Viewed

	@@ -0,0 +1,100 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_instruct
+  model_type: vicuna13b
+  load_pretrained: True
+  prompt: "Question: {} Short Answer:"
+datasets:
+  msvd_qa_instruct:
+    vis_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+    text_processor:
+        train:
+          name: "blip_question"
+        eval:
+          name: "blip_caption"
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json
+          storage: msvd/annotations/qa_train.json
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json
+          storage: msvd/annotations/qa_val.json
+        test:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json
+          storage: msvd/annotations/qa_test.json
+        ans2label:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json
+          storage: msvd/annotations/qa_ans2label.json
+      videos:
+        storage: /export/share/datasets/vision_language/msvd/videos
+      instance_id_key: question_id
+run:
+  task: gqa
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 8
+  accum_grad_iters: 1
+  max_len: 30
+  min_len: 1
+  num_beams: 5
+  inference_method: "generate"
+  prompt: "Question: {} Short Answer:"
+  length_penalty: -1.
+  seed: 42
+  output_dir: "output/instructblip/msvd_qa_vicuna13b/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["test"]
+  ques_files: {"train": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_questions.json",
+              "val": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_questions.json",
+              "test":"/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_test_questions.json"}
+  anno_files: {"train": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_annotations.json",
+            "val": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_annotations.json",
+            "test":"/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_test_annotations.json"}
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/qa_msvd_vicuna7b_eval.yaml ADDED Viewed

	@@ -0,0 +1,100 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_instruct
+  model_type: vicuna7b
+  load_pretrained: True
+  prompt: "Question: {} Short Answer:"
+datasets:
+  msvd_qa_instruct:
+    vis_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+        eval:
+          name: alpro_video_eval
+          n_frms: 4
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+    text_processor:
+        train:
+          name: "blip_question"
+        eval:
+          name: "blip_caption"
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json
+          storage: msvd/annotations/qa_train.json
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json
+          storage: msvd/annotations/qa_val.json
+        test:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json
+          storage: msvd/annotations/qa_test.json
+        ans2label:
+          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json
+          storage: msvd/annotations/qa_ans2label.json
+      videos:
+        storage: /export/share/datasets/vision_language/msvd/videos
+      instance_id_key: question_id
+run:
+  task: gqa
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 8
+  accum_grad_iters: 1
+  max_len: 30
+  min_len: 1
+  num_beams: 5
+  inference_method: "generate"
+  prompt: "Question: {} Short Answer:"
+  length_penalty: -1.
+  seed: 42
+  output_dir: "output/instructblip/msvd_qa_vicuna7b/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["test"]
+  ques_files: {"train": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_questions.json",
+              "val": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_questions.json",
+              "test":"/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_test_questions.json"}
+  anno_files: {"train": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_annotations.json",
+            "val": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_annotations.json",
+            "test":"/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_test_annotations.json"}
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/qa_okvqa_flant5xl_eval.yaml ADDED Viewed

	@@ -0,0 +1,90 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_t5_instruct
+  model_type: flant5xl
+  load_pretrained: True
+  prompt: "Question: {} Short Answer:"
+datasets:
+  ok_vqa:
+    vis_processor:
+            train:
+              name: "clip_image_train"
+            eval:
+              name: "clip_image_eval"
+    text_processor:
+          train:
+            name: "blip_question"
+          eval:
+            name: "blip_caption"
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+              # TODO make this order insensitive
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
+              # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
+              # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
+          storage:
+              - okvqa/annotations/okvqa_train.json
+              # - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json
+              # - okvqa/annotations/mscoco_train2014_annotations.json
+        test:
+          url:
+              # TODO make this order insensitive
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
+          storage:
+              - okvqa/annotations/vqa_val_eval.json
+              - okvqa/annotations/answer_list.json
+              - okvqa/annotations/OpenEnded_mscoco_val2014_questions.json
+              - okvqa/annotations/mscoco_val2014_annotations.json
+      images:
+          storage: /export/share/datasets/vision/coco/images
+run:
+  task: vqa
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 8
+  accum_grad_iters: 1
+  max_len: 10
+  min_len: 1
+  num_beams: 5
+  inference_method: "generate"
+  length_penalty: -1.
+  seed: 42
+  output_dir: "output/instructblip/okavqa_qa_flant5xl/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["test"]
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/qa_okvqa_flant5xxl_eval.yaml ADDED Viewed

	@@ -0,0 +1,90 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_t5_instruct
+  model_type: flant5xxl
+  load_pretrained: True
+  prompt: "Question: {} Short Answer:"
+datasets:
+  ok_vqa:
+    vis_processor:
+            train:
+              name: "clip_image_train"
+            eval:
+              name: "clip_image_eval"
+    text_processor:
+          train:
+            name: "blip_question"
+          eval:
+            name: "blip_caption"
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+              # TODO make this order insensitive
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
+              # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
+              # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
+          storage:
+              - okvqa/annotations/okvqa_train.json
+              # - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json
+              # - okvqa/annotations/mscoco_train2014_annotations.json
+        test:
+          url:
+              # TODO make this order insensitive
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
+          storage:
+              - okvqa/annotations/vqa_val_eval.json
+              - okvqa/annotations/answer_list.json
+              - okvqa/annotations/OpenEnded_mscoco_val2014_questions.json
+              - okvqa/annotations/mscoco_val2014_annotations.json
+      images:
+          storage: /export/share/datasets/vision/coco/images
+run:
+  task: vqa
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 0
+  accum_grad_iters: 1
+  max_len: 10
+  min_len: 1
+  num_beams: 5
+  inference_method: "generate"
+  length_penalty: -1.
+  seed: 42
+  output_dir: "output/instructblip/okavqa_qa_flant5xxl/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["test"]
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/qa_okvqa_vicuna13b_eval.yaml ADDED Viewed

	@@ -0,0 +1,90 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_instruct
+  model_type: vicuna13b
+  load_pretrained: True
+  prompt: "Question: {} Short Answer:"
+datasets:
+  ok_vqa:
+    vis_processor:
+            train:
+              name: "clip_image_train"
+            eval:
+              name: "clip_image_eval"
+    text_processor:
+          train:
+            name: "blip_question"
+          eval:
+            name: "blip_caption"
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+              # TODO make this order insensitive
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
+              # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
+              # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
+          storage:
+              - okvqa/annotations/okvqa_train.json
+              # - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json
+              # - okvqa/annotations/mscoco_train2014_annotations.json
+        test:
+          url:
+              # TODO make this order insensitive
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
+          storage:
+              - okvqa/annotations/vqa_val_eval.json
+              - okvqa/annotations/answer_list.json
+              - okvqa/annotations/OpenEnded_mscoco_val2014_questions.json
+              - okvqa/annotations/mscoco_val2014_annotations.json
+      images:
+          storage: /export/share/datasets/vision/coco/images
+run:
+  task: vqa
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 8
+  accum_grad_iters: 1
+  max_len: 10
+  min_len: 1
+  num_beams: 5
+  inference_method: "generate"
+  length_penalty: -1.
+  seed: 42
+  output_dir: "output/instructblip/okavqa_qa_vicuna13b/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["test"]
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/instructblip/qa_okvqa_vicuna7b_eval.yaml ADDED Viewed

	@@ -0,0 +1,90 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_instruct
+  model_type: vicuna7b
+  load_pretrained: True
+  prompt: "Question: {} Short Answer:"
+datasets:
+  ok_vqa:
+    vis_processor:
+            train:
+              name: "clip_image_train"
+            eval:
+              name: "clip_image_eval"
+    text_processor:
+          train:
+            name: "blip_question"
+          eval:
+            name: "blip_caption"
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+              # TODO make this order insensitive
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
+              # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
+              # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
+          storage:
+              - okvqa/annotations/okvqa_train.json
+              # - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json
+              # - okvqa/annotations/mscoco_train2014_annotations.json
+        test:
+          url:
+              # TODO make this order insensitive
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
+          storage:
+              - okvqa/annotations/vqa_val_eval.json
+              - okvqa/annotations/answer_list.json
+              - okvqa/annotations/OpenEnded_mscoco_val2014_questions.json
+              - okvqa/annotations/mscoco_val2014_annotations.json
+      images:
+          storage: /export/share/datasets/vision/coco/images
+run:
+  task: vqa
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 0
+  warmup_lr: 1e-8
+  warmup_steps: 1000
+  weight_decay: 0.05
+  max_epoch: 1
+  batch_size_train: 16
+  batch_size_eval: 1
+  num_workers: 8
+  accum_grad_iters: 1
+  max_len: 10
+  min_len: 1
+  num_beams: 5
+  inference_method: "generate"
+  length_penalty: -1.
+  seed: 42
+  output_dir: "output/instructblip/okavqa_qa_vicuna7b/"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: True
+  # train_splits: ["train"]
+  valid_splits: ["test"]
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
+  val_freq: 1

LAVIS-main/lavis/projects/pnp-vqa/eval/gqa_eval.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: pnp_vqa
+  model_type: base
+datasets:
+  gqa: # name of the dataset builder
+    type: balanced_testdev
+    vis_processor:
+        eval:
+          name: "blip_image_eval"
+          image_size: 384
+    text_processor:
+        eval:
+          name: "blip_question"
+run:
+  task: gqa_reading_comprehension
+  # optimization-specific
+  batch_size_train: 16
+  batch_size_eval: 16
+  num_workers: 4
+  # image question matching specific
+  block_num: 7
+  # image captioning specific
+  top_k: 50
+  top_p: 1
+  cap_min_length: 10
+  cap_max_length: 20
+  repetition_penalty: 1
+  num_patches: 20
+  num_captions: 100
+  prompt: 'a picture of '
+  # question answering specific
+  internal_bsz_fid: 1
+  num_captions_fid: 5
+  min_len: 0
+  max_len: 20
+  num_beams: 1
+  inference_method: "generate"
+  seed: 42
+  output_dir: "output/PNP-VQA/GQA"
+  evaluate: True
+  test_splits: ["val"]
+  # distribution-specific
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True

LAVIS-main/lavis/projects/pnp-vqa/eval/gqa_eval_3b.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: pnp_vqa
+  model_type: 3b
+datasets:
+  gqa: # name of the dataset builder
+    type: balanced_testdev
+    vis_processor:
+        eval:
+          name: "blip_image_eval"
+          image_size: 384
+    text_processor:
+        eval:
+          name: "blip_question"
+run:
+  task: gqa_reading_comprehension
+  # optimization-specific
+  batch_size_train: 4
+  batch_size_eval: 4
+  num_workers: 4
+  # image question matching specific
+  block_num: 7
+  # image captioning specific
+  top_k: 50
+  top_p: 1
+  cap_min_length: 10
+  cap_max_length: 20
+  repetition_penalty: 1
+  num_patches: 20
+  num_captions: 100
+  prompt: 'a picture of '
+  # question answering specific
+  internal_bsz_fid: 1
+  num_captions_fid: 5
+  min_len: 0
+  max_len: 20
+  num_beams: 1
+  inference_method: "generate"
+  seed: 42
+  output_dir: "output/PNP-VQA-3b/GQA"
+  evaluate: True
+  test_splits: ["val"]
+  # distribution-specific
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True

LAVIS-main/lavis/projects/pnp-vqa/eval/gqa_eval_large.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: pnp_vqa
+  model_type: large
+datasets:
+  gqa: # name of the dataset builder
+    type: balanced_testdev
+    vis_processor:
+        eval:
+          name: "blip_image_eval"
+          image_size: 384
+    text_processor:
+        eval:
+          name: "blip_question"
+run:
+  task: gqa_reading_comprehension
+  # optimization-specific
+  batch_size_train: 12
+  batch_size_eval: 12
+  num_workers: 4
+  # image question matching specific
+  block_num: 7
+  # image captioning specific
+  top_k: 50
+  top_p: 1
+  cap_min_length: 10
+  cap_max_length: 20
+  repetition_penalty: 1
+  num_patches: 20
+  num_captions: 100
+  prompt: 'a picture of '
+  # question answering specific
+  internal_bsz_fid: 1
+  num_captions_fid: 5
+  min_len: 0
+  max_len: 20
+  num_beams: 1
+  inference_method: "generate"
+  seed: 42
+  output_dir: "output/PNP-VQA-large/GQA"
+  evaluate: True
+  test_splits: ["val"]
+  # distribution-specific
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True

LAVIS-main/lavis/projects/pnp-vqa/eval/okvqa_eval.yaml ADDED Viewed

	@@ -0,0 +1,59 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: pnp_vqa
+  model_type: base
+datasets:
+  ok_vqa: # name of the dataset builder
+    vis_processor:
+        eval:
+          name: "blip_image_eval"
+          image_size: 384
+    text_processor:
+        eval:
+          name: "blip_question"
+run:
+  task: vqa_reading_comprehension
+  # optimization-specific
+  batch_size_train: 16
+  batch_size_eval: 16
+  num_workers: 4
+  # image question matching specific
+  block_num: 7
+  # image captioning specific
+  top_k: 50
+  top_p: 1
+  cap_min_length: 10
+  cap_max_length: 20
+  repetition_penalty: 1
+  num_patches: 20
+  num_captions: 100
+  prompt: 'a picture of '
+  # question answering specific
+  internal_bsz_fid: 1
+  num_captions_fid: 1
+  min_len: 0
+  max_len: 20
+  num_beams: 1
+  inference_method: "generate"
+  seed: 42
+  output_dir: "output/PNP-VQA/OKVQA"
+  evaluate: True
+  test_splits: ["test"]
+  # distribution-specific
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True

LAVIS-main/lavis/projects/pnp-vqa/eval/okvqa_eval_3b.yaml ADDED Viewed

	@@ -0,0 +1,59 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: pnp_vqa
+  model_type: 3b
+datasets:
+  ok_vqa: # name of the dataset builder
+    vis_processor:
+        eval:
+          name: "blip_image_eval"
+          image_size: 384
+    text_processor:
+        eval:
+          name: "blip_question"
+run:
+  task: vqa_reading_comprehension
+  # optimization-specific
+  batch_size_train: 4
+  batch_size_eval: 4
+  num_workers: 4
+  # image question matching specific
+  block_num: 7
+  # image captioning specific
+  top_k: 50
+  top_p: 1
+  cap_min_length: 10
+  cap_max_length: 20
+  repetition_penalty: 1
+  num_patches: 20
+  num_captions: 100
+  prompt: 'a picture of '
+  # question answering specific
+  internal_bsz_fid: 1
+  num_captions_fid: 1
+  min_len: 0
+  max_len: 20
+  num_beams: 1
+  inference_method: "generate"
+  seed: 42
+  output_dir: "output/PNP-VQA-3b/OKVQA"
+  evaluate: True
+  test_splits: ["test"]
+  # distribution-specific
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True

LAVIS-main/lavis/projects/pnp-vqa/eval/okvqa_eval_large.yaml ADDED Viewed

	@@ -0,0 +1,59 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: pnp_vqa
+  model_type: large
+datasets:
+  ok_vqa: # name of the dataset builder
+    vis_processor:
+        eval:
+          name: "blip_image_eval"
+          image_size: 384
+    text_processor:
+        eval:
+          name: "blip_question"
+run:
+  task: vqa_reading_comprehension
+  # optimization-specific
+  batch_size_train: 12
+  batch_size_eval: 12
+  num_workers: 4
+  # image question matching specific
+  block_num: 7
+  # image captioning specific
+  top_k: 50
+  top_p: 1
+  cap_min_length: 10
+  cap_max_length: 20
+  repetition_penalty: 1
+  num_patches: 20
+  num_captions: 100
+  prompt: 'a picture of '
+  # question answering specific
+  internal_bsz_fid: 1
+  num_captions_fid: 1
+  min_len: 0
+  max_len: 20
+  num_beams: 1
+  inference_method: "generate"
+  seed: 42
+  output_dir: "output/PNP-VQA-large/OKVQA"
+  evaluate: True
+  test_splits: ["test"]
+  # distribution-specific
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True

LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_eval.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: pnp_vqa
+  model_type: base
+datasets:
+  coco_vqa: # name of the dataset builder
+    type: eval
+    vis_processor:
+        eval:
+          name: "blip_image_eval"
+          image_size: 384
+    text_processor:
+        eval:
+          name: "blip_question"
+run:
+  task: vqa_reading_comprehension
+  # optimization-specific
+  batch_size_train: 16
+  batch_size_eval: 16
+  num_workers: 4
+  # image question matching specific
+  block_num: 7
+  # image captioning specific
+  top_k: 50
+  top_p: 1
+  cap_min_length: 10
+  cap_max_length: 20
+  repetition_penalty: 1
+  num_patches: 20
+  num_captions: 100
+  prompt: 'a picture of '
+  # question answering specific
+  internal_bsz_fid: 1
+  num_captions_fid: 1
+  min_len: 0
+  max_len: 20
+  num_beams: 1
+  inference_method: "generate"
+  seed: 42
+  output_dir: "output/PNP-VQA/VQAv2_val"
+  evaluate: True
+  test_splits: ["val"]
+  # distribution-specific
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True

LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_eval_3b.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: pnp_vqa
+  model_type: 3b
+datasets:
+  coco_vqa: # name of the dataset builder
+    type: eval
+    vis_processor:
+        eval:
+          name: "blip_image_eval"
+          image_size: 384
+    text_processor:
+        eval:
+          name: "blip_question"
+run:
+  task: vqa_reading_comprehension
+  # optimization-specific
+  batch_size_train: 4
+  batch_size_eval: 4
+  num_workers: 4
+  # image question matching specific
+  block_num: 7
+  # image captioning specific
+  top_k: 50
+  top_p: 1
+  cap_min_length: 10
+  cap_max_length: 20
+  repetition_penalty: 1
+  num_patches: 20
+  num_captions: 100
+  prompt: 'a picture of '
+  # question answering specific
+  internal_bsz_fid: 1
+  num_captions_fid: 1
+  min_len: 0
+  max_len: 20
+  num_beams: 1
+  inference_method: "generate"
+  seed: 42
+  output_dir: "output/PNP-VQA-3b/VQAv2_val"
+  evaluate: True
+  test_splits: ["val"]
+  # distribution-specific
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True

LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_eval_large.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: pnp_vqa
+  model_type: large
+datasets:
+  coco_vqa: # name of the dataset builder
+    type: eval
+    vis_processor:
+        eval:
+          name: "blip_image_eval"
+          image_size: 384
+    text_processor:
+        eval:
+          name: "blip_question"
+run:
+  task: vqa_reading_comprehension
+  # optimization-specific
+  batch_size_train: 12
+  batch_size_eval: 12
+  num_workers: 4
+  # image question matching specific
+  block_num: 7
+  # image captioning specific
+  top_k: 50
+  top_p: 1
+  cap_min_length: 10
+  cap_max_length: 20
+  repetition_penalty: 1
+  num_patches: 20
+  num_captions: 100
+  prompt: 'a picture of '
+  # question answering specific
+  internal_bsz_fid: 1
+  num_captions_fid: 1
+  min_len: 0
+  max_len: 20
+  num_beams: 1
+  inference_method: "generate"
+  seed: 42
+  output_dir: "output/PNP-VQA-large/VQAv2_val"
+  evaluate: True
+  test_splits: ["val"]
+  # distribution-specific
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True

LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_test_eval.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: pnp_vqa
+  model_type: base
+datasets:
+  coco_vqa: # name of the dataset builder
+    type: default
+    vis_processor:
+        eval:
+          name: "blip_image_eval"
+          image_size: 384
+    text_processor:
+        eval:
+          name: "blip_question"
+run:
+  task: vqa_reading_comprehension
+  # optimization-specific
+  batch_size_train: 16
+  batch_size_eval: 16
+  num_workers: 4
+  # image question matching specific
+  block_num: 7
+  # image captioning specific
+  top_k: 50
+  top_p: 1
+  cap_min_length: 10
+  cap_max_length: 20
+  repetition_penalty: 1
+  num_patches: 20
+  num_captions: 100
+  prompt: 'a picture of '
+  # question answering specific
+  internal_bsz_fid: 1
+  num_captions_fid: 1
+  min_len: 0
+  max_len: 20
+  num_beams: 1
+  inference_method: "generate"
+  seed: 42
+  output_dir: "output/PNP-VQA/VQAv2_test"
+  evaluate: True
+  test_splits: ["test"]
+  # distribution-specific
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True

LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_test_eval_3b.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: pnp_vqa
+  model_type: 3b
+datasets:
+  coco_vqa: # name of the dataset builder
+    type: default
+    vis_processor:
+        eval:
+          name: "blip_image_eval"
+          image_size: 384
+    text_processor:
+        eval:
+          name: "blip_question"
+run:
+  task: vqa_reading_comprehension
+  # optimization-specific
+  batch_size_train: 4
+  batch_size_eval: 4
+  num_workers: 4
+  # image question matching specific
+  block_num: 7
+  # image captioning specific
+  top_k: 50
+  top_p: 1
+  cap_min_length: 10
+  cap_max_length: 20
+  repetition_penalty: 1
+  num_patches: 20
+  num_captions: 100
+  prompt: 'a picture of '
+  # question answering specific
+  internal_bsz_fid: 1
+  num_captions_fid: 1
+  min_len: 0
+  max_len: 20
+  num_beams: 1
+  inference_method: "generate"
+  seed: 42
+  output_dir: "output/PNP-VQA-3b/VQAv2_test"
+  evaluate: True
+  test_splits: ["test"]
+  # distribution-specific
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True

LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_test_eval_large.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: pnp_vqa
+  model_type: large
+datasets:
+  coco_vqa: # name of the dataset builder
+    type: default
+    vis_processor:
+        eval:
+          name: "blip_image_eval"
+          image_size: 384
+    text_processor:
+        eval:
+          name: "blip_question"
+run:
+  task: vqa_reading_comprehension
+  # optimization-specific
+  batch_size_train: 12
+  batch_size_eval: 12
+  num_workers: 4
+  # image question matching specific
+  block_num: 7
+  # image captioning specific
+  top_k: 50
+  top_p: 1
+  cap_min_length: 10
+  cap_max_length: 20
+  repetition_penalty: 1
+  num_patches: 20
+  num_captions: 100
+  prompt: 'a picture of '
+  # question answering specific
+  internal_bsz_fid: 1
+  num_captions_fid: 1
+  min_len: 0
+  max_len: 20
+  num_beams: 1
+  inference_method: "generate"
+  seed: 42
+  output_dir: "output/PNP-VQA-large/VQAv2_test"
+  evaluate: True
+  test_splits: ["test"]
+  # distribution-specific
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True

LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_caption.yaml ADDED Viewed

	@@ -0,0 +1,176 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_xinstruct
+  model_type: vicuna7b
+  load_pretrained: True
+  # pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
+  load_finetuned: False
+  finetuned: ""
+  stage1_url_or_filename: null
+  image_model: "eva_clip_g"
+  pc_model: "ulip2_pointbert"
+  video_model: "eva_clip_g"
+  audio_model: "beats"
+  pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
+  pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth
+  pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth
+  pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/audio_qformer.pth
+  load_attention_image_qformer: True
+  load_attention_pc_qformer: True
+  load_attention_video_qformer: True
+  load_attention_audio_qformer: True
+  load_ln_type_image: "image"
+  load_ln_type_video: "video"
+  load_ln_type_pc: "pc"
+  load_ln_type_audio: "audio"
+  load_qformer_type_image: "image"
+  load_qformer_type_pc: "pc"
+  load_qformer_type_video: "video"
+  load_qformer_type_audio: "audio"
+  load_projection_image: True
+  load_projection_pc: True
+  load_projection_video: True
+  load_projection_audio: True
+  load_projection_type_image: "image"
+  load_projection_type_pc: "pc"
+  load_projection_type_video: "video"
+  load_projection_type_audio: "audio"
+  image_encoder_kwargs :  {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  pc_encoder_kwargs :  {}
+  video_encoder_kwargs :   {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  audio_encoder_kwargs :  {}
+  image_precision: "fp16"
+  pc_precision: "fp16"
+  video_precision: "fp16"
+  audio_precision: "fp16"
+  freeze_image: True
+  freeze_pc: True
+  freeze_video: True
+  freeze_audio: True
+  num_query_token: 32
+  llm_model: /path/to/vicuna-7b
+  prompt: "question: {} answer:"
+  max_txt_len: 128
+  max_output_txt_len: 256
+  apply_lemmatizer: False
+  num_few_shot_examples: 0
+  few_shot_prob: 0
+  qformer_text_input: True
+  llm_text_input: True
+  modalities :  [audio, video]
+  use_cues: True
+  shared_qformer: False
+  pretrained_shared_qformer: Null
+  load_attention_shared_qformer: False
+  load_qformer_type_shared: ""
+  load_projection_shared: False
+  load_projection_type_shaped: ""
+  load_ln_type_shared: ""
+  shared_qformer_num_features: 512
+  special_qformer_input_prompt: "a short description"
+  prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
+  predict_with_gen: False
+  use_caption: True
+  use_describe: False
+  enumerate_inputs: False
+  add_space: True
+datasets:
+  audio_video_discrn:
+    # data_dir: ${env.data_dir}/datasets
+    audio_processor:
+      train:
+        name: beats_audio
+        n_frames: 2
+      eval:
+        name: beats_audio
+        n_frames: 2
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
+    video_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 5
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+        eval:
+          name: alpro_video_eval
+          n_frms: 5
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+    data_type: [audio, video] # [images|videos|features]
+    build_info:
+      kwargs:
+        total: all
+        shuffle_modalities: False
+        balance_labels: True
+        dataset_name: audiocaps
+        ground_truth: False
+        classnames: [audio, video]
+        raw: True
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
+          storage:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
+      audio:
+        storage: /audiocaps/AUDIOCAPS_32000Hz/audio/val
+      video:
+        storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val
+run:
+  task: discrn_qa
+  # optimization-specific
+  batch_size_train: 8
+  batch_size_eval: 1
+  num_workers: 0
+  max_epoch: 1
+  segments: 1
+  # inference-specific
+  max_len: 10
+  min_len: 1
+  length_penalty: -1.
+  num_beams: 5
+  inference_method: "generate"
+  train_splits: ["train"]
+  valid_splits: ["val"]
+  # test_splits: ["test"]
+  # distribution
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  use_dist_eval_sampler: False
+  # model specific
+  k_test: 128
+  # misc
+  seed: 42
+  output_dir: "output/xinstructblip/eval/vicuna7b/discrn/audio_video_caption"
+  evaluate: True
+  save_freq: -1

LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_caption_13b.yaml ADDED Viewed

	@@ -0,0 +1,176 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_xinstruct
+  model_type: vicuna13b
+  load_pretrained: True
+  # pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/image_qformer.pth
+  load_finetuned: False
+  finetuned: ""
+  stage1_url_or_filename: null
+  image_model: "eva_clip_g"
+  pc_model: "ulip2_pointbert"
+  video_model: "eva_clip_g"
+  audio_model: "beats"
+  pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/image_qformer.pth
+  pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/pc_qformer.pth
+  pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/video_qformer.pth
+  pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/audio_qformer.pth
+  load_attention_image_qformer: True
+  load_attention_pc_qformer: True
+  load_attention_video_qformer: True
+  load_attention_audio_qformer: True
+  load_ln_type_image: "image"
+  load_ln_type_video: "video"
+  load_ln_type_pc: "pc"
+  load_ln_type_audio: "audio"
+  load_qformer_type_image: "image"
+  load_qformer_type_pc: "pc"
+  load_qformer_type_video: "video"
+  load_qformer_type_audio: "audio"
+  load_projection_image: True
+  load_projection_pc: True
+  load_projection_video: True
+  load_projection_audio: True
+  load_projection_type_image: "image"
+  load_projection_type_pc: "pc"
+  load_projection_type_video: "video"
+  load_projection_type_audio: "audio"
+  image_encoder_kwargs :  {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  pc_encoder_kwargs :  {}
+  video_encoder_kwargs :   {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  audio_encoder_kwargs :  {}
+  image_precision: "fp16"
+  pc_precision: "fp16"
+  video_precision: "fp16"
+  audio_precision: "fp16"
+  freeze_image: True
+  freeze_pc: True
+  freeze_video: True
+  freeze_audio: True
+  num_query_token: 32
+  llm_model: "/path/to/vicuna-13b"
+  prompt: "question: {} answer:"
+  max_txt_len: 128
+  max_output_txt_len: 256
+  apply_lemmatizer: False
+  num_few_shot_examples: 0
+  few_shot_prob: 0
+  qformer_text_input: True
+  llm_text_input: True
+  modalities :  [audio, video]
+  use_cues: True
+  shared_qformer: False
+  pretrained_shared_qformer: Null
+  load_attention_shared_qformer: False
+  load_qformer_type_shared: ""
+  load_projection_shared: False
+  load_projection_type_shaped: ""
+  load_ln_type_shared: ""
+  shared_qformer_num_features: 512
+  special_qformer_input_prompt: "a short description"
+  prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
+  predict_with_gen: False
+  use_caption: True
+  use_describe: False
+  enumerate_inputs: False
+  add_space: True
+datasets:
+  audio_video_discrn:
+    # data_dir: ${env.data_dir}/datasets
+    audio_processor:
+      train:
+        name: beats_audio
+        n_frames: 2
+      eval:
+        name: beats_audio
+        n_frames: 2
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
+    video_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 5
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+        eval:
+          name: alpro_video_eval
+          n_frms: 5
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+    data_type: [audio, video] # [images|videos|features]
+    build_info:
+      kwargs:
+        total: all
+        shuffle_modalities: False
+        balance_labels: True
+        dataset_name: audiocaps
+        ground_truth: False
+        classnames: [audio, video]
+        raw: True
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
+          storage:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
+      audio:
+        storage: /audiocaps/AUDIOCAPS_32000Hz/audio/val
+      video:
+        storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val
+run:
+  task: discrn_qa
+  # optimization-specific
+  batch_size_train: 8
+  batch_size_eval: 1
+  num_workers: 0
+  max_epoch: 1
+  segments: 1
+  # inference-specific
+  max_len: 10
+  min_len: 1
+  length_penalty: -1.
+  num_beams: 5
+  inference_method: "generate"
+  train_splits: ["train"]
+  valid_splits: ["val"]
+  # test_splits: ["test"]
+  # distribution
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  use_dist_eval_sampler: False
+  # model specific
+  k_test: 128
+  # misc
+  seed: 42
+  output_dir: "output/xinstructblip/eval/vicuna13b/discrn/audio_video_caption"
+  evaluate: True
+  save_freq: -1

LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe.yaml ADDED Viewed

	@@ -0,0 +1,176 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_xinstruct
+  model_type: vicuna7b
+  load_pretrained: True
+  # pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
+  load_finetuned: False
+  finetuned: ""
+  stage1_url_or_filename: null
+  image_model: "eva_clip_g"
+  pc_model: "ulip2_pointbert"
+  video_model: "eva_clip_g"
+  audio_model: "beats"
+  pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
+  pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth
+  pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth
+  pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/audio_qformer.pth
+  load_attention_image_qformer: True
+  load_attention_pc_qformer: True
+  load_attention_video_qformer: True
+  load_attention_audio_qformer: True
+  load_ln_type_image: "image"
+  load_ln_type_video: "video"
+  load_ln_type_pc: "pc"
+  load_ln_type_audio: "audio"
+  load_qformer_type_image: "image"
+  load_qformer_type_pc: "pc"
+  load_qformer_type_video: "video"
+  load_qformer_type_audio: "audio"
+  load_projection_image: True
+  load_projection_pc: True
+  load_projection_video: True
+  load_projection_audio: True
+  load_projection_type_image: "image"
+  load_projection_type_pc: "pc"
+  load_projection_type_video: "video"
+  load_projection_type_audio: "audio"
+  image_encoder_kwargs :  {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  pc_encoder_kwargs :  {}
+  video_encoder_kwargs :   {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  audio_encoder_kwargs :  {}
+  image_precision: "fp16"
+  pc_precision: "fp16"
+  video_precision: "fp16"
+  audio_precision: "fp16"
+  freeze_image: True
+  freeze_pc: True
+  freeze_video: True
+  freeze_audio: True
+  num_query_token: 32
+  llm_model: /path/to/vicuna-7b
+  prompt: "question: {} answer:"
+  max_txt_len: 128
+  max_output_txt_len: 256
+  apply_lemmatizer: False
+  num_few_shot_examples: 0
+  few_shot_prob: 0
+  qformer_text_input: True
+  llm_text_input: True
+  modalities :  [audio, video]
+  use_cues: True
+  shared_qformer: False
+  pretrained_shared_qformer: Null
+  load_attention_shared_qformer: False
+  load_qformer_type_shared: ""
+  load_projection_shared: False
+  load_projection_type_shaped: ""
+  load_ln_type_shared: ""
+  shared_qformer_num_features: 512
+  special_qformer_input_prompt: "a short description"
+  prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
+  predict_with_gen: False
+  use_caption: False
+  use_describe: False
+  enumerate_inputs: False
+  add_space: True
+datasets:
+  audio_video_discrn:
+    # data_dir: ${env.data_dir}/datasets
+    audio_processor:
+      train:
+        name: beats_audio
+        n_frames: 2
+      eval:
+        name: beats_audio
+        n_frames: 2
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
+    video_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 2
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+        eval:
+          name: alpro_video_eval
+          n_frms: 2
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+    data_type: [audio, video] # [images|videos|features]
+    build_info:
+      kwargs:
+        total: 100
+        shuffle_modalities: False
+        balance_labels: True
+        dataset_name: audiocaps
+        ground_truth: False
+        classnames: [audio, video]
+        raw: False
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
+          storage:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
+      audio:
+        storage: /audiocaps/AUDIOCAPS_32000Hz/audio/val
+      video:
+        storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val
+run:
+  task: discrn_qa
+  # optimization-specific
+  batch_size_train: 8
+  batch_size_eval: 1
+  num_workers: 8
+  max_epoch: 1
+  segments: 1
+  # inference-specific
+  max_len: 10
+  min_len: 1
+  length_penalty: -1.
+  num_beams: 5
+  inference_method: "generate"
+  train_splits: ["train"]
+  valid_splits: ["val"]
+  # test_splits: ["test"]
+  # distribution
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  use_dist_eval_sampler: False
+  # model specific
+  k_test: 128
+  # misc
+  seed: 42
+  output_dir: "output/xinstructblip/eval/vicuna7b/discrn/audio_video_describe"
+  evaluate: True
+  save_freq: -1

LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_13b.yaml ADDED Viewed

	@@ -0,0 +1,177 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_xinstruct
+  model_type: vicuna13b
+  load_pretrained: True
+  # pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/image_qformer.pth
+  load_finetuned: False
+  finetuned: ""
+  stage1_url_or_filename: null
+  image_model: "eva_clip_g"
+  pc_model: "ulip2_pointbert"
+  video_model: "eva_clip_g"
+  audio_model: "beats"
+  pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/image_qformer.pth
+  pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/pc_qformer_last.pth
+  pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/video_qformer.pth
+  pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/audio_qformer.pth
+  load_attention_image_qformer: True
+  load_attention_pc_qformer: True
+  load_attention_video_qformer: True
+  load_attention_audio_qformer: True
+  load_ln_type_image: "image"
+  load_ln_type_video: "video"
+  load_ln_type_pc: "pc"
+  load_ln_type_audio: "audio"
+  load_qformer_type_image: "image"
+  load_qformer_type_pc: "pc"
+  load_qformer_type_video: "video"
+  load_qformer_type_audio: "audio"
+  load_projection_image: True
+  load_projection_pc: True
+  load_projection_video: True
+  load_projection_audio: True
+  load_projection_type_image: "image"
+  load_projection_type_pc: "pc"
+  load_projection_type_video: "video"
+  load_projection_type_audio: "audio"
+  image_encoder_kwargs :  {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  pc_encoder_kwargs :  {}
+  video_encoder_kwargs :   {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  audio_encoder_kwargs :  {}
+  image_precision: "fp16"
+  pc_precision: "fp16"
+  video_precision: "fp16"
+  audio_precision: "fp16"
+  freeze_image: True
+  freeze_pc: True
+  freeze_video: True
+  freeze_audio: True
+  num_query_token: 32
+  llm_model: "/path/to/vicuna-13b"
+  prompt: "question: {} answer:"
+  max_txt_len: 128
+  max_output_txt_len: 256
+  apply_lemmatizer: False
+  num_few_shot_examples: 0
+  few_shot_prob: 0
+  qformer_text_input: True
+  llm_text_input: True
+  modalities :  [audio, video]
+  use_cues: True
+  shared_qformer: False
+  pretrained_shared_qformer: Null
+  load_attention_shared_qformer: False
+  load_qformer_type_shared: ""
+  load_projection_shared: False
+  load_projection_type_shaped: ""
+  load_ln_type_shared: ""
+  shared_qformer_num_features: 512
+  # special_qformer_input_prompt: "a short description"
+  prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
+  predict_with_gen: False
+  use_caption: False
+  use_describe: False
+  enumerate_inputs: False
+  add_space: True
+  remove_start: True
+datasets:
+  audio_video_discrn:
+    # data_dir: ${env.data_dir}/datasets
+    audio_processor:
+      train:
+        name: beats_audio
+        n_frames: 2
+      eval:
+        name: beats_audio
+        n_frames: 2
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
+    video_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 2
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+        eval:
+          name: alpro_video_eval
+          n_frms: 2
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+    data_type: [audio, video] # [images|videos|features]
+    build_info:
+      kwargs:
+        total: 100
+        shuffle_modalities: False
+        balance_labels: True
+        dataset_name: audiocaps
+        ground_truth: False
+        classnames: [audio, video]
+        raw: False
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
+          storage:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
+      audio:
+        storage: /audiocaps/AUDIOCAPS_32000Hz/audio/val
+      video:
+        storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val
+run:
+  task: discrn_qa
+  # optimization-specific
+  batch_size_train: 8
+  batch_size_eval: 1
+  num_workers: 8
+  max_epoch: 1
+  segments: 1
+  # inference-specific
+  max_len: 10
+  min_len: 1
+  length_penalty: -1.
+  num_beams: 5
+  inference_method: "generate"
+  train_splits: ["train"]
+  valid_splits: ["val"]
+  # test_splits: ["test"]
+  # distribution
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  use_dist_eval_sampler: False
+  # model specific
+  k_test: 128
+  # misc
+  seed: 42
+  output_dir: "output/xinstructblip/eval/vicuna13b/discrn/audio_video_describe"
+  evaluate: True
+  save_freq: -1

LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_nocue.yaml ADDED Viewed

	@@ -0,0 +1,176 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_xinstruct
+  model_type: vicuna7b
+  load_pretrained: True
+  # pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
+  load_finetuned: False
+  finetuned: ""
+  stage1_url_or_filename: null
+  image_model: "eva_clip_g"
+  pc_model: "ulip2_pointbert"
+  video_model: "eva_clip_g"
+  audio_model: "beats"
+  pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b_nocue/image_qformer.pth
+  pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b_nocue/pc_qformer.pth
+  pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b_nocue/video_qformer.pth
+  pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b_nocue/audio_qformer.pth
+  load_attention_image_qformer: True
+  load_attention_pc_qformer: True
+  load_attention_video_qformer: True
+  load_attention_audio_qformer: True
+  load_ln_type_image: "image"
+  load_ln_type_video: "video"
+  load_ln_type_pc: "pc"
+  load_ln_type_audio: "audio"
+  load_qformer_type_image: "image"
+  load_qformer_type_pc: "pc"
+  load_qformer_type_video: "video"
+  load_qformer_type_audio: "audio"
+  load_projection_image: True
+  load_projection_pc: True
+  load_projection_video: True
+  load_projection_audio: True
+  load_projection_type_image: "image"
+  load_projection_type_pc: "pc"
+  load_projection_type_video: "video"
+  load_projection_type_audio: "audio"
+  image_encoder_kwargs :  {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  pc_encoder_kwargs :  {}
+  video_encoder_kwargs :   {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  audio_encoder_kwargs :  {}
+  image_precision: "fp16"
+  pc_precision: "fp16"
+  video_precision: "fp16"
+  audio_precision: "fp16"
+  freeze_image: True
+  freeze_pc: True
+  freeze_video: True
+  freeze_audio: True
+  num_query_token: 32
+  llm_model: /path/to/vicuna-7b
+  prompt: "question: {} answer:"
+  max_txt_len: 128
+  max_output_txt_len: 256
+  apply_lemmatizer: False
+  num_few_shot_examples: 0
+  few_shot_prob: 0
+  qformer_text_input: True
+  llm_text_input: True
+  modalities :  [audio, video]
+  use_cues: False
+  shared_qformer: False
+  pretrained_shared_qformer: Null
+  load_attention_shared_qformer: False
+  load_qformer_type_shared: ""
+  load_projection_shared: False
+  load_projection_type_shaped: ""
+  load_ln_type_shared: ""
+  shared_qformer_num_features: 512
+  special_qformer_input_prompt: "a short description"
+  prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
+  predict_with_gen: False
+  use_caption: False
+  use_describe: False
+  enumerate_inputs: False
+  add_space: True
+datasets:
+  audio_video_discrn:
+    # data_dir: ${env.data_dir}/datasets
+    audio_processor:
+      train:
+        name: beats_audio
+        n_frames: 2
+      eval:
+        name: beats_audio
+        n_frames: 2
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
+    video_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 2
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+        eval:
+          name: alpro_video_eval
+          n_frms: 2
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+    data_type: [audio, video] # [images|videos|features]
+    build_info:
+      kwargs:
+        total: all
+        shuffle_modalities: False
+        balance_labels: True
+        dataset_name: audiocaps
+        ground_truth: False
+        classnames: [audio, video]
+        raw: False
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
+          storage:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
+      audio:
+        storage: /audiocaps/AUDIOCAPS_32000Hz/audio/val
+      video:
+        storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val
+run:
+  task: discrn_qa
+  # optimization-specific
+  batch_size_train: 8
+  batch_size_eval: 1
+  num_workers: 8
+  max_epoch: 1
+  segments: 1
+  # inference-specific
+  max_len: 10
+  min_len: 1
+  length_penalty: -1.
+  num_beams: 5
+  inference_method: "generate"
+  train_splits: ["train"]
+  valid_splits: ["val"]
+  # test_splits: ["test"]
+  # distribution
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  use_dist_eval_sampler: False
+  # model specific
+  k_test: 128
+  # misc
+  seed: 42
+  output_dir: "output/xinstructblip/eval/vicuna7b/discrn/audio_video_describe_nocue"
+  evaluate: True
+  save_freq: -1

LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_proj copy.yaml ADDED Viewed

	@@ -0,0 +1,179 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_xinstruct
+  model_type: vicuna7b
+  load_pretrained: True
+  # pretrained: /export/home/LAVIS-xgen_mm/lavis/output/xinstructblip/train/vicuna7b/audio/20231115194/checkpoint_65001.pth
+  load_finetuned: False
+  finetuned: ""
+  stage1_url_or_filename: null
+  image_model: "eva_clip_g"
+  pc_model: "ulip2_pointbert"
+  video_model: "eva_clip_g"
+  audio_model: "beats"
+  pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
+  pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth
+  pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth
+  pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/linear_projection_7b/audio_qformer_linear.pth
+  load_attention_image_qformer: True
+  load_attention_pc_qformer: True
+  load_attention_video_qformer: True
+  load_attention_audio_qformer: True
+  load_ln_type_image: "image"
+  load_ln_type_video: "video"
+  load_ln_type_pc: "pc"
+  load_ln_type_audio: "audio"
+  load_qformer_type_image: "image"
+  load_qformer_type_pc: "pc"
+  load_qformer_type_video: "video"
+  load_qformer_type_audio: "audio"
+  load_projection_image: True
+  load_projection_pc: True
+  load_projection_video: True
+  load_projection_audio: True
+  load_projection_type_image: "image"
+  load_projection_type_pc: "pc"
+  load_projection_type_video: "video"
+  load_projection_type_audio: "audio"
+  image_encoder_kwargs :  {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  pc_encoder_kwargs :  {}
+  video_encoder_kwargs :   {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  audio_encoder_kwargs :  {}
+  image_precision: "fp16"
+  pc_precision: "fp16"
+  video_precision: "fp16"
+  audio_precision: "fp16"
+  freeze_image: True
+  freeze_pc: True
+  freeze_video: True
+  freeze_audio: True
+  num_query_token: 32
+  llm_model: /path/to/vicuna-7b
+  prompt: "question: {} answer:"
+  max_txt_len: 128
+  max_output_txt_len: 256
+  apply_lemmatizer: False
+  num_few_shot_examples: 0
+  few_shot_prob: 0
+  qformer_text_input: True
+  llm_text_input: True
+  modalities :  [audio, video]
+  use_cues: True
+  shared_qformer: False
+  pretrained_shared_qformer: Null
+  load_attention_shared_qformer: False
+  load_qformer_type_shared: ""
+  load_projection_shared: False
+  load_projection_type_shaped: ""
+  load_ln_type_shared: ""
+  shared_qformer_num_features: 512
+  special_qformer_input_prompt: "a short description"
+  prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
+  predict_with_gen: False
+  use_caption: False
+  use_describe: False
+  enumerate_inputs: False
+  add_space: True
+  projection_only_audio: True
+  projection_path_audio: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/linear_projection_7b/audio_qformer_linear_768.pth
+  proj_dim: 768
+datasets:
+  audio_video_discrn:
+    # data_dir: ${env.data_dir}/datasets
+    audio_processor:
+      train:
+        name: beats_audio
+        n_frames: 2
+      eval:
+        name: beats_audio
+        n_frames: 2
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
+    video_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 2
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+        eval:
+          name: alpro_video_eval
+          n_frms: 2
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+    data_type: [audio, video] # [images|videos|features]
+    build_info:
+      kwargs:
+        total: all
+        shuffle_modalities: False
+        balance_labels: True
+        dataset_name: audiocaps
+        ground_truth: False
+        classnames: [audio, video]
+        raw: False
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
+          storage:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
+      audio:
+        storage: /audiocaps/AUDIOCAPS_32000Hz/audio/val
+      video:
+        storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val
+run:
+  task: discrn_qa
+  # optimization-specific
+  batch_size_train: 8
+  batch_size_eval: 1
+  num_workers: 8
+  max_epoch: 1
+  segments: 1
+  # inference-specific
+  max_len: 10
+  min_len: 1
+  length_penalty: -1.
+  num_beams: 5
+  inference_method: "generate"
+  train_splits: ["train"]
+  valid_splits: ["val"]
+  # test_splits: ["test"]
+  # distribution
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  use_dist_eval_sampler: False
+  # model specific
+  k_test: 128
+  # misc
+  seed: 42
+  output_dir: "output/xinstructblip/eval/vicuna7b/discrn/audio_video_describe_proj"
+  evaluate: True
+  save_freq: -1

LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_proj.yaml ADDED Viewed

	@@ -0,0 +1,179 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_xinstruct
+  model_type: vicuna7b
+  load_pretrained: True
+  # pretrained: /export/home/LAVIS-xgen_mm/lavis/output/xinstructblip/train/vicuna7b/audio/20231115194/checkpoint_65001.pth
+  load_finetuned: False
+  finetuned: ""
+  stage1_url_or_filename: null
+  image_model: "eva_clip_g"
+  pc_model: "ulip2_pointbert"
+  video_model: "eva_clip_g"
+  audio_model: "beats"
+  pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
+  pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth
+  pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth
+  pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/linear_projection_7b/audio_qformer_linear.pth
+  load_attention_image_qformer: True
+  load_attention_pc_qformer: True
+  load_attention_video_qformer: True
+  load_attention_audio_qformer: True
+  load_ln_type_image: "image"
+  load_ln_type_video: "video"
+  load_ln_type_pc: "pc"
+  load_ln_type_audio: "audio"
+  load_qformer_type_image: "image"
+  load_qformer_type_pc: "pc"
+  load_qformer_type_video: "video"
+  load_qformer_type_audio: "audio"
+  load_projection_image: True
+  load_projection_pc: True
+  load_projection_video: True
+  load_projection_audio: True
+  load_projection_type_image: "image"
+  load_projection_type_pc: "pc"
+  load_projection_type_video: "video"
+  load_projection_type_audio: "audio"
+  image_encoder_kwargs :  {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  pc_encoder_kwargs :  {}
+  video_encoder_kwargs :   {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  audio_encoder_kwargs :  {}
+  image_precision: "fp16"
+  pc_precision: "fp16"
+  video_precision: "fp16"
+  audio_precision: "fp16"
+  freeze_image: True
+  freeze_pc: True
+  freeze_video: True
+  freeze_audio: True
+  num_query_token: 32
+  llm_model: /path/to/vicuna-7b
+  prompt: "question: {} answer:"
+  max_txt_len: 128
+  max_output_txt_len: 256
+  apply_lemmatizer: False
+  num_few_shot_examples: 0
+  few_shot_prob: 0
+  qformer_text_input: True
+  llm_text_input: True
+  modalities :  [audio, video]
+  use_cues: True
+  shared_qformer: False
+  pretrained_shared_qformer: Null
+  load_attention_shared_qformer: False
+  load_qformer_type_shared: ""
+  load_projection_shared: False
+  load_projection_type_shaped: ""
+  load_ln_type_shared: ""
+  shared_qformer_num_features: 512
+  special_qformer_input_prompt: "a short description"
+  prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
+  predict_with_gen: False
+  use_caption: False
+  use_describe: False
+  enumerate_inputs: False
+  add_space: True
+  projection_only_audio: True
+  projection_path_audio: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/linear_projection_7b/audio_qformer_linear_768.pth
+  proj_dim: 768
+datasets:
+  audio_video_discrn:
+    # data_dir: ${env.data_dir}/datasets
+    audio_processor:
+      train:
+        name: beats_audio
+        n_frames: 2
+      eval:
+        name: beats_audio
+        n_frames: 2
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
+    video_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 2
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+        eval:
+          name: alpro_video_eval
+          n_frms: 2
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+    data_type: [audio, video] # [images|videos|features]
+    build_info:
+      kwargs:
+        total: all
+        shuffle_modalities: False
+        balance_labels: True
+        dataset_name: audiocaps
+        ground_truth: False
+        classnames: [audio, video]
+        raw: False
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
+          storage:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
+      audio:
+        storage: /audiocaps/AUDIOCAPS_32000Hz/audio/val
+      video:
+        storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val
+run:
+  task: discrn_qa
+  # optimization-specific
+  batch_size_train: 8
+  batch_size_eval: 1
+  num_workers: 8
+  max_epoch: 1
+  segments: 1
+  # inference-specific
+  max_len: 10
+  min_len: 1
+  length_penalty: -1.
+  num_beams: 5
+  inference_method: "generate"
+  train_splits: ["train"]
+  valid_splits: ["val"]
+  # test_splits: ["test"]
+  # distribution
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  use_dist_eval_sampler: False
+  # model specific
+  k_test: 128
+  # misc
+  seed: 42
+  output_dir: "output/xinstructblip/eval/vicuna7b/discrn/audio_video_describe_proj"
+  evaluate: True
+  save_freq: -1

LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_rand_init.yaml ADDED Viewed

	@@ -0,0 +1,176 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_xinstruct
+  model_type: vicuna7b
+  load_pretrained: True
+  # pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
+  load_finetuned: False
+  finetuned: ""
+  stage1_url_or_filename: null
+  image_model: "eva_clip_g"
+  pc_model: "ulip2_pointbert"
+  video_model: "eva_clip_g"
+  audio_model: "beats"
+  pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
+  pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth
+  pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth
+  pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/audio_qformer_no_init.pth
+  load_attention_image_qformer: True
+  load_attention_pc_qformer: True
+  load_attention_video_qformer: True
+  load_attention_audio_qformer: True
+  load_ln_type_image: "image"
+  load_ln_type_video: "video"
+  load_ln_type_pc: "pc"
+  load_ln_type_audio: "audio"
+  load_qformer_type_image: "image"
+  load_qformer_type_pc: "pc"
+  load_qformer_type_video: "video"
+  load_qformer_type_audio: "audio"
+  load_projection_image: True
+  load_projection_pc: True
+  load_projection_video: True
+  load_projection_audio: True
+  load_projection_type_image: "image"
+  load_projection_type_pc: "pc"
+  load_projection_type_video: "video"
+  load_projection_type_audio: "audio"
+  image_encoder_kwargs :  {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  pc_encoder_kwargs :  {}
+  video_encoder_kwargs :   {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  audio_encoder_kwargs :  {}
+  image_precision: "fp16"
+  pc_precision: "fp16"
+  video_precision: "fp16"
+  audio_precision: "fp16"
+  freeze_image: True
+  freeze_pc: True
+  freeze_video: True
+  freeze_audio: True
+  num_query_token: 32
+  llm_model: /path/to/vicuna-7b
+  prompt: "question: {} answer:"
+  max_txt_len: 128
+  max_output_txt_len: 256
+  apply_lemmatizer: False
+  num_few_shot_examples: 0
+  few_shot_prob: 0
+  qformer_text_input: True
+  llm_text_input: True
+  modalities :  [audio, video]
+  use_cues: True
+  shared_qformer: False
+  pretrained_shared_qformer: Null
+  load_attention_shared_qformer: False
+  load_qformer_type_shared: ""
+  load_projection_shared: False
+  load_projection_type_shaped: ""
+  load_ln_type_shared: ""
+  shared_qformer_num_features: 512
+  special_qformer_input_prompt: "a short description"
+  prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
+  predict_with_gen: False
+  use_caption: False
+  use_describe: False
+  enumerate_inputs: False
+  add_space: True
+datasets:
+  audio_video_discrn:
+    # data_dir: ${env.data_dir}/datasets
+    audio_processor:
+      train:
+        name: beats_audio
+        n_frames: 2
+      eval:
+        name: beats_audio
+        n_frames: 2
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
+    video_processor:
+        train:
+          name: alpro_video_train
+          n_frms: 2
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+        eval:
+          name: alpro_video_eval
+          n_frms: 2
+          image_size: 224
+          min_scale: 0.9
+          max_scale: 1.0
+          full_video: True
+    data_type: [audio, video] # [images|videos|features]
+    build_info:
+      kwargs:
+        total: all
+        shuffle_modalities: False
+        balance_labels: True
+        dataset_name: audiocaps
+        ground_truth: False
+        classnames: [audio, video]
+        raw: False
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
+          storage:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
+      audio:
+        storage: /audiocaps/AUDIOCAPS_32000Hz/audio/val
+      video:
+        storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val
+run:
+  task: discrn_qa
+  # optimization-specific
+  batch_size_train: 8
+  batch_size_eval: 1
+  num_workers: 8
+  max_epoch: 1
+  segments: 1
+  # inference-specific
+  max_len: 10
+  min_len: 1
+  length_penalty: -1.
+  num_beams: 5
+  inference_method: "generate"
+  train_splits: ["train"]
+  valid_splits: ["val"]
+  # test_splits: ["test"]
+  # distribution
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  use_dist_eval_sampler: False
+  # model specific
+  k_test: 128
+  # misc
+  seed: 42
+  output_dir: "output/xinstructblip/eval/vicuna7b/discrn/audio_video_describe_rand_init"
+  evaluate: True
+  save_freq: -1

LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/image_3d_caption.yaml ADDED Viewed

	@@ -0,0 +1,154 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_xinstruct
+  model_type: vicuna7b
+  load_pretrained: True
+  # pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
+  load_finetuned: False
+  finetuned: ""
+  stage1_url_or_filename: null
+  image_model: "eva_clip_g"
+  pc_model: "ulip2_pointbert"
+  video_model: "eva_clip_g"
+  audio_model: "beats"
+  pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
+  pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth
+  pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth
+  pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/audio_qformer.pth
+  load_attention_image_qformer: True
+  load_attention_pc_qformer: True
+  load_attention_video_qformer: True
+  load_attention_audio_qformer: True
+  load_ln_type_image: "image"
+  load_ln_type_video: "video"
+  load_ln_type_pc: "pc"
+  load_ln_type_audio: "audio"
+  load_qformer_type_image: "image"
+  load_qformer_type_pc: "pc"
+  load_qformer_type_video: "video"
+  load_qformer_type_audio: "audio"
+  load_projection_image: True
+  load_projection_pc: True
+  load_projection_video: True
+  load_projection_audio: True
+  load_projection_type_image: "image"
+  load_projection_type_pc: "pc"
+  load_projection_type_video: "video"
+  load_projection_type_audio: "audio"
+  image_encoder_kwargs :  {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  pc_encoder_kwargs :  {}
+  video_encoder_kwargs :   {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  audio_encoder_kwargs :  {}
+  image_precision: "fp16"
+  pc_precision: "fp16"
+  video_precision: "fp16"
+  audio_precision: "fp16"
+  freeze_image: True
+  freeze_pc: True
+  freeze_video: True
+  freeze_audio: True
+  num_query_token: 32
+  llm_model: /path/to/vicuna-7b
+  prompt: "question: {} answer:"
+  modalities :  [image, pc]
+  use_cues: True
+  shared_qformer: False
+  pretrained_shared_qformer: Null
+  load_attention_shared_qformer: False
+  load_qformer_type_shared: ""
+  load_projection_shared: False
+  load_projection_type_shaped: ""
+  load_ln_type_shared: ""
+  shared_qformer_num_features: 512
+  special_qformer_input_prompt: "a short description"
+  prefix:  "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
+  predict_with_gen: False
+  use_caption: True
+  use_describe: False
+  enumerate_inputs: False
+  add_space: True
+datasets:
+  image_pc_discrn: # name of the dataset builder
+    vis_processor:
+          train:
+            name: "clip_image_train"
+          eval:
+            name: "clip_image_eval"
+    pc_processor:
+          train:
+            name: "ulip_pc"
+          eval:
+            name: "ulip_pc"
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
+    data_type: [images, pc] # [images|videos|features]
+    build_info:
+      kwargs:
+        total: all
+        raw: True
+        shuffle_modalities: False
+        balance_labels: True
+        dataset_name: objaverse
+        classnames:  [image, 3d]
+        ground_truth: False
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/objaverse.json
+          storage:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/objaverse.json
+      pc:
+        storage: /export/einstein-vision/3d_vision/objaverse/objaverse_pc_parallel
+      images:
+        storage: /export/einstein-vision/3d_vision/objaverse_captions/images/
+run:
+  task: discrn_qa
+  # optimization-specific
+  batch_size_train: 8
+  batch_size_eval: 1
+  num_workers: 10
+  max_epoch: 1
+  segments: 1
+  # inference-specific
+  max_len: 10
+  min_len: 1
+  length_penalty: -1.
+  num_beams: 5
+  inference_method: "generate"
+  train_splits: ["train"]
+  valid_splits: ["val"]
+  # test_splits: ["test"]
+  # distribution
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  use_dist_eval_sampler: False
+  # model specific
+  k_test: 128
+  # misc
+  seed: 42
+  output_dir: "output/xinstructblip/eval/vicuna7b/discrn/image_3d_caption"
+  evaluate: True
+  save_freq: -1

LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/image_3d_caption_13b.yaml ADDED Viewed

	@@ -0,0 +1,154 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_xinstruct
+  model_type: vicuna13b
+  load_pretrained: True
+  # pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/image_qformer.pth
+  load_finetuned: False
+  finetuned: ""
+  stage1_url_or_filename: null
+  image_model: "eva_clip_g"
+  pc_model: "ulip2_pointbert"
+  video_model: "eva_clip_g"
+  audio_model: "beats"
+  pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/image_qformer.pth
+  pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/pc_qformer.pth
+  pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/video_qformer.pth
+  pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/audio_qformer.pth
+  load_attention_image_qformer: True
+  load_attention_pc_qformer: True
+  load_attention_video_qformer: True
+  load_attention_audio_qformer: True
+  load_ln_type_image: "image"
+  load_ln_type_video: "video"
+  load_ln_type_pc: "pc"
+  load_ln_type_audio: "audio"
+  load_qformer_type_image: "image"
+  load_qformer_type_pc: "pc"
+  load_qformer_type_video: "video"
+  load_qformer_type_audio: "audio"
+  load_projection_image: True
+  load_projection_pc: True
+  load_projection_video: True
+  load_projection_audio: True
+  load_projection_type_image: "image"
+  load_projection_type_pc: "pc"
+  load_projection_type_video: "video"
+  load_projection_type_audio: "audio"
+  image_encoder_kwargs :  {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  pc_encoder_kwargs :  {}
+  video_encoder_kwargs :   {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  audio_encoder_kwargs :  {}
+  image_precision: "fp16"
+  pc_precision: "fp16"
+  video_precision: "fp16"
+  audio_precision: "fp16"
+  freeze_image: True
+  freeze_pc: True
+  freeze_video: True
+  freeze_audio: True
+  num_query_token: 32
+  llm_model: "/path/to/vicuna-13b"
+  prompt: "question: {} answer:"
+  modalities :  [image, pc]
+  use_cues: True
+  shared_qformer: False
+  pretrained_shared_qformer: Null
+  load_attention_shared_qformer: False
+  load_qformer_type_shared: ""
+  load_projection_shared: False
+  load_projection_type_shaped: ""
+  load_ln_type_shared: ""
+  shared_qformer_num_features: 512
+  special_qformer_input_prompt: "a short description"
+  prefix:  "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
+  predict_with_gen: False
+  use_caption: True
+  use_describe: False
+  enumerate_inputs: False
+  add_space: True
+datasets:
+  image_pc_discrn: # name of the dataset builder
+    vis_processor:
+          train:
+            name: "clip_image_train"
+          eval:
+            name: "clip_image_eval"
+    pc_processor:
+          train:
+            name: "ulip_pc"
+          eval:
+            name: "ulip_pc"
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
+    data_type: [images, pc] # [images|videos|features]
+    build_info:
+      kwargs:
+        total: 100
+        raw: True
+        shuffle_modalities: False
+        balance_labels: True
+        dataset_name: objaverse
+        classnames:  [image, 3d]
+        ground_truth: False
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/objaverse.json
+          storage:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/objaverse.json
+      pc:
+        storage: /export/einstein-vision/3d_vision/objaverse/objaverse_pc_parallel
+      images:
+        storage: /export/einstein-vision/3d_vision/objaverse_captions/images/
+run:
+  task: discrn_qa
+  # optimization-specific
+  batch_size_train: 8
+  batch_size_eval: 1
+  num_workers: 2
+  max_epoch: 1
+  segments: 1
+  # inference-specific
+  max_len: 10
+  min_len: 1
+  length_penalty: -1.
+  num_beams: 5
+  inference_method: "generate"
+  train_splits: ["train"]
+  valid_splits: ["val"]
+  # test_splits: ["test"]
+  # distribution
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  use_dist_eval_sampler: False
+  # model specific
+  k_test: 128
+  # misc
+  seed: 42
+  output_dir: "output/xinstructblip/eval/vicuna13b/discrn/image_3d_caption"
+  evaluate: True
+  save_freq: -1

LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/image_3d_describe.yaml ADDED Viewed

	@@ -0,0 +1,154 @@

+ # Copyright (c) 2023, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_vicuna_xinstruct
+  model_type: vicuna7b
+  load_pretrained: True
+  # pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
+  load_finetuned: False
+  finetuned: ""
+  stage1_url_or_filename: null
+  image_model: "eva_clip_g"
+  pc_model: "ulip2_pointbert"
+  video_model: "eva_clip_g"
+  audio_model: "beats"
+  pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
+  pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth
+  pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth
+  pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/audio_qformer.pth
+  load_attention_image_qformer: True
+  load_attention_pc_qformer: True
+  load_attention_video_qformer: True
+  load_attention_audio_qformer: True
+  load_ln_type_image: "image"
+  load_ln_type_video: "video"
+  load_ln_type_pc: "pc"
+  load_ln_type_audio: "audio"
+  load_qformer_type_image: "image"
+  load_qformer_type_pc: "pc"
+  load_qformer_type_video: "video"
+  load_qformer_type_audio: "audio"
+  load_projection_image: True
+  load_projection_pc: True
+  load_projection_video: True
+  load_projection_audio: True
+  load_projection_type_image: "image"
+  load_projection_type_pc: "pc"
+  load_projection_type_video: "video"
+  load_projection_type_audio: "audio"
+  image_encoder_kwargs :  {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  pc_encoder_kwargs :  {}
+  video_encoder_kwargs :   {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
+  audio_encoder_kwargs :  {}
+  image_precision: "fp16"
+  pc_precision: "fp16"
+  video_precision: "fp16"
+  audio_precision: "fp16"
+  freeze_image: True
+  freeze_pc: True
+  freeze_video: True
+  freeze_audio: True
+  num_query_token: 32
+  llm_model: /path/to/vicuna-7b
+  prompt: "question: {} answer:"
+  modalities :  [image, pc]
+  use_cues: True
+  shared_qformer: False
+  pretrained_shared_qformer: Null
+  load_attention_shared_qformer: False
+  load_qformer_type_shared: ""
+  load_projection_shared: False
+  load_projection_type_shaped: ""
+  load_ln_type_shared: ""
+  shared_qformer_num_features: 512
+  special_qformer_input_prompt: "a short description"
+  prefix:  "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
+  predict_with_gen: False
+  use_caption: False
+  use_describe: False
+  enumerate_inputs: False
+  add_space: True
+datasets:
+  image_pc_discrn: # name of the dataset builder
+    vis_processor:
+          train:
+            name: "clip_image_train"
+          eval:
+            name: "clip_image_eval"
+    pc_processor:
+          train:
+            name: "ulip_pc"
+          eval:
+            name: "ulip_pc"
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
+    data_type: [images, pc] # [images|videos|features]
+    build_info:
+      kwargs:
+        total: all
+        raw: False
+        shuffle_modalities: False
+        balance_labels: True
+        dataset_name: objaverse
+        classnames: [image, 3d]
+        ground_truth: False
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/objaverse.json
+          storage:
+            - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/objaverse.json
+      pc:
+        storage: /export/einstein-vision/3d_vision/objaverse/objaverse_pc_parallel
+      images:
+        storage: /export/einstein-vision/3d_vision/objaverse_captions/images/
+run:
+  task: discrn_qa
+  # optimization-specific
+  batch_size_train: 8
+  batch_size_eval: 1
+  num_workers: 10
+  max_epoch: 1
+  segments: 1
+  # inference-specific
+  max_len: 10
+  min_len: 1
+  length_penalty: 1.
+  num_beams: 5
+  inference_method: "generate"
+  train_splits: ["train"]
+  valid_splits: ["val"]
+  # test_splits: ["test"]
+  # distribution
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  use_dist_eval_sampler: False
+  # model specific
+  k_test: 128
+  # misc
+  seed: 42
+  output_dir: "output/xinstructblip/eval/vicuna7b/discrn/image_3d_describe"
+  evaluate: True
+  save_freq: -1