Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- LAVIS-main/lavis/projects/instructblip/caption_nocaps_out_domain_vicuna7b_eval.yaml +82 -0
- LAVIS-main/lavis/projects/instructblip/caption_vatex_flant5xl_eval.yaml +90 -0
- LAVIS-main/lavis/projects/instructblip/caption_vatex_flant5xxl_eval.yaml +90 -0
- LAVIS-main/lavis/projects/instructblip/caption_vatex_vicuna13b_eval.yaml +90 -0
- LAVIS-main/lavis/projects/instructblip/caption_vatex_vicuna7b_eval.yaml +91 -0
- LAVIS-main/lavis/projects/instructblip/classification_modelnet40_vicuna13b.yaml +101 -0
- LAVIS-main/lavis/projects/instructblip/classification_modelnet40_vicuna7b.yaml +100 -0
- LAVIS-main/lavis/projects/instructblip/classification_snlive_flant5xl.yaml +94 -0
- LAVIS-main/lavis/projects/instructblip/classification_snlive_flant5xxl.yaml +95 -0
- LAVIS-main/lavis/projects/instructblip/classification_snlive_vicuna13b.yaml +93 -0
- LAVIS-main/lavis/projects/instructblip/classification_snlive_vicuna13b_test.yaml +93 -0
- LAVIS-main/lavis/projects/instructblip/classification_snlive_vicuna7b_test.yaml +93 -0
- LAVIS-main/lavis/projects/instructblip/classification_snlive_vicuna7b_val.yaml +93 -0
- LAVIS-main/lavis/projects/instructblip/completion_modelnet40_vicuna13b.yaml +101 -0
- LAVIS-main/lavis/projects/instructblip/completion_modelnet40_vicuna7b.yaml +102 -0
- LAVIS-main/lavis/projects/instructblip/qa_msrvtt_flant5xl_eval_test.yaml +92 -0
- LAVIS-main/lavis/projects/instructblip/qa_msrvtt_flant5xxl_eval_test.yaml +92 -0
- LAVIS-main/lavis/projects/instructblip/qa_msrvtt_vicuna13b_eval_test.yaml +92 -0
- LAVIS-main/lavis/projects/instructblip/qa_msrvtt_vicuna7b_eval_test.yaml +92 -0
- LAVIS-main/lavis/projects/instructblip/qa_msvd_flant5xl_eval.yaml +100 -0
- LAVIS-main/lavis/projects/instructblip/qa_msvd_flant5xxl_eval.yaml +100 -0
- LAVIS-main/lavis/projects/instructblip/qa_msvd_vicuna13b_eval.yaml +100 -0
- LAVIS-main/lavis/projects/instructblip/qa_msvd_vicuna7b_eval.yaml +100 -0
- LAVIS-main/lavis/projects/instructblip/qa_okvqa_flant5xl_eval.yaml +90 -0
- LAVIS-main/lavis/projects/instructblip/qa_okvqa_flant5xxl_eval.yaml +90 -0
- LAVIS-main/lavis/projects/instructblip/qa_okvqa_vicuna13b_eval.yaml +90 -0
- LAVIS-main/lavis/projects/instructblip/qa_okvqa_vicuna7b_eval.yaml +90 -0
- LAVIS-main/lavis/projects/pnp-vqa/eval/gqa_eval.yaml +60 -0
- LAVIS-main/lavis/projects/pnp-vqa/eval/gqa_eval_3b.yaml +60 -0
- LAVIS-main/lavis/projects/pnp-vqa/eval/gqa_eval_large.yaml +60 -0
- LAVIS-main/lavis/projects/pnp-vqa/eval/okvqa_eval.yaml +59 -0
- LAVIS-main/lavis/projects/pnp-vqa/eval/okvqa_eval_3b.yaml +59 -0
- LAVIS-main/lavis/projects/pnp-vqa/eval/okvqa_eval_large.yaml +59 -0
- LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_eval.yaml +60 -0
- LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_eval_3b.yaml +60 -0
- LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_eval_large.yaml +60 -0
- LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_test_eval.yaml +60 -0
- LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_test_eval_3b.yaml +60 -0
- LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_test_eval_large.yaml +60 -0
- LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_caption.yaml +176 -0
- LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_caption_13b.yaml +176 -0
- LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe.yaml +176 -0
- LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_13b.yaml +177 -0
- LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_nocue.yaml +176 -0
- LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_proj copy.yaml +179 -0
- LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_proj.yaml +179 -0
- LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_rand_init.yaml +176 -0
- LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/image_3d_caption.yaml +154 -0
- LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/image_3d_caption_13b.yaml +154 -0
- LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/image_3d_describe.yaml +154 -0
LAVIS-main/lavis/projects/instructblip/caption_nocaps_out_domain_vicuna7b_eval.yaml
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_vicuna_instruct
|
| 8 |
+
model_type: vicuna7b
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: "A short image caption."
|
| 11 |
+
|
| 12 |
+
datasets:
|
| 13 |
+
nocaps: # name of the dataset builder
|
| 14 |
+
# data_dir: ${env.data_dir}/datasets
|
| 15 |
+
data_type: images # [images|videos|features]
|
| 16 |
+
|
| 17 |
+
vis_processor:
|
| 18 |
+
eval:
|
| 19 |
+
name: "blip_image_eval"
|
| 20 |
+
image_size: 224
|
| 21 |
+
|
| 22 |
+
text_processor:
|
| 23 |
+
eval:
|
| 24 |
+
name: "blip_caption"
|
| 25 |
+
|
| 26 |
+
build_info:
|
| 27 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 28 |
+
annotations:
|
| 29 |
+
val:
|
| 30 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_val.json
|
| 31 |
+
storage: nocaps/annotations/nocaps_val.json
|
| 32 |
+
test:
|
| 33 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_test.json
|
| 34 |
+
storage: nocaps/annotations/nocaps_test.json
|
| 35 |
+
images:
|
| 36 |
+
storage: /export/share/datasets/vision/nocaps/
|
| 37 |
+
|
| 38 |
+
run:
|
| 39 |
+
task: captioning
|
| 40 |
+
# optimizer
|
| 41 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 42 |
+
init_lr: 1e-5
|
| 43 |
+
min_lr: 0
|
| 44 |
+
warmup_lr: 1e-8
|
| 45 |
+
warmup_steps: 1000
|
| 46 |
+
weight_decay: 0.05
|
| 47 |
+
max_epoch: 1
|
| 48 |
+
batch_size_train: 16
|
| 49 |
+
batch_size_eval: 1
|
| 50 |
+
num_workers: 8
|
| 51 |
+
accum_grad_iters: 1
|
| 52 |
+
|
| 53 |
+
max_len: 80
|
| 54 |
+
min_len: 10
|
| 55 |
+
num_beams: 5
|
| 56 |
+
inference_method: "generate"
|
| 57 |
+
# prompt: an image that shows
|
| 58 |
+
length_penalty: 1.
|
| 59 |
+
|
| 60 |
+
annotation_file: https://nocaps.s3.amazonaws.com/nocaps_val_4500_captions.json
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
seed: 42
|
| 64 |
+
output_dir: "output/instructblip/nocaps_out_domain_captioning_vicuna7b/"
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
amp: True
|
| 68 |
+
resume_ckpt_path: null
|
| 69 |
+
|
| 70 |
+
evaluate: True
|
| 71 |
+
# train_splits: ["train"]
|
| 72 |
+
valid_splits: ["val"]
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
device: "cuda"
|
| 76 |
+
world_size: 1
|
| 77 |
+
dist_url: "env://"
|
| 78 |
+
distributed: True
|
| 79 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 80 |
+
val_freq: 1
|
| 81 |
+
|
| 82 |
+
img_ids: [2, 4, 5, 8, 15, 18, 19, 22, 27, 30, 33, 35, 41, 42, 43, 46, 47, 51, 59, 60, 64, 65, 68, 69, 71, 72, 73, 77, 79, 81, 85, 87, 88, 90, 92, 100, 101, 102, 105, 107, 109, 115, 120, 124, 125, 126, 127, 129, 133, 135, 137, 139, 140, 141, 143, 150, 153, 155, 158, 164, 165, 167, 170, 171, 173, 182, 190, 191, 196, 200, 201, 203, 205, 208, 219, 225, 226, 228, 229, 232, 239, 240, 243, 245, 250, 262, 263, 264, 267, 272, 278, 283, 284, 290, 291, 297, 301, 304, 305, 309, 310, 311, 314, 323, 325, 329, 330, 331, 333, 334, 341, 349, 350, 351, 352, 354, 358, 359, 363, 365, 366, 368, 371, 372, 379, 381, 383, 386, 388, 389, 390, 392, 405, 415, 417, 418, 420, 421, 424, 428, 429, 432, 436, 441, 443, 452, 453, 454, 455, 456, 459, 464, 465, 468, 469, 476, 477, 478, 480, 487, 488, 490, 491, 493, 500, 502, 504, 506, 509, 510, 511, 512, 515, 516, 520, 527, 529, 533, 539, 540, 541, 544, 545, 547, 551, 554, 556, 559, 577, 579, 580, 582, 586, 587, 590, 593, 594, 607, 609, 616, 617, 619, 623, 628, 631, 634, 637, 648, 651, 654, 655, 665, 673, 678, 682, 684, 685, 688, 690, 695, 696, 701, 702, 705, 707, 708, 712, 714, 718, 719, 723, 725, 726, 730, 731, 733, 734, 740, 744, 748, 750, 751, 756, 757, 760, 761, 763, 767, 775, 779, 782, 783, 784, 787, 790, 792, 794, 798, 799, 802, 805, 807, 810, 812, 816, 818, 819, 820, 821, 829, 831, 836, 841, 842, 844, 845, 849, 850, 853, 854, 857, 859, 861, 868, 871, 874, 875, 877, 879, 886, 887, 889, 890, 891, 892, 893, 894, 896, 899, 900, 905, 918, 924, 926, 927, 929, 932, 934, 935, 943, 948, 950, 952, 953, 954, 956, 957, 963, 965, 969, 972, 973, 974, 976, 980, 985, 987, 988, 990, 992, 993, 994, 1000, 1001, 1003, 1005, 1009, 1013, 1016, 1018, 1019, 1020, 1021, 1022, 1024, 1028, 1029, 1033, 1036, 1037, 1038, 1042, 1045, 1046, 1050, 1053, 1054, 1056, 1065, 1072, 1076, 1079, 1082, 1083, 1096, 1101, 1103, 1107, 1112, 1117, 1129, 1132, 1133, 1136, 1138, 1141, 1143, 1155, 1157, 1160, 1164, 1165, 1166, 1172, 1175, 1179, 1183, 1194, 1197, 1200, 1202, 1210, 1228, 1234, 1236, 1241, 1246, 1251, 1253, 1255, 1261, 1265, 1268, 1269, 1271, 1272, 1273, 1277, 1286, 1287, 1290, 1296, 1297, 1302, 1303, 1308, 1310, 1312, 1315, 1316, 1317, 1320, 1321, 1324, 1327, 1329, 1330, 1331, 1333, 1334, 1336, 1338, 1339, 1340, 1345, 1347, 1356, 1362, 1366, 1371, 1374, 1376, 1381, 1384, 1385, 1388, 1394, 1396, 1397, 1398, 1403, 1404, 1408, 1410, 1413, 1414, 1417, 1424, 1430, 1433, 1436, 1438, 1440, 1446, 1449, 1453, 1454, 1456, 1460, 1462, 1464, 1465, 1467, 1470, 1473, 1477, 1491, 1494, 1498, 1504, 1506, 1509, 1511, 1515, 1519, 1524, 1530, 1533, 1540, 1541, 1549, 1557, 1558, 1561, 1570, 1572, 1575, 1579, 1591, 1593, 1594, 1595, 1596, 1602, 1605, 1609, 1611, 1615, 1617, 1618, 1624, 1625, 1627, 1636, 1637, 1639, 1640, 1641, 1646, 1647, 1648, 1649, 1655, 1656, 1657, 1658, 1659, 1662, 1671, 1675, 1679, 1681, 1690, 1692, 1701, 1712, 1715, 1716, 1718, 1719, 1721, 1723, 1725, 1728, 1729, 1730, 1732, 1737, 1740, 1746, 1747, 1753, 1754, 1756, 1757, 1758, 1761, 1762, 1767, 1771, 1773, 1775, 1779, 1783, 1784, 1786, 1787, 1789, 1791, 1794, 1802, 1803, 1807, 1813, 1814, 1815, 1817, 1824, 1826, 1827, 1832, 1834, 1835, 1838, 1839, 1840, 1841, 1842, 1843, 1844, 1847, 1850, 1860, 1861, 1866, 1870, 1872, 1873, 1876, 1878, 1886, 1889, 1894, 1897, 1899, 1902, 1907, 1911, 1912, 1917, 1920, 1924, 1925, 1928, 1931, 1935, 1936, 1937, 1939, 1941, 1946, 1948, 1949, 1952, 1954, 1955, 1956, 1959, 1967, 1968, 1970, 1975, 1976, 1979, 1980, 1985, 1986, 1994, 1996, 1998, 2001, 2003, 2007, 2009, 2011, 2012, 2014, 2019, 2028, 2029, 2042, 2047, 2049, 2050, 2060, 2068, 2071, 2076, 2078, 2080, 2081, 2086, 2089, 2090, 2093, 2094, 2099, 2102, 2107, 2112, 2115, 2121, 2124, 2125, 2129, 2131, 2133, 2135, 2140, 2141, 2148, 2150, 2151, 2152, 2155, 2163, 2173, 2176, 2178, 2182, 2183, 2187, 2188, 2196, 2197, 2198, 2199, 2200, 2205, 2207, 2209, 2215, 2217, 2220, 2221, 2223, 2230, 2235, 2236, 2237, 2238, 2241, 2242, 2243, 2244, 2246, 2252, 2253, 2261, 2265, 2274, 2277, 2278, 2281, 2286, 2290, 2292, 2293, 2294, 2296, 2299, 2301, 2304, 2305, 2307, 2309, 2312, 2314, 2315, 2319, 2323, 2324, 2337, 2338, 2339, 2340, 2342, 2351, 2356, 2358, 2360, 2367, 2369, 2371, 2374, 2376, 2378, 2382, 2383, 2387, 2388, 2390, 2399, 2400, 2412, 2416, 2422, 2423, 2427, 2428, 2435, 2439, 2440, 2442, 2447, 2450, 2455, 2459, 2461, 2462, 2463, 2466, 2468, 2470, 2479, 2480, 2482, 2483, 2485, 2488, 2491, 2495, 2496, 2502, 2505, 2506, 2507, 2510, 2511, 2515, 2522, 2524, 2532, 2534, 2546, 2547, 2550, 2554, 2558, 2562, 2563, 2574, 2583, 2584, 2590, 2594, 2598, 2602, 2603, 2606, 2611, 2613, 2615, 2617, 2619, 2623, 2625, 2630, 2636, 2642, 2643, 2644, 2646, 2647, 2649, 2650, 2659, 2661, 2664, 2674, 2675, 2677, 2682, 2684, 2685, 2691, 2693, 2695, 2698, 2699, 2703, 2704, 2706, 2707, 2711, 2713, 2719, 2720, 2723, 2726, 2727, 2729, 2730, 2733, 2734, 2738, 2739, 2741, 2744, 2745, 2748, 2749, 2754, 2757, 2761, 2762, 2764, 2765, 2767, 2768, 2772, 2776, 2778, 2779, 2780, 2781, 2783, 2787, 2791, 2795, 2796, 2799, 2800, 2802, 2807, 2808, 2811, 2813, 2817, 2820, 2827, 2829, 2831, 2833, 2834, 2835, 2839, 2840, 2841, 2846, 2847, 2849, 2852, 2855, 2859, 2860, 2864, 2870, 2871, 2876, 2878, 2879, 2882, 2884, 2885, 2886, 2887, 2888, 2895, 2896, 2897, 2898, 2900, 2902, 2905, 2911, 2913, 2915, 2919, 2922, 2924, 2933, 2939, 2945, 2953, 2954, 2958, 2959, 2968, 2973, 2976, 2979, 2982, 2984, 2992, 3002, 3004, 3007, 3008, 3009, 3010, 3013, 3016, 3021, 3022, 3023, 3026, 3028, 3033, 3036, 3037, 3039, 3043, 3044, 3045, 3046, 3053, 3060, 3062, 3063, 3071, 3072, 3085, 3086, 3092, 3095, 3096, 3102, 3103, 3104, 3105, 3111, 3115, 3116, 3122, 3129, 3131, 3132, 3137, 3138, 3140, 3147, 3148, 3157, 3164, 3167, 3168, 3170, 3175, 3179, 3182, 3184, 3190, 3194, 3196, 3198, 3199, 3200, 3215, 3216, 3217, 3219, 3222, 3229, 3230, 3237, 3239, 3242, 3249, 3253, 3255, 3257, 3258, 3267, 3270, 3271, 3274, 3279, 3288, 3290, 3291, 3293, 3299, 3305, 3306, 3312, 3318, 3319, 3320, 3323, 3326, 3328, 3329, 3335, 3343, 3344, 3345, 3347, 3349, 3350, 3353, 3356, 3362, 3364, 3366, 3369, 3374, 3377, 3379, 3381, 3382, 3384, 3385, 3388, 3389, 3395, 3399, 3403, 3409, 3411, 3416, 3419, 3421, 3423, 3424, 3425, 3427, 3428, 3431, 3437, 3438, 3439, 3444, 3450, 3452, 3453, 3456, 3457, 3460, 3461, 3462, 3464, 3466, 3467, 3471, 3472, 3477, 3478, 3482, 3484, 3486, 3492, 3499, 3500, 3501, 3502, 3511, 3525, 3529, 3531, 3533, 3534, 3536, 3552, 3553, 3555, 3557, 3562, 3567, 3568, 3570, 3571, 3573, 3577, 3578, 3584, 3585, 3586, 3587, 3595, 3600, 3601, 3604, 3609, 3610, 3612, 3615, 3616, 3619, 3620, 3624, 3625, 3631, 3632, 3636, 3637, 3638, 3640, 3643, 3651, 3654, 3655, 3656, 3657, 3662, 3667, 3668, 3671, 3677, 3684, 3686, 3689, 3693, 3694, 3696, 3697, 3698, 3699, 3700, 3701, 3703, 3704, 3707, 3708, 3709, 3711, 3712, 3713, 3714, 3719, 3721, 3723, 3726, 3737, 3741, 3742, 3744, 3750, 3752, 3757, 3760, 3761, 3764, 3765, 3767, 3770, 3772, 3774, 3776, 3778, 3780, 3781, 3796, 3797, 3805, 3818, 3819, 3820, 3821, 3824, 3841, 3845, 3848, 3851, 3858, 3866, 3870, 3871, 3876, 3879, 3880, 3883, 3893, 3896, 3900, 3903, 3904, 3908, 3909, 3913, 3914, 3916, 3924, 3927, 3937, 3940, 3942, 3943, 3949, 3950, 3953, 3954, 3959, 3963, 3966, 3969, 3972, 3978, 3981, 3983, 3984, 3986, 3989, 3990, 3991, 3999, 4000, 4004, 4005, 4006, 4012, 4014, 4016, 4017, 4019, 4020, 4030, 4035, 4046, 4049, 4051, 4052, 4053, 4057, 4061, 4065, 4066, 4068, 4073, 4074, 4075, 4079, 4080, 4082, 4084, 4086, 4090, 4091, 4093, 4094, 4095, 4096, 4100, 4102, 4104, 4106, 4113, 4114, 4115, 4116, 4118, 4124, 4126, 4127, 4128, 4131, 4133, 4134, 4142, 4145, 4149, 4156, 4160, 4171, 4174, 4178, 4179, 4180, 4183, 4186, 4190, 4191, 4195, 4197, 4215, 4220, 4229, 4234, 4245, 4249, 4251, 4252, 4254, 4257, 4259, 4264, 4265, 4266, 4267, 4275, 4276, 4277, 4282, 4284, 4285, 4288, 4291, 4294, 4295, 4301, 4302, 4313, 4315, 4320, 4328, 4333, 4336, 4339, 4342, 4345, 4346, 4350, 4354, 4372, 4374, 4375, 4377, 4379, 4380, 4386, 4388, 4389, 4392, 4396, 4402, 4404, 4408, 4410, 4424, 4426, 4428, 4431, 4435, 4436, 4439, 4442, 4446, 4447, 4449, 4452, 4455, 4458, 4460, 4461, 4466, 4469, 4475, 4476, 4478, 4488, 4491, 4494, 4498]
|
LAVIS-main/lavis/projects/instructblip/caption_vatex_flant5xl_eval.yaml
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_t5_instruct
|
| 8 |
+
model_type: flant5xl
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: "a short description"
|
| 11 |
+
|
| 12 |
+
datasets:
|
| 13 |
+
vatex_caption:
|
| 14 |
+
vis_processor:
|
| 15 |
+
train:
|
| 16 |
+
name: alpro_video_train
|
| 17 |
+
n_frms: 4
|
| 18 |
+
image_size: 224
|
| 19 |
+
min_scale: 0.9
|
| 20 |
+
max_scale: 1.0
|
| 21 |
+
eval:
|
| 22 |
+
name: alpro_video_eval
|
| 23 |
+
n_frms: 4
|
| 24 |
+
image_size: 224
|
| 25 |
+
min_scale: 0.9
|
| 26 |
+
max_scale: 1.0
|
| 27 |
+
text_processor:
|
| 28 |
+
train:
|
| 29 |
+
name: "blip_caption"
|
| 30 |
+
eval:
|
| 31 |
+
name: "blip_caption"
|
| 32 |
+
|
| 33 |
+
build_info:
|
| 34 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 35 |
+
annotations:
|
| 36 |
+
train:
|
| 37 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
|
| 38 |
+
storage: vatex/annotations/cap_train.json
|
| 39 |
+
val:
|
| 40 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
|
| 41 |
+
storage: vatex/annotations/cap_val.json
|
| 42 |
+
test:
|
| 43 |
+
# iWNXAYGh9cI_000004_000014.mp4 is corrupt and removed from youtube
|
| 44 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
|
| 45 |
+
storage: vatex/annotations/cap_test.json
|
| 46 |
+
videos:
|
| 47 |
+
storage: /export/video-language-dataset/data/vatex/
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
run:
|
| 51 |
+
task: captioning
|
| 52 |
+
# optimizer
|
| 53 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 54 |
+
init_lr: 1e-5
|
| 55 |
+
min_lr: 0
|
| 56 |
+
warmup_lr: 1e-8
|
| 57 |
+
warmup_steps: 1000
|
| 58 |
+
weight_decay: 0.05
|
| 59 |
+
max_epoch: 1
|
| 60 |
+
batch_size_train: 16
|
| 61 |
+
batch_size_eval: 1
|
| 62 |
+
num_workers: 8
|
| 63 |
+
accum_grad_iters: 1
|
| 64 |
+
|
| 65 |
+
max_len: 80
|
| 66 |
+
min_len: 10
|
| 67 |
+
num_beams: 5
|
| 68 |
+
inference_method: "generate"
|
| 69 |
+
prompt: "describe the video"
|
| 70 |
+
length_penalty: 1.
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
seed: 42
|
| 74 |
+
output_dir: "output/instructblip/vatex_caption_flant5xl/"
|
| 75 |
+
|
| 76 |
+
amp: True
|
| 77 |
+
resume_ckpt_path: null
|
| 78 |
+
|
| 79 |
+
evaluate: True
|
| 80 |
+
# train_splits: ["train"]
|
| 81 |
+
valid_splits: ["val"]
|
| 82 |
+
annotation_file: /export/home/.cache/lavis/vatex_caption_gt/vatex_caption_val_annotations.json
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
device: "cuda"
|
| 86 |
+
world_size: 1
|
| 87 |
+
dist_url: "env://"
|
| 88 |
+
distributed: True
|
| 89 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 90 |
+
val_freq: 1
|
LAVIS-main/lavis/projects/instructblip/caption_vatex_flant5xxl_eval.yaml
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_t5_instruct
|
| 8 |
+
model_type: flant5xxl
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: "a short description"
|
| 11 |
+
|
| 12 |
+
datasets:
|
| 13 |
+
vatex_caption:
|
| 14 |
+
vis_processor:
|
| 15 |
+
train:
|
| 16 |
+
name: alpro_video_train
|
| 17 |
+
n_frms: 4
|
| 18 |
+
image_size: 224
|
| 19 |
+
min_scale: 0.9
|
| 20 |
+
max_scale: 1.0
|
| 21 |
+
eval:
|
| 22 |
+
name: alpro_video_eval
|
| 23 |
+
n_frms: 4
|
| 24 |
+
image_size: 224
|
| 25 |
+
min_scale: 0.9
|
| 26 |
+
max_scale: 1.0
|
| 27 |
+
text_processor:
|
| 28 |
+
train:
|
| 29 |
+
name: "blip_caption"
|
| 30 |
+
eval:
|
| 31 |
+
name: "blip_caption"
|
| 32 |
+
|
| 33 |
+
build_info:
|
| 34 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 35 |
+
annotations:
|
| 36 |
+
train:
|
| 37 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
|
| 38 |
+
storage: vatex/annotations/cap_train.json
|
| 39 |
+
val:
|
| 40 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
|
| 41 |
+
storage: vatex/annotations/cap_val.json
|
| 42 |
+
test:
|
| 43 |
+
# iWNXAYGh9cI_000004_000014.mp4 is corrupt and removed from youtube
|
| 44 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
|
| 45 |
+
storage: vatex/annotations/cap_test.json
|
| 46 |
+
videos:
|
| 47 |
+
storage: /export/video-language-dataset/data/vatex/
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
run:
|
| 51 |
+
task: captioning
|
| 52 |
+
# optimizer
|
| 53 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 54 |
+
init_lr: 1e-5
|
| 55 |
+
min_lr: 0
|
| 56 |
+
warmup_lr: 1e-8
|
| 57 |
+
warmup_steps: 1000
|
| 58 |
+
weight_decay: 0.05
|
| 59 |
+
max_epoch: 1
|
| 60 |
+
batch_size_train: 16
|
| 61 |
+
batch_size_eval: 1
|
| 62 |
+
num_workers: 0
|
| 63 |
+
accum_grad_iters: 1
|
| 64 |
+
|
| 65 |
+
max_len: 80
|
| 66 |
+
min_len: 10
|
| 67 |
+
num_beams: 5
|
| 68 |
+
inference_method: "generate"
|
| 69 |
+
prompt: "describe the video"
|
| 70 |
+
length_penalty: 1.
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
seed: 42
|
| 74 |
+
output_dir: "output/instructblip/vatex_caption_flant5xxl/"
|
| 75 |
+
|
| 76 |
+
amp: True
|
| 77 |
+
resume_ckpt_path: null
|
| 78 |
+
|
| 79 |
+
evaluate: True
|
| 80 |
+
# train_splits: ["train"]
|
| 81 |
+
valid_splits: ["val"]
|
| 82 |
+
annotation_file: /export/home/.cache/lavis/vatex_caption_gt/vatex_caption_val_annotations.json
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
device: "cuda"
|
| 86 |
+
world_size: 1
|
| 87 |
+
dist_url: "env://"
|
| 88 |
+
distributed: True
|
| 89 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 90 |
+
val_freq: 1
|
LAVIS-main/lavis/projects/instructblip/caption_vatex_vicuna13b_eval.yaml
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_vicuna_instruct
|
| 8 |
+
model_type: vicuna13b
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: "describe the video"
|
| 11 |
+
|
| 12 |
+
datasets:
|
| 13 |
+
vatex_caption:
|
| 14 |
+
vis_processor:
|
| 15 |
+
train:
|
| 16 |
+
name: alpro_video_train
|
| 17 |
+
n_frms: 4
|
| 18 |
+
image_size: 224
|
| 19 |
+
min_scale: 0.9
|
| 20 |
+
max_scale: 1.0
|
| 21 |
+
eval:
|
| 22 |
+
name: alpro_video_eval
|
| 23 |
+
n_frms: 4
|
| 24 |
+
image_size: 224
|
| 25 |
+
min_scale: 0.9
|
| 26 |
+
max_scale: 1.0
|
| 27 |
+
text_processor:
|
| 28 |
+
train:
|
| 29 |
+
name: "blip_caption"
|
| 30 |
+
eval:
|
| 31 |
+
name: "blip_caption"
|
| 32 |
+
|
| 33 |
+
build_info:
|
| 34 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 35 |
+
annotations:
|
| 36 |
+
train:
|
| 37 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
|
| 38 |
+
storage: vatex/annotations/cap_train.json
|
| 39 |
+
val:
|
| 40 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
|
| 41 |
+
storage: vatex/annotations/cap_val.json
|
| 42 |
+
test:
|
| 43 |
+
# iWNXAYGh9cI_000004_000014.mp4 is corrupt and removed from youtube
|
| 44 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
|
| 45 |
+
storage: vatex/annotations/cap_test.json
|
| 46 |
+
videos:
|
| 47 |
+
storage: /export/video-language-dataset/data/vatex/
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
run:
|
| 51 |
+
task: captioning
|
| 52 |
+
# optimizer
|
| 53 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 54 |
+
init_lr: 1e-5
|
| 55 |
+
min_lr: 0
|
| 56 |
+
warmup_lr: 1e-8
|
| 57 |
+
warmup_steps: 1000
|
| 58 |
+
weight_decay: 0.05
|
| 59 |
+
max_epoch: 1
|
| 60 |
+
batch_size_train: 16
|
| 61 |
+
batch_size_eval: 1
|
| 62 |
+
num_workers: 8
|
| 63 |
+
accum_grad_iters: 1
|
| 64 |
+
|
| 65 |
+
max_len: 80
|
| 66 |
+
min_len: 10
|
| 67 |
+
num_beams: 5
|
| 68 |
+
inference_method: "generate"
|
| 69 |
+
prompt: "describe the video"
|
| 70 |
+
length_penalty: 0.
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
seed: 42
|
| 74 |
+
output_dir: "output/instructblip/msvd_caption_vicuna13b/"
|
| 75 |
+
|
| 76 |
+
amp: True
|
| 77 |
+
resume_ckpt_path: null
|
| 78 |
+
|
| 79 |
+
evaluate: True
|
| 80 |
+
# train_splits: ["train"]
|
| 81 |
+
valid_splits: ["val"]
|
| 82 |
+
annotation_file: /export/home/.cache/lavis/vatex_caption_gt/vatex_caption_val_annotations.json
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
device: "cuda"
|
| 86 |
+
world_size: 1
|
| 87 |
+
dist_url: "env://"
|
| 88 |
+
distributed: True
|
| 89 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 90 |
+
val_freq: 1
|
LAVIS-main/lavis/projects/instructblip/caption_vatex_vicuna7b_eval.yaml
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_vicuna_instruct
|
| 8 |
+
model_type: vicuna7b
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: "a short description"
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
datasets:
|
| 14 |
+
vatex_caption:
|
| 15 |
+
vis_processor:
|
| 16 |
+
train:
|
| 17 |
+
name: alpro_video_train
|
| 18 |
+
n_frms: 4
|
| 19 |
+
image_size: 224
|
| 20 |
+
min_scale: 0.9
|
| 21 |
+
max_scale: 1.0
|
| 22 |
+
eval:
|
| 23 |
+
name: alpro_video_eval
|
| 24 |
+
n_frms: 4
|
| 25 |
+
image_size: 224
|
| 26 |
+
min_scale: 0.9
|
| 27 |
+
max_scale: 1.0
|
| 28 |
+
text_processor:
|
| 29 |
+
train:
|
| 30 |
+
name: "blip_caption"
|
| 31 |
+
eval:
|
| 32 |
+
name: "blip_caption"
|
| 33 |
+
|
| 34 |
+
build_info:
|
| 35 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 36 |
+
annotations:
|
| 37 |
+
train:
|
| 38 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
|
| 39 |
+
storage: vatex/annotations/cap_train.json
|
| 40 |
+
val:
|
| 41 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
|
| 42 |
+
storage: vatex/annotations/cap_val.json
|
| 43 |
+
test:
|
| 44 |
+
# iWNXAYGh9cI_000004_000014.mp4 is corrupt and removed from youtube
|
| 45 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
|
| 46 |
+
storage: vatex/annotations/cap_test.json
|
| 47 |
+
videos:
|
| 48 |
+
storage: /export/video-language-dataset/data/vatex/
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
run:
|
| 52 |
+
task: captioning
|
| 53 |
+
# optimizer
|
| 54 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 55 |
+
init_lr: 1e-5
|
| 56 |
+
min_lr: 0
|
| 57 |
+
warmup_lr: 1e-8
|
| 58 |
+
warmup_steps: 1000
|
| 59 |
+
weight_decay: 0.05
|
| 60 |
+
max_epoch: 1
|
| 61 |
+
batch_size_train: 16
|
| 62 |
+
batch_size_eval: 1
|
| 63 |
+
num_workers: 8
|
| 64 |
+
accum_grad_iters: 1
|
| 65 |
+
|
| 66 |
+
max_len: 80
|
| 67 |
+
min_len: 10
|
| 68 |
+
num_beams: 5
|
| 69 |
+
inference_method: "generate"
|
| 70 |
+
prompt: "describe the video"
|
| 71 |
+
length_penalty: 1.
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
seed: 42
|
| 75 |
+
output_dir: "output/instructblip/vatex_caption_vicuna7b/"
|
| 76 |
+
|
| 77 |
+
amp: True
|
| 78 |
+
resume_ckpt_path: null
|
| 79 |
+
|
| 80 |
+
evaluate: True
|
| 81 |
+
# train_splits: ["train"]
|
| 82 |
+
valid_splits: ["val"]
|
| 83 |
+
annotation_file: /export/home/.cache/lavis/vatex_caption_gt/vatex_caption_val_annotations.json
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
device: "cuda"
|
| 87 |
+
world_size: 1
|
| 88 |
+
dist_url: "env://"
|
| 89 |
+
distributed: True
|
| 90 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 91 |
+
val_freq: 1
|
LAVIS-main/lavis/projects/instructblip/classification_modelnet40_vicuna13b.yaml
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_vicuna_instruct
|
| 8 |
+
model_type: vicuna13b
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: "describe the 3d model."
|
| 11 |
+
format_candidates_prompt: " a 3d model of a {}"
|
| 12 |
+
|
| 13 |
+
datasets:
|
| 14 |
+
modelnet40_cls: # name of the dataset builder
|
| 15 |
+
data_type: [pc, images]
|
| 16 |
+
|
| 17 |
+
vis_processor:
|
| 18 |
+
train:
|
| 19 |
+
name: "clip_image_train"
|
| 20 |
+
image_size: 224
|
| 21 |
+
eval:
|
| 22 |
+
name: "clip_image_eval"
|
| 23 |
+
image_size: 224
|
| 24 |
+
|
| 25 |
+
pc_processor:
|
| 26 |
+
train:
|
| 27 |
+
name: "ulip_pc"
|
| 28 |
+
eval:
|
| 29 |
+
name: "ulip_pc"
|
| 30 |
+
text_processor:
|
| 31 |
+
train:
|
| 32 |
+
name: "blip_caption"
|
| 33 |
+
eval:
|
| 34 |
+
name: "blip_caption"
|
| 35 |
+
|
| 36 |
+
build_info:
|
| 37 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 38 |
+
annotations:
|
| 39 |
+
train:
|
| 40 |
+
url:
|
| 41 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
|
| 42 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
|
| 43 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train.txt
|
| 44 |
+
storage:
|
| 45 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
|
| 46 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
|
| 47 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train.txt
|
| 48 |
+
val:
|
| 49 |
+
url:
|
| 50 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
|
| 51 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
|
| 52 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test.txt
|
| 53 |
+
storage:
|
| 54 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
|
| 55 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
|
| 56 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test.txt
|
| 57 |
+
|
| 58 |
+
pc:
|
| 59 |
+
storage: /export/home/ULIP/data/modelnet40_normal_resampled
|
| 60 |
+
|
| 61 |
+
images:
|
| 62 |
+
storage: /export/einstein-vision/3d_vision/3d_object_datasets/modelnet_images8192
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
run:
|
| 66 |
+
task: multimodal_classification
|
| 67 |
+
# optimizer
|
| 68 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 69 |
+
init_lr: 1e-5
|
| 70 |
+
min_lr: 0
|
| 71 |
+
warmup_lr: 1e-8
|
| 72 |
+
warmup_steps: 1000
|
| 73 |
+
weight_decay: 0.05
|
| 74 |
+
max_epoch: 1
|
| 75 |
+
batch_size_train: 16
|
| 76 |
+
batch_size_eval: 1
|
| 77 |
+
num_workers: 8
|
| 78 |
+
accum_grad_iters: 1
|
| 79 |
+
prompt: "describe the 3d model."
|
| 80 |
+
|
| 81 |
+
max_len: 3
|
| 82 |
+
min_len: 1
|
| 83 |
+
num_beams: 5
|
| 84 |
+
|
| 85 |
+
seed: 42
|
| 86 |
+
output_dir: "output/instructblip/modelent_classification_vicuna13b/"
|
| 87 |
+
|
| 88 |
+
amp: True
|
| 89 |
+
resume_ckpt_path: null
|
| 90 |
+
|
| 91 |
+
evaluate: True
|
| 92 |
+
# train_splits: ["train"]
|
| 93 |
+
valid_splits: ["val"]
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
device: "cuda"
|
| 97 |
+
world_size: 1
|
| 98 |
+
dist_url: "env://"
|
| 99 |
+
distributed: True
|
| 100 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 101 |
+
val_freq: 1
|
LAVIS-main/lavis/projects/instructblip/classification_modelnet40_vicuna7b.yaml
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_vicuna_instruct
|
| 8 |
+
model_type: vicuna7b
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: "describe the 3d model."
|
| 11 |
+
format_candidates_prompt: " a 3d model of a {}"
|
| 12 |
+
|
| 13 |
+
datasets:
|
| 14 |
+
modelnet40_cls: # name of the dataset builder
|
| 15 |
+
data_type: [pc, images]
|
| 16 |
+
|
| 17 |
+
vis_processor:
|
| 18 |
+
train:
|
| 19 |
+
name: "clip_image_train"
|
| 20 |
+
image_size: 224
|
| 21 |
+
eval:
|
| 22 |
+
name: "clip_image_eval"
|
| 23 |
+
image_size: 224
|
| 24 |
+
|
| 25 |
+
pc_processor:
|
| 26 |
+
train:
|
| 27 |
+
name: "ulip_pc"
|
| 28 |
+
eval:
|
| 29 |
+
name: "ulip_pc"
|
| 30 |
+
text_processor:
|
| 31 |
+
train:
|
| 32 |
+
name: "blip_caption"
|
| 33 |
+
eval:
|
| 34 |
+
name: "blip_caption"
|
| 35 |
+
|
| 36 |
+
build_info:
|
| 37 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 38 |
+
annotations:
|
| 39 |
+
train:
|
| 40 |
+
url:
|
| 41 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
|
| 42 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
|
| 43 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train.txt
|
| 44 |
+
storage:
|
| 45 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
|
| 46 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
|
| 47 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train.txt
|
| 48 |
+
val:
|
| 49 |
+
url:
|
| 50 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
|
| 51 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
|
| 52 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test.txt
|
| 53 |
+
storage:
|
| 54 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
|
| 55 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
|
| 56 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test.txt
|
| 57 |
+
|
| 58 |
+
pc:
|
| 59 |
+
storage: /export/home/ULIP/data/modelnet40_normal_resampled
|
| 60 |
+
|
| 61 |
+
images:
|
| 62 |
+
storage: /export/einstein-vision/3d_vision/3d_object_datasets/modelnet_images8192
|
| 63 |
+
|
| 64 |
+
run:
|
| 65 |
+
task: multimodal_classification
|
| 66 |
+
# optimizer
|
| 67 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 68 |
+
init_lr: 1e-5
|
| 69 |
+
min_lr: 0
|
| 70 |
+
warmup_lr: 1e-8
|
| 71 |
+
warmup_steps: 1000
|
| 72 |
+
weight_decay: 0.05
|
| 73 |
+
max_epoch: 1
|
| 74 |
+
batch_size_train: 16
|
| 75 |
+
batch_size_eval: 1
|
| 76 |
+
num_workers: 8
|
| 77 |
+
accum_grad_iters: 1
|
| 78 |
+
prompt: "describe the 3d model."
|
| 79 |
+
|
| 80 |
+
max_len: 3
|
| 81 |
+
min_len: 1
|
| 82 |
+
num_beams: 5
|
| 83 |
+
|
| 84 |
+
seed: 42
|
| 85 |
+
output_dir: "output/instructblip/modelent_classification_vicuna7b/"
|
| 86 |
+
|
| 87 |
+
amp: True
|
| 88 |
+
resume_ckpt_path: null
|
| 89 |
+
|
| 90 |
+
evaluate: True
|
| 91 |
+
# train_splits: ["train"]
|
| 92 |
+
valid_splits: ["val"]
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
device: "cuda"
|
| 96 |
+
world_size: 1
|
| 97 |
+
dist_url: "env://"
|
| 98 |
+
distributed: True
|
| 99 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 100 |
+
val_freq: 1
|
LAVIS-main/lavis/projects/instructblip/classification_snlive_flant5xl.yaml
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
## note flant5 has been trained on snli
|
| 8 |
+
arch: blip2_t5_instruct
|
| 9 |
+
model_type: flant5xl
|
| 10 |
+
load_pretrained: True
|
| 11 |
+
prompt: ""
|
| 12 |
+
|
| 13 |
+
datasets:
|
| 14 |
+
snli_ve_instruct:
|
| 15 |
+
# data_dir: ${env.data_dir}/datasets
|
| 16 |
+
data_type: images # [images|videos|features]
|
| 17 |
+
|
| 18 |
+
vis_processor:
|
| 19 |
+
train:
|
| 20 |
+
name: "clip_image_train"
|
| 21 |
+
image_size: 224
|
| 22 |
+
eval:
|
| 23 |
+
name: "clip_image_eval"
|
| 24 |
+
image_size: 224
|
| 25 |
+
|
| 26 |
+
text_processor:
|
| 27 |
+
train:
|
| 28 |
+
name: "blip_caption"
|
| 29 |
+
eval:
|
| 30 |
+
name: "blip_caption"
|
| 31 |
+
prompt: "given the image respond to "
|
| 32 |
+
|
| 33 |
+
build_info:
|
| 34 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 35 |
+
annotations:
|
| 36 |
+
train:
|
| 37 |
+
url:
|
| 38 |
+
- /export/share/dongxuli/data/lavis/snli/ve_train.json
|
| 39 |
+
storage:
|
| 40 |
+
- snli/annotations/ve_train.json
|
| 41 |
+
val:
|
| 42 |
+
url:
|
| 43 |
+
- /export/share/dongxuli/data/lavis/snli/ve_dev.json
|
| 44 |
+
storage:
|
| 45 |
+
- snli/annotations/ve_dev.json
|
| 46 |
+
test:
|
| 47 |
+
url:
|
| 48 |
+
- /export/share/dongxuli/data/lavis/snli/ve_test.json
|
| 49 |
+
storage:
|
| 50 |
+
- snli/annotations/ve_test.json
|
| 51 |
+
images:
|
| 52 |
+
# storage: flickr30k/images/flickr30k-images
|
| 53 |
+
storage: /export/share/datasets/vision/flickr30k/flickr30k-images
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
run:
|
| 57 |
+
task: multimodal_classification
|
| 58 |
+
# optimizer
|
| 59 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 60 |
+
init_lr: 1e-5
|
| 61 |
+
min_lr: 0
|
| 62 |
+
warmup_lr: 1e-8
|
| 63 |
+
warmup_steps: 1000
|
| 64 |
+
weight_decay: 0.05
|
| 65 |
+
max_epoch: 1
|
| 66 |
+
batch_size_train: 16
|
| 67 |
+
batch_size_eval: 1
|
| 68 |
+
num_workers: 8
|
| 69 |
+
accum_grad_iters: 1
|
| 70 |
+
|
| 71 |
+
max_len: 30
|
| 72 |
+
min_len: 1
|
| 73 |
+
num_beams: 5
|
| 74 |
+
inference_method: "generate"
|
| 75 |
+
prompt: ""
|
| 76 |
+
length_penalty: -1.
|
| 77 |
+
|
| 78 |
+
seed: 42
|
| 79 |
+
output_dir: "output/instructblip/snlive_classification_flant5xl/"
|
| 80 |
+
|
| 81 |
+
amp: True
|
| 82 |
+
resume_ckpt_path: null
|
| 83 |
+
|
| 84 |
+
evaluate: True
|
| 85 |
+
# train_splits: ["train"]
|
| 86 |
+
valid_splits: ["val"]
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
device: "cuda"
|
| 90 |
+
world_size: 1
|
| 91 |
+
dist_url: "env://"
|
| 92 |
+
distributed: True
|
| 93 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 94 |
+
val_freq: 1
|
LAVIS-main/lavis/projects/instructblip/classification_snlive_flant5xxl.yaml
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
## note flant5 has been trained on snli
|
| 8 |
+
arch: blip2_t5_instruct
|
| 9 |
+
model_type: flant5xxl
|
| 10 |
+
load_pretrained: True
|
| 11 |
+
prompt: ""
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
datasets:
|
| 15 |
+
snli_ve_instruct:
|
| 16 |
+
# data_dir: ${env.data_dir}/datasets
|
| 17 |
+
data_type: images # [images|videos|features]
|
| 18 |
+
|
| 19 |
+
vis_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: "clip_image_train"
|
| 22 |
+
image_size: 224
|
| 23 |
+
eval:
|
| 24 |
+
name: "clip_image_eval"
|
| 25 |
+
image_size: 224
|
| 26 |
+
|
| 27 |
+
text_processor:
|
| 28 |
+
train:
|
| 29 |
+
name: "blip_caption"
|
| 30 |
+
eval:
|
| 31 |
+
name: "blip_caption"
|
| 32 |
+
prompt: "given the image respond to "
|
| 33 |
+
|
| 34 |
+
build_info:
|
| 35 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 36 |
+
annotations:
|
| 37 |
+
train:
|
| 38 |
+
url:
|
| 39 |
+
- /export/share/dongxuli/data/lavis/snli/ve_train.json
|
| 40 |
+
storage:
|
| 41 |
+
- snli/annotations/ve_train.json
|
| 42 |
+
val:
|
| 43 |
+
url:
|
| 44 |
+
- /export/share/dongxuli/data/lavis/snli/ve_dev.json
|
| 45 |
+
storage:
|
| 46 |
+
- snli/annotations/ve_dev.json
|
| 47 |
+
test:
|
| 48 |
+
url:
|
| 49 |
+
- /export/share/dongxuli/data/lavis/snli/ve_test.json
|
| 50 |
+
storage:
|
| 51 |
+
- snli/annotations/ve_test.json
|
| 52 |
+
images:
|
| 53 |
+
# storage: flickr30k/images/flickr30k-images
|
| 54 |
+
storage: /export/share/datasets/vision/flickr30k/flickr30k-images
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
run:
|
| 58 |
+
task: multimodal_classification
|
| 59 |
+
# optimizer
|
| 60 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 61 |
+
init_lr: 1e-5
|
| 62 |
+
min_lr: 0
|
| 63 |
+
warmup_lr: 1e-8
|
| 64 |
+
warmup_steps: 1000
|
| 65 |
+
weight_decay: 0.05
|
| 66 |
+
max_epoch: 1
|
| 67 |
+
batch_size_train: 16
|
| 68 |
+
batch_size_eval: 1
|
| 69 |
+
num_workers: 8
|
| 70 |
+
accum_grad_iters: 1
|
| 71 |
+
|
| 72 |
+
max_len: 30
|
| 73 |
+
min_len: 1
|
| 74 |
+
num_beams: 5
|
| 75 |
+
inference_method: "generate"
|
| 76 |
+
prompt: ""
|
| 77 |
+
length_penalty: -1.
|
| 78 |
+
|
| 79 |
+
seed: 42
|
| 80 |
+
output_dir: "output/instructblip/snlive_classification_flant5xxl/"
|
| 81 |
+
|
| 82 |
+
amp: True
|
| 83 |
+
resume_ckpt_path: null
|
| 84 |
+
|
| 85 |
+
evaluate: True
|
| 86 |
+
# train_splits: ["train"]
|
| 87 |
+
valid_splits: ["test"]
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
device: "cuda"
|
| 91 |
+
world_size: 1
|
| 92 |
+
dist_url: "env://"
|
| 93 |
+
distributed: True
|
| 94 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 95 |
+
val_freq: 1
|
LAVIS-main/lavis/projects/instructblip/classification_snlive_vicuna13b.yaml
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_vicuna_instruct
|
| 8 |
+
model_type: vicuna13b
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: ""
|
| 11 |
+
|
| 12 |
+
datasets:
|
| 13 |
+
snli_ve_instruct:
|
| 14 |
+
# data_dir: ${env.data_dir}/datasets
|
| 15 |
+
data_type: images # [images|videos|features]
|
| 16 |
+
|
| 17 |
+
vis_processor:
|
| 18 |
+
train:
|
| 19 |
+
name: "clip_image_train"
|
| 20 |
+
image_size: 224
|
| 21 |
+
eval:
|
| 22 |
+
name: "clip_image_eval"
|
| 23 |
+
image_size: 224
|
| 24 |
+
|
| 25 |
+
text_processor:
|
| 26 |
+
train:
|
| 27 |
+
name: "blip_caption"
|
| 28 |
+
eval:
|
| 29 |
+
name: "blip_caption"
|
| 30 |
+
# prompt: "how would you respond to "
|
| 31 |
+
|
| 32 |
+
build_info:
|
| 33 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 34 |
+
annotations:
|
| 35 |
+
train:
|
| 36 |
+
url:
|
| 37 |
+
- /export/share/dongxuli/data/lavis/snli/ve_train.json
|
| 38 |
+
storage:
|
| 39 |
+
- snli/annotations/ve_train.json
|
| 40 |
+
val:
|
| 41 |
+
url:
|
| 42 |
+
- /export/share/dongxuli/data/lavis/snli/ve_dev.json
|
| 43 |
+
storage:
|
| 44 |
+
- snli/annotations/ve_dev.json
|
| 45 |
+
test:
|
| 46 |
+
url:
|
| 47 |
+
- /export/share/dongxuli/data/lavis/snli/ve_test.json
|
| 48 |
+
storage:
|
| 49 |
+
- snli/annotations/ve_test.json
|
| 50 |
+
images:
|
| 51 |
+
# storage: flickr30k/images/flickr30k-images
|
| 52 |
+
storage: /export/share/datasets/vision/flickr30k/flickr30k-images
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
run:
|
| 56 |
+
task: multimodal_classification
|
| 57 |
+
# optimizer
|
| 58 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 59 |
+
init_lr: 1e-5
|
| 60 |
+
min_lr: 0
|
| 61 |
+
warmup_lr: 1e-8
|
| 62 |
+
warmup_steps: 1000
|
| 63 |
+
weight_decay: 0.05
|
| 64 |
+
max_epoch: 1
|
| 65 |
+
batch_size_train: 16
|
| 66 |
+
batch_size_eval: 1
|
| 67 |
+
num_workers: 8
|
| 68 |
+
accum_grad_iters: 1
|
| 69 |
+
|
| 70 |
+
max_len: 30
|
| 71 |
+
min_len: 1
|
| 72 |
+
num_beams: 5
|
| 73 |
+
inference_method: "generate"
|
| 74 |
+
prompt: ""
|
| 75 |
+
length_penalty: -1.
|
| 76 |
+
|
| 77 |
+
seed: 42
|
| 78 |
+
output_dir: "output/instructblip/snlive_classification_vicuna13b_val/"
|
| 79 |
+
|
| 80 |
+
amp: True
|
| 81 |
+
resume_ckpt_path: null
|
| 82 |
+
|
| 83 |
+
evaluate: True
|
| 84 |
+
# train_splits: ["train"]
|
| 85 |
+
valid_splits: ["val"]
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
device: "cuda"
|
| 89 |
+
world_size: 1
|
| 90 |
+
dist_url: "env://"
|
| 91 |
+
distributed: True
|
| 92 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 93 |
+
val_freq: 1
|
LAVIS-main/lavis/projects/instructblip/classification_snlive_vicuna13b_test.yaml
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_vicuna_instruct
|
| 8 |
+
model_type: vicuna13b
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: ""
|
| 11 |
+
|
| 12 |
+
datasets:
|
| 13 |
+
snli_ve_instruct:
|
| 14 |
+
# data_dir: ${env.data_dir}/datasets
|
| 15 |
+
data_type: images # [images|videos|features]
|
| 16 |
+
|
| 17 |
+
vis_processor:
|
| 18 |
+
train:
|
| 19 |
+
name: "clip_image_train"
|
| 20 |
+
image_size: 224
|
| 21 |
+
eval:
|
| 22 |
+
name: "clip_image_eval"
|
| 23 |
+
image_size: 224
|
| 24 |
+
|
| 25 |
+
text_processor:
|
| 26 |
+
train:
|
| 27 |
+
name: "blip_caption"
|
| 28 |
+
eval:
|
| 29 |
+
name: "blip_caption"
|
| 30 |
+
# prompt: "how would you respond to "
|
| 31 |
+
|
| 32 |
+
build_info:
|
| 33 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 34 |
+
annotations:
|
| 35 |
+
train:
|
| 36 |
+
url:
|
| 37 |
+
- /export/share/dongxuli/data/lavis/snli/ve_train.json
|
| 38 |
+
storage:
|
| 39 |
+
- snli/annotations/ve_train.json
|
| 40 |
+
val:
|
| 41 |
+
url:
|
| 42 |
+
- /export/share/dongxuli/data/lavis/snli/ve_dev.json
|
| 43 |
+
storage:
|
| 44 |
+
- snli/annotations/ve_dev.json
|
| 45 |
+
test:
|
| 46 |
+
url:
|
| 47 |
+
- /export/share/dongxuli/data/lavis/snli/ve_test.json
|
| 48 |
+
storage:
|
| 49 |
+
- snli/annotations/ve_test.json
|
| 50 |
+
images:
|
| 51 |
+
# storage: flickr30k/images/flickr30k-images
|
| 52 |
+
storage: /export/share/datasets/vision/flickr30k/flickr30k-images
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
run:
|
| 56 |
+
task: multimodal_classification
|
| 57 |
+
# optimizer
|
| 58 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 59 |
+
init_lr: 1e-5
|
| 60 |
+
min_lr: 0
|
| 61 |
+
warmup_lr: 1e-8
|
| 62 |
+
warmup_steps: 1000
|
| 63 |
+
weight_decay: 0.05
|
| 64 |
+
max_epoch: 1
|
| 65 |
+
batch_size_train: 16
|
| 66 |
+
batch_size_eval: 1
|
| 67 |
+
num_workers: 8
|
| 68 |
+
accum_grad_iters: 1
|
| 69 |
+
|
| 70 |
+
max_len: 30
|
| 71 |
+
min_len: 1
|
| 72 |
+
num_beams: 5
|
| 73 |
+
inference_method: "generate"
|
| 74 |
+
prompt: ""
|
| 75 |
+
length_penalty: -1.
|
| 76 |
+
|
| 77 |
+
seed: 42
|
| 78 |
+
output_dir: "output/instructblip/snlive_classification_vicuna13b_test/"
|
| 79 |
+
|
| 80 |
+
amp: True
|
| 81 |
+
resume_ckpt_path: null
|
| 82 |
+
|
| 83 |
+
evaluate: True
|
| 84 |
+
# train_splits: ["train"]
|
| 85 |
+
valid_splits: ["test"]
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
device: "cuda"
|
| 89 |
+
world_size: 1
|
| 90 |
+
dist_url: "env://"
|
| 91 |
+
distributed: True
|
| 92 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 93 |
+
val_freq: 1
|
LAVIS-main/lavis/projects/instructblip/classification_snlive_vicuna7b_test.yaml
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_vicuna_instruct
|
| 8 |
+
model_type: vicuna7b
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: ""
|
| 11 |
+
|
| 12 |
+
datasets:
|
| 13 |
+
snli_ve_instruct:
|
| 14 |
+
# data_dir: ${env.data_dir}/datasets
|
| 15 |
+
data_type: images # [images|videos|features]
|
| 16 |
+
|
| 17 |
+
vis_processor:
|
| 18 |
+
train:
|
| 19 |
+
name: "clip_image_train"
|
| 20 |
+
image_size: 224
|
| 21 |
+
eval:
|
| 22 |
+
name: "clip_image_eval"
|
| 23 |
+
image_size: 224
|
| 24 |
+
|
| 25 |
+
text_processor:
|
| 26 |
+
train:
|
| 27 |
+
name: "blip_caption"
|
| 28 |
+
eval:
|
| 29 |
+
name: "blip_caption"
|
| 30 |
+
# prompt: "given the image respond to "
|
| 31 |
+
|
| 32 |
+
build_info:
|
| 33 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 34 |
+
annotations:
|
| 35 |
+
train:
|
| 36 |
+
url:
|
| 37 |
+
- /export/share/dongxuli/data/lavis/snli/ve_train.json
|
| 38 |
+
storage:
|
| 39 |
+
- snli/annotations/ve_train.json
|
| 40 |
+
val:
|
| 41 |
+
url:
|
| 42 |
+
- /export/share/dongxuli/data/lavis/snli/ve_dev.json
|
| 43 |
+
storage:
|
| 44 |
+
- snli/annotations/ve_dev.json
|
| 45 |
+
test:
|
| 46 |
+
url:
|
| 47 |
+
- /export/share/dongxuli/data/lavis/snli/ve_test.json
|
| 48 |
+
storage:
|
| 49 |
+
- snli/annotations/ve_test.json
|
| 50 |
+
images:
|
| 51 |
+
# storage: flickr30k/images/flickr30k-images
|
| 52 |
+
storage: /export/share/datasets/vision/flickr30k/flickr30k-images
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
run:
|
| 56 |
+
task: multimodal_classification
|
| 57 |
+
# optimizer
|
| 58 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 59 |
+
init_lr: 1e-5
|
| 60 |
+
min_lr: 0
|
| 61 |
+
warmup_lr: 1e-8
|
| 62 |
+
warmup_steps: 1000
|
| 63 |
+
weight_decay: 0.05
|
| 64 |
+
max_epoch: 1
|
| 65 |
+
batch_size_train: 16
|
| 66 |
+
batch_size_eval: 1
|
| 67 |
+
num_workers: 8
|
| 68 |
+
accum_grad_iters: 1
|
| 69 |
+
|
| 70 |
+
max_len: 30
|
| 71 |
+
min_len: 1
|
| 72 |
+
num_beams: 5
|
| 73 |
+
inference_method: "generate"
|
| 74 |
+
prompt: ""
|
| 75 |
+
length_penalty: -1.
|
| 76 |
+
|
| 77 |
+
seed: 42
|
| 78 |
+
output_dir: "output/instructblip/snlive_classification_vicuna7b_test/"
|
| 79 |
+
|
| 80 |
+
amp: True
|
| 81 |
+
resume_ckpt_path: null
|
| 82 |
+
|
| 83 |
+
evaluate: True
|
| 84 |
+
# train_splits: ["train"]
|
| 85 |
+
valid_splits: ["test"]
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
device: "cuda"
|
| 89 |
+
world_size: 1
|
| 90 |
+
dist_url: "env://"
|
| 91 |
+
distributed: True
|
| 92 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 93 |
+
val_freq: 1
|
LAVIS-main/lavis/projects/instructblip/classification_snlive_vicuna7b_val.yaml
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_vicuna_instruct
|
| 8 |
+
model_type: vicuna7b
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: ""
|
| 11 |
+
|
| 12 |
+
datasets:
|
| 13 |
+
snli_ve_instruct:
|
| 14 |
+
# data_dir: ${env.data_dir}/datasets
|
| 15 |
+
data_type: images # [images|videos|features]
|
| 16 |
+
|
| 17 |
+
vis_processor:
|
| 18 |
+
train:
|
| 19 |
+
name: "clip_image_train"
|
| 20 |
+
image_size: 224
|
| 21 |
+
eval:
|
| 22 |
+
name: "clip_image_eval"
|
| 23 |
+
image_size: 224
|
| 24 |
+
|
| 25 |
+
text_processor:
|
| 26 |
+
train:
|
| 27 |
+
name: "blip_caption"
|
| 28 |
+
eval:
|
| 29 |
+
name: "blip_caption"
|
| 30 |
+
# prompt: "given the image respond to "
|
| 31 |
+
|
| 32 |
+
build_info:
|
| 33 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 34 |
+
annotations:
|
| 35 |
+
train:
|
| 36 |
+
url:
|
| 37 |
+
- /export/share/dongxuli/data/lavis/snli/ve_train.json
|
| 38 |
+
storage:
|
| 39 |
+
- snli/annotations/ve_train.json
|
| 40 |
+
val:
|
| 41 |
+
url:
|
| 42 |
+
- /export/share/dongxuli/data/lavis/snli/ve_dev.json
|
| 43 |
+
storage:
|
| 44 |
+
- snli/annotations/ve_dev.json
|
| 45 |
+
test:
|
| 46 |
+
url:
|
| 47 |
+
- /export/share/dongxuli/data/lavis/snli/ve_test.json
|
| 48 |
+
storage:
|
| 49 |
+
- snli/annotations/ve_test.json
|
| 50 |
+
images:
|
| 51 |
+
# storage: flickr30k/images/flickr30k-images
|
| 52 |
+
storage: /export/share/datasets/vision/flickr30k/flickr30k-images
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
run:
|
| 56 |
+
task: multimodal_classification
|
| 57 |
+
# optimizer
|
| 58 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 59 |
+
init_lr: 1e-5
|
| 60 |
+
min_lr: 0
|
| 61 |
+
warmup_lr: 1e-8
|
| 62 |
+
warmup_steps: 1000
|
| 63 |
+
weight_decay: 0.05
|
| 64 |
+
max_epoch: 1
|
| 65 |
+
batch_size_train: 16
|
| 66 |
+
batch_size_eval: 1
|
| 67 |
+
num_workers: 8
|
| 68 |
+
accum_grad_iters: 1
|
| 69 |
+
|
| 70 |
+
max_len: 30
|
| 71 |
+
min_len: 1
|
| 72 |
+
num_beams: 5
|
| 73 |
+
inference_method: "generate"
|
| 74 |
+
prompt: ""
|
| 75 |
+
length_penalty: -1.
|
| 76 |
+
|
| 77 |
+
seed: 42
|
| 78 |
+
output_dir: "output/instructblip/snlive_classification_vicuna7b_val/"
|
| 79 |
+
|
| 80 |
+
amp: True
|
| 81 |
+
resume_ckpt_path: null
|
| 82 |
+
|
| 83 |
+
evaluate: True
|
| 84 |
+
# train_splits: ["train"]
|
| 85 |
+
valid_splits: ["val"]
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
device: "cuda"
|
| 89 |
+
world_size: 1
|
| 90 |
+
dist_url: "env://"
|
| 91 |
+
distributed: True
|
| 92 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 93 |
+
val_freq: 1
|
LAVIS-main/lavis/projects/instructblip/completion_modelnet40_vicuna13b.yaml
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_vicuna_instruct
|
| 8 |
+
model_type: vicuna13b
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: "describe the 3d model"
|
| 11 |
+
|
| 12 |
+
datasets:
|
| 13 |
+
modelnet40_cls: # name of the dataset builder
|
| 14 |
+
data_type: [images,pc]
|
| 15 |
+
|
| 16 |
+
vis_processor:
|
| 17 |
+
train:
|
| 18 |
+
name: "clip_image_train"
|
| 19 |
+
image_size: 224
|
| 20 |
+
eval:
|
| 21 |
+
name: "clip_image_eval"
|
| 22 |
+
image_size: 224
|
| 23 |
+
|
| 24 |
+
pc_processor:
|
| 25 |
+
train:
|
| 26 |
+
name: "ulip_pc"
|
| 27 |
+
eval:
|
| 28 |
+
name: "ulip_pc"
|
| 29 |
+
text_processor:
|
| 30 |
+
train:
|
| 31 |
+
name: "blip_caption"
|
| 32 |
+
eval:
|
| 33 |
+
name: "blip_caption"
|
| 34 |
+
|
| 35 |
+
build_info:
|
| 36 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 37 |
+
annotations:
|
| 38 |
+
train:
|
| 39 |
+
url:
|
| 40 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
|
| 41 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
|
| 42 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train.txt
|
| 43 |
+
storage:
|
| 44 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
|
| 45 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
|
| 46 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train.txt
|
| 47 |
+
val:
|
| 48 |
+
url:
|
| 49 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
|
| 50 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
|
| 51 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test.txt
|
| 52 |
+
storage:
|
| 53 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
|
| 54 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
|
| 55 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test.txt
|
| 56 |
+
|
| 57 |
+
pc:
|
| 58 |
+
storage: /export/home/ULIP/data/modelnet40_normal_resampled
|
| 59 |
+
|
| 60 |
+
images:
|
| 61 |
+
storage: /export/einstein-vision/3d_vision/3d_object_datasets/modelnet_images8192
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
run:
|
| 65 |
+
task: multimodal_classification
|
| 66 |
+
# optimizer
|
| 67 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 68 |
+
init_lr: 1e-5
|
| 69 |
+
min_lr: 0
|
| 70 |
+
warmup_lr: 1e-8
|
| 71 |
+
warmup_steps: 1000
|
| 72 |
+
weight_decay: 0.05
|
| 73 |
+
max_epoch: 1
|
| 74 |
+
batch_size_train: 32
|
| 75 |
+
batch_size_eval: 1
|
| 76 |
+
num_workers: 8
|
| 77 |
+
accum_grad_iters: 1
|
| 78 |
+
|
| 79 |
+
max_len: 80
|
| 80 |
+
min_len: 1
|
| 81 |
+
num_beams: 5
|
| 82 |
+
length_penalty: 0.
|
| 83 |
+
prompt: "describe the 3d model"
|
| 84 |
+
|
| 85 |
+
seed: 42
|
| 86 |
+
output_dir: "output/instructblip/modelent_completion_vicuna13b/"
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
amp: True
|
| 90 |
+
resume_ckpt_path: null
|
| 91 |
+
|
| 92 |
+
evaluate: True
|
| 93 |
+
# train_splits: ["train"]
|
| 94 |
+
valid_splits: ["val"]
|
| 95 |
+
device: "cuda"
|
| 96 |
+
world_size: 1
|
| 97 |
+
dist_url: "env://"
|
| 98 |
+
distributed: True
|
| 99 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 100 |
+
val_freq: 1
|
| 101 |
+
|
LAVIS-main/lavis/projects/instructblip/completion_modelnet40_vicuna7b.yaml
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_vicuna_instruct
|
| 8 |
+
model_type: vicuna7b
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: "describe the 3d model"
|
| 11 |
+
predict_with_gen: True
|
| 12 |
+
|
| 13 |
+
datasets:
|
| 14 |
+
modelnet40_cls: # name of the dataset builder
|
| 15 |
+
data_type: [images, pc]
|
| 16 |
+
|
| 17 |
+
vis_processor:
|
| 18 |
+
train:
|
| 19 |
+
name: "clip_image_train"
|
| 20 |
+
image_size: 224
|
| 21 |
+
eval:
|
| 22 |
+
name: "clip_image_eval"
|
| 23 |
+
image_size: 224
|
| 24 |
+
|
| 25 |
+
pc_processor:
|
| 26 |
+
train:
|
| 27 |
+
name: "ulip_pc"
|
| 28 |
+
eval:
|
| 29 |
+
name: "ulip_pc"
|
| 30 |
+
text_processor:
|
| 31 |
+
train:
|
| 32 |
+
name: "blip_caption"
|
| 33 |
+
eval:
|
| 34 |
+
name: "blip_caption"
|
| 35 |
+
|
| 36 |
+
build_info:
|
| 37 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 38 |
+
annotations:
|
| 39 |
+
train:
|
| 40 |
+
url:
|
| 41 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
|
| 42 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
|
| 43 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train.txt
|
| 44 |
+
storage:
|
| 45 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
|
| 46 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train_8192pts_fps.dat
|
| 47 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_train.txt
|
| 48 |
+
val:
|
| 49 |
+
url:
|
| 50 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
|
| 51 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
|
| 52 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test.txt
|
| 53 |
+
storage:
|
| 54 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_shape_names.txt
|
| 55 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test_8192pts_fps.dat
|
| 56 |
+
- /export/home/ULIP/data/modelnet40_normal_resampled/modelnet40_test.txt
|
| 57 |
+
|
| 58 |
+
pc:
|
| 59 |
+
storage: /export/home/ULIP/data/modelnet40_normal_resampled
|
| 60 |
+
|
| 61 |
+
images:
|
| 62 |
+
storage: /export/einstein-vision/3d_vision/3d_object_datasets/modelnet_images8192
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
run:
|
| 66 |
+
task: multimodal_classification
|
| 67 |
+
# optimizer
|
| 68 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 69 |
+
init_lr: 1e-5
|
| 70 |
+
min_lr: 0
|
| 71 |
+
warmup_lr: 1e-8
|
| 72 |
+
warmup_steps: 1000
|
| 73 |
+
weight_decay: 0.05
|
| 74 |
+
max_epoch: 1
|
| 75 |
+
batch_size_train: 32
|
| 76 |
+
batch_size_eval: 1
|
| 77 |
+
num_workers: 8
|
| 78 |
+
accum_grad_iters: 1
|
| 79 |
+
|
| 80 |
+
max_len: 80
|
| 81 |
+
min_len: 1
|
| 82 |
+
num_beams: 5
|
| 83 |
+
length_penalty: 0.
|
| 84 |
+
prompt: "describe the 3d model"
|
| 85 |
+
|
| 86 |
+
seed: 42
|
| 87 |
+
output_dir: "output/instructblip/modelent_completion_vicuna7b/"
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
amp: True
|
| 91 |
+
resume_ckpt_path: null
|
| 92 |
+
|
| 93 |
+
evaluate: True
|
| 94 |
+
# train_splits: ["train"]
|
| 95 |
+
valid_splits: ["val"]
|
| 96 |
+
device: "cuda"
|
| 97 |
+
world_size: 1
|
| 98 |
+
dist_url: "env://"
|
| 99 |
+
distributed: True
|
| 100 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 101 |
+
val_freq: 1
|
| 102 |
+
|
LAVIS-main/lavis/projects/instructblip/qa_msrvtt_flant5xl_eval_test.yaml
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_t5_instruct
|
| 8 |
+
model_type: flant5xl
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: "Question: {} Short Answer:"
|
| 11 |
+
|
| 12 |
+
datasets:
|
| 13 |
+
msrvtt_qa_instruct:
|
| 14 |
+
vis_processor:
|
| 15 |
+
train:
|
| 16 |
+
name: alpro_video_train
|
| 17 |
+
n_frms: 4
|
| 18 |
+
image_size: 224
|
| 19 |
+
min_scale: 0.9
|
| 20 |
+
max_scale: 1.0
|
| 21 |
+
eval:
|
| 22 |
+
name: alpro_video_eval
|
| 23 |
+
n_frms: 4
|
| 24 |
+
image_size: 224
|
| 25 |
+
min_scale: 0.9
|
| 26 |
+
max_scale: 1.0
|
| 27 |
+
text_processor:
|
| 28 |
+
train:
|
| 29 |
+
name: "blip_question"
|
| 30 |
+
eval:
|
| 31 |
+
name: "blip_caption"
|
| 32 |
+
|
| 33 |
+
build_info:
|
| 34 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 35 |
+
annotations:
|
| 36 |
+
train:
|
| 37 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json
|
| 38 |
+
storage: msrvtt/annotations/qa_train.json
|
| 39 |
+
val:
|
| 40 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json
|
| 41 |
+
storage: msrvtt/annotations/qa_val.json
|
| 42 |
+
test:
|
| 43 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json
|
| 44 |
+
storage: msrvtt/annotations/qa_test.json
|
| 45 |
+
ans2label:
|
| 46 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json
|
| 47 |
+
storage: msrvtt/annotations/qa_ans2label.json
|
| 48 |
+
videos:
|
| 49 |
+
storage: /export/share/datasets/vision_language/msrvtt/videos
|
| 50 |
+
|
| 51 |
+
instance_id_key: question_id
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
run:
|
| 55 |
+
task: gqa
|
| 56 |
+
# optimizer
|
| 57 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 58 |
+
init_lr: 1e-5
|
| 59 |
+
min_lr: 0
|
| 60 |
+
warmup_lr: 1e-8
|
| 61 |
+
warmup_steps: 1000
|
| 62 |
+
weight_decay: 0.05
|
| 63 |
+
max_epoch: 1
|
| 64 |
+
batch_size_train: 16
|
| 65 |
+
batch_size_eval: 1
|
| 66 |
+
num_workers: 8
|
| 67 |
+
accum_grad_iters: 1
|
| 68 |
+
|
| 69 |
+
max_len: 10
|
| 70 |
+
min_len: 1
|
| 71 |
+
num_beams: 5
|
| 72 |
+
inference_method: "generate"
|
| 73 |
+
length_penalty: -1.
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
seed: 42
|
| 77 |
+
output_dir: "output/instructblip/msrvtt_qa_flant5xl_test/"
|
| 78 |
+
|
| 79 |
+
amp: True
|
| 80 |
+
resume_ckpt_path: null
|
| 81 |
+
|
| 82 |
+
evaluate: True
|
| 83 |
+
# train_splits: ["train"]
|
| 84 |
+
valid_splits: ["test"]
|
| 85 |
+
|
| 86 |
+
device: "cuda"
|
| 87 |
+
world_size: 1
|
| 88 |
+
dist_url: "env://"
|
| 89 |
+
distributed: True
|
| 90 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 91 |
+
val_freq: 1
|
| 92 |
+
|
LAVIS-main/lavis/projects/instructblip/qa_msrvtt_flant5xxl_eval_test.yaml
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_t5_instruct
|
| 8 |
+
model_type: flant5xxl
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: "Question: {} Short Answer:"
|
| 11 |
+
|
| 12 |
+
datasets:
|
| 13 |
+
msrvtt_qa_instruct:
|
| 14 |
+
vis_processor:
|
| 15 |
+
train:
|
| 16 |
+
name: alpro_video_train
|
| 17 |
+
n_frms: 4
|
| 18 |
+
image_size: 224
|
| 19 |
+
min_scale: 0.9
|
| 20 |
+
max_scale: 1.0
|
| 21 |
+
eval:
|
| 22 |
+
name: alpro_video_eval
|
| 23 |
+
n_frms: 4
|
| 24 |
+
image_size: 224
|
| 25 |
+
min_scale: 0.9
|
| 26 |
+
max_scale: 1.0
|
| 27 |
+
text_processor:
|
| 28 |
+
train:
|
| 29 |
+
name: "blip_question"
|
| 30 |
+
eval:
|
| 31 |
+
name: "blip_caption"
|
| 32 |
+
|
| 33 |
+
build_info:
|
| 34 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 35 |
+
annotations:
|
| 36 |
+
train:
|
| 37 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json
|
| 38 |
+
storage: msrvtt/annotations/qa_train.json
|
| 39 |
+
val:
|
| 40 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json
|
| 41 |
+
storage: msrvtt/annotations/qa_val.json
|
| 42 |
+
test:
|
| 43 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json
|
| 44 |
+
storage: msrvtt/annotations/qa_test.json
|
| 45 |
+
ans2label:
|
| 46 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json
|
| 47 |
+
storage: msrvtt/annotations/qa_ans2label.json
|
| 48 |
+
videos:
|
| 49 |
+
storage: /export/share/datasets/vision_language/msrvtt/videos
|
| 50 |
+
|
| 51 |
+
instance_id_key: question_id
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
run:
|
| 55 |
+
task: gqa
|
| 56 |
+
# optimizer
|
| 57 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 58 |
+
init_lr: 1e-5
|
| 59 |
+
min_lr: 0
|
| 60 |
+
warmup_lr: 1e-8
|
| 61 |
+
warmup_steps: 1000
|
| 62 |
+
weight_decay: 0.05
|
| 63 |
+
max_epoch: 1
|
| 64 |
+
batch_size_train: 16
|
| 65 |
+
batch_size_eval: 1
|
| 66 |
+
num_workers: 0
|
| 67 |
+
accum_grad_iters: 1
|
| 68 |
+
|
| 69 |
+
max_len: 10
|
| 70 |
+
min_len: 1
|
| 71 |
+
num_beams: 5
|
| 72 |
+
inference_method: "generate"
|
| 73 |
+
length_penalty: -1.
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
seed: 42
|
| 77 |
+
output_dir: "output/instructblip/msrvtt_qa_flant5xxl_test/"
|
| 78 |
+
|
| 79 |
+
amp: True
|
| 80 |
+
resume_ckpt_path: null
|
| 81 |
+
|
| 82 |
+
evaluate: True
|
| 83 |
+
# train_splits: ["train"]
|
| 84 |
+
valid_splits: ["test"]
|
| 85 |
+
|
| 86 |
+
device: "cuda"
|
| 87 |
+
world_size: 1
|
| 88 |
+
dist_url: "env://"
|
| 89 |
+
distributed: True
|
| 90 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 91 |
+
val_freq: 1
|
| 92 |
+
|
LAVIS-main/lavis/projects/instructblip/qa_msrvtt_vicuna13b_eval_test.yaml
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_vicuna_instruct
|
| 8 |
+
model_type: vicuna13b
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: "Question: {} Short Answer:"
|
| 11 |
+
|
| 12 |
+
datasets:
|
| 13 |
+
msrvtt_qa_instruct:
|
| 14 |
+
vis_processor:
|
| 15 |
+
train:
|
| 16 |
+
name: alpro_video_train
|
| 17 |
+
n_frms: 4
|
| 18 |
+
image_size: 224
|
| 19 |
+
min_scale: 0.9
|
| 20 |
+
max_scale: 1.0
|
| 21 |
+
eval:
|
| 22 |
+
name: alpro_video_eval
|
| 23 |
+
n_frms: 4
|
| 24 |
+
image_size: 224
|
| 25 |
+
min_scale: 0.9
|
| 26 |
+
max_scale: 1.0
|
| 27 |
+
text_processor:
|
| 28 |
+
train:
|
| 29 |
+
name: "blip_question"
|
| 30 |
+
eval:
|
| 31 |
+
name: "blip_caption"
|
| 32 |
+
|
| 33 |
+
build_info:
|
| 34 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 35 |
+
annotations:
|
| 36 |
+
train:
|
| 37 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json
|
| 38 |
+
storage: msrvtt/annotations/qa_train.json
|
| 39 |
+
val:
|
| 40 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json
|
| 41 |
+
storage: msrvtt/annotations/qa_val.json
|
| 42 |
+
test:
|
| 43 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json
|
| 44 |
+
storage: msrvtt/annotations/qa_test.json
|
| 45 |
+
ans2label:
|
| 46 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json
|
| 47 |
+
storage: msrvtt/annotations/qa_ans2label.json
|
| 48 |
+
videos:
|
| 49 |
+
storage: /export/share/datasets/vision_language/msrvtt/videos
|
| 50 |
+
|
| 51 |
+
instance_id_key: question_id
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
run:
|
| 55 |
+
task: gqa
|
| 56 |
+
# optimizer
|
| 57 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 58 |
+
init_lr: 1e-5
|
| 59 |
+
min_lr: 0
|
| 60 |
+
warmup_lr: 1e-8
|
| 61 |
+
warmup_steps: 1000
|
| 62 |
+
weight_decay: 0.05
|
| 63 |
+
max_epoch: 1
|
| 64 |
+
batch_size_train: 16
|
| 65 |
+
batch_size_eval: 1
|
| 66 |
+
num_workers: 8
|
| 67 |
+
accum_grad_iters: 1
|
| 68 |
+
|
| 69 |
+
max_len: 10
|
| 70 |
+
min_len: 1
|
| 71 |
+
num_beams: 5
|
| 72 |
+
inference_method: "generate"
|
| 73 |
+
length_penalty: -1.
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
seed: 42
|
| 77 |
+
output_dir: "output/instructblip/msrvtt_qa_vicuna13b_test/"
|
| 78 |
+
|
| 79 |
+
amp: True
|
| 80 |
+
resume_ckpt_path: null
|
| 81 |
+
|
| 82 |
+
evaluate: True
|
| 83 |
+
# train_splits: ["train"]
|
| 84 |
+
valid_splits: ["test"]
|
| 85 |
+
|
| 86 |
+
device: "cuda"
|
| 87 |
+
world_size: 1
|
| 88 |
+
dist_url: "env://"
|
| 89 |
+
distributed: True
|
| 90 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 91 |
+
val_freq: 1
|
| 92 |
+
|
LAVIS-main/lavis/projects/instructblip/qa_msrvtt_vicuna7b_eval_test.yaml
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_vicuna_instruct
|
| 8 |
+
model_type: vicuna7b
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: "Question: {} Short Answer:"
|
| 11 |
+
|
| 12 |
+
datasets:
|
| 13 |
+
msrvtt_qa_instruct:
|
| 14 |
+
vis_processor:
|
| 15 |
+
train:
|
| 16 |
+
name: alpro_video_train
|
| 17 |
+
n_frms: 4
|
| 18 |
+
image_size: 224
|
| 19 |
+
min_scale: 0.9
|
| 20 |
+
max_scale: 1.0
|
| 21 |
+
eval:
|
| 22 |
+
name: alpro_video_eval
|
| 23 |
+
n_frms: 4
|
| 24 |
+
image_size: 224
|
| 25 |
+
min_scale: 0.9
|
| 26 |
+
max_scale: 1.0
|
| 27 |
+
text_processor:
|
| 28 |
+
train:
|
| 29 |
+
name: "blip_question"
|
| 30 |
+
eval:
|
| 31 |
+
name: "blip_caption"
|
| 32 |
+
|
| 33 |
+
build_info:
|
| 34 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 35 |
+
annotations:
|
| 36 |
+
train:
|
| 37 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json
|
| 38 |
+
storage: msrvtt/annotations/qa_train.json
|
| 39 |
+
val:
|
| 40 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json
|
| 41 |
+
storage: msrvtt/annotations/qa_val.json
|
| 42 |
+
test:
|
| 43 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json
|
| 44 |
+
storage: msrvtt/annotations/qa_test.json
|
| 45 |
+
ans2label:
|
| 46 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json
|
| 47 |
+
storage: msrvtt/annotations/qa_ans2label.json
|
| 48 |
+
videos:
|
| 49 |
+
storage: /export/share/datasets/vision_language/msrvtt/videos
|
| 50 |
+
|
| 51 |
+
instance_id_key: question_id
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
run:
|
| 55 |
+
task: gqa
|
| 56 |
+
# optimizer
|
| 57 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 58 |
+
init_lr: 1e-5
|
| 59 |
+
min_lr: 0
|
| 60 |
+
warmup_lr: 1e-8
|
| 61 |
+
warmup_steps: 1000
|
| 62 |
+
weight_decay: 0.05
|
| 63 |
+
max_epoch: 1
|
| 64 |
+
batch_size_train: 16
|
| 65 |
+
batch_size_eval: 1
|
| 66 |
+
num_workers: 8
|
| 67 |
+
accum_grad_iters: 1
|
| 68 |
+
|
| 69 |
+
max_len: 10
|
| 70 |
+
min_len: 1
|
| 71 |
+
num_beams: 5
|
| 72 |
+
inference_method: "generate"
|
| 73 |
+
length_penalty: -1.
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
seed: 42
|
| 77 |
+
output_dir: "output/instructblip/msrvtt_qa_vicuna7b_test/"
|
| 78 |
+
|
| 79 |
+
amp: True
|
| 80 |
+
resume_ckpt_path: null
|
| 81 |
+
|
| 82 |
+
evaluate: True
|
| 83 |
+
# train_splits: ["train"]
|
| 84 |
+
valid_splits: ["test"]
|
| 85 |
+
|
| 86 |
+
device: "cuda"
|
| 87 |
+
world_size: 1
|
| 88 |
+
dist_url: "env://"
|
| 89 |
+
distributed: True
|
| 90 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 91 |
+
val_freq: 1
|
| 92 |
+
|
LAVIS-main/lavis/projects/instructblip/qa_msvd_flant5xl_eval.yaml
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_t5_instruct
|
| 8 |
+
model_type: flant5xl
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: "Question: {} Short Answer:"
|
| 11 |
+
|
| 12 |
+
datasets:
|
| 13 |
+
msvd_qa_instruct:
|
| 14 |
+
vis_processor:
|
| 15 |
+
train:
|
| 16 |
+
name: alpro_video_train
|
| 17 |
+
n_frms: 4
|
| 18 |
+
image_size: 224
|
| 19 |
+
min_scale: 0.9
|
| 20 |
+
max_scale: 1.0
|
| 21 |
+
eval:
|
| 22 |
+
name: alpro_video_eval
|
| 23 |
+
n_frms: 4
|
| 24 |
+
image_size: 224
|
| 25 |
+
min_scale: 0.9
|
| 26 |
+
max_scale: 1.0
|
| 27 |
+
text_processor:
|
| 28 |
+
train:
|
| 29 |
+
name: "blip_question"
|
| 30 |
+
eval:
|
| 31 |
+
name: "blip_caption"
|
| 32 |
+
|
| 33 |
+
build_info:
|
| 34 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 35 |
+
annotations:
|
| 36 |
+
train:
|
| 37 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json
|
| 38 |
+
storage: msvd/annotations/qa_train.json
|
| 39 |
+
val:
|
| 40 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json
|
| 41 |
+
storage: msvd/annotations/qa_val.json
|
| 42 |
+
test:
|
| 43 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json
|
| 44 |
+
storage: msvd/annotations/qa_test.json
|
| 45 |
+
ans2label:
|
| 46 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json
|
| 47 |
+
storage: msvd/annotations/qa_ans2label.json
|
| 48 |
+
videos:
|
| 49 |
+
storage: /export/share/datasets/vision_language/msvd/videos
|
| 50 |
+
|
| 51 |
+
instance_id_key: question_id
|
| 52 |
+
|
| 53 |
+
run:
|
| 54 |
+
task: gqa
|
| 55 |
+
# optimizer
|
| 56 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 57 |
+
init_lr: 1e-5
|
| 58 |
+
min_lr: 0
|
| 59 |
+
warmup_lr: 1e-8
|
| 60 |
+
warmup_steps: 1000
|
| 61 |
+
weight_decay: 0.05
|
| 62 |
+
max_epoch: 1
|
| 63 |
+
batch_size_train: 16
|
| 64 |
+
batch_size_eval: 1
|
| 65 |
+
num_workers: 8
|
| 66 |
+
accum_grad_iters: 1
|
| 67 |
+
|
| 68 |
+
max_len: 30
|
| 69 |
+
min_len: 1
|
| 70 |
+
num_beams: 5
|
| 71 |
+
inference_method: "generate"
|
| 72 |
+
prompt: "Question: {} Short Answer:"
|
| 73 |
+
length_penalty: -1.
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
seed: 42
|
| 77 |
+
output_dir: "output/instructblip/msvd_qa_flant5xl/"
|
| 78 |
+
|
| 79 |
+
amp: True
|
| 80 |
+
resume_ckpt_path: null
|
| 81 |
+
|
| 82 |
+
evaluate: True
|
| 83 |
+
# train_splits: ["train"]
|
| 84 |
+
valid_splits: ["test"]
|
| 85 |
+
ques_files: {"train": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_questions.json",
|
| 86 |
+
"val": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_questions.json",
|
| 87 |
+
"test":"/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_test_questions.json"}
|
| 88 |
+
anno_files: {"train": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_annotations.json",
|
| 89 |
+
"val": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_annotations.json",
|
| 90 |
+
"test":"/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_test_annotations.json"}
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
device: "cuda"
|
| 96 |
+
world_size: 1
|
| 97 |
+
dist_url: "env://"
|
| 98 |
+
distributed: True
|
| 99 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 100 |
+
val_freq: 1
|
LAVIS-main/lavis/projects/instructblip/qa_msvd_flant5xxl_eval.yaml
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_t5_instruct
|
| 8 |
+
model_type: flant5xxl
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: "Question: {} Short Answer:"
|
| 11 |
+
|
| 12 |
+
datasets:
|
| 13 |
+
msvd_qa_instruct:
|
| 14 |
+
vis_processor:
|
| 15 |
+
train:
|
| 16 |
+
name: alpro_video_train
|
| 17 |
+
n_frms: 4
|
| 18 |
+
image_size: 224
|
| 19 |
+
min_scale: 0.9
|
| 20 |
+
max_scale: 1.0
|
| 21 |
+
eval:
|
| 22 |
+
name: alpro_video_eval
|
| 23 |
+
n_frms: 4
|
| 24 |
+
image_size: 224
|
| 25 |
+
min_scale: 0.9
|
| 26 |
+
max_scale: 1.0
|
| 27 |
+
text_processor:
|
| 28 |
+
train:
|
| 29 |
+
name: "blip_question"
|
| 30 |
+
eval:
|
| 31 |
+
name: "blip_caption"
|
| 32 |
+
|
| 33 |
+
build_info:
|
| 34 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 35 |
+
annotations:
|
| 36 |
+
train:
|
| 37 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json
|
| 38 |
+
storage: msvd/annotations/qa_train.json
|
| 39 |
+
val:
|
| 40 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json
|
| 41 |
+
storage: msvd/annotations/qa_val.json
|
| 42 |
+
test:
|
| 43 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json
|
| 44 |
+
storage: msvd/annotations/qa_test.json
|
| 45 |
+
ans2label:
|
| 46 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json
|
| 47 |
+
storage: msvd/annotations/qa_ans2label.json
|
| 48 |
+
videos:
|
| 49 |
+
storage: /export/share/datasets/vision_language/msvd/videos
|
| 50 |
+
|
| 51 |
+
instance_id_key: question_id
|
| 52 |
+
|
| 53 |
+
run:
|
| 54 |
+
task: gqa
|
| 55 |
+
# optimizer
|
| 56 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 57 |
+
init_lr: 1e-5
|
| 58 |
+
min_lr: 0
|
| 59 |
+
warmup_lr: 1e-8
|
| 60 |
+
warmup_steps: 1000
|
| 61 |
+
weight_decay: 0.05
|
| 62 |
+
max_epoch: 1
|
| 63 |
+
batch_size_train: 16
|
| 64 |
+
batch_size_eval: 1
|
| 65 |
+
num_workers: 8
|
| 66 |
+
accum_grad_iters: 1
|
| 67 |
+
|
| 68 |
+
max_len: 30
|
| 69 |
+
min_len: 1
|
| 70 |
+
num_beams: 5
|
| 71 |
+
inference_method: "generate"
|
| 72 |
+
prompt: "Question: {} Short Answer:"
|
| 73 |
+
length_penalty: -1.
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
seed: 42
|
| 77 |
+
output_dir: "output/instructblip/msvd_qa_flant5xxl/"
|
| 78 |
+
|
| 79 |
+
amp: True
|
| 80 |
+
resume_ckpt_path: null
|
| 81 |
+
|
| 82 |
+
evaluate: True
|
| 83 |
+
# train_splits: ["train"]
|
| 84 |
+
valid_splits: ["test"]
|
| 85 |
+
ques_files: {"train": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_questions.json",
|
| 86 |
+
"val": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_questions.json",
|
| 87 |
+
"test":"/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_test_questions.json"}
|
| 88 |
+
anno_files: {"train": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_annotations.json",
|
| 89 |
+
"val": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_annotations.json",
|
| 90 |
+
"test":"/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_test_annotations.json"}
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
device: "cuda"
|
| 96 |
+
world_size: 1
|
| 97 |
+
dist_url: "env://"
|
| 98 |
+
distributed: True
|
| 99 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 100 |
+
val_freq: 1
|
LAVIS-main/lavis/projects/instructblip/qa_msvd_vicuna13b_eval.yaml
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_vicuna_instruct
|
| 8 |
+
model_type: vicuna13b
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: "Question: {} Short Answer:"
|
| 11 |
+
|
| 12 |
+
datasets:
|
| 13 |
+
msvd_qa_instruct:
|
| 14 |
+
vis_processor:
|
| 15 |
+
train:
|
| 16 |
+
name: alpro_video_train
|
| 17 |
+
n_frms: 4
|
| 18 |
+
image_size: 224
|
| 19 |
+
min_scale: 0.9
|
| 20 |
+
max_scale: 1.0
|
| 21 |
+
eval:
|
| 22 |
+
name: alpro_video_eval
|
| 23 |
+
n_frms: 4
|
| 24 |
+
image_size: 224
|
| 25 |
+
min_scale: 0.9
|
| 26 |
+
max_scale: 1.0
|
| 27 |
+
text_processor:
|
| 28 |
+
train:
|
| 29 |
+
name: "blip_question"
|
| 30 |
+
eval:
|
| 31 |
+
name: "blip_caption"
|
| 32 |
+
|
| 33 |
+
build_info:
|
| 34 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 35 |
+
annotations:
|
| 36 |
+
train:
|
| 37 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json
|
| 38 |
+
storage: msvd/annotations/qa_train.json
|
| 39 |
+
val:
|
| 40 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json
|
| 41 |
+
storage: msvd/annotations/qa_val.json
|
| 42 |
+
test:
|
| 43 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json
|
| 44 |
+
storage: msvd/annotations/qa_test.json
|
| 45 |
+
ans2label:
|
| 46 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json
|
| 47 |
+
storage: msvd/annotations/qa_ans2label.json
|
| 48 |
+
videos:
|
| 49 |
+
storage: /export/share/datasets/vision_language/msvd/videos
|
| 50 |
+
|
| 51 |
+
instance_id_key: question_id
|
| 52 |
+
|
| 53 |
+
run:
|
| 54 |
+
task: gqa
|
| 55 |
+
# optimizer
|
| 56 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 57 |
+
init_lr: 1e-5
|
| 58 |
+
min_lr: 0
|
| 59 |
+
warmup_lr: 1e-8
|
| 60 |
+
warmup_steps: 1000
|
| 61 |
+
weight_decay: 0.05
|
| 62 |
+
max_epoch: 1
|
| 63 |
+
batch_size_train: 16
|
| 64 |
+
batch_size_eval: 1
|
| 65 |
+
num_workers: 8
|
| 66 |
+
accum_grad_iters: 1
|
| 67 |
+
|
| 68 |
+
max_len: 30
|
| 69 |
+
min_len: 1
|
| 70 |
+
num_beams: 5
|
| 71 |
+
inference_method: "generate"
|
| 72 |
+
prompt: "Question: {} Short Answer:"
|
| 73 |
+
length_penalty: -1.
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
seed: 42
|
| 77 |
+
output_dir: "output/instructblip/msvd_qa_vicuna13b/"
|
| 78 |
+
|
| 79 |
+
amp: True
|
| 80 |
+
resume_ckpt_path: null
|
| 81 |
+
|
| 82 |
+
evaluate: True
|
| 83 |
+
# train_splits: ["train"]
|
| 84 |
+
valid_splits: ["test"]
|
| 85 |
+
ques_files: {"train": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_questions.json",
|
| 86 |
+
"val": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_questions.json",
|
| 87 |
+
"test":"/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_test_questions.json"}
|
| 88 |
+
anno_files: {"train": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_annotations.json",
|
| 89 |
+
"val": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_annotations.json",
|
| 90 |
+
"test":"/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_test_annotations.json"}
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
device: "cuda"
|
| 96 |
+
world_size: 1
|
| 97 |
+
dist_url: "env://"
|
| 98 |
+
distributed: True
|
| 99 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 100 |
+
val_freq: 1
|
LAVIS-main/lavis/projects/instructblip/qa_msvd_vicuna7b_eval.yaml
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_vicuna_instruct
|
| 8 |
+
model_type: vicuna7b
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: "Question: {} Short Answer:"
|
| 11 |
+
|
| 12 |
+
datasets:
|
| 13 |
+
msvd_qa_instruct:
|
| 14 |
+
vis_processor:
|
| 15 |
+
train:
|
| 16 |
+
name: alpro_video_train
|
| 17 |
+
n_frms: 4
|
| 18 |
+
image_size: 224
|
| 19 |
+
min_scale: 0.9
|
| 20 |
+
max_scale: 1.0
|
| 21 |
+
eval:
|
| 22 |
+
name: alpro_video_eval
|
| 23 |
+
n_frms: 4
|
| 24 |
+
image_size: 224
|
| 25 |
+
min_scale: 0.9
|
| 26 |
+
max_scale: 1.0
|
| 27 |
+
text_processor:
|
| 28 |
+
train:
|
| 29 |
+
name: "blip_question"
|
| 30 |
+
eval:
|
| 31 |
+
name: "blip_caption"
|
| 32 |
+
|
| 33 |
+
build_info:
|
| 34 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 35 |
+
annotations:
|
| 36 |
+
train:
|
| 37 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json
|
| 38 |
+
storage: msvd/annotations/qa_train.json
|
| 39 |
+
val:
|
| 40 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json
|
| 41 |
+
storage: msvd/annotations/qa_val.json
|
| 42 |
+
test:
|
| 43 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json
|
| 44 |
+
storage: msvd/annotations/qa_test.json
|
| 45 |
+
ans2label:
|
| 46 |
+
url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json
|
| 47 |
+
storage: msvd/annotations/qa_ans2label.json
|
| 48 |
+
videos:
|
| 49 |
+
storage: /export/share/datasets/vision_language/msvd/videos
|
| 50 |
+
|
| 51 |
+
instance_id_key: question_id
|
| 52 |
+
|
| 53 |
+
run:
|
| 54 |
+
task: gqa
|
| 55 |
+
# optimizer
|
| 56 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 57 |
+
init_lr: 1e-5
|
| 58 |
+
min_lr: 0
|
| 59 |
+
warmup_lr: 1e-8
|
| 60 |
+
warmup_steps: 1000
|
| 61 |
+
weight_decay: 0.05
|
| 62 |
+
max_epoch: 1
|
| 63 |
+
batch_size_train: 16
|
| 64 |
+
batch_size_eval: 1
|
| 65 |
+
num_workers: 8
|
| 66 |
+
accum_grad_iters: 1
|
| 67 |
+
|
| 68 |
+
max_len: 30
|
| 69 |
+
min_len: 1
|
| 70 |
+
num_beams: 5
|
| 71 |
+
inference_method: "generate"
|
| 72 |
+
prompt: "Question: {} Short Answer:"
|
| 73 |
+
length_penalty: -1.
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
seed: 42
|
| 77 |
+
output_dir: "output/instructblip/msvd_qa_vicuna7b/"
|
| 78 |
+
|
| 79 |
+
amp: True
|
| 80 |
+
resume_ckpt_path: null
|
| 81 |
+
|
| 82 |
+
evaluate: True
|
| 83 |
+
# train_splits: ["train"]
|
| 84 |
+
valid_splits: ["test"]
|
| 85 |
+
ques_files: {"train": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_questions.json",
|
| 86 |
+
"val": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_questions.json",
|
| 87 |
+
"test":"/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_test_questions.json"}
|
| 88 |
+
anno_files: {"train": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_annotations.json",
|
| 89 |
+
"val": "/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_val_annotations.json",
|
| 90 |
+
"test":"/export/home/.cache/lavis/msvd_qa_instruct_gt/msvd_qa_instruct_test_annotations.json"}
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
device: "cuda"
|
| 96 |
+
world_size: 1
|
| 97 |
+
dist_url: "env://"
|
| 98 |
+
distributed: True
|
| 99 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 100 |
+
val_freq: 1
|
LAVIS-main/lavis/projects/instructblip/qa_okvqa_flant5xl_eval.yaml
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_t5_instruct
|
| 8 |
+
model_type: flant5xl
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: "Question: {} Short Answer:"
|
| 11 |
+
|
| 12 |
+
datasets:
|
| 13 |
+
ok_vqa:
|
| 14 |
+
vis_processor:
|
| 15 |
+
train:
|
| 16 |
+
name: "clip_image_train"
|
| 17 |
+
eval:
|
| 18 |
+
name: "clip_image_eval"
|
| 19 |
+
text_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: "blip_question"
|
| 22 |
+
eval:
|
| 23 |
+
name: "blip_caption"
|
| 24 |
+
build_info:
|
| 25 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 26 |
+
annotations:
|
| 27 |
+
train:
|
| 28 |
+
url:
|
| 29 |
+
# TODO make this order insensitive
|
| 30 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
|
| 31 |
+
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
|
| 32 |
+
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
|
| 33 |
+
storage:
|
| 34 |
+
- okvqa/annotations/okvqa_train.json
|
| 35 |
+
# - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json
|
| 36 |
+
# - okvqa/annotations/mscoco_train2014_annotations.json
|
| 37 |
+
test:
|
| 38 |
+
url:
|
| 39 |
+
# TODO make this order insensitive
|
| 40 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
|
| 41 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
|
| 42 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
|
| 43 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
|
| 44 |
+
storage:
|
| 45 |
+
- okvqa/annotations/vqa_val_eval.json
|
| 46 |
+
- okvqa/annotations/answer_list.json
|
| 47 |
+
- okvqa/annotations/OpenEnded_mscoco_val2014_questions.json
|
| 48 |
+
- okvqa/annotations/mscoco_val2014_annotations.json
|
| 49 |
+
images:
|
| 50 |
+
storage: /export/share/datasets/vision/coco/images
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
run:
|
| 54 |
+
task: vqa
|
| 55 |
+
# optimizer
|
| 56 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 57 |
+
init_lr: 1e-5
|
| 58 |
+
min_lr: 0
|
| 59 |
+
warmup_lr: 1e-8
|
| 60 |
+
warmup_steps: 1000
|
| 61 |
+
weight_decay: 0.05
|
| 62 |
+
max_epoch: 1
|
| 63 |
+
batch_size_train: 16
|
| 64 |
+
batch_size_eval: 1
|
| 65 |
+
num_workers: 8
|
| 66 |
+
accum_grad_iters: 1
|
| 67 |
+
|
| 68 |
+
max_len: 10
|
| 69 |
+
min_len: 1
|
| 70 |
+
num_beams: 5
|
| 71 |
+
inference_method: "generate"
|
| 72 |
+
length_penalty: -1.
|
| 73 |
+
|
| 74 |
+
seed: 42
|
| 75 |
+
output_dir: "output/instructblip/okavqa_qa_flant5xl/"
|
| 76 |
+
|
| 77 |
+
amp: True
|
| 78 |
+
resume_ckpt_path: null
|
| 79 |
+
|
| 80 |
+
evaluate: True
|
| 81 |
+
# train_splits: ["train"]
|
| 82 |
+
valid_splits: ["test"]
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
device: "cuda"
|
| 86 |
+
world_size: 1
|
| 87 |
+
dist_url: "env://"
|
| 88 |
+
distributed: True
|
| 89 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 90 |
+
val_freq: 1
|
LAVIS-main/lavis/projects/instructblip/qa_okvqa_flant5xxl_eval.yaml
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_t5_instruct
|
| 8 |
+
model_type: flant5xxl
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: "Question: {} Short Answer:"
|
| 11 |
+
|
| 12 |
+
datasets:
|
| 13 |
+
ok_vqa:
|
| 14 |
+
vis_processor:
|
| 15 |
+
train:
|
| 16 |
+
name: "clip_image_train"
|
| 17 |
+
eval:
|
| 18 |
+
name: "clip_image_eval"
|
| 19 |
+
text_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: "blip_question"
|
| 22 |
+
eval:
|
| 23 |
+
name: "blip_caption"
|
| 24 |
+
build_info:
|
| 25 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 26 |
+
annotations:
|
| 27 |
+
train:
|
| 28 |
+
url:
|
| 29 |
+
# TODO make this order insensitive
|
| 30 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
|
| 31 |
+
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
|
| 32 |
+
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
|
| 33 |
+
storage:
|
| 34 |
+
- okvqa/annotations/okvqa_train.json
|
| 35 |
+
# - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json
|
| 36 |
+
# - okvqa/annotations/mscoco_train2014_annotations.json
|
| 37 |
+
test:
|
| 38 |
+
url:
|
| 39 |
+
# TODO make this order insensitive
|
| 40 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
|
| 41 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
|
| 42 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
|
| 43 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
|
| 44 |
+
storage:
|
| 45 |
+
- okvqa/annotations/vqa_val_eval.json
|
| 46 |
+
- okvqa/annotations/answer_list.json
|
| 47 |
+
- okvqa/annotations/OpenEnded_mscoco_val2014_questions.json
|
| 48 |
+
- okvqa/annotations/mscoco_val2014_annotations.json
|
| 49 |
+
images:
|
| 50 |
+
storage: /export/share/datasets/vision/coco/images
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
run:
|
| 54 |
+
task: vqa
|
| 55 |
+
# optimizer
|
| 56 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 57 |
+
init_lr: 1e-5
|
| 58 |
+
min_lr: 0
|
| 59 |
+
warmup_lr: 1e-8
|
| 60 |
+
warmup_steps: 1000
|
| 61 |
+
weight_decay: 0.05
|
| 62 |
+
max_epoch: 1
|
| 63 |
+
batch_size_train: 16
|
| 64 |
+
batch_size_eval: 1
|
| 65 |
+
num_workers: 0
|
| 66 |
+
accum_grad_iters: 1
|
| 67 |
+
|
| 68 |
+
max_len: 10
|
| 69 |
+
min_len: 1
|
| 70 |
+
num_beams: 5
|
| 71 |
+
inference_method: "generate"
|
| 72 |
+
length_penalty: -1.
|
| 73 |
+
|
| 74 |
+
seed: 42
|
| 75 |
+
output_dir: "output/instructblip/okavqa_qa_flant5xxl/"
|
| 76 |
+
|
| 77 |
+
amp: True
|
| 78 |
+
resume_ckpt_path: null
|
| 79 |
+
|
| 80 |
+
evaluate: True
|
| 81 |
+
# train_splits: ["train"]
|
| 82 |
+
valid_splits: ["test"]
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
device: "cuda"
|
| 86 |
+
world_size: 1
|
| 87 |
+
dist_url: "env://"
|
| 88 |
+
distributed: True
|
| 89 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 90 |
+
val_freq: 1
|
LAVIS-main/lavis/projects/instructblip/qa_okvqa_vicuna13b_eval.yaml
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_vicuna_instruct
|
| 8 |
+
model_type: vicuna13b
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: "Question: {} Short Answer:"
|
| 11 |
+
|
| 12 |
+
datasets:
|
| 13 |
+
ok_vqa:
|
| 14 |
+
vis_processor:
|
| 15 |
+
train:
|
| 16 |
+
name: "clip_image_train"
|
| 17 |
+
eval:
|
| 18 |
+
name: "clip_image_eval"
|
| 19 |
+
text_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: "blip_question"
|
| 22 |
+
eval:
|
| 23 |
+
name: "blip_caption"
|
| 24 |
+
build_info:
|
| 25 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 26 |
+
annotations:
|
| 27 |
+
train:
|
| 28 |
+
url:
|
| 29 |
+
# TODO make this order insensitive
|
| 30 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
|
| 31 |
+
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
|
| 32 |
+
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
|
| 33 |
+
storage:
|
| 34 |
+
- okvqa/annotations/okvqa_train.json
|
| 35 |
+
# - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json
|
| 36 |
+
# - okvqa/annotations/mscoco_train2014_annotations.json
|
| 37 |
+
test:
|
| 38 |
+
url:
|
| 39 |
+
# TODO make this order insensitive
|
| 40 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
|
| 41 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
|
| 42 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
|
| 43 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
|
| 44 |
+
storage:
|
| 45 |
+
- okvqa/annotations/vqa_val_eval.json
|
| 46 |
+
- okvqa/annotations/answer_list.json
|
| 47 |
+
- okvqa/annotations/OpenEnded_mscoco_val2014_questions.json
|
| 48 |
+
- okvqa/annotations/mscoco_val2014_annotations.json
|
| 49 |
+
images:
|
| 50 |
+
storage: /export/share/datasets/vision/coco/images
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
run:
|
| 54 |
+
task: vqa
|
| 55 |
+
# optimizer
|
| 56 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 57 |
+
init_lr: 1e-5
|
| 58 |
+
min_lr: 0
|
| 59 |
+
warmup_lr: 1e-8
|
| 60 |
+
warmup_steps: 1000
|
| 61 |
+
weight_decay: 0.05
|
| 62 |
+
max_epoch: 1
|
| 63 |
+
batch_size_train: 16
|
| 64 |
+
batch_size_eval: 1
|
| 65 |
+
num_workers: 8
|
| 66 |
+
accum_grad_iters: 1
|
| 67 |
+
|
| 68 |
+
max_len: 10
|
| 69 |
+
min_len: 1
|
| 70 |
+
num_beams: 5
|
| 71 |
+
inference_method: "generate"
|
| 72 |
+
length_penalty: -1.
|
| 73 |
+
|
| 74 |
+
seed: 42
|
| 75 |
+
output_dir: "output/instructblip/okavqa_qa_vicuna13b/"
|
| 76 |
+
|
| 77 |
+
amp: True
|
| 78 |
+
resume_ckpt_path: null
|
| 79 |
+
|
| 80 |
+
evaluate: True
|
| 81 |
+
# train_splits: ["train"]
|
| 82 |
+
valid_splits: ["test"]
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
device: "cuda"
|
| 86 |
+
world_size: 1
|
| 87 |
+
dist_url: "env://"
|
| 88 |
+
distributed: True
|
| 89 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 90 |
+
val_freq: 1
|
LAVIS-main/lavis/projects/instructblip/qa_okvqa_vicuna7b_eval.yaml
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: blip2_vicuna_instruct
|
| 8 |
+
model_type: vicuna7b
|
| 9 |
+
load_pretrained: True
|
| 10 |
+
prompt: "Question: {} Short Answer:"
|
| 11 |
+
|
| 12 |
+
datasets:
|
| 13 |
+
ok_vqa:
|
| 14 |
+
vis_processor:
|
| 15 |
+
train:
|
| 16 |
+
name: "clip_image_train"
|
| 17 |
+
eval:
|
| 18 |
+
name: "clip_image_eval"
|
| 19 |
+
text_processor:
|
| 20 |
+
train:
|
| 21 |
+
name: "blip_question"
|
| 22 |
+
eval:
|
| 23 |
+
name: "blip_caption"
|
| 24 |
+
build_info:
|
| 25 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 26 |
+
annotations:
|
| 27 |
+
train:
|
| 28 |
+
url:
|
| 29 |
+
# TODO make this order insensitive
|
| 30 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
|
| 31 |
+
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
|
| 32 |
+
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
|
| 33 |
+
storage:
|
| 34 |
+
- okvqa/annotations/okvqa_train.json
|
| 35 |
+
# - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json
|
| 36 |
+
# - okvqa/annotations/mscoco_train2014_annotations.json
|
| 37 |
+
test:
|
| 38 |
+
url:
|
| 39 |
+
# TODO make this order insensitive
|
| 40 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
|
| 41 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
|
| 42 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
|
| 43 |
+
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
|
| 44 |
+
storage:
|
| 45 |
+
- okvqa/annotations/vqa_val_eval.json
|
| 46 |
+
- okvqa/annotations/answer_list.json
|
| 47 |
+
- okvqa/annotations/OpenEnded_mscoco_val2014_questions.json
|
| 48 |
+
- okvqa/annotations/mscoco_val2014_annotations.json
|
| 49 |
+
images:
|
| 50 |
+
storage: /export/share/datasets/vision/coco/images
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
run:
|
| 54 |
+
task: vqa
|
| 55 |
+
# optimizer
|
| 56 |
+
lr_sched: "linear_warmup_cosine_lr"
|
| 57 |
+
init_lr: 1e-5
|
| 58 |
+
min_lr: 0
|
| 59 |
+
warmup_lr: 1e-8
|
| 60 |
+
warmup_steps: 1000
|
| 61 |
+
weight_decay: 0.05
|
| 62 |
+
max_epoch: 1
|
| 63 |
+
batch_size_train: 16
|
| 64 |
+
batch_size_eval: 1
|
| 65 |
+
num_workers: 8
|
| 66 |
+
accum_grad_iters: 1
|
| 67 |
+
|
| 68 |
+
max_len: 10
|
| 69 |
+
min_len: 1
|
| 70 |
+
num_beams: 5
|
| 71 |
+
inference_method: "generate"
|
| 72 |
+
length_penalty: -1.
|
| 73 |
+
|
| 74 |
+
seed: 42
|
| 75 |
+
output_dir: "output/instructblip/okavqa_qa_vicuna7b/"
|
| 76 |
+
|
| 77 |
+
amp: True
|
| 78 |
+
resume_ckpt_path: null
|
| 79 |
+
|
| 80 |
+
evaluate: True
|
| 81 |
+
# train_splits: ["train"]
|
| 82 |
+
valid_splits: ["test"]
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
device: "cuda"
|
| 86 |
+
world_size: 1
|
| 87 |
+
dist_url: "env://"
|
| 88 |
+
distributed: True
|
| 89 |
+
save_freq: -1 # save epoch every xxx epochs -1 only save last and best.
|
| 90 |
+
val_freq: 1
|
LAVIS-main/lavis/projects/pnp-vqa/eval/gqa_eval.yaml
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: pnp_vqa
|
| 8 |
+
model_type: base
|
| 9 |
+
|
| 10 |
+
datasets:
|
| 11 |
+
gqa: # name of the dataset builder
|
| 12 |
+
type: balanced_testdev
|
| 13 |
+
vis_processor:
|
| 14 |
+
eval:
|
| 15 |
+
name: "blip_image_eval"
|
| 16 |
+
image_size: 384
|
| 17 |
+
text_processor:
|
| 18 |
+
eval:
|
| 19 |
+
name: "blip_question"
|
| 20 |
+
|
| 21 |
+
run:
|
| 22 |
+
task: gqa_reading_comprehension
|
| 23 |
+
|
| 24 |
+
# optimization-specific
|
| 25 |
+
batch_size_train: 16
|
| 26 |
+
batch_size_eval: 16
|
| 27 |
+
num_workers: 4
|
| 28 |
+
|
| 29 |
+
# image question matching specific
|
| 30 |
+
block_num: 7
|
| 31 |
+
|
| 32 |
+
# image captioning specific
|
| 33 |
+
top_k: 50
|
| 34 |
+
top_p: 1
|
| 35 |
+
cap_min_length: 10
|
| 36 |
+
cap_max_length: 20
|
| 37 |
+
repetition_penalty: 1
|
| 38 |
+
num_patches: 20
|
| 39 |
+
num_captions: 100
|
| 40 |
+
prompt: 'a picture of '
|
| 41 |
+
|
| 42 |
+
# question answering specific
|
| 43 |
+
internal_bsz_fid: 1
|
| 44 |
+
num_captions_fid: 5
|
| 45 |
+
min_len: 0
|
| 46 |
+
max_len: 20
|
| 47 |
+
num_beams: 1
|
| 48 |
+
inference_method: "generate"
|
| 49 |
+
|
| 50 |
+
seed: 42
|
| 51 |
+
output_dir: "output/PNP-VQA/GQA"
|
| 52 |
+
|
| 53 |
+
evaluate: True
|
| 54 |
+
test_splits: ["val"]
|
| 55 |
+
|
| 56 |
+
# distribution-specific
|
| 57 |
+
device: "cuda"
|
| 58 |
+
world_size: 1
|
| 59 |
+
dist_url: "env://"
|
| 60 |
+
distributed: True
|
LAVIS-main/lavis/projects/pnp-vqa/eval/gqa_eval_3b.yaml
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: pnp_vqa
|
| 8 |
+
model_type: 3b
|
| 9 |
+
|
| 10 |
+
datasets:
|
| 11 |
+
gqa: # name of the dataset builder
|
| 12 |
+
type: balanced_testdev
|
| 13 |
+
vis_processor:
|
| 14 |
+
eval:
|
| 15 |
+
name: "blip_image_eval"
|
| 16 |
+
image_size: 384
|
| 17 |
+
text_processor:
|
| 18 |
+
eval:
|
| 19 |
+
name: "blip_question"
|
| 20 |
+
|
| 21 |
+
run:
|
| 22 |
+
task: gqa_reading_comprehension
|
| 23 |
+
|
| 24 |
+
# optimization-specific
|
| 25 |
+
batch_size_train: 4
|
| 26 |
+
batch_size_eval: 4
|
| 27 |
+
num_workers: 4
|
| 28 |
+
|
| 29 |
+
# image question matching specific
|
| 30 |
+
block_num: 7
|
| 31 |
+
|
| 32 |
+
# image captioning specific
|
| 33 |
+
top_k: 50
|
| 34 |
+
top_p: 1
|
| 35 |
+
cap_min_length: 10
|
| 36 |
+
cap_max_length: 20
|
| 37 |
+
repetition_penalty: 1
|
| 38 |
+
num_patches: 20
|
| 39 |
+
num_captions: 100
|
| 40 |
+
prompt: 'a picture of '
|
| 41 |
+
|
| 42 |
+
# question answering specific
|
| 43 |
+
internal_bsz_fid: 1
|
| 44 |
+
num_captions_fid: 5
|
| 45 |
+
min_len: 0
|
| 46 |
+
max_len: 20
|
| 47 |
+
num_beams: 1
|
| 48 |
+
inference_method: "generate"
|
| 49 |
+
|
| 50 |
+
seed: 42
|
| 51 |
+
output_dir: "output/PNP-VQA-3b/GQA"
|
| 52 |
+
|
| 53 |
+
evaluate: True
|
| 54 |
+
test_splits: ["val"]
|
| 55 |
+
|
| 56 |
+
# distribution-specific
|
| 57 |
+
device: "cuda"
|
| 58 |
+
world_size: 1
|
| 59 |
+
dist_url: "env://"
|
| 60 |
+
distributed: True
|
LAVIS-main/lavis/projects/pnp-vqa/eval/gqa_eval_large.yaml
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: pnp_vqa
|
| 8 |
+
model_type: large
|
| 9 |
+
|
| 10 |
+
datasets:
|
| 11 |
+
gqa: # name of the dataset builder
|
| 12 |
+
type: balanced_testdev
|
| 13 |
+
vis_processor:
|
| 14 |
+
eval:
|
| 15 |
+
name: "blip_image_eval"
|
| 16 |
+
image_size: 384
|
| 17 |
+
text_processor:
|
| 18 |
+
eval:
|
| 19 |
+
name: "blip_question"
|
| 20 |
+
|
| 21 |
+
run:
|
| 22 |
+
task: gqa_reading_comprehension
|
| 23 |
+
|
| 24 |
+
# optimization-specific
|
| 25 |
+
batch_size_train: 12
|
| 26 |
+
batch_size_eval: 12
|
| 27 |
+
num_workers: 4
|
| 28 |
+
|
| 29 |
+
# image question matching specific
|
| 30 |
+
block_num: 7
|
| 31 |
+
|
| 32 |
+
# image captioning specific
|
| 33 |
+
top_k: 50
|
| 34 |
+
top_p: 1
|
| 35 |
+
cap_min_length: 10
|
| 36 |
+
cap_max_length: 20
|
| 37 |
+
repetition_penalty: 1
|
| 38 |
+
num_patches: 20
|
| 39 |
+
num_captions: 100
|
| 40 |
+
prompt: 'a picture of '
|
| 41 |
+
|
| 42 |
+
# question answering specific
|
| 43 |
+
internal_bsz_fid: 1
|
| 44 |
+
num_captions_fid: 5
|
| 45 |
+
min_len: 0
|
| 46 |
+
max_len: 20
|
| 47 |
+
num_beams: 1
|
| 48 |
+
inference_method: "generate"
|
| 49 |
+
|
| 50 |
+
seed: 42
|
| 51 |
+
output_dir: "output/PNP-VQA-large/GQA"
|
| 52 |
+
|
| 53 |
+
evaluate: True
|
| 54 |
+
test_splits: ["val"]
|
| 55 |
+
|
| 56 |
+
# distribution-specific
|
| 57 |
+
device: "cuda"
|
| 58 |
+
world_size: 1
|
| 59 |
+
dist_url: "env://"
|
| 60 |
+
distributed: True
|
LAVIS-main/lavis/projects/pnp-vqa/eval/okvqa_eval.yaml
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: pnp_vqa
|
| 8 |
+
model_type: base
|
| 9 |
+
|
| 10 |
+
datasets:
|
| 11 |
+
ok_vqa: # name of the dataset builder
|
| 12 |
+
vis_processor:
|
| 13 |
+
eval:
|
| 14 |
+
name: "blip_image_eval"
|
| 15 |
+
image_size: 384
|
| 16 |
+
text_processor:
|
| 17 |
+
eval:
|
| 18 |
+
name: "blip_question"
|
| 19 |
+
|
| 20 |
+
run:
|
| 21 |
+
task: vqa_reading_comprehension
|
| 22 |
+
|
| 23 |
+
# optimization-specific
|
| 24 |
+
batch_size_train: 16
|
| 25 |
+
batch_size_eval: 16
|
| 26 |
+
num_workers: 4
|
| 27 |
+
|
| 28 |
+
# image question matching specific
|
| 29 |
+
block_num: 7
|
| 30 |
+
|
| 31 |
+
# image captioning specific
|
| 32 |
+
top_k: 50
|
| 33 |
+
top_p: 1
|
| 34 |
+
cap_min_length: 10
|
| 35 |
+
cap_max_length: 20
|
| 36 |
+
repetition_penalty: 1
|
| 37 |
+
num_patches: 20
|
| 38 |
+
num_captions: 100
|
| 39 |
+
prompt: 'a picture of '
|
| 40 |
+
|
| 41 |
+
# question answering specific
|
| 42 |
+
internal_bsz_fid: 1
|
| 43 |
+
num_captions_fid: 1
|
| 44 |
+
min_len: 0
|
| 45 |
+
max_len: 20
|
| 46 |
+
num_beams: 1
|
| 47 |
+
inference_method: "generate"
|
| 48 |
+
|
| 49 |
+
seed: 42
|
| 50 |
+
output_dir: "output/PNP-VQA/OKVQA"
|
| 51 |
+
|
| 52 |
+
evaluate: True
|
| 53 |
+
test_splits: ["test"]
|
| 54 |
+
|
| 55 |
+
# distribution-specific
|
| 56 |
+
device: "cuda"
|
| 57 |
+
world_size: 1
|
| 58 |
+
dist_url: "env://"
|
| 59 |
+
distributed: True
|
LAVIS-main/lavis/projects/pnp-vqa/eval/okvqa_eval_3b.yaml
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: pnp_vqa
|
| 8 |
+
model_type: 3b
|
| 9 |
+
|
| 10 |
+
datasets:
|
| 11 |
+
ok_vqa: # name of the dataset builder
|
| 12 |
+
vis_processor:
|
| 13 |
+
eval:
|
| 14 |
+
name: "blip_image_eval"
|
| 15 |
+
image_size: 384
|
| 16 |
+
text_processor:
|
| 17 |
+
eval:
|
| 18 |
+
name: "blip_question"
|
| 19 |
+
|
| 20 |
+
run:
|
| 21 |
+
task: vqa_reading_comprehension
|
| 22 |
+
|
| 23 |
+
# optimization-specific
|
| 24 |
+
batch_size_train: 4
|
| 25 |
+
batch_size_eval: 4
|
| 26 |
+
num_workers: 4
|
| 27 |
+
|
| 28 |
+
# image question matching specific
|
| 29 |
+
block_num: 7
|
| 30 |
+
|
| 31 |
+
# image captioning specific
|
| 32 |
+
top_k: 50
|
| 33 |
+
top_p: 1
|
| 34 |
+
cap_min_length: 10
|
| 35 |
+
cap_max_length: 20
|
| 36 |
+
repetition_penalty: 1
|
| 37 |
+
num_patches: 20
|
| 38 |
+
num_captions: 100
|
| 39 |
+
prompt: 'a picture of '
|
| 40 |
+
|
| 41 |
+
# question answering specific
|
| 42 |
+
internal_bsz_fid: 1
|
| 43 |
+
num_captions_fid: 1
|
| 44 |
+
min_len: 0
|
| 45 |
+
max_len: 20
|
| 46 |
+
num_beams: 1
|
| 47 |
+
inference_method: "generate"
|
| 48 |
+
|
| 49 |
+
seed: 42
|
| 50 |
+
output_dir: "output/PNP-VQA-3b/OKVQA"
|
| 51 |
+
|
| 52 |
+
evaluate: True
|
| 53 |
+
test_splits: ["test"]
|
| 54 |
+
|
| 55 |
+
# distribution-specific
|
| 56 |
+
device: "cuda"
|
| 57 |
+
world_size: 1
|
| 58 |
+
dist_url: "env://"
|
| 59 |
+
distributed: True
|
LAVIS-main/lavis/projects/pnp-vqa/eval/okvqa_eval_large.yaml
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: pnp_vqa
|
| 8 |
+
model_type: large
|
| 9 |
+
|
| 10 |
+
datasets:
|
| 11 |
+
ok_vqa: # name of the dataset builder
|
| 12 |
+
vis_processor:
|
| 13 |
+
eval:
|
| 14 |
+
name: "blip_image_eval"
|
| 15 |
+
image_size: 384
|
| 16 |
+
text_processor:
|
| 17 |
+
eval:
|
| 18 |
+
name: "blip_question"
|
| 19 |
+
|
| 20 |
+
run:
|
| 21 |
+
task: vqa_reading_comprehension
|
| 22 |
+
|
| 23 |
+
# optimization-specific
|
| 24 |
+
batch_size_train: 12
|
| 25 |
+
batch_size_eval: 12
|
| 26 |
+
num_workers: 4
|
| 27 |
+
|
| 28 |
+
# image question matching specific
|
| 29 |
+
block_num: 7
|
| 30 |
+
|
| 31 |
+
# image captioning specific
|
| 32 |
+
top_k: 50
|
| 33 |
+
top_p: 1
|
| 34 |
+
cap_min_length: 10
|
| 35 |
+
cap_max_length: 20
|
| 36 |
+
repetition_penalty: 1
|
| 37 |
+
num_patches: 20
|
| 38 |
+
num_captions: 100
|
| 39 |
+
prompt: 'a picture of '
|
| 40 |
+
|
| 41 |
+
# question answering specific
|
| 42 |
+
internal_bsz_fid: 1
|
| 43 |
+
num_captions_fid: 1
|
| 44 |
+
min_len: 0
|
| 45 |
+
max_len: 20
|
| 46 |
+
num_beams: 1
|
| 47 |
+
inference_method: "generate"
|
| 48 |
+
|
| 49 |
+
seed: 42
|
| 50 |
+
output_dir: "output/PNP-VQA-large/OKVQA"
|
| 51 |
+
|
| 52 |
+
evaluate: True
|
| 53 |
+
test_splits: ["test"]
|
| 54 |
+
|
| 55 |
+
# distribution-specific
|
| 56 |
+
device: "cuda"
|
| 57 |
+
world_size: 1
|
| 58 |
+
dist_url: "env://"
|
| 59 |
+
distributed: True
|
LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_eval.yaml
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: pnp_vqa
|
| 8 |
+
model_type: base
|
| 9 |
+
|
| 10 |
+
datasets:
|
| 11 |
+
coco_vqa: # name of the dataset builder
|
| 12 |
+
type: eval
|
| 13 |
+
vis_processor:
|
| 14 |
+
eval:
|
| 15 |
+
name: "blip_image_eval"
|
| 16 |
+
image_size: 384
|
| 17 |
+
text_processor:
|
| 18 |
+
eval:
|
| 19 |
+
name: "blip_question"
|
| 20 |
+
|
| 21 |
+
run:
|
| 22 |
+
task: vqa_reading_comprehension
|
| 23 |
+
|
| 24 |
+
# optimization-specific
|
| 25 |
+
batch_size_train: 16
|
| 26 |
+
batch_size_eval: 16
|
| 27 |
+
num_workers: 4
|
| 28 |
+
|
| 29 |
+
# image question matching specific
|
| 30 |
+
block_num: 7
|
| 31 |
+
|
| 32 |
+
# image captioning specific
|
| 33 |
+
top_k: 50
|
| 34 |
+
top_p: 1
|
| 35 |
+
cap_min_length: 10
|
| 36 |
+
cap_max_length: 20
|
| 37 |
+
repetition_penalty: 1
|
| 38 |
+
num_patches: 20
|
| 39 |
+
num_captions: 100
|
| 40 |
+
prompt: 'a picture of '
|
| 41 |
+
|
| 42 |
+
# question answering specific
|
| 43 |
+
internal_bsz_fid: 1
|
| 44 |
+
num_captions_fid: 1
|
| 45 |
+
min_len: 0
|
| 46 |
+
max_len: 20
|
| 47 |
+
num_beams: 1
|
| 48 |
+
inference_method: "generate"
|
| 49 |
+
|
| 50 |
+
seed: 42
|
| 51 |
+
output_dir: "output/PNP-VQA/VQAv2_val"
|
| 52 |
+
|
| 53 |
+
evaluate: True
|
| 54 |
+
test_splits: ["val"]
|
| 55 |
+
|
| 56 |
+
# distribution-specific
|
| 57 |
+
device: "cuda"
|
| 58 |
+
world_size: 1
|
| 59 |
+
dist_url: "env://"
|
| 60 |
+
distributed: True
|
LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_eval_3b.yaml
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: pnp_vqa
|
| 8 |
+
model_type: 3b
|
| 9 |
+
|
| 10 |
+
datasets:
|
| 11 |
+
coco_vqa: # name of the dataset builder
|
| 12 |
+
type: eval
|
| 13 |
+
vis_processor:
|
| 14 |
+
eval:
|
| 15 |
+
name: "blip_image_eval"
|
| 16 |
+
image_size: 384
|
| 17 |
+
text_processor:
|
| 18 |
+
eval:
|
| 19 |
+
name: "blip_question"
|
| 20 |
+
|
| 21 |
+
run:
|
| 22 |
+
task: vqa_reading_comprehension
|
| 23 |
+
|
| 24 |
+
# optimization-specific
|
| 25 |
+
batch_size_train: 4
|
| 26 |
+
batch_size_eval: 4
|
| 27 |
+
num_workers: 4
|
| 28 |
+
|
| 29 |
+
# image question matching specific
|
| 30 |
+
block_num: 7
|
| 31 |
+
|
| 32 |
+
# image captioning specific
|
| 33 |
+
top_k: 50
|
| 34 |
+
top_p: 1
|
| 35 |
+
cap_min_length: 10
|
| 36 |
+
cap_max_length: 20
|
| 37 |
+
repetition_penalty: 1
|
| 38 |
+
num_patches: 20
|
| 39 |
+
num_captions: 100
|
| 40 |
+
prompt: 'a picture of '
|
| 41 |
+
|
| 42 |
+
# question answering specific
|
| 43 |
+
internal_bsz_fid: 1
|
| 44 |
+
num_captions_fid: 1
|
| 45 |
+
min_len: 0
|
| 46 |
+
max_len: 20
|
| 47 |
+
num_beams: 1
|
| 48 |
+
inference_method: "generate"
|
| 49 |
+
|
| 50 |
+
seed: 42
|
| 51 |
+
output_dir: "output/PNP-VQA-3b/VQAv2_val"
|
| 52 |
+
|
| 53 |
+
evaluate: True
|
| 54 |
+
test_splits: ["val"]
|
| 55 |
+
|
| 56 |
+
# distribution-specific
|
| 57 |
+
device: "cuda"
|
| 58 |
+
world_size: 1
|
| 59 |
+
dist_url: "env://"
|
| 60 |
+
distributed: True
|
LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_eval_large.yaml
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: pnp_vqa
|
| 8 |
+
model_type: large
|
| 9 |
+
|
| 10 |
+
datasets:
|
| 11 |
+
coco_vqa: # name of the dataset builder
|
| 12 |
+
type: eval
|
| 13 |
+
vis_processor:
|
| 14 |
+
eval:
|
| 15 |
+
name: "blip_image_eval"
|
| 16 |
+
image_size: 384
|
| 17 |
+
text_processor:
|
| 18 |
+
eval:
|
| 19 |
+
name: "blip_question"
|
| 20 |
+
|
| 21 |
+
run:
|
| 22 |
+
task: vqa_reading_comprehension
|
| 23 |
+
|
| 24 |
+
# optimization-specific
|
| 25 |
+
batch_size_train: 12
|
| 26 |
+
batch_size_eval: 12
|
| 27 |
+
num_workers: 4
|
| 28 |
+
|
| 29 |
+
# image question matching specific
|
| 30 |
+
block_num: 7
|
| 31 |
+
|
| 32 |
+
# image captioning specific
|
| 33 |
+
top_k: 50
|
| 34 |
+
top_p: 1
|
| 35 |
+
cap_min_length: 10
|
| 36 |
+
cap_max_length: 20
|
| 37 |
+
repetition_penalty: 1
|
| 38 |
+
num_patches: 20
|
| 39 |
+
num_captions: 100
|
| 40 |
+
prompt: 'a picture of '
|
| 41 |
+
|
| 42 |
+
# question answering specific
|
| 43 |
+
internal_bsz_fid: 1
|
| 44 |
+
num_captions_fid: 1
|
| 45 |
+
min_len: 0
|
| 46 |
+
max_len: 20
|
| 47 |
+
num_beams: 1
|
| 48 |
+
inference_method: "generate"
|
| 49 |
+
|
| 50 |
+
seed: 42
|
| 51 |
+
output_dir: "output/PNP-VQA-large/VQAv2_val"
|
| 52 |
+
|
| 53 |
+
evaluate: True
|
| 54 |
+
test_splits: ["val"]
|
| 55 |
+
|
| 56 |
+
# distribution-specific
|
| 57 |
+
device: "cuda"
|
| 58 |
+
world_size: 1
|
| 59 |
+
dist_url: "env://"
|
| 60 |
+
distributed: True
|
LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_test_eval.yaml
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: pnp_vqa
|
| 8 |
+
model_type: base
|
| 9 |
+
|
| 10 |
+
datasets:
|
| 11 |
+
coco_vqa: # name of the dataset builder
|
| 12 |
+
type: default
|
| 13 |
+
vis_processor:
|
| 14 |
+
eval:
|
| 15 |
+
name: "blip_image_eval"
|
| 16 |
+
image_size: 384
|
| 17 |
+
text_processor:
|
| 18 |
+
eval:
|
| 19 |
+
name: "blip_question"
|
| 20 |
+
|
| 21 |
+
run:
|
| 22 |
+
task: vqa_reading_comprehension
|
| 23 |
+
|
| 24 |
+
# optimization-specific
|
| 25 |
+
batch_size_train: 16
|
| 26 |
+
batch_size_eval: 16
|
| 27 |
+
num_workers: 4
|
| 28 |
+
|
| 29 |
+
# image question matching specific
|
| 30 |
+
block_num: 7
|
| 31 |
+
|
| 32 |
+
# image captioning specific
|
| 33 |
+
top_k: 50
|
| 34 |
+
top_p: 1
|
| 35 |
+
cap_min_length: 10
|
| 36 |
+
cap_max_length: 20
|
| 37 |
+
repetition_penalty: 1
|
| 38 |
+
num_patches: 20
|
| 39 |
+
num_captions: 100
|
| 40 |
+
prompt: 'a picture of '
|
| 41 |
+
|
| 42 |
+
# question answering specific
|
| 43 |
+
internal_bsz_fid: 1
|
| 44 |
+
num_captions_fid: 1
|
| 45 |
+
min_len: 0
|
| 46 |
+
max_len: 20
|
| 47 |
+
num_beams: 1
|
| 48 |
+
inference_method: "generate"
|
| 49 |
+
|
| 50 |
+
seed: 42
|
| 51 |
+
output_dir: "output/PNP-VQA/VQAv2_test"
|
| 52 |
+
|
| 53 |
+
evaluate: True
|
| 54 |
+
test_splits: ["test"]
|
| 55 |
+
|
| 56 |
+
# distribution-specific
|
| 57 |
+
device: "cuda"
|
| 58 |
+
world_size: 1
|
| 59 |
+
dist_url: "env://"
|
| 60 |
+
distributed: True
|
LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_test_eval_3b.yaml
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: pnp_vqa
|
| 8 |
+
model_type: 3b
|
| 9 |
+
|
| 10 |
+
datasets:
|
| 11 |
+
coco_vqa: # name of the dataset builder
|
| 12 |
+
type: default
|
| 13 |
+
vis_processor:
|
| 14 |
+
eval:
|
| 15 |
+
name: "blip_image_eval"
|
| 16 |
+
image_size: 384
|
| 17 |
+
text_processor:
|
| 18 |
+
eval:
|
| 19 |
+
name: "blip_question"
|
| 20 |
+
|
| 21 |
+
run:
|
| 22 |
+
task: vqa_reading_comprehension
|
| 23 |
+
|
| 24 |
+
# optimization-specific
|
| 25 |
+
batch_size_train: 4
|
| 26 |
+
batch_size_eval: 4
|
| 27 |
+
num_workers: 4
|
| 28 |
+
|
| 29 |
+
# image question matching specific
|
| 30 |
+
block_num: 7
|
| 31 |
+
|
| 32 |
+
# image captioning specific
|
| 33 |
+
top_k: 50
|
| 34 |
+
top_p: 1
|
| 35 |
+
cap_min_length: 10
|
| 36 |
+
cap_max_length: 20
|
| 37 |
+
repetition_penalty: 1
|
| 38 |
+
num_patches: 20
|
| 39 |
+
num_captions: 100
|
| 40 |
+
prompt: 'a picture of '
|
| 41 |
+
|
| 42 |
+
# question answering specific
|
| 43 |
+
internal_bsz_fid: 1
|
| 44 |
+
num_captions_fid: 1
|
| 45 |
+
min_len: 0
|
| 46 |
+
max_len: 20
|
| 47 |
+
num_beams: 1
|
| 48 |
+
inference_method: "generate"
|
| 49 |
+
|
| 50 |
+
seed: 42
|
| 51 |
+
output_dir: "output/PNP-VQA-3b/VQAv2_test"
|
| 52 |
+
|
| 53 |
+
evaluate: True
|
| 54 |
+
test_splits: ["test"]
|
| 55 |
+
|
| 56 |
+
# distribution-specific
|
| 57 |
+
device: "cuda"
|
| 58 |
+
world_size: 1
|
| 59 |
+
dist_url: "env://"
|
| 60 |
+
distributed: True
|
LAVIS-main/lavis/projects/pnp-vqa/eval/vqav2_test_eval_large.yaml
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
|
| 6 |
+
model:
|
| 7 |
+
arch: pnp_vqa
|
| 8 |
+
model_type: large
|
| 9 |
+
|
| 10 |
+
datasets:
|
| 11 |
+
coco_vqa: # name of the dataset builder
|
| 12 |
+
type: default
|
| 13 |
+
vis_processor:
|
| 14 |
+
eval:
|
| 15 |
+
name: "blip_image_eval"
|
| 16 |
+
image_size: 384
|
| 17 |
+
text_processor:
|
| 18 |
+
eval:
|
| 19 |
+
name: "blip_question"
|
| 20 |
+
|
| 21 |
+
run:
|
| 22 |
+
task: vqa_reading_comprehension
|
| 23 |
+
|
| 24 |
+
# optimization-specific
|
| 25 |
+
batch_size_train: 12
|
| 26 |
+
batch_size_eval: 12
|
| 27 |
+
num_workers: 4
|
| 28 |
+
|
| 29 |
+
# image question matching specific
|
| 30 |
+
block_num: 7
|
| 31 |
+
|
| 32 |
+
# image captioning specific
|
| 33 |
+
top_k: 50
|
| 34 |
+
top_p: 1
|
| 35 |
+
cap_min_length: 10
|
| 36 |
+
cap_max_length: 20
|
| 37 |
+
repetition_penalty: 1
|
| 38 |
+
num_patches: 20
|
| 39 |
+
num_captions: 100
|
| 40 |
+
prompt: 'a picture of '
|
| 41 |
+
|
| 42 |
+
# question answering specific
|
| 43 |
+
internal_bsz_fid: 1
|
| 44 |
+
num_captions_fid: 1
|
| 45 |
+
min_len: 0
|
| 46 |
+
max_len: 20
|
| 47 |
+
num_beams: 1
|
| 48 |
+
inference_method: "generate"
|
| 49 |
+
|
| 50 |
+
seed: 42
|
| 51 |
+
output_dir: "output/PNP-VQA-large/VQAv2_test"
|
| 52 |
+
|
| 53 |
+
evaluate: True
|
| 54 |
+
test_splits: ["test"]
|
| 55 |
+
|
| 56 |
+
# distribution-specific
|
| 57 |
+
device: "cuda"
|
| 58 |
+
world_size: 1
|
| 59 |
+
dist_url: "env://"
|
| 60 |
+
distributed: True
|
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_caption.yaml
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
model:
|
| 6 |
+
arch: blip2_vicuna_xinstruct
|
| 7 |
+
model_type: vicuna7b
|
| 8 |
+
load_pretrained: True
|
| 9 |
+
# pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
|
| 10 |
+
load_finetuned: False
|
| 11 |
+
finetuned: ""
|
| 12 |
+
stage1_url_or_filename: null
|
| 13 |
+
image_model: "eva_clip_g"
|
| 14 |
+
pc_model: "ulip2_pointbert"
|
| 15 |
+
video_model: "eva_clip_g"
|
| 16 |
+
audio_model: "beats"
|
| 17 |
+
pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
|
| 18 |
+
pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth
|
| 19 |
+
pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth
|
| 20 |
+
pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/audio_qformer.pth
|
| 21 |
+
load_attention_image_qformer: True
|
| 22 |
+
load_attention_pc_qformer: True
|
| 23 |
+
load_attention_video_qformer: True
|
| 24 |
+
load_attention_audio_qformer: True
|
| 25 |
+
load_ln_type_image: "image"
|
| 26 |
+
load_ln_type_video: "video"
|
| 27 |
+
load_ln_type_pc: "pc"
|
| 28 |
+
load_ln_type_audio: "audio"
|
| 29 |
+
load_qformer_type_image: "image"
|
| 30 |
+
load_qformer_type_pc: "pc"
|
| 31 |
+
load_qformer_type_video: "video"
|
| 32 |
+
load_qformer_type_audio: "audio"
|
| 33 |
+
load_projection_image: True
|
| 34 |
+
load_projection_pc: True
|
| 35 |
+
load_projection_video: True
|
| 36 |
+
load_projection_audio: True
|
| 37 |
+
load_projection_type_image: "image"
|
| 38 |
+
load_projection_type_pc: "pc"
|
| 39 |
+
load_projection_type_video: "video"
|
| 40 |
+
load_projection_type_audio: "audio"
|
| 41 |
+
image_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
|
| 42 |
+
pc_encoder_kwargs : {}
|
| 43 |
+
video_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
|
| 44 |
+
audio_encoder_kwargs : {}
|
| 45 |
+
image_precision: "fp16"
|
| 46 |
+
pc_precision: "fp16"
|
| 47 |
+
video_precision: "fp16"
|
| 48 |
+
audio_precision: "fp16"
|
| 49 |
+
freeze_image: True
|
| 50 |
+
freeze_pc: True
|
| 51 |
+
freeze_video: True
|
| 52 |
+
freeze_audio: True
|
| 53 |
+
num_query_token: 32
|
| 54 |
+
llm_model: /path/to/vicuna-7b
|
| 55 |
+
prompt: "question: {} answer:"
|
| 56 |
+
max_txt_len: 128
|
| 57 |
+
max_output_txt_len: 256
|
| 58 |
+
apply_lemmatizer: False
|
| 59 |
+
num_few_shot_examples: 0
|
| 60 |
+
few_shot_prob: 0
|
| 61 |
+
qformer_text_input: True
|
| 62 |
+
llm_text_input: True
|
| 63 |
+
modalities : [audio, video]
|
| 64 |
+
use_cues: True
|
| 65 |
+
shared_qformer: False
|
| 66 |
+
pretrained_shared_qformer: Null
|
| 67 |
+
load_attention_shared_qformer: False
|
| 68 |
+
load_qformer_type_shared: ""
|
| 69 |
+
load_projection_shared: False
|
| 70 |
+
load_projection_type_shaped: ""
|
| 71 |
+
load_ln_type_shared: ""
|
| 72 |
+
shared_qformer_num_features: 512
|
| 73 |
+
special_qformer_input_prompt: "a short description"
|
| 74 |
+
prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
|
| 75 |
+
predict_with_gen: False
|
| 76 |
+
use_caption: True
|
| 77 |
+
use_describe: False
|
| 78 |
+
enumerate_inputs: False
|
| 79 |
+
add_space: True
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
datasets:
|
| 83 |
+
audio_video_discrn:
|
| 84 |
+
# data_dir: ${env.data_dir}/datasets
|
| 85 |
+
audio_processor:
|
| 86 |
+
train:
|
| 87 |
+
name: beats_audio
|
| 88 |
+
n_frames: 2
|
| 89 |
+
eval:
|
| 90 |
+
name: beats_audio
|
| 91 |
+
n_frames: 2
|
| 92 |
+
|
| 93 |
+
text_processor:
|
| 94 |
+
train:
|
| 95 |
+
name: "blip_caption"
|
| 96 |
+
eval:
|
| 97 |
+
name: "blip_caption"
|
| 98 |
+
|
| 99 |
+
video_processor:
|
| 100 |
+
train:
|
| 101 |
+
name: alpro_video_train
|
| 102 |
+
n_frms: 5
|
| 103 |
+
image_size: 224
|
| 104 |
+
min_scale: 0.9
|
| 105 |
+
max_scale: 1.0
|
| 106 |
+
full_video: True
|
| 107 |
+
eval:
|
| 108 |
+
name: alpro_video_eval
|
| 109 |
+
n_frms: 5
|
| 110 |
+
image_size: 224
|
| 111 |
+
min_scale: 0.9
|
| 112 |
+
max_scale: 1.0
|
| 113 |
+
full_video: True
|
| 114 |
+
|
| 115 |
+
data_type: [audio, video] # [images|videos|features]
|
| 116 |
+
|
| 117 |
+
build_info:
|
| 118 |
+
kwargs:
|
| 119 |
+
total: all
|
| 120 |
+
shuffle_modalities: False
|
| 121 |
+
balance_labels: True
|
| 122 |
+
dataset_name: audiocaps
|
| 123 |
+
ground_truth: False
|
| 124 |
+
classnames: [audio, video]
|
| 125 |
+
raw: True
|
| 126 |
+
|
| 127 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 128 |
+
annotations:
|
| 129 |
+
val:
|
| 130 |
+
url:
|
| 131 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
|
| 132 |
+
storage:
|
| 133 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
|
| 134 |
+
|
| 135 |
+
audio:
|
| 136 |
+
storage: /audiocaps/AUDIOCAPS_32000Hz/audio/val
|
| 137 |
+
video:
|
| 138 |
+
storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val
|
| 139 |
+
|
| 140 |
+
run:
|
| 141 |
+
task: discrn_qa
|
| 142 |
+
# optimization-specific
|
| 143 |
+
batch_size_train: 8
|
| 144 |
+
batch_size_eval: 1
|
| 145 |
+
num_workers: 0
|
| 146 |
+
max_epoch: 1
|
| 147 |
+
segments: 1
|
| 148 |
+
|
| 149 |
+
# inference-specific
|
| 150 |
+
max_len: 10
|
| 151 |
+
min_len: 1
|
| 152 |
+
length_penalty: -1.
|
| 153 |
+
num_beams: 5
|
| 154 |
+
inference_method: "generate"
|
| 155 |
+
|
| 156 |
+
train_splits: ["train"]
|
| 157 |
+
valid_splits: ["val"]
|
| 158 |
+
# test_splits: ["test"]
|
| 159 |
+
|
| 160 |
+
# distribution
|
| 161 |
+
device: "cuda"
|
| 162 |
+
world_size: 1
|
| 163 |
+
dist_url: "env://"
|
| 164 |
+
distributed: True
|
| 165 |
+
use_dist_eval_sampler: False
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
# model specific
|
| 169 |
+
k_test: 128
|
| 170 |
+
|
| 171 |
+
# misc
|
| 172 |
+
seed: 42
|
| 173 |
+
output_dir: "output/xinstructblip/eval/vicuna7b/discrn/audio_video_caption"
|
| 174 |
+
|
| 175 |
+
evaluate: True
|
| 176 |
+
save_freq: -1
|
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_caption_13b.yaml
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
model:
|
| 6 |
+
arch: blip2_vicuna_xinstruct
|
| 7 |
+
model_type: vicuna13b
|
| 8 |
+
load_pretrained: True
|
| 9 |
+
# pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/image_qformer.pth
|
| 10 |
+
load_finetuned: False
|
| 11 |
+
finetuned: ""
|
| 12 |
+
stage1_url_or_filename: null
|
| 13 |
+
image_model: "eva_clip_g"
|
| 14 |
+
pc_model: "ulip2_pointbert"
|
| 15 |
+
video_model: "eva_clip_g"
|
| 16 |
+
audio_model: "beats"
|
| 17 |
+
pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/image_qformer.pth
|
| 18 |
+
pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/pc_qformer.pth
|
| 19 |
+
pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/video_qformer.pth
|
| 20 |
+
pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/audio_qformer.pth
|
| 21 |
+
load_attention_image_qformer: True
|
| 22 |
+
load_attention_pc_qformer: True
|
| 23 |
+
load_attention_video_qformer: True
|
| 24 |
+
load_attention_audio_qformer: True
|
| 25 |
+
load_ln_type_image: "image"
|
| 26 |
+
load_ln_type_video: "video"
|
| 27 |
+
load_ln_type_pc: "pc"
|
| 28 |
+
load_ln_type_audio: "audio"
|
| 29 |
+
load_qformer_type_image: "image"
|
| 30 |
+
load_qformer_type_pc: "pc"
|
| 31 |
+
load_qformer_type_video: "video"
|
| 32 |
+
load_qformer_type_audio: "audio"
|
| 33 |
+
load_projection_image: True
|
| 34 |
+
load_projection_pc: True
|
| 35 |
+
load_projection_video: True
|
| 36 |
+
load_projection_audio: True
|
| 37 |
+
load_projection_type_image: "image"
|
| 38 |
+
load_projection_type_pc: "pc"
|
| 39 |
+
load_projection_type_video: "video"
|
| 40 |
+
load_projection_type_audio: "audio"
|
| 41 |
+
image_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
|
| 42 |
+
pc_encoder_kwargs : {}
|
| 43 |
+
video_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
|
| 44 |
+
audio_encoder_kwargs : {}
|
| 45 |
+
image_precision: "fp16"
|
| 46 |
+
pc_precision: "fp16"
|
| 47 |
+
video_precision: "fp16"
|
| 48 |
+
audio_precision: "fp16"
|
| 49 |
+
freeze_image: True
|
| 50 |
+
freeze_pc: True
|
| 51 |
+
freeze_video: True
|
| 52 |
+
freeze_audio: True
|
| 53 |
+
num_query_token: 32
|
| 54 |
+
llm_model: "/path/to/vicuna-13b"
|
| 55 |
+
prompt: "question: {} answer:"
|
| 56 |
+
max_txt_len: 128
|
| 57 |
+
max_output_txt_len: 256
|
| 58 |
+
apply_lemmatizer: False
|
| 59 |
+
num_few_shot_examples: 0
|
| 60 |
+
few_shot_prob: 0
|
| 61 |
+
qformer_text_input: True
|
| 62 |
+
llm_text_input: True
|
| 63 |
+
modalities : [audio, video]
|
| 64 |
+
use_cues: True
|
| 65 |
+
shared_qformer: False
|
| 66 |
+
pretrained_shared_qformer: Null
|
| 67 |
+
load_attention_shared_qformer: False
|
| 68 |
+
load_qformer_type_shared: ""
|
| 69 |
+
load_projection_shared: False
|
| 70 |
+
load_projection_type_shaped: ""
|
| 71 |
+
load_ln_type_shared: ""
|
| 72 |
+
shared_qformer_num_features: 512
|
| 73 |
+
special_qformer_input_prompt: "a short description"
|
| 74 |
+
prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
|
| 75 |
+
predict_with_gen: False
|
| 76 |
+
use_caption: True
|
| 77 |
+
use_describe: False
|
| 78 |
+
enumerate_inputs: False
|
| 79 |
+
add_space: True
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
datasets:
|
| 83 |
+
audio_video_discrn:
|
| 84 |
+
# data_dir: ${env.data_dir}/datasets
|
| 85 |
+
audio_processor:
|
| 86 |
+
train:
|
| 87 |
+
name: beats_audio
|
| 88 |
+
n_frames: 2
|
| 89 |
+
eval:
|
| 90 |
+
name: beats_audio
|
| 91 |
+
n_frames: 2
|
| 92 |
+
|
| 93 |
+
text_processor:
|
| 94 |
+
train:
|
| 95 |
+
name: "blip_caption"
|
| 96 |
+
eval:
|
| 97 |
+
name: "blip_caption"
|
| 98 |
+
|
| 99 |
+
video_processor:
|
| 100 |
+
train:
|
| 101 |
+
name: alpro_video_train
|
| 102 |
+
n_frms: 5
|
| 103 |
+
image_size: 224
|
| 104 |
+
min_scale: 0.9
|
| 105 |
+
max_scale: 1.0
|
| 106 |
+
full_video: True
|
| 107 |
+
eval:
|
| 108 |
+
name: alpro_video_eval
|
| 109 |
+
n_frms: 5
|
| 110 |
+
image_size: 224
|
| 111 |
+
min_scale: 0.9
|
| 112 |
+
max_scale: 1.0
|
| 113 |
+
full_video: True
|
| 114 |
+
|
| 115 |
+
data_type: [audio, video] # [images|videos|features]
|
| 116 |
+
|
| 117 |
+
build_info:
|
| 118 |
+
kwargs:
|
| 119 |
+
total: all
|
| 120 |
+
shuffle_modalities: False
|
| 121 |
+
balance_labels: True
|
| 122 |
+
dataset_name: audiocaps
|
| 123 |
+
ground_truth: False
|
| 124 |
+
classnames: [audio, video]
|
| 125 |
+
raw: True
|
| 126 |
+
|
| 127 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 128 |
+
annotations:
|
| 129 |
+
val:
|
| 130 |
+
url:
|
| 131 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
|
| 132 |
+
storage:
|
| 133 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
|
| 134 |
+
|
| 135 |
+
audio:
|
| 136 |
+
storage: /audiocaps/AUDIOCAPS_32000Hz/audio/val
|
| 137 |
+
video:
|
| 138 |
+
storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val
|
| 139 |
+
|
| 140 |
+
run:
|
| 141 |
+
task: discrn_qa
|
| 142 |
+
# optimization-specific
|
| 143 |
+
batch_size_train: 8
|
| 144 |
+
batch_size_eval: 1
|
| 145 |
+
num_workers: 0
|
| 146 |
+
max_epoch: 1
|
| 147 |
+
segments: 1
|
| 148 |
+
|
| 149 |
+
# inference-specific
|
| 150 |
+
max_len: 10
|
| 151 |
+
min_len: 1
|
| 152 |
+
length_penalty: -1.
|
| 153 |
+
num_beams: 5
|
| 154 |
+
inference_method: "generate"
|
| 155 |
+
|
| 156 |
+
train_splits: ["train"]
|
| 157 |
+
valid_splits: ["val"]
|
| 158 |
+
# test_splits: ["test"]
|
| 159 |
+
|
| 160 |
+
# distribution
|
| 161 |
+
device: "cuda"
|
| 162 |
+
world_size: 1
|
| 163 |
+
dist_url: "env://"
|
| 164 |
+
distributed: True
|
| 165 |
+
use_dist_eval_sampler: False
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
# model specific
|
| 169 |
+
k_test: 128
|
| 170 |
+
|
| 171 |
+
# misc
|
| 172 |
+
seed: 42
|
| 173 |
+
output_dir: "output/xinstructblip/eval/vicuna13b/discrn/audio_video_caption"
|
| 174 |
+
|
| 175 |
+
evaluate: True
|
| 176 |
+
save_freq: -1
|
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe.yaml
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
model:
|
| 6 |
+
arch: blip2_vicuna_xinstruct
|
| 7 |
+
model_type: vicuna7b
|
| 8 |
+
load_pretrained: True
|
| 9 |
+
# pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
|
| 10 |
+
load_finetuned: False
|
| 11 |
+
finetuned: ""
|
| 12 |
+
stage1_url_or_filename: null
|
| 13 |
+
image_model: "eva_clip_g"
|
| 14 |
+
pc_model: "ulip2_pointbert"
|
| 15 |
+
video_model: "eva_clip_g"
|
| 16 |
+
audio_model: "beats"
|
| 17 |
+
pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
|
| 18 |
+
pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth
|
| 19 |
+
pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth
|
| 20 |
+
pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/audio_qformer.pth
|
| 21 |
+
load_attention_image_qformer: True
|
| 22 |
+
load_attention_pc_qformer: True
|
| 23 |
+
load_attention_video_qformer: True
|
| 24 |
+
load_attention_audio_qformer: True
|
| 25 |
+
load_ln_type_image: "image"
|
| 26 |
+
load_ln_type_video: "video"
|
| 27 |
+
load_ln_type_pc: "pc"
|
| 28 |
+
load_ln_type_audio: "audio"
|
| 29 |
+
load_qformer_type_image: "image"
|
| 30 |
+
load_qformer_type_pc: "pc"
|
| 31 |
+
load_qformer_type_video: "video"
|
| 32 |
+
load_qformer_type_audio: "audio"
|
| 33 |
+
load_projection_image: True
|
| 34 |
+
load_projection_pc: True
|
| 35 |
+
load_projection_video: True
|
| 36 |
+
load_projection_audio: True
|
| 37 |
+
load_projection_type_image: "image"
|
| 38 |
+
load_projection_type_pc: "pc"
|
| 39 |
+
load_projection_type_video: "video"
|
| 40 |
+
load_projection_type_audio: "audio"
|
| 41 |
+
image_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
|
| 42 |
+
pc_encoder_kwargs : {}
|
| 43 |
+
video_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
|
| 44 |
+
audio_encoder_kwargs : {}
|
| 45 |
+
image_precision: "fp16"
|
| 46 |
+
pc_precision: "fp16"
|
| 47 |
+
video_precision: "fp16"
|
| 48 |
+
audio_precision: "fp16"
|
| 49 |
+
freeze_image: True
|
| 50 |
+
freeze_pc: True
|
| 51 |
+
freeze_video: True
|
| 52 |
+
freeze_audio: True
|
| 53 |
+
num_query_token: 32
|
| 54 |
+
llm_model: /path/to/vicuna-7b
|
| 55 |
+
prompt: "question: {} answer:"
|
| 56 |
+
max_txt_len: 128
|
| 57 |
+
max_output_txt_len: 256
|
| 58 |
+
apply_lemmatizer: False
|
| 59 |
+
num_few_shot_examples: 0
|
| 60 |
+
few_shot_prob: 0
|
| 61 |
+
qformer_text_input: True
|
| 62 |
+
llm_text_input: True
|
| 63 |
+
modalities : [audio, video]
|
| 64 |
+
use_cues: True
|
| 65 |
+
shared_qformer: False
|
| 66 |
+
pretrained_shared_qformer: Null
|
| 67 |
+
load_attention_shared_qformer: False
|
| 68 |
+
load_qformer_type_shared: ""
|
| 69 |
+
load_projection_shared: False
|
| 70 |
+
load_projection_type_shaped: ""
|
| 71 |
+
load_ln_type_shared: ""
|
| 72 |
+
shared_qformer_num_features: 512
|
| 73 |
+
special_qformer_input_prompt: "a short description"
|
| 74 |
+
prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
|
| 75 |
+
predict_with_gen: False
|
| 76 |
+
use_caption: False
|
| 77 |
+
use_describe: False
|
| 78 |
+
enumerate_inputs: False
|
| 79 |
+
add_space: True
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
datasets:
|
| 83 |
+
audio_video_discrn:
|
| 84 |
+
# data_dir: ${env.data_dir}/datasets
|
| 85 |
+
audio_processor:
|
| 86 |
+
train:
|
| 87 |
+
name: beats_audio
|
| 88 |
+
n_frames: 2
|
| 89 |
+
eval:
|
| 90 |
+
name: beats_audio
|
| 91 |
+
n_frames: 2
|
| 92 |
+
|
| 93 |
+
text_processor:
|
| 94 |
+
train:
|
| 95 |
+
name: "blip_caption"
|
| 96 |
+
eval:
|
| 97 |
+
name: "blip_caption"
|
| 98 |
+
|
| 99 |
+
video_processor:
|
| 100 |
+
train:
|
| 101 |
+
name: alpro_video_train
|
| 102 |
+
n_frms: 2
|
| 103 |
+
image_size: 224
|
| 104 |
+
min_scale: 0.9
|
| 105 |
+
max_scale: 1.0
|
| 106 |
+
full_video: True
|
| 107 |
+
eval:
|
| 108 |
+
name: alpro_video_eval
|
| 109 |
+
n_frms: 2
|
| 110 |
+
image_size: 224
|
| 111 |
+
min_scale: 0.9
|
| 112 |
+
max_scale: 1.0
|
| 113 |
+
full_video: True
|
| 114 |
+
|
| 115 |
+
data_type: [audio, video] # [images|videos|features]
|
| 116 |
+
|
| 117 |
+
build_info:
|
| 118 |
+
kwargs:
|
| 119 |
+
total: 100
|
| 120 |
+
shuffle_modalities: False
|
| 121 |
+
balance_labels: True
|
| 122 |
+
dataset_name: audiocaps
|
| 123 |
+
ground_truth: False
|
| 124 |
+
classnames: [audio, video]
|
| 125 |
+
raw: False
|
| 126 |
+
|
| 127 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 128 |
+
annotations:
|
| 129 |
+
val:
|
| 130 |
+
url:
|
| 131 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
|
| 132 |
+
storage:
|
| 133 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
|
| 134 |
+
|
| 135 |
+
audio:
|
| 136 |
+
storage: /audiocaps/AUDIOCAPS_32000Hz/audio/val
|
| 137 |
+
video:
|
| 138 |
+
storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val
|
| 139 |
+
|
| 140 |
+
run:
|
| 141 |
+
task: discrn_qa
|
| 142 |
+
# optimization-specific
|
| 143 |
+
batch_size_train: 8
|
| 144 |
+
batch_size_eval: 1
|
| 145 |
+
num_workers: 8
|
| 146 |
+
max_epoch: 1
|
| 147 |
+
segments: 1
|
| 148 |
+
|
| 149 |
+
# inference-specific
|
| 150 |
+
max_len: 10
|
| 151 |
+
min_len: 1
|
| 152 |
+
length_penalty: -1.
|
| 153 |
+
num_beams: 5
|
| 154 |
+
inference_method: "generate"
|
| 155 |
+
|
| 156 |
+
train_splits: ["train"]
|
| 157 |
+
valid_splits: ["val"]
|
| 158 |
+
# test_splits: ["test"]
|
| 159 |
+
|
| 160 |
+
# distribution
|
| 161 |
+
device: "cuda"
|
| 162 |
+
world_size: 1
|
| 163 |
+
dist_url: "env://"
|
| 164 |
+
distributed: True
|
| 165 |
+
use_dist_eval_sampler: False
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
# model specific
|
| 169 |
+
k_test: 128
|
| 170 |
+
|
| 171 |
+
# misc
|
| 172 |
+
seed: 42
|
| 173 |
+
output_dir: "output/xinstructblip/eval/vicuna7b/discrn/audio_video_describe"
|
| 174 |
+
|
| 175 |
+
evaluate: True
|
| 176 |
+
save_freq: -1
|
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_13b.yaml
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
model:
|
| 6 |
+
arch: blip2_vicuna_xinstruct
|
| 7 |
+
model_type: vicuna13b
|
| 8 |
+
load_pretrained: True
|
| 9 |
+
# pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/image_qformer.pth
|
| 10 |
+
load_finetuned: False
|
| 11 |
+
finetuned: ""
|
| 12 |
+
stage1_url_or_filename: null
|
| 13 |
+
image_model: "eva_clip_g"
|
| 14 |
+
pc_model: "ulip2_pointbert"
|
| 15 |
+
video_model: "eva_clip_g"
|
| 16 |
+
audio_model: "beats"
|
| 17 |
+
pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/image_qformer.pth
|
| 18 |
+
pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/pc_qformer_last.pth
|
| 19 |
+
pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/video_qformer.pth
|
| 20 |
+
pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/audio_qformer.pth
|
| 21 |
+
load_attention_image_qformer: True
|
| 22 |
+
load_attention_pc_qformer: True
|
| 23 |
+
load_attention_video_qformer: True
|
| 24 |
+
load_attention_audio_qformer: True
|
| 25 |
+
load_ln_type_image: "image"
|
| 26 |
+
load_ln_type_video: "video"
|
| 27 |
+
load_ln_type_pc: "pc"
|
| 28 |
+
load_ln_type_audio: "audio"
|
| 29 |
+
load_qformer_type_image: "image"
|
| 30 |
+
load_qformer_type_pc: "pc"
|
| 31 |
+
load_qformer_type_video: "video"
|
| 32 |
+
load_qformer_type_audio: "audio"
|
| 33 |
+
load_projection_image: True
|
| 34 |
+
load_projection_pc: True
|
| 35 |
+
load_projection_video: True
|
| 36 |
+
load_projection_audio: True
|
| 37 |
+
load_projection_type_image: "image"
|
| 38 |
+
load_projection_type_pc: "pc"
|
| 39 |
+
load_projection_type_video: "video"
|
| 40 |
+
load_projection_type_audio: "audio"
|
| 41 |
+
image_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
|
| 42 |
+
pc_encoder_kwargs : {}
|
| 43 |
+
video_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
|
| 44 |
+
audio_encoder_kwargs : {}
|
| 45 |
+
image_precision: "fp16"
|
| 46 |
+
pc_precision: "fp16"
|
| 47 |
+
video_precision: "fp16"
|
| 48 |
+
audio_precision: "fp16"
|
| 49 |
+
freeze_image: True
|
| 50 |
+
freeze_pc: True
|
| 51 |
+
freeze_video: True
|
| 52 |
+
freeze_audio: True
|
| 53 |
+
num_query_token: 32
|
| 54 |
+
llm_model: "/path/to/vicuna-13b"
|
| 55 |
+
prompt: "question: {} answer:"
|
| 56 |
+
max_txt_len: 128
|
| 57 |
+
max_output_txt_len: 256
|
| 58 |
+
apply_lemmatizer: False
|
| 59 |
+
num_few_shot_examples: 0
|
| 60 |
+
few_shot_prob: 0
|
| 61 |
+
qformer_text_input: True
|
| 62 |
+
llm_text_input: True
|
| 63 |
+
modalities : [audio, video]
|
| 64 |
+
use_cues: True
|
| 65 |
+
shared_qformer: False
|
| 66 |
+
pretrained_shared_qformer: Null
|
| 67 |
+
load_attention_shared_qformer: False
|
| 68 |
+
load_qformer_type_shared: ""
|
| 69 |
+
load_projection_shared: False
|
| 70 |
+
load_projection_type_shaped: ""
|
| 71 |
+
load_ln_type_shared: ""
|
| 72 |
+
shared_qformer_num_features: 512
|
| 73 |
+
# special_qformer_input_prompt: "a short description"
|
| 74 |
+
prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
|
| 75 |
+
predict_with_gen: False
|
| 76 |
+
use_caption: False
|
| 77 |
+
use_describe: False
|
| 78 |
+
enumerate_inputs: False
|
| 79 |
+
add_space: True
|
| 80 |
+
remove_start: True
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
datasets:
|
| 84 |
+
audio_video_discrn:
|
| 85 |
+
# data_dir: ${env.data_dir}/datasets
|
| 86 |
+
audio_processor:
|
| 87 |
+
train:
|
| 88 |
+
name: beats_audio
|
| 89 |
+
n_frames: 2
|
| 90 |
+
eval:
|
| 91 |
+
name: beats_audio
|
| 92 |
+
n_frames: 2
|
| 93 |
+
|
| 94 |
+
text_processor:
|
| 95 |
+
train:
|
| 96 |
+
name: "blip_caption"
|
| 97 |
+
eval:
|
| 98 |
+
name: "blip_caption"
|
| 99 |
+
|
| 100 |
+
video_processor:
|
| 101 |
+
train:
|
| 102 |
+
name: alpro_video_train
|
| 103 |
+
n_frms: 2
|
| 104 |
+
image_size: 224
|
| 105 |
+
min_scale: 0.9
|
| 106 |
+
max_scale: 1.0
|
| 107 |
+
full_video: True
|
| 108 |
+
eval:
|
| 109 |
+
name: alpro_video_eval
|
| 110 |
+
n_frms: 2
|
| 111 |
+
image_size: 224
|
| 112 |
+
min_scale: 0.9
|
| 113 |
+
max_scale: 1.0
|
| 114 |
+
full_video: True
|
| 115 |
+
|
| 116 |
+
data_type: [audio, video] # [images|videos|features]
|
| 117 |
+
|
| 118 |
+
build_info:
|
| 119 |
+
kwargs:
|
| 120 |
+
total: 100
|
| 121 |
+
shuffle_modalities: False
|
| 122 |
+
balance_labels: True
|
| 123 |
+
dataset_name: audiocaps
|
| 124 |
+
ground_truth: False
|
| 125 |
+
classnames: [audio, video]
|
| 126 |
+
raw: False
|
| 127 |
+
|
| 128 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 129 |
+
annotations:
|
| 130 |
+
val:
|
| 131 |
+
url:
|
| 132 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
|
| 133 |
+
storage:
|
| 134 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
|
| 135 |
+
|
| 136 |
+
audio:
|
| 137 |
+
storage: /audiocaps/AUDIOCAPS_32000Hz/audio/val
|
| 138 |
+
video:
|
| 139 |
+
storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val
|
| 140 |
+
|
| 141 |
+
run:
|
| 142 |
+
task: discrn_qa
|
| 143 |
+
# optimization-specific
|
| 144 |
+
batch_size_train: 8
|
| 145 |
+
batch_size_eval: 1
|
| 146 |
+
num_workers: 8
|
| 147 |
+
max_epoch: 1
|
| 148 |
+
segments: 1
|
| 149 |
+
|
| 150 |
+
# inference-specific
|
| 151 |
+
max_len: 10
|
| 152 |
+
min_len: 1
|
| 153 |
+
length_penalty: -1.
|
| 154 |
+
num_beams: 5
|
| 155 |
+
inference_method: "generate"
|
| 156 |
+
|
| 157 |
+
train_splits: ["train"]
|
| 158 |
+
valid_splits: ["val"]
|
| 159 |
+
# test_splits: ["test"]
|
| 160 |
+
|
| 161 |
+
# distribution
|
| 162 |
+
device: "cuda"
|
| 163 |
+
world_size: 1
|
| 164 |
+
dist_url: "env://"
|
| 165 |
+
distributed: True
|
| 166 |
+
use_dist_eval_sampler: False
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
# model specific
|
| 170 |
+
k_test: 128
|
| 171 |
+
|
| 172 |
+
# misc
|
| 173 |
+
seed: 42
|
| 174 |
+
output_dir: "output/xinstructblip/eval/vicuna13b/discrn/audio_video_describe"
|
| 175 |
+
|
| 176 |
+
evaluate: True
|
| 177 |
+
save_freq: -1
|
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_nocue.yaml
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
model:
|
| 6 |
+
arch: blip2_vicuna_xinstruct
|
| 7 |
+
model_type: vicuna7b
|
| 8 |
+
load_pretrained: True
|
| 9 |
+
# pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
|
| 10 |
+
load_finetuned: False
|
| 11 |
+
finetuned: ""
|
| 12 |
+
stage1_url_or_filename: null
|
| 13 |
+
image_model: "eva_clip_g"
|
| 14 |
+
pc_model: "ulip2_pointbert"
|
| 15 |
+
video_model: "eva_clip_g"
|
| 16 |
+
audio_model: "beats"
|
| 17 |
+
pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b_nocue/image_qformer.pth
|
| 18 |
+
pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b_nocue/pc_qformer.pth
|
| 19 |
+
pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b_nocue/video_qformer.pth
|
| 20 |
+
pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b_nocue/audio_qformer.pth
|
| 21 |
+
load_attention_image_qformer: True
|
| 22 |
+
load_attention_pc_qformer: True
|
| 23 |
+
load_attention_video_qformer: True
|
| 24 |
+
load_attention_audio_qformer: True
|
| 25 |
+
load_ln_type_image: "image"
|
| 26 |
+
load_ln_type_video: "video"
|
| 27 |
+
load_ln_type_pc: "pc"
|
| 28 |
+
load_ln_type_audio: "audio"
|
| 29 |
+
load_qformer_type_image: "image"
|
| 30 |
+
load_qformer_type_pc: "pc"
|
| 31 |
+
load_qformer_type_video: "video"
|
| 32 |
+
load_qformer_type_audio: "audio"
|
| 33 |
+
load_projection_image: True
|
| 34 |
+
load_projection_pc: True
|
| 35 |
+
load_projection_video: True
|
| 36 |
+
load_projection_audio: True
|
| 37 |
+
load_projection_type_image: "image"
|
| 38 |
+
load_projection_type_pc: "pc"
|
| 39 |
+
load_projection_type_video: "video"
|
| 40 |
+
load_projection_type_audio: "audio"
|
| 41 |
+
image_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
|
| 42 |
+
pc_encoder_kwargs : {}
|
| 43 |
+
video_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
|
| 44 |
+
audio_encoder_kwargs : {}
|
| 45 |
+
image_precision: "fp16"
|
| 46 |
+
pc_precision: "fp16"
|
| 47 |
+
video_precision: "fp16"
|
| 48 |
+
audio_precision: "fp16"
|
| 49 |
+
freeze_image: True
|
| 50 |
+
freeze_pc: True
|
| 51 |
+
freeze_video: True
|
| 52 |
+
freeze_audio: True
|
| 53 |
+
num_query_token: 32
|
| 54 |
+
llm_model: /path/to/vicuna-7b
|
| 55 |
+
prompt: "question: {} answer:"
|
| 56 |
+
max_txt_len: 128
|
| 57 |
+
max_output_txt_len: 256
|
| 58 |
+
apply_lemmatizer: False
|
| 59 |
+
num_few_shot_examples: 0
|
| 60 |
+
few_shot_prob: 0
|
| 61 |
+
qformer_text_input: True
|
| 62 |
+
llm_text_input: True
|
| 63 |
+
modalities : [audio, video]
|
| 64 |
+
use_cues: False
|
| 65 |
+
shared_qformer: False
|
| 66 |
+
pretrained_shared_qformer: Null
|
| 67 |
+
load_attention_shared_qformer: False
|
| 68 |
+
load_qformer_type_shared: ""
|
| 69 |
+
load_projection_shared: False
|
| 70 |
+
load_projection_type_shaped: ""
|
| 71 |
+
load_ln_type_shared: ""
|
| 72 |
+
shared_qformer_num_features: 512
|
| 73 |
+
special_qformer_input_prompt: "a short description"
|
| 74 |
+
prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
|
| 75 |
+
predict_with_gen: False
|
| 76 |
+
use_caption: False
|
| 77 |
+
use_describe: False
|
| 78 |
+
enumerate_inputs: False
|
| 79 |
+
add_space: True
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
datasets:
|
| 83 |
+
audio_video_discrn:
|
| 84 |
+
# data_dir: ${env.data_dir}/datasets
|
| 85 |
+
audio_processor:
|
| 86 |
+
train:
|
| 87 |
+
name: beats_audio
|
| 88 |
+
n_frames: 2
|
| 89 |
+
eval:
|
| 90 |
+
name: beats_audio
|
| 91 |
+
n_frames: 2
|
| 92 |
+
|
| 93 |
+
text_processor:
|
| 94 |
+
train:
|
| 95 |
+
name: "blip_caption"
|
| 96 |
+
eval:
|
| 97 |
+
name: "blip_caption"
|
| 98 |
+
|
| 99 |
+
video_processor:
|
| 100 |
+
train:
|
| 101 |
+
name: alpro_video_train
|
| 102 |
+
n_frms: 2
|
| 103 |
+
image_size: 224
|
| 104 |
+
min_scale: 0.9
|
| 105 |
+
max_scale: 1.0
|
| 106 |
+
full_video: True
|
| 107 |
+
eval:
|
| 108 |
+
name: alpro_video_eval
|
| 109 |
+
n_frms: 2
|
| 110 |
+
image_size: 224
|
| 111 |
+
min_scale: 0.9
|
| 112 |
+
max_scale: 1.0
|
| 113 |
+
full_video: True
|
| 114 |
+
|
| 115 |
+
data_type: [audio, video] # [images|videos|features]
|
| 116 |
+
|
| 117 |
+
build_info:
|
| 118 |
+
kwargs:
|
| 119 |
+
total: all
|
| 120 |
+
shuffle_modalities: False
|
| 121 |
+
balance_labels: True
|
| 122 |
+
dataset_name: audiocaps
|
| 123 |
+
ground_truth: False
|
| 124 |
+
classnames: [audio, video]
|
| 125 |
+
raw: False
|
| 126 |
+
|
| 127 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 128 |
+
annotations:
|
| 129 |
+
val:
|
| 130 |
+
url:
|
| 131 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
|
| 132 |
+
storage:
|
| 133 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
|
| 134 |
+
|
| 135 |
+
audio:
|
| 136 |
+
storage: /audiocaps/AUDIOCAPS_32000Hz/audio/val
|
| 137 |
+
video:
|
| 138 |
+
storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val
|
| 139 |
+
|
| 140 |
+
run:
|
| 141 |
+
task: discrn_qa
|
| 142 |
+
# optimization-specific
|
| 143 |
+
batch_size_train: 8
|
| 144 |
+
batch_size_eval: 1
|
| 145 |
+
num_workers: 8
|
| 146 |
+
max_epoch: 1
|
| 147 |
+
segments: 1
|
| 148 |
+
|
| 149 |
+
# inference-specific
|
| 150 |
+
max_len: 10
|
| 151 |
+
min_len: 1
|
| 152 |
+
length_penalty: -1.
|
| 153 |
+
num_beams: 5
|
| 154 |
+
inference_method: "generate"
|
| 155 |
+
|
| 156 |
+
train_splits: ["train"]
|
| 157 |
+
valid_splits: ["val"]
|
| 158 |
+
# test_splits: ["test"]
|
| 159 |
+
|
| 160 |
+
# distribution
|
| 161 |
+
device: "cuda"
|
| 162 |
+
world_size: 1
|
| 163 |
+
dist_url: "env://"
|
| 164 |
+
distributed: True
|
| 165 |
+
use_dist_eval_sampler: False
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
# model specific
|
| 169 |
+
k_test: 128
|
| 170 |
+
|
| 171 |
+
# misc
|
| 172 |
+
seed: 42
|
| 173 |
+
output_dir: "output/xinstructblip/eval/vicuna7b/discrn/audio_video_describe_nocue"
|
| 174 |
+
|
| 175 |
+
evaluate: True
|
| 176 |
+
save_freq: -1
|
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_proj copy.yaml
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
model:
|
| 6 |
+
arch: blip2_vicuna_xinstruct
|
| 7 |
+
model_type: vicuna7b
|
| 8 |
+
load_pretrained: True
|
| 9 |
+
# pretrained: /export/home/LAVIS-xgen_mm/lavis/output/xinstructblip/train/vicuna7b/audio/20231115194/checkpoint_65001.pth
|
| 10 |
+
load_finetuned: False
|
| 11 |
+
finetuned: ""
|
| 12 |
+
stage1_url_or_filename: null
|
| 13 |
+
image_model: "eva_clip_g"
|
| 14 |
+
pc_model: "ulip2_pointbert"
|
| 15 |
+
video_model: "eva_clip_g"
|
| 16 |
+
audio_model: "beats"
|
| 17 |
+
pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
|
| 18 |
+
pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth
|
| 19 |
+
pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth
|
| 20 |
+
pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/linear_projection_7b/audio_qformer_linear.pth
|
| 21 |
+
load_attention_image_qformer: True
|
| 22 |
+
load_attention_pc_qformer: True
|
| 23 |
+
load_attention_video_qformer: True
|
| 24 |
+
load_attention_audio_qformer: True
|
| 25 |
+
load_ln_type_image: "image"
|
| 26 |
+
load_ln_type_video: "video"
|
| 27 |
+
load_ln_type_pc: "pc"
|
| 28 |
+
load_ln_type_audio: "audio"
|
| 29 |
+
load_qformer_type_image: "image"
|
| 30 |
+
load_qformer_type_pc: "pc"
|
| 31 |
+
load_qformer_type_video: "video"
|
| 32 |
+
load_qformer_type_audio: "audio"
|
| 33 |
+
load_projection_image: True
|
| 34 |
+
load_projection_pc: True
|
| 35 |
+
load_projection_video: True
|
| 36 |
+
load_projection_audio: True
|
| 37 |
+
load_projection_type_image: "image"
|
| 38 |
+
load_projection_type_pc: "pc"
|
| 39 |
+
load_projection_type_video: "video"
|
| 40 |
+
load_projection_type_audio: "audio"
|
| 41 |
+
image_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
|
| 42 |
+
pc_encoder_kwargs : {}
|
| 43 |
+
video_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
|
| 44 |
+
audio_encoder_kwargs : {}
|
| 45 |
+
image_precision: "fp16"
|
| 46 |
+
pc_precision: "fp16"
|
| 47 |
+
video_precision: "fp16"
|
| 48 |
+
audio_precision: "fp16"
|
| 49 |
+
freeze_image: True
|
| 50 |
+
freeze_pc: True
|
| 51 |
+
freeze_video: True
|
| 52 |
+
freeze_audio: True
|
| 53 |
+
num_query_token: 32
|
| 54 |
+
llm_model: /path/to/vicuna-7b
|
| 55 |
+
prompt: "question: {} answer:"
|
| 56 |
+
max_txt_len: 128
|
| 57 |
+
max_output_txt_len: 256
|
| 58 |
+
apply_lemmatizer: False
|
| 59 |
+
num_few_shot_examples: 0
|
| 60 |
+
few_shot_prob: 0
|
| 61 |
+
qformer_text_input: True
|
| 62 |
+
llm_text_input: True
|
| 63 |
+
modalities : [audio, video]
|
| 64 |
+
use_cues: True
|
| 65 |
+
shared_qformer: False
|
| 66 |
+
pretrained_shared_qformer: Null
|
| 67 |
+
load_attention_shared_qformer: False
|
| 68 |
+
load_qformer_type_shared: ""
|
| 69 |
+
load_projection_shared: False
|
| 70 |
+
load_projection_type_shaped: ""
|
| 71 |
+
load_ln_type_shared: ""
|
| 72 |
+
shared_qformer_num_features: 512
|
| 73 |
+
special_qformer_input_prompt: "a short description"
|
| 74 |
+
prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
|
| 75 |
+
predict_with_gen: False
|
| 76 |
+
use_caption: False
|
| 77 |
+
use_describe: False
|
| 78 |
+
enumerate_inputs: False
|
| 79 |
+
add_space: True
|
| 80 |
+
projection_only_audio: True
|
| 81 |
+
projection_path_audio: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/linear_projection_7b/audio_qformer_linear_768.pth
|
| 82 |
+
proj_dim: 768
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
datasets:
|
| 86 |
+
audio_video_discrn:
|
| 87 |
+
# data_dir: ${env.data_dir}/datasets
|
| 88 |
+
audio_processor:
|
| 89 |
+
train:
|
| 90 |
+
name: beats_audio
|
| 91 |
+
n_frames: 2
|
| 92 |
+
eval:
|
| 93 |
+
name: beats_audio
|
| 94 |
+
n_frames: 2
|
| 95 |
+
|
| 96 |
+
text_processor:
|
| 97 |
+
train:
|
| 98 |
+
name: "blip_caption"
|
| 99 |
+
eval:
|
| 100 |
+
name: "blip_caption"
|
| 101 |
+
|
| 102 |
+
video_processor:
|
| 103 |
+
train:
|
| 104 |
+
name: alpro_video_train
|
| 105 |
+
n_frms: 2
|
| 106 |
+
image_size: 224
|
| 107 |
+
min_scale: 0.9
|
| 108 |
+
max_scale: 1.0
|
| 109 |
+
full_video: True
|
| 110 |
+
eval:
|
| 111 |
+
name: alpro_video_eval
|
| 112 |
+
n_frms: 2
|
| 113 |
+
image_size: 224
|
| 114 |
+
min_scale: 0.9
|
| 115 |
+
max_scale: 1.0
|
| 116 |
+
full_video: True
|
| 117 |
+
|
| 118 |
+
data_type: [audio, video] # [images|videos|features]
|
| 119 |
+
|
| 120 |
+
build_info:
|
| 121 |
+
kwargs:
|
| 122 |
+
total: all
|
| 123 |
+
shuffle_modalities: False
|
| 124 |
+
balance_labels: True
|
| 125 |
+
dataset_name: audiocaps
|
| 126 |
+
ground_truth: False
|
| 127 |
+
classnames: [audio, video]
|
| 128 |
+
raw: False
|
| 129 |
+
|
| 130 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 131 |
+
annotations:
|
| 132 |
+
val:
|
| 133 |
+
url:
|
| 134 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
|
| 135 |
+
storage:
|
| 136 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
|
| 137 |
+
|
| 138 |
+
audio:
|
| 139 |
+
storage: /audiocaps/AUDIOCAPS_32000Hz/audio/val
|
| 140 |
+
video:
|
| 141 |
+
storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val
|
| 142 |
+
|
| 143 |
+
run:
|
| 144 |
+
task: discrn_qa
|
| 145 |
+
# optimization-specific
|
| 146 |
+
batch_size_train: 8
|
| 147 |
+
batch_size_eval: 1
|
| 148 |
+
num_workers: 8
|
| 149 |
+
max_epoch: 1
|
| 150 |
+
segments: 1
|
| 151 |
+
|
| 152 |
+
# inference-specific
|
| 153 |
+
max_len: 10
|
| 154 |
+
min_len: 1
|
| 155 |
+
length_penalty: -1.
|
| 156 |
+
num_beams: 5
|
| 157 |
+
inference_method: "generate"
|
| 158 |
+
|
| 159 |
+
train_splits: ["train"]
|
| 160 |
+
valid_splits: ["val"]
|
| 161 |
+
# test_splits: ["test"]
|
| 162 |
+
|
| 163 |
+
# distribution
|
| 164 |
+
device: "cuda"
|
| 165 |
+
world_size: 1
|
| 166 |
+
dist_url: "env://"
|
| 167 |
+
distributed: True
|
| 168 |
+
use_dist_eval_sampler: False
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
# model specific
|
| 172 |
+
k_test: 128
|
| 173 |
+
|
| 174 |
+
# misc
|
| 175 |
+
seed: 42
|
| 176 |
+
output_dir: "output/xinstructblip/eval/vicuna7b/discrn/audio_video_describe_proj"
|
| 177 |
+
|
| 178 |
+
evaluate: True
|
| 179 |
+
save_freq: -1
|
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_proj.yaml
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
model:
|
| 6 |
+
arch: blip2_vicuna_xinstruct
|
| 7 |
+
model_type: vicuna7b
|
| 8 |
+
load_pretrained: True
|
| 9 |
+
# pretrained: /export/home/LAVIS-xgen_mm/lavis/output/xinstructblip/train/vicuna7b/audio/20231115194/checkpoint_65001.pth
|
| 10 |
+
load_finetuned: False
|
| 11 |
+
finetuned: ""
|
| 12 |
+
stage1_url_or_filename: null
|
| 13 |
+
image_model: "eva_clip_g"
|
| 14 |
+
pc_model: "ulip2_pointbert"
|
| 15 |
+
video_model: "eva_clip_g"
|
| 16 |
+
audio_model: "beats"
|
| 17 |
+
pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
|
| 18 |
+
pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth
|
| 19 |
+
pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth
|
| 20 |
+
pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/linear_projection_7b/audio_qformer_linear.pth
|
| 21 |
+
load_attention_image_qformer: True
|
| 22 |
+
load_attention_pc_qformer: True
|
| 23 |
+
load_attention_video_qformer: True
|
| 24 |
+
load_attention_audio_qformer: True
|
| 25 |
+
load_ln_type_image: "image"
|
| 26 |
+
load_ln_type_video: "video"
|
| 27 |
+
load_ln_type_pc: "pc"
|
| 28 |
+
load_ln_type_audio: "audio"
|
| 29 |
+
load_qformer_type_image: "image"
|
| 30 |
+
load_qformer_type_pc: "pc"
|
| 31 |
+
load_qformer_type_video: "video"
|
| 32 |
+
load_qformer_type_audio: "audio"
|
| 33 |
+
load_projection_image: True
|
| 34 |
+
load_projection_pc: True
|
| 35 |
+
load_projection_video: True
|
| 36 |
+
load_projection_audio: True
|
| 37 |
+
load_projection_type_image: "image"
|
| 38 |
+
load_projection_type_pc: "pc"
|
| 39 |
+
load_projection_type_video: "video"
|
| 40 |
+
load_projection_type_audio: "audio"
|
| 41 |
+
image_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
|
| 42 |
+
pc_encoder_kwargs : {}
|
| 43 |
+
video_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
|
| 44 |
+
audio_encoder_kwargs : {}
|
| 45 |
+
image_precision: "fp16"
|
| 46 |
+
pc_precision: "fp16"
|
| 47 |
+
video_precision: "fp16"
|
| 48 |
+
audio_precision: "fp16"
|
| 49 |
+
freeze_image: True
|
| 50 |
+
freeze_pc: True
|
| 51 |
+
freeze_video: True
|
| 52 |
+
freeze_audio: True
|
| 53 |
+
num_query_token: 32
|
| 54 |
+
llm_model: /path/to/vicuna-7b
|
| 55 |
+
prompt: "question: {} answer:"
|
| 56 |
+
max_txt_len: 128
|
| 57 |
+
max_output_txt_len: 256
|
| 58 |
+
apply_lemmatizer: False
|
| 59 |
+
num_few_shot_examples: 0
|
| 60 |
+
few_shot_prob: 0
|
| 61 |
+
qformer_text_input: True
|
| 62 |
+
llm_text_input: True
|
| 63 |
+
modalities : [audio, video]
|
| 64 |
+
use_cues: True
|
| 65 |
+
shared_qformer: False
|
| 66 |
+
pretrained_shared_qformer: Null
|
| 67 |
+
load_attention_shared_qformer: False
|
| 68 |
+
load_qformer_type_shared: ""
|
| 69 |
+
load_projection_shared: False
|
| 70 |
+
load_projection_type_shaped: ""
|
| 71 |
+
load_ln_type_shared: ""
|
| 72 |
+
shared_qformer_num_features: 512
|
| 73 |
+
special_qformer_input_prompt: "a short description"
|
| 74 |
+
prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
|
| 75 |
+
predict_with_gen: False
|
| 76 |
+
use_caption: False
|
| 77 |
+
use_describe: False
|
| 78 |
+
enumerate_inputs: False
|
| 79 |
+
add_space: True
|
| 80 |
+
projection_only_audio: True
|
| 81 |
+
projection_path_audio: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/linear_projection_7b/audio_qformer_linear_768.pth
|
| 82 |
+
proj_dim: 768
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
datasets:
|
| 86 |
+
audio_video_discrn:
|
| 87 |
+
# data_dir: ${env.data_dir}/datasets
|
| 88 |
+
audio_processor:
|
| 89 |
+
train:
|
| 90 |
+
name: beats_audio
|
| 91 |
+
n_frames: 2
|
| 92 |
+
eval:
|
| 93 |
+
name: beats_audio
|
| 94 |
+
n_frames: 2
|
| 95 |
+
|
| 96 |
+
text_processor:
|
| 97 |
+
train:
|
| 98 |
+
name: "blip_caption"
|
| 99 |
+
eval:
|
| 100 |
+
name: "blip_caption"
|
| 101 |
+
|
| 102 |
+
video_processor:
|
| 103 |
+
train:
|
| 104 |
+
name: alpro_video_train
|
| 105 |
+
n_frms: 2
|
| 106 |
+
image_size: 224
|
| 107 |
+
min_scale: 0.9
|
| 108 |
+
max_scale: 1.0
|
| 109 |
+
full_video: True
|
| 110 |
+
eval:
|
| 111 |
+
name: alpro_video_eval
|
| 112 |
+
n_frms: 2
|
| 113 |
+
image_size: 224
|
| 114 |
+
min_scale: 0.9
|
| 115 |
+
max_scale: 1.0
|
| 116 |
+
full_video: True
|
| 117 |
+
|
| 118 |
+
data_type: [audio, video] # [images|videos|features]
|
| 119 |
+
|
| 120 |
+
build_info:
|
| 121 |
+
kwargs:
|
| 122 |
+
total: all
|
| 123 |
+
shuffle_modalities: False
|
| 124 |
+
balance_labels: True
|
| 125 |
+
dataset_name: audiocaps
|
| 126 |
+
ground_truth: False
|
| 127 |
+
classnames: [audio, video]
|
| 128 |
+
raw: False
|
| 129 |
+
|
| 130 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 131 |
+
annotations:
|
| 132 |
+
val:
|
| 133 |
+
url:
|
| 134 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
|
| 135 |
+
storage:
|
| 136 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
|
| 137 |
+
|
| 138 |
+
audio:
|
| 139 |
+
storage: /audiocaps/AUDIOCAPS_32000Hz/audio/val
|
| 140 |
+
video:
|
| 141 |
+
storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val
|
| 142 |
+
|
| 143 |
+
run:
|
| 144 |
+
task: discrn_qa
|
| 145 |
+
# optimization-specific
|
| 146 |
+
batch_size_train: 8
|
| 147 |
+
batch_size_eval: 1
|
| 148 |
+
num_workers: 8
|
| 149 |
+
max_epoch: 1
|
| 150 |
+
segments: 1
|
| 151 |
+
|
| 152 |
+
# inference-specific
|
| 153 |
+
max_len: 10
|
| 154 |
+
min_len: 1
|
| 155 |
+
length_penalty: -1.
|
| 156 |
+
num_beams: 5
|
| 157 |
+
inference_method: "generate"
|
| 158 |
+
|
| 159 |
+
train_splits: ["train"]
|
| 160 |
+
valid_splits: ["val"]
|
| 161 |
+
# test_splits: ["test"]
|
| 162 |
+
|
| 163 |
+
# distribution
|
| 164 |
+
device: "cuda"
|
| 165 |
+
world_size: 1
|
| 166 |
+
dist_url: "env://"
|
| 167 |
+
distributed: True
|
| 168 |
+
use_dist_eval_sampler: False
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
# model specific
|
| 172 |
+
k_test: 128
|
| 173 |
+
|
| 174 |
+
# misc
|
| 175 |
+
seed: 42
|
| 176 |
+
output_dir: "output/xinstructblip/eval/vicuna7b/discrn/audio_video_describe_proj"
|
| 177 |
+
|
| 178 |
+
evaluate: True
|
| 179 |
+
save_freq: -1
|
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/audio_video_describe_rand_init.yaml
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
model:
|
| 6 |
+
arch: blip2_vicuna_xinstruct
|
| 7 |
+
model_type: vicuna7b
|
| 8 |
+
load_pretrained: True
|
| 9 |
+
# pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
|
| 10 |
+
load_finetuned: False
|
| 11 |
+
finetuned: ""
|
| 12 |
+
stage1_url_or_filename: null
|
| 13 |
+
image_model: "eva_clip_g"
|
| 14 |
+
pc_model: "ulip2_pointbert"
|
| 15 |
+
video_model: "eva_clip_g"
|
| 16 |
+
audio_model: "beats"
|
| 17 |
+
pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
|
| 18 |
+
pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth
|
| 19 |
+
pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth
|
| 20 |
+
pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/audio_qformer_no_init.pth
|
| 21 |
+
load_attention_image_qformer: True
|
| 22 |
+
load_attention_pc_qformer: True
|
| 23 |
+
load_attention_video_qformer: True
|
| 24 |
+
load_attention_audio_qformer: True
|
| 25 |
+
load_ln_type_image: "image"
|
| 26 |
+
load_ln_type_video: "video"
|
| 27 |
+
load_ln_type_pc: "pc"
|
| 28 |
+
load_ln_type_audio: "audio"
|
| 29 |
+
load_qformer_type_image: "image"
|
| 30 |
+
load_qformer_type_pc: "pc"
|
| 31 |
+
load_qformer_type_video: "video"
|
| 32 |
+
load_qformer_type_audio: "audio"
|
| 33 |
+
load_projection_image: True
|
| 34 |
+
load_projection_pc: True
|
| 35 |
+
load_projection_video: True
|
| 36 |
+
load_projection_audio: True
|
| 37 |
+
load_projection_type_image: "image"
|
| 38 |
+
load_projection_type_pc: "pc"
|
| 39 |
+
load_projection_type_video: "video"
|
| 40 |
+
load_projection_type_audio: "audio"
|
| 41 |
+
image_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
|
| 42 |
+
pc_encoder_kwargs : {}
|
| 43 |
+
video_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
|
| 44 |
+
audio_encoder_kwargs : {}
|
| 45 |
+
image_precision: "fp16"
|
| 46 |
+
pc_precision: "fp16"
|
| 47 |
+
video_precision: "fp16"
|
| 48 |
+
audio_precision: "fp16"
|
| 49 |
+
freeze_image: True
|
| 50 |
+
freeze_pc: True
|
| 51 |
+
freeze_video: True
|
| 52 |
+
freeze_audio: True
|
| 53 |
+
num_query_token: 32
|
| 54 |
+
llm_model: /path/to/vicuna-7b
|
| 55 |
+
prompt: "question: {} answer:"
|
| 56 |
+
max_txt_len: 128
|
| 57 |
+
max_output_txt_len: 256
|
| 58 |
+
apply_lemmatizer: False
|
| 59 |
+
num_few_shot_examples: 0
|
| 60 |
+
few_shot_prob: 0
|
| 61 |
+
qformer_text_input: True
|
| 62 |
+
llm_text_input: True
|
| 63 |
+
modalities : [audio, video]
|
| 64 |
+
use_cues: True
|
| 65 |
+
shared_qformer: False
|
| 66 |
+
pretrained_shared_qformer: Null
|
| 67 |
+
load_attention_shared_qformer: False
|
| 68 |
+
load_qformer_type_shared: ""
|
| 69 |
+
load_projection_shared: False
|
| 70 |
+
load_projection_type_shaped: ""
|
| 71 |
+
load_ln_type_shared: ""
|
| 72 |
+
shared_qformer_num_features: 512
|
| 73 |
+
special_qformer_input_prompt: "a short description"
|
| 74 |
+
prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
|
| 75 |
+
predict_with_gen: False
|
| 76 |
+
use_caption: False
|
| 77 |
+
use_describe: False
|
| 78 |
+
enumerate_inputs: False
|
| 79 |
+
add_space: True
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
datasets:
|
| 83 |
+
audio_video_discrn:
|
| 84 |
+
# data_dir: ${env.data_dir}/datasets
|
| 85 |
+
audio_processor:
|
| 86 |
+
train:
|
| 87 |
+
name: beats_audio
|
| 88 |
+
n_frames: 2
|
| 89 |
+
eval:
|
| 90 |
+
name: beats_audio
|
| 91 |
+
n_frames: 2
|
| 92 |
+
|
| 93 |
+
text_processor:
|
| 94 |
+
train:
|
| 95 |
+
name: "blip_caption"
|
| 96 |
+
eval:
|
| 97 |
+
name: "blip_caption"
|
| 98 |
+
|
| 99 |
+
video_processor:
|
| 100 |
+
train:
|
| 101 |
+
name: alpro_video_train
|
| 102 |
+
n_frms: 2
|
| 103 |
+
image_size: 224
|
| 104 |
+
min_scale: 0.9
|
| 105 |
+
max_scale: 1.0
|
| 106 |
+
full_video: True
|
| 107 |
+
eval:
|
| 108 |
+
name: alpro_video_eval
|
| 109 |
+
n_frms: 2
|
| 110 |
+
image_size: 224
|
| 111 |
+
min_scale: 0.9
|
| 112 |
+
max_scale: 1.0
|
| 113 |
+
full_video: True
|
| 114 |
+
|
| 115 |
+
data_type: [audio, video] # [images|videos|features]
|
| 116 |
+
|
| 117 |
+
build_info:
|
| 118 |
+
kwargs:
|
| 119 |
+
total: all
|
| 120 |
+
shuffle_modalities: False
|
| 121 |
+
balance_labels: True
|
| 122 |
+
dataset_name: audiocaps
|
| 123 |
+
ground_truth: False
|
| 124 |
+
classnames: [audio, video]
|
| 125 |
+
raw: False
|
| 126 |
+
|
| 127 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 128 |
+
annotations:
|
| 129 |
+
val:
|
| 130 |
+
url:
|
| 131 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
|
| 132 |
+
storage:
|
| 133 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/audiocaps.json
|
| 134 |
+
|
| 135 |
+
audio:
|
| 136 |
+
storage: /audiocaps/AUDIOCAPS_32000Hz/audio/val
|
| 137 |
+
video:
|
| 138 |
+
storage: /export/einstein-vision/audio_datasets/audiocaps/video/AUDIOCAPS_32000Hz/audio/val
|
| 139 |
+
|
| 140 |
+
run:
|
| 141 |
+
task: discrn_qa
|
| 142 |
+
# optimization-specific
|
| 143 |
+
batch_size_train: 8
|
| 144 |
+
batch_size_eval: 1
|
| 145 |
+
num_workers: 8
|
| 146 |
+
max_epoch: 1
|
| 147 |
+
segments: 1
|
| 148 |
+
|
| 149 |
+
# inference-specific
|
| 150 |
+
max_len: 10
|
| 151 |
+
min_len: 1
|
| 152 |
+
length_penalty: -1.
|
| 153 |
+
num_beams: 5
|
| 154 |
+
inference_method: "generate"
|
| 155 |
+
|
| 156 |
+
train_splits: ["train"]
|
| 157 |
+
valid_splits: ["val"]
|
| 158 |
+
# test_splits: ["test"]
|
| 159 |
+
|
| 160 |
+
# distribution
|
| 161 |
+
device: "cuda"
|
| 162 |
+
world_size: 1
|
| 163 |
+
dist_url: "env://"
|
| 164 |
+
distributed: True
|
| 165 |
+
use_dist_eval_sampler: False
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
# model specific
|
| 169 |
+
k_test: 128
|
| 170 |
+
|
| 171 |
+
# misc
|
| 172 |
+
seed: 42
|
| 173 |
+
output_dir: "output/xinstructblip/eval/vicuna7b/discrn/audio_video_describe_rand_init"
|
| 174 |
+
|
| 175 |
+
evaluate: True
|
| 176 |
+
save_freq: -1
|
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/image_3d_caption.yaml
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
model:
|
| 6 |
+
arch: blip2_vicuna_xinstruct
|
| 7 |
+
model_type: vicuna7b
|
| 8 |
+
load_pretrained: True
|
| 9 |
+
# pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
|
| 10 |
+
load_finetuned: False
|
| 11 |
+
finetuned: ""
|
| 12 |
+
stage1_url_or_filename: null
|
| 13 |
+
image_model: "eva_clip_g"
|
| 14 |
+
pc_model: "ulip2_pointbert"
|
| 15 |
+
video_model: "eva_clip_g"
|
| 16 |
+
audio_model: "beats"
|
| 17 |
+
pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
|
| 18 |
+
pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth
|
| 19 |
+
pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth
|
| 20 |
+
pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/audio_qformer.pth
|
| 21 |
+
load_attention_image_qformer: True
|
| 22 |
+
load_attention_pc_qformer: True
|
| 23 |
+
load_attention_video_qformer: True
|
| 24 |
+
load_attention_audio_qformer: True
|
| 25 |
+
load_ln_type_image: "image"
|
| 26 |
+
load_ln_type_video: "video"
|
| 27 |
+
load_ln_type_pc: "pc"
|
| 28 |
+
load_ln_type_audio: "audio"
|
| 29 |
+
load_qformer_type_image: "image"
|
| 30 |
+
load_qformer_type_pc: "pc"
|
| 31 |
+
load_qformer_type_video: "video"
|
| 32 |
+
load_qformer_type_audio: "audio"
|
| 33 |
+
load_projection_image: True
|
| 34 |
+
load_projection_pc: True
|
| 35 |
+
load_projection_video: True
|
| 36 |
+
load_projection_audio: True
|
| 37 |
+
load_projection_type_image: "image"
|
| 38 |
+
load_projection_type_pc: "pc"
|
| 39 |
+
load_projection_type_video: "video"
|
| 40 |
+
load_projection_type_audio: "audio"
|
| 41 |
+
image_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
|
| 42 |
+
pc_encoder_kwargs : {}
|
| 43 |
+
video_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
|
| 44 |
+
audio_encoder_kwargs : {}
|
| 45 |
+
image_precision: "fp16"
|
| 46 |
+
pc_precision: "fp16"
|
| 47 |
+
video_precision: "fp16"
|
| 48 |
+
audio_precision: "fp16"
|
| 49 |
+
freeze_image: True
|
| 50 |
+
freeze_pc: True
|
| 51 |
+
freeze_video: True
|
| 52 |
+
freeze_audio: True
|
| 53 |
+
num_query_token: 32
|
| 54 |
+
llm_model: /path/to/vicuna-7b
|
| 55 |
+
prompt: "question: {} answer:"
|
| 56 |
+
modalities : [image, pc]
|
| 57 |
+
use_cues: True
|
| 58 |
+
shared_qformer: False
|
| 59 |
+
pretrained_shared_qformer: Null
|
| 60 |
+
load_attention_shared_qformer: False
|
| 61 |
+
load_qformer_type_shared: ""
|
| 62 |
+
load_projection_shared: False
|
| 63 |
+
load_projection_type_shaped: ""
|
| 64 |
+
load_ln_type_shared: ""
|
| 65 |
+
shared_qformer_num_features: 512
|
| 66 |
+
special_qformer_input_prompt: "a short description"
|
| 67 |
+
prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
|
| 68 |
+
predict_with_gen: False
|
| 69 |
+
use_caption: True
|
| 70 |
+
use_describe: False
|
| 71 |
+
enumerate_inputs: False
|
| 72 |
+
add_space: True
|
| 73 |
+
|
| 74 |
+
datasets:
|
| 75 |
+
image_pc_discrn: # name of the dataset builder
|
| 76 |
+
vis_processor:
|
| 77 |
+
train:
|
| 78 |
+
name: "clip_image_train"
|
| 79 |
+
eval:
|
| 80 |
+
name: "clip_image_eval"
|
| 81 |
+
pc_processor:
|
| 82 |
+
train:
|
| 83 |
+
name: "ulip_pc"
|
| 84 |
+
eval:
|
| 85 |
+
name: "ulip_pc"
|
| 86 |
+
text_processor:
|
| 87 |
+
train:
|
| 88 |
+
name: "blip_caption"
|
| 89 |
+
eval:
|
| 90 |
+
name: "blip_caption"
|
| 91 |
+
|
| 92 |
+
data_type: [images, pc] # [images|videos|features]
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
build_info:
|
| 96 |
+
kwargs:
|
| 97 |
+
total: all
|
| 98 |
+
raw: True
|
| 99 |
+
shuffle_modalities: False
|
| 100 |
+
balance_labels: True
|
| 101 |
+
dataset_name: objaverse
|
| 102 |
+
classnames: [image, 3d]
|
| 103 |
+
ground_truth: False
|
| 104 |
+
|
| 105 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 106 |
+
annotations:
|
| 107 |
+
val:
|
| 108 |
+
url:
|
| 109 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/objaverse.json
|
| 110 |
+
storage:
|
| 111 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/objaverse.json
|
| 112 |
+
pc:
|
| 113 |
+
storage: /export/einstein-vision/3d_vision/objaverse/objaverse_pc_parallel
|
| 114 |
+
|
| 115 |
+
images:
|
| 116 |
+
storage: /export/einstein-vision/3d_vision/objaverse_captions/images/
|
| 117 |
+
|
| 118 |
+
run:
|
| 119 |
+
task: discrn_qa
|
| 120 |
+
# optimization-specific
|
| 121 |
+
batch_size_train: 8
|
| 122 |
+
batch_size_eval: 1
|
| 123 |
+
num_workers: 10
|
| 124 |
+
max_epoch: 1
|
| 125 |
+
segments: 1
|
| 126 |
+
|
| 127 |
+
# inference-specific
|
| 128 |
+
max_len: 10
|
| 129 |
+
min_len: 1
|
| 130 |
+
length_penalty: -1.
|
| 131 |
+
num_beams: 5
|
| 132 |
+
inference_method: "generate"
|
| 133 |
+
|
| 134 |
+
train_splits: ["train"]
|
| 135 |
+
valid_splits: ["val"]
|
| 136 |
+
# test_splits: ["test"]
|
| 137 |
+
|
| 138 |
+
# distribution
|
| 139 |
+
device: "cuda"
|
| 140 |
+
world_size: 1
|
| 141 |
+
dist_url: "env://"
|
| 142 |
+
distributed: True
|
| 143 |
+
use_dist_eval_sampler: False
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
# model specific
|
| 147 |
+
k_test: 128
|
| 148 |
+
|
| 149 |
+
# misc
|
| 150 |
+
seed: 42
|
| 151 |
+
output_dir: "output/xinstructblip/eval/vicuna7b/discrn/image_3d_caption"
|
| 152 |
+
|
| 153 |
+
evaluate: True
|
| 154 |
+
save_freq: -1
|
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/image_3d_caption_13b.yaml
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
model:
|
| 6 |
+
arch: blip2_vicuna_xinstruct
|
| 7 |
+
model_type: vicuna13b
|
| 8 |
+
load_pretrained: True
|
| 9 |
+
# pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/image_qformer.pth
|
| 10 |
+
load_finetuned: False
|
| 11 |
+
finetuned: ""
|
| 12 |
+
stage1_url_or_filename: null
|
| 13 |
+
image_model: "eva_clip_g"
|
| 14 |
+
pc_model: "ulip2_pointbert"
|
| 15 |
+
video_model: "eva_clip_g"
|
| 16 |
+
audio_model: "beats"
|
| 17 |
+
pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/image_qformer.pth
|
| 18 |
+
pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/pc_qformer.pth
|
| 19 |
+
pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/video_qformer.pth
|
| 20 |
+
pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna13b/audio_qformer.pth
|
| 21 |
+
load_attention_image_qformer: True
|
| 22 |
+
load_attention_pc_qformer: True
|
| 23 |
+
load_attention_video_qformer: True
|
| 24 |
+
load_attention_audio_qformer: True
|
| 25 |
+
load_ln_type_image: "image"
|
| 26 |
+
load_ln_type_video: "video"
|
| 27 |
+
load_ln_type_pc: "pc"
|
| 28 |
+
load_ln_type_audio: "audio"
|
| 29 |
+
load_qformer_type_image: "image"
|
| 30 |
+
load_qformer_type_pc: "pc"
|
| 31 |
+
load_qformer_type_video: "video"
|
| 32 |
+
load_qformer_type_audio: "audio"
|
| 33 |
+
load_projection_image: True
|
| 34 |
+
load_projection_pc: True
|
| 35 |
+
load_projection_video: True
|
| 36 |
+
load_projection_audio: True
|
| 37 |
+
load_projection_type_image: "image"
|
| 38 |
+
load_projection_type_pc: "pc"
|
| 39 |
+
load_projection_type_video: "video"
|
| 40 |
+
load_projection_type_audio: "audio"
|
| 41 |
+
image_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
|
| 42 |
+
pc_encoder_kwargs : {}
|
| 43 |
+
video_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
|
| 44 |
+
audio_encoder_kwargs : {}
|
| 45 |
+
image_precision: "fp16"
|
| 46 |
+
pc_precision: "fp16"
|
| 47 |
+
video_precision: "fp16"
|
| 48 |
+
audio_precision: "fp16"
|
| 49 |
+
freeze_image: True
|
| 50 |
+
freeze_pc: True
|
| 51 |
+
freeze_video: True
|
| 52 |
+
freeze_audio: True
|
| 53 |
+
num_query_token: 32
|
| 54 |
+
llm_model: "/path/to/vicuna-13b"
|
| 55 |
+
prompt: "question: {} answer:"
|
| 56 |
+
modalities : [image, pc]
|
| 57 |
+
use_cues: True
|
| 58 |
+
shared_qformer: False
|
| 59 |
+
pretrained_shared_qformer: Null
|
| 60 |
+
load_attention_shared_qformer: False
|
| 61 |
+
load_qformer_type_shared: ""
|
| 62 |
+
load_projection_shared: False
|
| 63 |
+
load_projection_type_shaped: ""
|
| 64 |
+
load_ln_type_shared: ""
|
| 65 |
+
shared_qformer_num_features: 512
|
| 66 |
+
special_qformer_input_prompt: "a short description"
|
| 67 |
+
prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
|
| 68 |
+
predict_with_gen: False
|
| 69 |
+
use_caption: True
|
| 70 |
+
use_describe: False
|
| 71 |
+
enumerate_inputs: False
|
| 72 |
+
add_space: True
|
| 73 |
+
|
| 74 |
+
datasets:
|
| 75 |
+
image_pc_discrn: # name of the dataset builder
|
| 76 |
+
vis_processor:
|
| 77 |
+
train:
|
| 78 |
+
name: "clip_image_train"
|
| 79 |
+
eval:
|
| 80 |
+
name: "clip_image_eval"
|
| 81 |
+
pc_processor:
|
| 82 |
+
train:
|
| 83 |
+
name: "ulip_pc"
|
| 84 |
+
eval:
|
| 85 |
+
name: "ulip_pc"
|
| 86 |
+
text_processor:
|
| 87 |
+
train:
|
| 88 |
+
name: "blip_caption"
|
| 89 |
+
eval:
|
| 90 |
+
name: "blip_caption"
|
| 91 |
+
|
| 92 |
+
data_type: [images, pc] # [images|videos|features]
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
build_info:
|
| 96 |
+
kwargs:
|
| 97 |
+
total: 100
|
| 98 |
+
raw: True
|
| 99 |
+
shuffle_modalities: False
|
| 100 |
+
balance_labels: True
|
| 101 |
+
dataset_name: objaverse
|
| 102 |
+
classnames: [image, 3d]
|
| 103 |
+
ground_truth: False
|
| 104 |
+
|
| 105 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 106 |
+
annotations:
|
| 107 |
+
val:
|
| 108 |
+
url:
|
| 109 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/objaverse.json
|
| 110 |
+
storage:
|
| 111 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/objaverse.json
|
| 112 |
+
pc:
|
| 113 |
+
storage: /export/einstein-vision/3d_vision/objaverse/objaverse_pc_parallel
|
| 114 |
+
|
| 115 |
+
images:
|
| 116 |
+
storage: /export/einstein-vision/3d_vision/objaverse_captions/images/
|
| 117 |
+
|
| 118 |
+
run:
|
| 119 |
+
task: discrn_qa
|
| 120 |
+
# optimization-specific
|
| 121 |
+
batch_size_train: 8
|
| 122 |
+
batch_size_eval: 1
|
| 123 |
+
num_workers: 2
|
| 124 |
+
max_epoch: 1
|
| 125 |
+
segments: 1
|
| 126 |
+
|
| 127 |
+
# inference-specific
|
| 128 |
+
max_len: 10
|
| 129 |
+
min_len: 1
|
| 130 |
+
length_penalty: -1.
|
| 131 |
+
num_beams: 5
|
| 132 |
+
inference_method: "generate"
|
| 133 |
+
|
| 134 |
+
train_splits: ["train"]
|
| 135 |
+
valid_splits: ["val"]
|
| 136 |
+
# test_splits: ["test"]
|
| 137 |
+
|
| 138 |
+
# distribution
|
| 139 |
+
device: "cuda"
|
| 140 |
+
world_size: 1
|
| 141 |
+
dist_url: "env://"
|
| 142 |
+
distributed: True
|
| 143 |
+
use_dist_eval_sampler: False
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
# model specific
|
| 147 |
+
k_test: 128
|
| 148 |
+
|
| 149 |
+
# misc
|
| 150 |
+
seed: 42
|
| 151 |
+
output_dir: "output/xinstructblip/eval/vicuna13b/discrn/image_3d_caption"
|
| 152 |
+
|
| 153 |
+
evaluate: True
|
| 154 |
+
save_freq: -1
|
LAVIS-main/lavis/projects/xinstruct_blip/eval/discrn/image_3d_describe.yaml
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, salesforce.com, inc.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 4 |
+
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 5 |
+
model:
|
| 6 |
+
arch: blip2_vicuna_xinstruct
|
| 7 |
+
model_type: vicuna7b
|
| 8 |
+
load_pretrained: True
|
| 9 |
+
# pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
|
| 10 |
+
load_finetuned: False
|
| 11 |
+
finetuned: ""
|
| 12 |
+
stage1_url_or_filename: null
|
| 13 |
+
image_model: "eva_clip_g"
|
| 14 |
+
pc_model: "ulip2_pointbert"
|
| 15 |
+
video_model: "eva_clip_g"
|
| 16 |
+
audio_model: "beats"
|
| 17 |
+
pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
|
| 18 |
+
pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth
|
| 19 |
+
pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth
|
| 20 |
+
pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/audio_qformer.pth
|
| 21 |
+
load_attention_image_qformer: True
|
| 22 |
+
load_attention_pc_qformer: True
|
| 23 |
+
load_attention_video_qformer: True
|
| 24 |
+
load_attention_audio_qformer: True
|
| 25 |
+
load_ln_type_image: "image"
|
| 26 |
+
load_ln_type_video: "video"
|
| 27 |
+
load_ln_type_pc: "pc"
|
| 28 |
+
load_ln_type_audio: "audio"
|
| 29 |
+
load_qformer_type_image: "image"
|
| 30 |
+
load_qformer_type_pc: "pc"
|
| 31 |
+
load_qformer_type_video: "video"
|
| 32 |
+
load_qformer_type_audio: "audio"
|
| 33 |
+
load_projection_image: True
|
| 34 |
+
load_projection_pc: True
|
| 35 |
+
load_projection_video: True
|
| 36 |
+
load_projection_audio: True
|
| 37 |
+
load_projection_type_image: "image"
|
| 38 |
+
load_projection_type_pc: "pc"
|
| 39 |
+
load_projection_type_video: "video"
|
| 40 |
+
load_projection_type_audio: "audio"
|
| 41 |
+
image_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
|
| 42 |
+
pc_encoder_kwargs : {}
|
| 43 |
+
video_encoder_kwargs : {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
|
| 44 |
+
audio_encoder_kwargs : {}
|
| 45 |
+
image_precision: "fp16"
|
| 46 |
+
pc_precision: "fp16"
|
| 47 |
+
video_precision: "fp16"
|
| 48 |
+
audio_precision: "fp16"
|
| 49 |
+
freeze_image: True
|
| 50 |
+
freeze_pc: True
|
| 51 |
+
freeze_video: True
|
| 52 |
+
freeze_audio: True
|
| 53 |
+
num_query_token: 32
|
| 54 |
+
llm_model: /path/to/vicuna-7b
|
| 55 |
+
prompt: "question: {} answer:"
|
| 56 |
+
modalities : [image, pc]
|
| 57 |
+
use_cues: True
|
| 58 |
+
shared_qformer: False
|
| 59 |
+
pretrained_shared_qformer: Null
|
| 60 |
+
load_attention_shared_qformer: False
|
| 61 |
+
load_qformer_type_shared: ""
|
| 62 |
+
load_projection_shared: False
|
| 63 |
+
load_projection_type_shaped: ""
|
| 64 |
+
load_ln_type_shared: ""
|
| 65 |
+
shared_qformer_num_features: 512
|
| 66 |
+
special_qformer_input_prompt: "a short description"
|
| 67 |
+
prefix: "You are given two inputs. Select exactly one of the two by referece to its relative position (first or second, left or right) that best answers the question. "
|
| 68 |
+
predict_with_gen: False
|
| 69 |
+
use_caption: False
|
| 70 |
+
use_describe: False
|
| 71 |
+
enumerate_inputs: False
|
| 72 |
+
add_space: True
|
| 73 |
+
|
| 74 |
+
datasets:
|
| 75 |
+
image_pc_discrn: # name of the dataset builder
|
| 76 |
+
vis_processor:
|
| 77 |
+
train:
|
| 78 |
+
name: "clip_image_train"
|
| 79 |
+
eval:
|
| 80 |
+
name: "clip_image_eval"
|
| 81 |
+
pc_processor:
|
| 82 |
+
train:
|
| 83 |
+
name: "ulip_pc"
|
| 84 |
+
eval:
|
| 85 |
+
name: "ulip_pc"
|
| 86 |
+
text_processor:
|
| 87 |
+
train:
|
| 88 |
+
name: "blip_caption"
|
| 89 |
+
eval:
|
| 90 |
+
name: "blip_caption"
|
| 91 |
+
|
| 92 |
+
data_type: [images, pc] # [images|videos|features]
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
build_info:
|
| 96 |
+
kwargs:
|
| 97 |
+
total: all
|
| 98 |
+
raw: False
|
| 99 |
+
shuffle_modalities: False
|
| 100 |
+
balance_labels: True
|
| 101 |
+
dataset_name: objaverse
|
| 102 |
+
classnames: [image, 3d]
|
| 103 |
+
ground_truth: False
|
| 104 |
+
|
| 105 |
+
# Be careful not to append minus sign (-) before split to avoid itemizing
|
| 106 |
+
annotations:
|
| 107 |
+
val:
|
| 108 |
+
url:
|
| 109 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/objaverse.json
|
| 110 |
+
storage:
|
| 111 |
+
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/discrn/objaverse.json
|
| 112 |
+
pc:
|
| 113 |
+
storage: /export/einstein-vision/3d_vision/objaverse/objaverse_pc_parallel
|
| 114 |
+
|
| 115 |
+
images:
|
| 116 |
+
storage: /export/einstein-vision/3d_vision/objaverse_captions/images/
|
| 117 |
+
|
| 118 |
+
run:
|
| 119 |
+
task: discrn_qa
|
| 120 |
+
# optimization-specific
|
| 121 |
+
batch_size_train: 8
|
| 122 |
+
batch_size_eval: 1
|
| 123 |
+
num_workers: 10
|
| 124 |
+
max_epoch: 1
|
| 125 |
+
segments: 1
|
| 126 |
+
|
| 127 |
+
# inference-specific
|
| 128 |
+
max_len: 10
|
| 129 |
+
min_len: 1
|
| 130 |
+
length_penalty: 1.
|
| 131 |
+
num_beams: 5
|
| 132 |
+
inference_method: "generate"
|
| 133 |
+
|
| 134 |
+
train_splits: ["train"]
|
| 135 |
+
valid_splits: ["val"]
|
| 136 |
+
# test_splits: ["test"]
|
| 137 |
+
|
| 138 |
+
# distribution
|
| 139 |
+
device: "cuda"
|
| 140 |
+
world_size: 1
|
| 141 |
+
dist_url: "env://"
|
| 142 |
+
distributed: True
|
| 143 |
+
use_dist_eval_sampler: False
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
# model specific
|
| 147 |
+
k_test: 128
|
| 148 |
+
|
| 149 |
+
# misc
|
| 150 |
+
seed: 42
|
| 151 |
+
output_dir: "output/xinstructblip/eval/vicuna7b/discrn/image_3d_describe"
|
| 152 |
+
|
| 153 |
+
evaluate: True
|
| 154 |
+
save_freq: -1
|