|
|
import os as __os |
|
|
from copy import deepcopy as __deepcopy |
|
|
import itertools as __itertools |
|
|
|
|
|
data_root = "DATAS/TRAIN_TEST" |
|
|
anno_root_it = f"{data_root}/magic_jsons" |
|
|
|
|
|
|
|
|
available_corpus = dict( |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
caption_textvr=[ |
|
|
f"{anno_root_it}/video/caption/textvr/train.json", |
|
|
f"{data_root}/videos/TextVR", |
|
|
"video" |
|
|
], |
|
|
caption_videochat=[ |
|
|
f"{anno_root_it}/video/caption/videochat/train.json", |
|
|
f"{data_root}/videos/webvid_10m", |
|
|
"video" |
|
|
], |
|
|
caption_webvid=[ |
|
|
f"{anno_root_it}/video/caption/webvid/train.json", |
|
|
f"{data_root}/videos/webvid_10m", |
|
|
"video" |
|
|
], |
|
|
caption_youcook2=[ |
|
|
f"{anno_root_it}/video/caption/youcook2/train.json", |
|
|
f"{data_root}/videos/YouCook2/split_videos", |
|
|
"video" |
|
|
], |
|
|
classification_k710=[ |
|
|
f"{anno_root_it}/video/classification/k710/train.json", |
|
|
f"{data_root}/videos/kinetics", |
|
|
"video" |
|
|
], |
|
|
classification_ssv2=[ |
|
|
f"{anno_root_it}/video/classification/ssv2/train.json", |
|
|
f"{data_root}/videos/20bn-something-something-v2", |
|
|
"video" |
|
|
], |
|
|
conversation_videochat1=[ |
|
|
f"{anno_root_it}/video/conversation/videochat1/train.json", |
|
|
f"{data_root}/videos/webvid_10m", |
|
|
"video" |
|
|
], |
|
|
conversation_videochat2=[ |
|
|
f"{anno_root_it}/video/conversation/videochat2/train.json", |
|
|
f"{data_root}/videos/InternVid-10M-FLT/videos", |
|
|
"video" |
|
|
], |
|
|
conversation_videochatgpt=[ |
|
|
f"{anno_root_it}/video/conversation/videochatgpt/train.json", |
|
|
f"{data_root}/videos/AVideo_ChatGPT", |
|
|
"video" |
|
|
], |
|
|
reasoning_next_qa=[ |
|
|
f"{anno_root_it}/video/reasoning/next_qa/train.json", |
|
|
f"{data_root}/videos/NExTVideo", |
|
|
"video" |
|
|
], |
|
|
reasoning_clevrer_qa=[ |
|
|
f"{anno_root_it}/video/reasoning/clevrer_qa/train.json", |
|
|
f"{data_root}/videos/CLEVRER", |
|
|
"video" |
|
|
], |
|
|
reasoning_clevrer_mc=[ |
|
|
f"{anno_root_it}/video/reasoning/clevrer_mc/train.json", |
|
|
f"{data_root}/videos/CLEVRER", |
|
|
"video" |
|
|
], |
|
|
vqa_ego_qa=[ |
|
|
f"{anno_root_it}/video/vqa/ego_qa/train.json", |
|
|
f"{data_root}/videos/ego4d_data/split_videos", |
|
|
"video" |
|
|
], |
|
|
vqa_tgif_frame_qa=[ |
|
|
f"{anno_root_it}/video/vqa/tgif_frame_qa/train.json", |
|
|
f"{data_root}/videos/tgif", |
|
|
"video" |
|
|
], |
|
|
vqa_tgif_transition_qa=[ |
|
|
f"{anno_root_it}/video/vqa/tgif_transition_qa/train.json", |
|
|
f"{data_root}/videos/tgif", |
|
|
"video" |
|
|
], |
|
|
vqa_webvid_qa=[ |
|
|
f"{anno_root_it}/video/vqa/webvid_qa/train.json", |
|
|
f"{data_root}/videos/webvid_10m", |
|
|
"video" |
|
|
], |
|
|
origin_videochatgpt=[ |
|
|
f"{anno_root_it}/video/origin_videochatgpt/train.json", |
|
|
f"{data_root}/videos/Video_ChatGPT", |
|
|
"video" |
|
|
], |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
available_corpus["videochat2_instruction_full"] = [ |
|
|
available_corpus["caption_coco"], |
|
|
available_corpus["caption_llava"], |
|
|
available_corpus["caption_minigpt4"], |
|
|
available_corpus["caption_paragraph_captioning"], |
|
|
available_corpus["caption_textcaps"], |
|
|
available_corpus["classification_imagenet"], |
|
|
available_corpus["classification_coco_itm"], |
|
|
available_corpus["conversation_llava"], |
|
|
available_corpus["reasoning_clevr"], |
|
|
available_corpus["reasoning_visual_mrc"], |
|
|
available_corpus["reasoning_llava"], |
|
|
available_corpus["vqa_vqav2"], |
|
|
available_corpus["vqa_gqa"], |
|
|
available_corpus["vqa_okvqa"], |
|
|
available_corpus["vqa_a_okvqa"], |
|
|
available_corpus["vqa_viquae"], |
|
|
available_corpus["vqa_ocr_vqa"], |
|
|
available_corpus["vqa_text_vqa"], |
|
|
available_corpus["vqa_st_vqa"], |
|
|
available_corpus["vqa_docvqa"], |
|
|
available_corpus["caption_textvr"], |
|
|
available_corpus["caption_youcook2"], |
|
|
available_corpus["classification_k710"], |
|
|
available_corpus["classification_ssv2"], |
|
|
available_corpus["conversation_videochat2"], |
|
|
available_corpus["conversation_videochatgpt"], |
|
|
available_corpus["reasoning_next_qa"], |
|
|
available_corpus["reasoning_clevrer_qa"], |
|
|
available_corpus["reasoning_clevrer_mc"], |
|
|
available_corpus["vqa_ego_qa"], |
|
|
available_corpus["vqa_tgif_frame_qa"], |
|
|
available_corpus["vqa_tgif_transition_qa"], |
|
|
available_corpus["conversation_videochat1"], |
|
|
available_corpus["vqa_webvid_qa"], |
|
|
available_corpus["caption_videochat"], |
|
|
available_corpus["caption_webvid"], |
|
|
] |
|
|
|
|
|
available_corpus["videochat2_video"] = [ |
|
|
available_corpus["caption_textvr"], |
|
|
available_corpus["caption_youcook2"], |
|
|
available_corpus["classification_k710"], |
|
|
available_corpus["classification_ssv2"], |
|
|
available_corpus["conversation_videochat2"], |
|
|
available_corpus["conversation_videochatgpt"], |
|
|
available_corpus["reasoning_next_qa"], |
|
|
available_corpus["reasoning_clevrer_qa"], |
|
|
available_corpus["reasoning_clevrer_mc"], |
|
|
available_corpus["vqa_ego_qa"], |
|
|
available_corpus["vqa_tgif_frame_qa"], |
|
|
available_corpus["vqa_tgif_transition_qa"], |
|
|
available_corpus["conversation_videochat1"], |
|
|
available_corpus["vqa_webvid_qa"], |
|
|
available_corpus["caption_videochat"], |
|
|
available_corpus["caption_webvid"], |
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
available_corpus["videochat2_instruction_debug"] = [ |
|
|
|
|
|
available_corpus["caption_textvr"], |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
] |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
print(len(list( |
|
|
__itertools.chain( |
|
|
available_corpus['conversation_data'], |
|
|
available_corpus['reasoning_data'], |
|
|
available_corpus['conversation_videochat2'], |
|
|
available_corpus['caption_data'], |
|
|
available_corpus['classification_data'], |
|
|
) |
|
|
))) |
|
|
print(len(available_corpus['videochat2_instruction_full'])) |