| import json | |
| from pycocotools.coco import COCO | |
| from tqdm import tqdm | |
| import string | |
| import re | |
| def extract_object_name(text): | |
| parts = text.split("is") | |
| if len(parts) > 1: | |
| return parts[1].strip() | |
| return None | |
| text_pth = "/home/yuqian_fu/Projects/DAVIS_test_gap20.json" | |
| save_path = "/home/yuqian_fu/Projects/DAVIS_test_gap20_instruction.json" | |
| new_data = [] | |
| sent_id = 0 | |
| with open(text_pth, "r") as fp: | |
| datas = json.load(fp) | |
| # data是一帧帧图片 | |
| for data in datas: | |
| instruct_list = [] | |
| # new_annos = [] | |
| for anno in data["first_frame_anns"]: | |
| text = anno["text"] | |
| # 提取is之后的句子 | |
| # raw = extract_object_name(text) | |
| #将raw变小写 | |
| raw_lower = text.lower() | |
| # 删除 "green" 并去掉多余的空格 | |
| # result = raw_lower.replace("green", "").strip() | |
| # 删除objname中的序号:ball_0 --> ball | |
| result = re.sub(r'_\d+$', '', raw_lower) | |
| # 删除所有标点符号 | |
| sent = result.translate(str.maketrans('', '', string.punctuation)) | |
| tokens = sent.split() | |
| sample = { | |
| "tokens": tokens, | |
| "raw": text, | |
| "sent_id": sent_id, | |
| "sent": sent | |
| } | |
| # anno["llava_text"] = sent | |
| # new_annos.append(anno) | |
| sent_id += 1 | |
| instruct_list.append(sample) | |
| # del anno["text"] #debug | |
| data["instruction"] = instruct_list | |
| # data["first_frame_anns"] = new_annos | |
| # del data["instruction"] #debug | |
| new_data.append(data) | |
| print(sent_id) | |
| print("len of new_data: ", len(new_data)) | |
| with open(save_path, "w") as fp: | |
| json.dump(new_data, fp) | |