import json from pycocotools.coco import COCO from tqdm import tqdm import string import re def extract_object_name(text): parts = text.split("is") if len(parts) > 1: return parts[1].strip() return None text_pth = "/home/yuqian_fu/Projects/DAVIS_test_gap20.json" save_path = "/home/yuqian_fu/Projects/DAVIS_test_gap20_instruction.json" new_data = [] sent_id = 0 with open(text_pth, "r") as fp: datas = json.load(fp) # data是一帧帧图片 for data in datas: instruct_list = [] # new_annos = [] for anno in data["first_frame_anns"]: text = anno["text"] # 提取is之后的句子 # raw = extract_object_name(text) #将raw变小写 raw_lower = text.lower() # 删除 "green" 并去掉多余的空格 # result = raw_lower.replace("green", "").strip() # 删除objname中的序号:ball_0 --> ball result = re.sub(r'_\d+$', '', raw_lower) # 删除所有标点符号 sent = result.translate(str.maketrans('', '', string.punctuation)) tokens = sent.split() sample = { "tokens": tokens, "raw": text, "sent_id": sent_id, "sent": sent } # anno["llava_text"] = sent # new_annos.append(anno) sent_id += 1 instruct_list.append(sample) # del anno["text"] #debug data["instruction"] = instruct_list # data["first_frame_anns"] = new_annos # del data["instruction"] #debug new_data.append(data) print(sent_id) print("len of new_data: ", len(new_data)) with open(save_path, "w") as fp: json.dump(new_data, fp)