import json from pycocotools.coco import COCO from tqdm import tqdm import string import re import argparse ''' Process the text generated by LLaVA to extract object names and convert them into token format ''' parser = argparse.ArgumentParser() parser.add_argument("--text_path", type=str, required=True, help="Path to the input JSON file containing text data.") parser.add_argument("--save_path", type=str, required=True, help="Path to save the processed JSON file.") args = parser.parse_args() def extract_object_name(text): parts = text.split("is") if len(parts) > 1: return parts[1].strip() return None with open(args.text_path, "r") as fp: datas = json.load(fp) new_data = [] sent_id = 0 for data in datas: instruct_list = [] for anno in data["first_frame_anns"]: text = anno["text"] raw = extract_object_name(text) raw_lower = raw.lower() result = raw_lower.replace("green", "").strip() sent = result.translate(str.maketrans('', '', string.punctuation)) tokens = sent.split() sample = { "tokens": tokens, "raw": raw, "sent_id": sent_id, "sent": sent } sent_id += 1 instruct_list.append(sample) del anno["text"] data["instruction"] = instruct_list new_data.append(data) with open(args.save_path, "w") as fp: json.dump(new_data, fp)