ObjectRelator-plus / datasets /build_text.py
YuqianFu's picture
Upload folder using huggingface_hub
36c1e62 verified
import json
from pycocotools.coco import COCO
from tqdm import tqdm
import string
import re
import argparse
'''
Process the text generated by LLaVA to extract object names and convert them into token format
'''
parser = argparse.ArgumentParser()
parser.add_argument("--text_path", type=str, required=True, help="Path to the input JSON file containing text data.")
parser.add_argument("--save_path", type=str, required=True, help="Path to save the processed JSON file.")
args = parser.parse_args()
def extract_object_name(text):
parts = text.split("is")
if len(parts) > 1:
return parts[1].strip()
return None
with open(args.text_path, "r") as fp:
datas = json.load(fp)
new_data = []
sent_id = 0
for data in datas:
instruct_list = []
for anno in data["first_frame_anns"]:
text = anno["text"]
raw = extract_object_name(text)
raw_lower = raw.lower()
result = raw_lower.replace("green", "").strip()
sent = result.translate(str.maketrans('', '', string.punctuation))
tokens = sent.split()
sample = {
"tokens": tokens,
"raw": raw,
"sent_id": sent_id,
"sent": sent
}
sent_id += 1
instruct_list.append(sample)
del anno["text"]
data["instruction"] = instruct_list
new_data.append(data)
with open(args.save_path, "w") as fp:
json.dump(new_data, fp)