import json
from pycocotools.coco import COCO
from tqdm import tqdm
import string
import re
import argparse

'''
Process the text generated by LLaVA to extract object names and convert them into token format
'''

parser = argparse.ArgumentParser()
parser.add_argument("--text_path", type=str, required=True, help="Path to the input JSON file containing text data.")
parser.add_argument("--save_path", type=str, required=True, help="Path to save the processed JSON file.")
args = parser.parse_args()

def extract_object_name(text):
    parts = text.split("is")
    if len(parts) > 1:
        return parts[1].strip()
    return None

with open(args.text_path, "r") as fp:
   datas = json.load(fp)


new_data = []
sent_id = 0
for data in datas:
    instruct_list = []
    for anno in data["first_frame_anns"]:
        text = anno["text"]
        raw = extract_object_name(text)
        raw_lower = raw.lower()
        result = raw_lower.replace("green", "").strip() 
        sent = result.translate(str.maketrans('', '', string.punctuation))
        tokens = sent.split()
        sample = {
            "tokens": tokens,
            "raw": raw,
            "sent_id": sent_id,
            "sent": sent
        }
        sent_id += 1
        instruct_list.append(sample)
        del anno["text"] 
    data["instruction"] = instruct_list
    new_data.append(data)

with open(args.save_path, "w") as fp:
    json.dump(new_data, fp)