YuqianFu
/

ObjectRelator-plus

Model card Files Files and versions

Metrics Training metrics Community

ObjectRelator-plus / datasets /build_text.py

YuqianFu's picture

Upload folder using huggingface_hub

36c1e62 verified about 2 months ago

history blame contribute delete

1.43 kB

	import json
	from pycocotools.coco import COCO
	from tqdm import tqdm
	import string
	import re
	import argparse

	'''
	Process the text generated by LLaVA to extract object names and convert them into token format
	'''

	parser = argparse.ArgumentParser()
	parser.add_argument("--text_path", type=str, required=True, help="Path to the input JSON file containing text data.")
	parser.add_argument("--save_path", type=str, required=True, help="Path to save the processed JSON file.")
	args = parser.parse_args()

	def extract_object_name(text):
	parts = text.split("is")
	if len(parts) > 1:
	return parts[1].strip()
	return None

	with open(args.text_path, "r") as fp:
	datas = json.load(fp)


	new_data = []
	sent_id = 0
	for data in datas:
	instruct_list = []
	for anno in data["first_frame_anns"]:
	text = anno["text"]
	raw = extract_object_name(text)
	raw_lower = raw.lower()
	result = raw_lower.replace("green", "").strip()
	sent = result.translate(str.maketrans('', '', string.punctuation))
	tokens = sent.split()
	sample = {
	"tokens": tokens,
	"raw": raw,
	"sent_id": sent_id,
	"sent": sent
	}
	sent_id += 1
	instruct_list.append(sample)
	del anno["text"]
	data["instruction"] = instruct_list
	new_data.append(data)

	with open(args.save_path, "w") as fp:
	json.dump(new_data, fp)