deepspeed / scripts /tools /merge_img_tsv_into_json_for_vdtk.py
xingzhikb's picture
init
002bd9b
import click
from utils.git_utils.tsv_io import TSVFile
import json
import tqdm
from collections import defaultdict
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def build_mapping_for_json(json_data):
mapping = defaultdict(dict)
for data in tqdm.tqdm(json_data):
if data.get("metadata", None) is None:
raise ValueError(
"Metadata is not found in JSON data, we need it to build image-region idx to image mapping"
)
metadata = data["metadata"]
if metadata.get("metadata_image_id", None) is None:
raise ValueError(
"Image ID is not found in JSON data, we need it to build image-region idx to image mapping"
)
if metadata.get("metadata_region_id", None) is None:
raise ValueError(
"Region ID is not found in JSON data, we need it to build image-region idx to image mapping"
)
image_id = metadata["metadata_image_id"]
region_id = metadata["metadata_region_id"]
mapping[image_id][region_id] = data
return mapping
@click.command()
@click.option("--tsv_path", "-t", help="Path to TSV file")
@click.option("--json_path", "-j", help="Path to JSON file")
@click.option("--output_path", "-o", help="Path to output JSON file")
def main(tsv_path, json_path, output_path):
"""we build the json file with images based on the image tsv file.
Args:
tsv_path (_type_): _description_
json_path (_type_): _description_
output_path (_type_): _description_
"""
tsv_data = TSVFile(tsv_path)
with open(json_path, "r") as f:
json_data = json.load(f)
tsv_data_len = len(tsv_data)
json_data_len = len(json_data)
mapping = build_mapping_for_json(json_data)
if tsv_data_len != json_data_len:
logger.warning(f"Lengths of img TSV and JSON data are not equal: {tsv_data_len} != {json_data_len}")
with open(output_path, "w") as f:
is_first = True
for tsv_sample in tqdm.tqdm(tsv_data):
tsv_ident, media_b64 = tsv_sample
image_id, region_cnt, region_id = list(map(int, tsv_ident.split("-")))
json_sample = mapping[image_id][region_id]
json_sample["media_b64"] = media_b64
if is_first:
string = json.dumps(json_sample)
f.write("[" + string)
is_first = False
else:
string = json.dumps(json_sample)
f.write("\n," + string)
f.write("]")
if __name__ == "__main__":
main()