import click from utils.git_utils.tsv_io import TSVFile import json import tqdm from collections import defaultdict import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def build_mapping_for_json(json_data): mapping = defaultdict(dict) for data in tqdm.tqdm(json_data): if data.get("metadata", None) is None: raise ValueError( "Metadata is not found in JSON data, we need it to build image-region idx to image mapping" ) metadata = data["metadata"] if metadata.get("metadata_image_id", None) is None: raise ValueError( "Image ID is not found in JSON data, we need it to build image-region idx to image mapping" ) if metadata.get("metadata_region_id", None) is None: raise ValueError( "Region ID is not found in JSON data, we need it to build image-region idx to image mapping" ) image_id = metadata["metadata_image_id"] region_id = metadata["metadata_region_id"] mapping[image_id][region_id] = data return mapping @click.command() @click.option("--tsv_path", "-t", help="Path to TSV file") @click.option("--json_path", "-j", help="Path to JSON file") @click.option("--output_path", "-o", help="Path to output JSON file") def main(tsv_path, json_path, output_path): """we build the json file with images based on the image tsv file. Args: tsv_path (_type_): _description_ json_path (_type_): _description_ output_path (_type_): _description_ """ tsv_data = TSVFile(tsv_path) with open(json_path, "r") as f: json_data = json.load(f) tsv_data_len = len(tsv_data) json_data_len = len(json_data) mapping = build_mapping_for_json(json_data) if tsv_data_len != json_data_len: logger.warning(f"Lengths of img TSV and JSON data are not equal: {tsv_data_len} != {json_data_len}") with open(output_path, "w") as f: is_first = True for tsv_sample in tqdm.tqdm(tsv_data): tsv_ident, media_b64 = tsv_sample image_id, region_cnt, region_id = list(map(int, tsv_ident.split("-"))) json_sample = mapping[image_id][region_id] json_sample["media_b64"] = media_b64 if is_first: string = json.dumps(json_sample) f.write("[" + string) is_first = False else: string = json.dumps(json_sample) f.write("\n," + string) f.write("]") if __name__ == "__main__": main()