DenseLabelDev
/
projects
/internvl_matcher
/dataset
/process_functions
/annotation_json_file_load.py
| import json | |
| import random | |
| def RegionCaptionDataset_load_fn(data_path, repeat_time): | |
| with open(data_path, 'r') as f: | |
| json_file = json.load(f) | |
| ret, hf_ret = [], [] | |
| for item in json_file: | |
| item.update({'image': item['file_name']}) | |
| if len(item["description"]) != len(item["annotation"]): | |
| print("The number of description is not equal to seg !!!") | |
| else: | |
| ret.append(item) | |
| if repeat_time < 1: | |
| ret = random.sample(ret, int(len(ret) * repeat_time)) | |
| elif repeat_time > 1: | |
| int_repeat_time = int(repeat_time) | |
| remaining_repeat_time = repeat_time - int_repeat_time | |
| if remaining_repeat_time > 0: | |
| remaining_ret = random.sample( | |
| ret, int(len(ret) * remaining_repeat_time)) | |
| ret = ret * int_repeat_time | |
| ret.extend(remaining_ret) | |
| else: | |
| ret = ret * int_repeat_time | |
| for item in ret: | |
| image = item["file_name"] | |
| description = item["description"] | |
| hf_required_info = {"image": image, "description": description} | |
| hf_ret.append(hf_required_info) | |
| return ret, hf_ret | |
| def RegionConversationDataset_load_fn(data_path, repeat_time): | |
| with open(data_path, 'r') as f: | |
| json_file = json.load(f) | |
| ret, hf_ret = [], [] | |
| for dataset_info in json_file: | |
| if 'annotation' not in dataset_info or len(dataset_info['annotation']) == 0: | |
| print("The annotation is not valid, filter out!!!") | |
| continue | |
| dataset_info.update({'image': dataset_info['file_name']}) | |
| ret.append(dataset_info) | |
| if repeat_time < 1: | |
| ret = random.sample(ret, int(len(ret) * repeat_time)) | |
| elif repeat_time > 1: | |
| int_repeat_time = int(repeat_time) | |
| remaining_repeat_time = repeat_time - int_repeat_time | |
| if remaining_repeat_time > 0: | |
| remaining_ret = random.sample( | |
| ret, int(len(ret) * remaining_repeat_time)) | |
| ret = ret * int_repeat_time | |
| ret.extend(remaining_ret) | |
| else: | |
| ret = ret * int_repeat_time | |
| for dataset_info in ret: | |
| conversations = dataset_info["conversations"] | |
| image = dataset_info["file_name"] | |
| num_regions = len(dataset_info['annotation']) | |
| required_info = {'image': image, 'conversations': conversations, | |
| 'num_regions': num_regions} | |
| hf_ret.append(required_info) | |
| return ret, hf_ret |