| import xml.etree.ElementTree as ET |
| import jsonlines |
| import random |
| from tqdm import tqdm |
| import argparse |
| import os |
| import glob |
|
|
| def get_sentence_data(fn): |
| """ |
| Parses a sentence file from the Flickr30K Entities dataset |
| |
| input: |
| fn - full file path to the sentence file to parse |
| |
| output: |
| a list of dictionaries for each sentence with the following fields: |
| sentence - the original sentence |
| phrases - a list of dictionaries for each phrase with the |
| following fields: |
| phrase - the text of the annotated phrase |
| first_word_index - the position of the first word of |
| the phrase in the sentence |
| phrase_id - an identifier for this phrase |
| phrase_type - a list of the coarse categories this |
| phrase belongs to |
| |
| """ |
| with open(fn, 'r') as f: |
| sentences = f.read().split('\n') |
|
|
| annotations = [] |
| for sentence in sentences: |
| if not sentence: |
| continue |
|
|
| first_word = [] |
| phrases = [] |
| phrase_id = [] |
| phrase_type = [] |
| words = [] |
| current_phrase = [] |
| add_to_phrase = False |
| for token in sentence.split(): |
| if add_to_phrase: |
| if token[-1] == ']': |
| add_to_phrase = False |
| token = token[:-1] |
| current_phrase.append(token) |
| phrases.append(' '.join(current_phrase)) |
| current_phrase = [] |
| else: |
| current_phrase.append(token) |
|
|
| words.append(token) |
| else: |
| if token[0] == '[': |
| add_to_phrase = True |
| first_word.append(len(words)) |
| parts = token.split('/') |
| phrase_id.append(parts[1][3:]) |
| phrase_type.append(parts[2:]) |
| else: |
| words.append(token) |
|
|
| sentence_data = {'sentence' : ' '.join(words), 'phrases' : []} |
| for index, phrase, p_id, p_type in zip(first_word, phrases, phrase_id, phrase_type): |
| sentence_data['phrases'].append({'first_word_index' : index, |
| 'phrase' : phrase, |
| 'phrase_id' : p_id, |
| 'phrase_type' : p_type}) |
|
|
| annotations.append(sentence_data) |
|
|
| return annotations |
|
|
| def get_annotations(fn): |
| """ |
| Parses the xml files in the Flickr30K Entities dataset |
| |
| input: |
| fn - full file path to the annotations file to parse |
| |
| output: |
| dictionary with the following fields: |
| scene - list of identifiers which were annotated as |
| pertaining to the whole scene |
| nobox - list of identifiers which were annotated as |
| not being visible in the image |
| boxes - a dictionary where the fields are identifiers |
| and the values are its list of boxes in the |
| [xmin ymin xmax ymax] format |
| """ |
| tree = ET.parse(fn) |
| root = tree.getroot() |
| filename = root.findall('filename')[0].text |
| size_container = root.findall('size')[0] |
| anno_info = {'filename': filename, 'boxes' : {}, 'scene' : [], 'nobox' : []} |
| for size_element in size_container: |
| anno_info[size_element.tag] = int(size_element.text) |
|
|
| for object_container in root.findall('object'): |
| for names in object_container.findall('name'): |
| box_id = names.text |
| box_container = object_container.findall('bndbox') |
| if len(box_container) > 0: |
| if box_id not in anno_info['boxes']: |
| anno_info['boxes'][box_id] = [] |
| xmin = int(box_container[0].findall('xmin')[0].text) - 1 |
| ymin = int(box_container[0].findall('ymin')[0].text) - 1 |
| xmax = int(box_container[0].findall('xmax')[0].text) - 1 |
| ymax = int(box_container[0].findall('ymax')[0].text) - 1 |
| anno_info['boxes'][box_id].append([xmin, ymin, xmax, ymax]) |
| else: |
| nobndbox = int(object_container.findall('nobndbox')[0].text) |
| if nobndbox > 0: |
| anno_info['nobox'].append(box_id) |
|
|
| scene = int(object_container.findall('scene')[0].text) |
| if scene > 0: |
| anno_info['scene'].append(box_id) |
|
|
| return anno_info |
|
|
| def gen_record(sd, an): |
| filename = an["filename"] |
| caption = sd["sentence"] |
| regions = [] |
| for ph in sd["phrases"]: |
| if ph["phrase_id"] in an["boxes"]: |
| for box in an["boxes"][ph["phrase_id"]]: |
| regions.append( |
| { |
| "phrase": ph["phrase"], |
| "bbox": box |
| } |
| ) |
| if len(regions) < 1: |
| print("no phrase regions") |
| return None |
| return { |
| "filename": filename, |
| "height": an["height"], |
| "width": an["width"], |
| "grounding":{ |
| "caption": caption, |
| "regions": regions |
| } |
| } |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="flickr30k entities to ODVG List.") |
| parser.add_argument("--root", type=str, default="", help="Source anno root") |
| parser.add_argument("--output_file", type=str, default="flickr30k_entities_odvg.jsonl") |
| parser.add_argument("--osoi", action="store_true", default=False) |
| args = parser.parse_args() |
| print(args) |
|
|
| odvg_anno = [] |
| sentence_list = os.path.join(args.root, "Sentences") |
| annotation_list = os.path.join(args.root, "Annotations") |
| sentence_list = sorted(glob.glob(sentence_list + "/*")) |
| annotation_list = sorted(glob.glob(annotation_list + "/*")) |
| len_anno = len(annotation_list) |
| for idx in tqdm(range(len_anno)): |
| sds = get_sentence_data(sentence_list[idx]) |
| an = get_annotations(annotation_list[idx]) |
| if args.osoi: |
| sd = sds[random.randint(0, len(sds)-1)] |
| x = gen_record(sd, an) |
| if x: |
| odvg_anno.append(x) |
| else: |
| for sd in sds: |
| x = gen_record(sd, an) |
| if x: |
| odvg_anno.append(x) |
| with jsonlines.open(args.output_file, mode="w") as fwriter: |
| fwriter.write_all(odvg_anno) |