| import argparse | |
| import os | |
| from collections import defaultdict | |
| from typing import Dict, Any | |
| from concrete.util import CommunicationWriterTGZ | |
| from nltk.corpus import framenet, framenet15 | |
| from tqdm import tqdm | |
| from sftp.data_reader.concrete_srl import concrete_doc | |
| from tools.framenet.fn_util import framenet_split, Sentence as TokSentence | |
| def process_sentence(sent) -> Dict[str, Any]: | |
| ret = {'sentence': sent.text, 'tokenization': list(), 'annotations': list()} | |
| tok_sent = TokSentence(sent.text) | |
| for token in tok_sent.tokens: | |
| ret['tokenization'].append((token.idx, token.idx_end-1)) | |
| def process_one_ann_set(ann_set): | |
| ret['annotations'].append(event := {'label': ann_set.frame.name, 'children': (arg_list := list())}) | |
| target_list = list() | |
| for tar_start, tar_end in ann_set.Target: | |
| target_list.extend( | |
| list(range(tok_sent.span(tar_start, tar_end)[0], tok_sent.span(tar_start, tar_end)[1]+1)) | |
| ) | |
| target_list.sort() | |
| event['span'] = (target_list[0], target_list[-1]) | |
| for fe_start, fe_end, fe_name in ann_set.FE[0]: | |
| fe_start, fe_end = tok_sent.span(fe_start, fe_end) | |
| arg_list.append({ | |
| 'span': (fe_start, fe_end), | |
| 'label': fe_name | |
| }) | |
| if 'annotationSet' in sent: | |
| for ann_item in sent.annotationSet: | |
| if 'Target' not in ann_item: | |
| continue | |
| process_one_ann_set(ann_item) | |
| if 'Target' in sent: | |
| process_one_ann_set(sent) | |
| return ret | |
| def process_doc(docs, dst_path: str): | |
| writer = CommunicationWriterTGZ(dst_path) | |
| for doc in tqdm(docs): | |
| sentences = list() | |
| for sent in doc.sentence: | |
| sentences.append(process_sentence(sent)) | |
| comm = concrete_doc(sentences, doc.filename) | |
| writer.write(comm, comm.id + '.concrete') | |
| writer.close() | |
| def process_exemplar(dst_path, fn): | |
| bar = tqdm() | |
| raw_annotations = list() | |
| print('Loading exemplars...') | |
| try: | |
| for ann_sent in fn.annotations(full_text=False): | |
| if 'Target' not in ann_sent: | |
| continue | |
| bar.update() | |
| raw_annotations.append(ann_sent) | |
| except RuntimeError: | |
| pass | |
| finally: | |
| bar.close() | |
| char_idx_offset = 0 | |
| sentences = list() | |
| for sent in raw_annotations: | |
| sentences.append(process_sentence(sent)) | |
| char_idx_offset += len(sent.text)+1 | |
| comm = concrete_doc(sentences, 'exemplar') | |
| CommunicationWriterTGZ(dst_path).write(comm, 'exemplar.concrete') | |
| def run(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument( | |
| 'dst', metavar='DESTINATION', type=str, | |
| help='Destination folder path.' | |
| ) | |
| parser.add_argument( | |
| '-v', metavar='VERSION', default='1.7', type=str, choices=['1.5', '1.7'], | |
| help='Version of FrameNet. Either 1.5 or 1.7.' | |
| ) | |
| args = parser.parse_args() | |
| fn = framenet if args.v == '1.7' else framenet15 | |
| os.makedirs(args.dst, exist_ok=True) | |
| doc_group = defaultdict(list) | |
| for doc in fn.docs(): | |
| if doc.filename in framenet_split['dev']: | |
| doc_group['dev'].append(doc) | |
| elif doc.filename in framenet_split['test']: | |
| doc_group['test'].append(doc) | |
| else: | |
| doc_group['train'].append(doc) | |
| for sp in framenet_split: | |
| print(f'Loaded {len(doc_group[sp])} docs for {sp}.') | |
| for sp in framenet_split: | |
| process_doc(doc_group[sp], dst_path=os.path.join(args.dst, f'{sp}.tar.gz')) | |
| process_exemplar(os.path.join(args.dst, 'exemplar.tar.gz'), fn) | |
| if __name__ == '__main__': | |
| run() | |