| import numpy as np | |
| def read_file(filename): | |
| sentences = [] | |
| sentence = [] | |
| lengths = [] | |
| num_sentneces_to_remove = 0 | |
| num_sentences = 0 | |
| num_tokens_to_remove = 0 | |
| num_tokens = 0 | |
| with open(filename, 'r') as file: | |
| for line in file: | |
| line = line.strip() | |
| if len(line) == 0: | |
| xx_count = 0 | |
| for row in sentence: | |
| if row[2] == 'XX': | |
| xx_count += 1 | |
| if xx_count / len(sentence) >= 0.5: | |
| num_sentneces_to_remove += 1 | |
| num_tokens_to_remove += len(sentence) | |
| else: | |
| sentences.append(sentence) | |
| lengths.append(len(sentence)) | |
| num_sentences += 1 | |
| num_tokens += len(sentence) | |
| sentence = [] | |
| continue | |
| tokens = line.split('\t') | |
| idx = tokens[0] | |
| word = tokens[1] | |
| pos = tokens[2] | |
| ner = tokens[3] | |
| arc = tokens[4] | |
| arc_tag = tokens[5] | |
| sentence.append((idx, word, pos, ner, arc, arc_tag)) | |
| print("removed %d sentences out of %d sentences" % (num_sentneces_to_remove, num_sentences)) | |
| print("removed %d tokens out of %d tokens" % (num_tokens_to_remove, num_tokens)) | |
| return sentences | |
| def write_file(filename, sentences): | |
| with open(filename, 'w') as file: | |
| for sentence in sentences: | |
| for row in sentence: | |
| file.write('\t'.join([token for token in row]) + '\n') | |
| file.write('\n') | |
| dataset_dict = {'ontonotes': 'onto'} | |
| datasets = ['ontonotes'] | |
| splits = ['test'] | |
| domains = ['all', 'wb'] | |
| for dataset in datasets: | |
| for domain in domains: | |
| for split in splits: | |
| print('dataset: %s, domain: %s, split: %s' % (dataset, domain, split)) | |
| filemame = 'data/'+ dataset_dict[dataset] + '_pos_ner_dp_' + split + '_' + domain | |
| sentences = read_file(filemame) | |
| write_filename = filemame + '_without_xx' | |
| write_file(write_filename, sentences) | |