import numpy as np def read_file(filename): sentences = [] sentence = [] lengths = [] num_sentneces_to_remove = 0 num_sentences = 0 num_tokens_to_remove = 0 num_tokens = 0 with open(filename, 'r') as file: for line in file: line = line.strip() if len(line) == 0: xx_count = 0 for row in sentence: if row[2] == 'XX': xx_count += 1 if xx_count / len(sentence) >= 0.5: num_sentneces_to_remove += 1 num_tokens_to_remove += len(sentence) else: sentences.append(sentence) lengths.append(len(sentence)) num_sentences += 1 num_tokens += len(sentence) sentence = [] continue tokens = line.split('\t') idx = tokens[0] word = tokens[1] pos = tokens[2] ner = tokens[3] arc = tokens[4] arc_tag = tokens[5] sentence.append((idx, word, pos, ner, arc, arc_tag)) print("removed %d sentences out of %d sentences" % (num_sentneces_to_remove, num_sentences)) print("removed %d tokens out of %d tokens" % (num_tokens_to_remove, num_tokens)) return sentences def write_file(filename, sentences): with open(filename, 'w') as file: for sentence in sentences: for row in sentence: file.write('\t'.join([token for token in row]) + '\n') file.write('\n') dataset_dict = {'ontonotes': 'onto'} datasets = ['ontonotes'] splits = ['test'] domains = ['all', 'wb'] for dataset in datasets: for domain in domains: for split in splits: print('dataset: %s, domain: %s, split: %s' % (dataset, domain, split)) filemame = 'data/'+ dataset_dict[dataset] + '_pos_ner_dp_' + split + '_' + domain sentences = read_file(filemame) write_filename = filemame + '_without_xx' write_file(write_filename, sentences)