Albin Thörn Cleland
Clean initial commit with LFS
19b8775
"""
Preprocess the WikiNER dataset, by
1) normalizing tags;
2) split into train (70%), dev (15%), test (15%) datasets.
"""
import os
import random
import warnings
from collections import Counter
def read_sentences(filename, encoding):
sents = []
cache = []
skipped = 0
skip = False
with open(filename, encoding=encoding) as infile:
for i, line in enumerate(infile):
line = line.rstrip()
if len(line) == 0:
if len(cache) > 0:
if not skip:
sents.append(cache)
else:
skipped += 1
skip = False
cache = []
continue
array = line.split()
if len(array) != 2:
skip = True
warnings.warn("Format error at line {}: {}".format(i+1, line))
continue
w, t = array
cache.append([w, t])
if len(cache) > 0:
if not skip:
sents.append(cache)
else:
skipped += 1
cache = []
print("Skipped {} examples due to formatting issues.".format(skipped))
return sents
def write_sentences_to_file(sents, filename):
print(f"Writing {len(sents)} sentences to {filename}")
with open(filename, 'w', encoding='utf-8') as outfile:
for sent in sents:
for pair in sent:
print(f"{pair[0]}\t{pair[1]}", file=outfile)
print("", file=outfile)
def remap_labels(sents, remap):
new_sentences = []
for sentence in sents:
new_sent = []
for word in sentence:
new_sent.append([word[0], remap.get(word[1], word[1])])
new_sentences.append(new_sent)
return new_sentences
def split_wikiner_data(directory, sents, prefix="", suffix="bio", remap=None, shuffle=True, train_fraction=0.7, dev_fraction=0.15, test_section=True):
random.seed(1234)
if remap:
sents = remap_labels(sents, remap)
# split
num = len(sents)
train_num = int(num*train_fraction)
if test_section:
dev_num = int(num*dev_fraction)
if train_fraction + dev_fraction > 1.0:
raise ValueError("Train and dev fractions added up to more than 1: {} {} {}".format(train_fraction, dev_fraction))
else:
dev_num = num - train_num
if shuffle:
random.shuffle(sents)
train_sents = sents[:train_num]
dev_sents = sents[train_num:train_num+dev_num]
if test_section:
test_sents = sents[train_num+dev_num:]
batches = [train_sents, dev_sents, test_sents]
filenames = [f'train.{suffix}', f'dev.{suffix}', f'test.{suffix}']
else:
batches = [train_sents, dev_sents]
filenames = [f'train.{suffix}', f'dev.{suffix}']
if prefix:
filenames = ['%s.%s' % (prefix, f) for f in filenames]
for batch, filename in zip(batches, filenames):
write_sentences_to_file(batch, os.path.join(directory, filename))
def split_wikiner(directory, *in_filenames, encoding="utf-8", **kwargs):
sents = []
for filename in in_filenames:
new_sents = read_sentences(filename, encoding)
print(f"{len(new_sents)} sentences read from {filename}.")
sents.extend(new_sents)
split_wikiner_data(directory, sents, **kwargs)
if __name__ == "__main__":
in_filename = 'raw/wp2.txt'
directory = "."
split_wikiner(directory, in_filename)