stanza-digphil / stanza /utils /datasets /ner /split_wikiner.py

Albin Thörn Cleland

Clean initial commit with LFS

19b8775 about 2 months ago

3.5 kB

	"""
	Preprocess the WikiNER dataset, by
	1) normalizing tags;
	2) split into train (70%), dev (15%), test (15%) datasets.
	"""

	import os
	import random
	import warnings
	from collections import Counter

	def read_sentences(filename, encoding):
	sents = []
	cache = []
	skipped = 0
	skip = False
	with open(filename, encoding=encoding) as infile:
	for i, line in enumerate(infile):
	line = line.rstrip()
	if len(line) == 0:
	if len(cache) > 0:
	if not skip:
	sents.append(cache)
	else:
	skipped += 1
	skip = False
	cache = []
	continue
	array = line.split()
	if len(array) != 2:
	skip = True
	warnings.warn("Format error at line {}: {}".format(i+1, line))
	continue
	w, t = array
	cache.append([w, t])
	if len(cache) > 0:
	if not skip:
	sents.append(cache)
	else:
	skipped += 1
	cache = []
	print("Skipped {} examples due to formatting issues.".format(skipped))
	return sents

	def write_sentences_to_file(sents, filename):
	print(f"Writing {len(sents)} sentences to {filename}")
	with open(filename, 'w', encoding='utf-8') as outfile:
	for sent in sents:
	for pair in sent:
	print(f"{pair[0]}\t{pair[1]}", file=outfile)
	print("", file=outfile)

	def remap_labels(sents, remap):
	new_sentences = []
	for sentence in sents:
	new_sent = []
	for word in sentence:
	new_sent.append([word[0], remap.get(word[1], word[1])])
	new_sentences.append(new_sent)
	return new_sentences

	def split_wikiner_data(directory, sents, prefix="", suffix="bio", remap=None, shuffle=True, train_fraction=0.7, dev_fraction=0.15, test_section=True):
	random.seed(1234)

	if remap:
	sents = remap_labels(sents, remap)

	# split
	num = len(sents)
	train_num = int(num*train_fraction)
	if test_section:
	dev_num = int(num*dev_fraction)
	if train_fraction + dev_fraction > 1.0:
	raise ValueError("Train and dev fractions added up to more than 1: {} {} {}".format(train_fraction, dev_fraction))
	else:
	dev_num = num - train_num

	if shuffle:
	random.shuffle(sents)
	train_sents = sents[:train_num]
	dev_sents = sents[train_num:train_num+dev_num]
	if test_section:
	test_sents = sents[train_num+dev_num:]
	batches = [train_sents, dev_sents, test_sents]
	filenames = [f'train.{suffix}', f'dev.{suffix}', f'test.{suffix}']
	else:
	batches = [train_sents, dev_sents]
	filenames = [f'train.{suffix}', f'dev.{suffix}']

	if prefix:
	filenames = ['%s.%s' % (prefix, f) for f in filenames]
	for batch, filename in zip(batches, filenames):
	write_sentences_to_file(batch, os.path.join(directory, filename))

	def split_wikiner(directory, in_filenames, encoding="utf-8", *kwargs):
	sents = []
	for filename in in_filenames:
	new_sents = read_sentences(filename, encoding)
	print(f"{len(new_sents)} sentences read from {filename}.")
	sents.extend(new_sents)

	split_wikiner_data(directory, sents, **kwargs)

	if __name__ == "__main__":
	in_filename = 'raw/wp2.txt'
	directory = "."
	split_wikiner(directory, in_filename)