Albin Thörn Cleland
Clean initial commit with LFS
19b8775
"""
Utils for the processing of NER datasets
These can be invoked from either the specific dataset scripts
or the entire prepare_ner_dataset.py script
"""
from collections import defaultdict
import io
import json
import os
import random
import zipfile
from stanza.models.common.doc import Document
import stanza.utils.datasets.ner.prepare_ner_file as prepare_ner_file
SHARDS = ('train', 'dev', 'test')
def bioes_to_bio(tags):
new_tags = []
in_entity = False
for tag in tags:
if tag == 'O':
new_tags.append(tag)
in_entity = False
elif in_entity and (tag.startswith("B-") or tag.startswith("S-")):
# TODO: does the tag have to match the previous tag?
# eg, does B-LOC B-PER in BIOES need a B-PER or is I-PER sufficient?
new_tags.append('B-' + tag[2:])
else:
new_tags.append('I-' + tag[2:])
in_entity = True
return new_tags
def convert_bioes_to_bio(base_input_path, base_output_path, short_name):
"""
Convert BIOES files back to BIO (not BIO2)
Useful for preparing datasets for CoreNLP, which doesn't do great with the more highly split classes
"""
for shard in SHARDS:
input_filename = os.path.join(base_input_path, '%s.%s.bioes' % (short_name, shard))
output_filename = os.path.join(base_output_path, '%s.%s.bio' % (short_name, shard))
input_sentences = read_tsv(input_filename, text_column=0, annotation_column=1)
new_sentences = []
for sentence in input_sentences:
tags = [x[1] for x in sentence]
tags = bioes_to_bio(tags)
sentence = [(x[0], y) for x, y in zip(sentence, tags)]
new_sentences.append(sentence)
write_sentences(output_filename, new_sentences)
def convert_bio_to_json(base_input_path, base_output_path, short_name, suffix="bio", shard_names=SHARDS, shards=SHARDS):
"""
Convert BIO files to json
It can often be convenient to put the intermediate BIO files in
the same directory as the output files, in which case you can pass
in same path for both base_input_path and base_output_path.
This also will rewrite a BIOES as json
"""
for input_shard, output_shard in zip(shard_names, shards):
input_filename = os.path.join(base_input_path, '%s.%s.%s' % (short_name, input_shard, suffix))
if not os.path.exists(input_filename):
alt_filename = os.path.join(base_input_path, '%s.%s' % (input_shard, suffix))
if os.path.exists(alt_filename):
input_filename = alt_filename
else:
raise FileNotFoundError('Cannot find %s component of %s in %s or %s' % (output_shard, short_name, input_filename, alt_filename))
output_filename = os.path.join(base_output_path, '%s.%s.json' % (short_name, output_shard))
print("Converting %s to %s" % (input_filename, output_filename))
prepare_ner_file.process_dataset(input_filename, output_filename)
def get_tags(datasets):
"""
return the set of tags used in these datasets
datasets is expected to be train, dev, test but could be any list
"""
tags = set()
for dataset in datasets:
for sentence in dataset:
for word, tag in sentence:
tags.add(tag)
return tags
def write_sentences(output_filename, dataset):
"""
Write exactly one output file worth of dataset
"""
os.makedirs(os.path.split(output_filename)[0], exist_ok=True)
with open(output_filename, "w", encoding="utf-8") as fout:
for sent_idx, sentence in enumerate(dataset):
for word_idx, word in enumerate(sentence):
if len(word) > 2:
word = word[:2]
try:
fout.write("%s\t%s\n" % word)
except TypeError:
raise TypeError("Unable to process sentence %d word %d of file %s" % (sent_idx, word_idx, output_filename))
fout.write("\n")
def write_dataset(datasets, output_dir, short_name, suffix="bio", shard_names=SHARDS, shards=SHARDS):
"""
write all three pieces of a dataset to output_dir
datasets should be 3 lists: train, dev, test
each list should be a list of sentences
each sentence is a list of pairs: word, tag
after writing to .bio files, the files will be converted to .json
"""
for shard, dataset in zip(shard_names, datasets):
output_filename = os.path.join(output_dir, "%s.%s.%s" % (short_name, shard, suffix))
write_sentences(output_filename, dataset)
convert_bio_to_json(output_dir, output_dir, short_name, suffix, shard_names=shard_names, shards=shards)
def write_multitag_json(output_filename, dataset):
json_dataset = []
for sentence in dataset:
json_sentence = []
for word in sentence:
word = {'text': word[0],
'ner': word[1],
'multi_ner': word[2]}
json_sentence.append(word)
json_dataset.append(json_sentence)
with open(output_filename, 'w', encoding='utf-8') as fout:
json.dump(json_dataset, fout, indent=2)
def write_multitag_dataset(datasets, output_dir, short_name, suffix="bio", shard_names=SHARDS, shards=SHARDS):
for shard, dataset in zip(shard_names, datasets):
output_filename = os.path.join(output_dir, "%s.%s.%s" % (short_name, shard, suffix))
write_sentences(output_filename, dataset)
for shard, dataset in zip(shard_names, datasets):
output_filename = os.path.join(output_dir, "%s.%s.json" % (short_name, shard))
write_multitag_json(output_filename, dataset)
def read_tsv(filename, text_column, annotation_column, remap_tag_fn=None, remap_line=None, skip_comments=True, keep_broken_tags=False, keep_all_columns=False, separator="\t", zip_filename=None):
"""
Read sentences from a TSV file
Returns a list of list of (word, tag)
If keep_broken_tags==True, then None is returned for a missing. Otherwise, an IndexError is thrown
"""
if zip_filename is not None:
with zipfile.ZipFile(zip_filename) as zin:
with zin.open(filename) as fin:
fin = io.TextIOWrapper(fin, encoding='utf-8')
lines = fin.readlines()
else:
with open(filename, encoding="utf-8") as fin:
lines = fin.readlines()
lines = [x.strip() for x in lines]
sentences = []
current_sentence = []
for line_idx, line in enumerate(lines):
if not line:
if current_sentence:
sentences.append(current_sentence)
current_sentence = []
continue
if skip_comments and line.startswith("#"):
continue
if remap_line is not None:
line = remap_line(line)
pieces = line.split(separator)
try:
word = pieces[text_column]
except IndexError as e:
raise IndexError("Filename %s: could not find word index %d at line %d |%s|" % (filename, text_column, line_idx, line)) from e
if word == '\x96':
# this happens in GermEval2014 for some reason
continue
try:
tag = pieces[annotation_column]
except IndexError as e:
if keep_broken_tags:
tag = None
else:
raise IndexError("Filename %s: could not find tag index %d at line %d |%s|" % (filename, annotation_column, line_idx, line)) from e
if remap_tag_fn is not None:
tag = remap_tag_fn(tag)
if keep_all_columns:
pieces[annotation_column] = tag
current_sentence.append(pieces)
else:
current_sentence.append((word, tag))
if current_sentence:
sentences.append(current_sentence)
return sentences
def random_shuffle_directory(input_dir, output_dir, short_name):
input_files = os.listdir(input_dir)
input_files = sorted(input_files)
random_shuffle_files(input_dir, input_files, output_dir, short_name)
def random_shuffle_files(input_dir, input_files, output_dir, short_name):
"""
Shuffle the files into different chunks based on their filename
The first piece of the filename, split by ".", is used as a random seed.
This will make it so that adding new files or using a different
annotation scheme (assuming that's encoding in pieces of the
filename) won't change the distibution of the files
"""
input_keys = {}
for f in input_files:
seed = f.split(".")[0]
if seed in input_keys:
raise ValueError("Multiple files with the same prefix: %s and %s" % (input_keys[seed], f))
input_keys[seed] = f
assert len(input_keys) == len(input_files)
train_files = []
dev_files = []
test_files = []
for filename in input_files:
seed = filename.split(".")[0]
# "salt" the filenames when using as a seed
# definitely not because of a dumb bug in the original implementation
seed = seed + ".txt.4class.tsv"
random.seed(seed, 2)
location = random.random()
if location < 0.7:
train_files.append(filename)
elif location < 0.8:
dev_files.append(filename)
else:
test_files.append(filename)
print("Train files: %d Dev files: %d Test files: %d" % (len(train_files), len(dev_files), len(test_files)))
assert len(train_files) + len(dev_files) + len(test_files) == len(input_files)
file_lists = [train_files, dev_files, test_files]
datasets = []
for files in file_lists:
dataset = []
for filename in files:
dataset.extend(read_tsv(os.path.join(input_dir, filename), 0, 1))
datasets.append(dataset)
write_dataset(datasets, output_dir, short_name)
return len(train_files), len(dev_files), len(test_files)
def random_shuffle_by_prefixes(input_dir, output_dir, short_name, prefix_map):
input_files = os.listdir(input_dir)
input_files = sorted(input_files)
file_divisions = defaultdict(list)
for filename in input_files:
for division in prefix_map.keys():
for prefix in prefix_map[division]:
if filename.startswith(prefix):
break
else: # for/else is intentional
continue
break
else: # yes, stop asking
raise ValueError("Could not assign %s to any of the divisions in the prefix_map" % filename)
#print("Assigning %s to %s because of %s" % (filename, division, prefix))
file_divisions[division].append(filename)
num_train_files = 0
num_dev_files = 0
num_test_files = 0
for division in file_divisions.keys():
print()
print("Processing %d files from %s" % (len(file_divisions[division]), division))
d_train, d_dev, d_test = random_shuffle_files(input_dir, file_divisions[division], output_dir, "%s-%s" % (short_name, division))
num_train_files += d_train
num_dev_files += d_dev
num_test_files += d_test
print()
print("After shuffling: Train files: %d Dev files: %d Test files: %d" % (num_train_files, num_dev_files, num_test_files))
dataset_divisions = ["%s-%s" % (short_name, division) for division in file_divisions]
combine_dataset(output_dir, output_dir, dataset_divisions, short_name)
def combine_dataset(input_dir, output_dir, input_datasets, output_dataset):
datasets = []
for shard in SHARDS:
full_dataset = []
for input_dataset in input_datasets:
input_filename = "%s.%s.json" % (input_dataset, shard)
input_path = os.path.join(input_dir, input_filename)
with open(input_path, encoding="utf-8") as fin:
dataset = json.load(fin)
converted = [[(word['text'], word['ner']) for word in sentence] for sentence in dataset]
full_dataset.extend(converted)
datasets.append(full_dataset)
write_dataset(datasets, output_dir, output_dataset)
def read_prefix_file(destination_file):
"""
Read a prefix file such as the one for the Worldwide dataset
the format should be
africa:
af_
...
asia:
cn_
...
"""
destination = None
known_prefixes = set()
prefixes = []
prefix_map = {}
with open(destination_file, encoding="utf-8") as fin:
for line in fin:
line = line.strip()
if line.startswith("#"):
continue
if not line:
continue
if line.endswith(":"):
if destination is not None:
prefix_map[destination] = prefixes
prefixes = []
destination = line[:-1].strip().lower().replace(" ", "_")
else:
if not destination:
raise RuntimeError("Found a prefix before the first label was assigned when reading %s" % destination_file)
prefixes.append(line)
if line in known_prefixes:
raise RuntimeError("Found the same prefix twice! %s" % line)
known_prefixes.add(line)
if destination and prefixes:
prefix_map[destination] = prefixes
return prefix_map
def read_json_entities(filename):
"""
Read entities from a file, return a list of (text, label)
Should work on both BIOES and BIO
"""
with open(filename) as fin:
doc = Document(json.load(fin))
return list_doc_entities(doc)
def list_doc_entities(doc):
"""
Return a list of (text, label)
Should work on both BIOES and BIO
"""
entities = []
for sentence in doc.sentences:
current_entity = []
previous_label = None
for token in sentence.tokens:
if token.ner == 'O' or token.ner.startswith("E-"):
if token.ner.startswith("E-"):
current_entity.append(token.text)
if current_entity:
assert previous_label is not None
entities.append((current_entity, previous_label))
current_entity = []
previous_label = None
elif token.ner.startswith("I-"):
if previous_label is not None and previous_label != 'O' and previous_label != token.ner[2:]:
if current_entity:
assert previous_label is not None
entities.append((current_entity, previous_label))
current_entity = []
previous_label = token.ner[2:]
current_entity.append(token.text)
elif token.ner.startswith("B-") or token.ner.startswith("S-"):
if current_entity:
assert previous_label is not None
entities.append((current_entity, previous_label))
current_entity = []
previous_label = None
current_entity.append(token.text)
previous_label = token.ner[2:]
if token.ner.startswith("S-"):
assert previous_label is not None
entities.append(current_entity)
current_entity = []
previous_label = None
else:
raise RuntimeError("Expected BIO(ES) format in the json file!")
previous_label = token.ner[2:]
if current_entity:
assert previous_label is not None
entities.append((current_entity, previous_label))
entities = [(tuple(x[0]), x[1]) for x in entities]
return entities
def combine_files(output_filename, *input_filenames):
"""
Combine multiple NER json files into one NER file
"""
doc = []
for filename in input_filenames:
with open(filename) as fin:
new_doc = json.load(fin)
doc.extend(new_doc)
with open(output_filename, "w") as fout:
json.dump(doc, fout, indent=2)