|
|
""" |
|
|
Utils for the processing of NER datasets |
|
|
|
|
|
These can be invoked from either the specific dataset scripts |
|
|
or the entire prepare_ner_dataset.py script |
|
|
""" |
|
|
|
|
|
from collections import defaultdict |
|
|
import io |
|
|
import json |
|
|
import os |
|
|
import random |
|
|
import zipfile |
|
|
|
|
|
from stanza.models.common.doc import Document |
|
|
import stanza.utils.datasets.ner.prepare_ner_file as prepare_ner_file |
|
|
|
|
|
SHARDS = ('train', 'dev', 'test') |
|
|
|
|
|
def bioes_to_bio(tags): |
|
|
new_tags = [] |
|
|
in_entity = False |
|
|
for tag in tags: |
|
|
if tag == 'O': |
|
|
new_tags.append(tag) |
|
|
in_entity = False |
|
|
elif in_entity and (tag.startswith("B-") or tag.startswith("S-")): |
|
|
|
|
|
|
|
|
new_tags.append('B-' + tag[2:]) |
|
|
else: |
|
|
new_tags.append('I-' + tag[2:]) |
|
|
in_entity = True |
|
|
return new_tags |
|
|
|
|
|
def convert_bioes_to_bio(base_input_path, base_output_path, short_name): |
|
|
""" |
|
|
Convert BIOES files back to BIO (not BIO2) |
|
|
|
|
|
Useful for preparing datasets for CoreNLP, which doesn't do great with the more highly split classes |
|
|
""" |
|
|
for shard in SHARDS: |
|
|
input_filename = os.path.join(base_input_path, '%s.%s.bioes' % (short_name, shard)) |
|
|
output_filename = os.path.join(base_output_path, '%s.%s.bio' % (short_name, shard)) |
|
|
|
|
|
input_sentences = read_tsv(input_filename, text_column=0, annotation_column=1) |
|
|
new_sentences = [] |
|
|
for sentence in input_sentences: |
|
|
tags = [x[1] for x in sentence] |
|
|
tags = bioes_to_bio(tags) |
|
|
sentence = [(x[0], y) for x, y in zip(sentence, tags)] |
|
|
new_sentences.append(sentence) |
|
|
write_sentences(output_filename, new_sentences) |
|
|
|
|
|
|
|
|
def convert_bio_to_json(base_input_path, base_output_path, short_name, suffix="bio", shard_names=SHARDS, shards=SHARDS): |
|
|
""" |
|
|
Convert BIO files to json |
|
|
|
|
|
It can often be convenient to put the intermediate BIO files in |
|
|
the same directory as the output files, in which case you can pass |
|
|
in same path for both base_input_path and base_output_path. |
|
|
|
|
|
This also will rewrite a BIOES as json |
|
|
""" |
|
|
for input_shard, output_shard in zip(shard_names, shards): |
|
|
input_filename = os.path.join(base_input_path, '%s.%s.%s' % (short_name, input_shard, suffix)) |
|
|
if not os.path.exists(input_filename): |
|
|
alt_filename = os.path.join(base_input_path, '%s.%s' % (input_shard, suffix)) |
|
|
if os.path.exists(alt_filename): |
|
|
input_filename = alt_filename |
|
|
else: |
|
|
raise FileNotFoundError('Cannot find %s component of %s in %s or %s' % (output_shard, short_name, input_filename, alt_filename)) |
|
|
output_filename = os.path.join(base_output_path, '%s.%s.json' % (short_name, output_shard)) |
|
|
print("Converting %s to %s" % (input_filename, output_filename)) |
|
|
prepare_ner_file.process_dataset(input_filename, output_filename) |
|
|
|
|
|
def get_tags(datasets): |
|
|
""" |
|
|
return the set of tags used in these datasets |
|
|
|
|
|
datasets is expected to be train, dev, test but could be any list |
|
|
""" |
|
|
tags = set() |
|
|
for dataset in datasets: |
|
|
for sentence in dataset: |
|
|
for word, tag in sentence: |
|
|
tags.add(tag) |
|
|
return tags |
|
|
|
|
|
def write_sentences(output_filename, dataset): |
|
|
""" |
|
|
Write exactly one output file worth of dataset |
|
|
""" |
|
|
os.makedirs(os.path.split(output_filename)[0], exist_ok=True) |
|
|
with open(output_filename, "w", encoding="utf-8") as fout: |
|
|
for sent_idx, sentence in enumerate(dataset): |
|
|
for word_idx, word in enumerate(sentence): |
|
|
if len(word) > 2: |
|
|
word = word[:2] |
|
|
try: |
|
|
fout.write("%s\t%s\n" % word) |
|
|
except TypeError: |
|
|
raise TypeError("Unable to process sentence %d word %d of file %s" % (sent_idx, word_idx, output_filename)) |
|
|
fout.write("\n") |
|
|
|
|
|
def write_dataset(datasets, output_dir, short_name, suffix="bio", shard_names=SHARDS, shards=SHARDS): |
|
|
""" |
|
|
write all three pieces of a dataset to output_dir |
|
|
|
|
|
datasets should be 3 lists: train, dev, test |
|
|
each list should be a list of sentences |
|
|
each sentence is a list of pairs: word, tag |
|
|
|
|
|
after writing to .bio files, the files will be converted to .json |
|
|
""" |
|
|
for shard, dataset in zip(shard_names, datasets): |
|
|
output_filename = os.path.join(output_dir, "%s.%s.%s" % (short_name, shard, suffix)) |
|
|
write_sentences(output_filename, dataset) |
|
|
|
|
|
convert_bio_to_json(output_dir, output_dir, short_name, suffix, shard_names=shard_names, shards=shards) |
|
|
|
|
|
|
|
|
def write_multitag_json(output_filename, dataset): |
|
|
json_dataset = [] |
|
|
for sentence in dataset: |
|
|
json_sentence = [] |
|
|
for word in sentence: |
|
|
word = {'text': word[0], |
|
|
'ner': word[1], |
|
|
'multi_ner': word[2]} |
|
|
json_sentence.append(word) |
|
|
json_dataset.append(json_sentence) |
|
|
with open(output_filename, 'w', encoding='utf-8') as fout: |
|
|
json.dump(json_dataset, fout, indent=2) |
|
|
|
|
|
def write_multitag_dataset(datasets, output_dir, short_name, suffix="bio", shard_names=SHARDS, shards=SHARDS): |
|
|
for shard, dataset in zip(shard_names, datasets): |
|
|
output_filename = os.path.join(output_dir, "%s.%s.%s" % (short_name, shard, suffix)) |
|
|
write_sentences(output_filename, dataset) |
|
|
|
|
|
for shard, dataset in zip(shard_names, datasets): |
|
|
output_filename = os.path.join(output_dir, "%s.%s.json" % (short_name, shard)) |
|
|
write_multitag_json(output_filename, dataset) |
|
|
|
|
|
def read_tsv(filename, text_column, annotation_column, remap_tag_fn=None, remap_line=None, skip_comments=True, keep_broken_tags=False, keep_all_columns=False, separator="\t", zip_filename=None): |
|
|
""" |
|
|
Read sentences from a TSV file |
|
|
|
|
|
Returns a list of list of (word, tag) |
|
|
|
|
|
If keep_broken_tags==True, then None is returned for a missing. Otherwise, an IndexError is thrown |
|
|
""" |
|
|
if zip_filename is not None: |
|
|
with zipfile.ZipFile(zip_filename) as zin: |
|
|
with zin.open(filename) as fin: |
|
|
fin = io.TextIOWrapper(fin, encoding='utf-8') |
|
|
lines = fin.readlines() |
|
|
else: |
|
|
with open(filename, encoding="utf-8") as fin: |
|
|
lines = fin.readlines() |
|
|
|
|
|
lines = [x.strip() for x in lines] |
|
|
|
|
|
sentences = [] |
|
|
current_sentence = [] |
|
|
for line_idx, line in enumerate(lines): |
|
|
if not line: |
|
|
if current_sentence: |
|
|
sentences.append(current_sentence) |
|
|
current_sentence = [] |
|
|
continue |
|
|
if skip_comments and line.startswith("#"): |
|
|
continue |
|
|
|
|
|
if remap_line is not None: |
|
|
line = remap_line(line) |
|
|
pieces = line.split(separator) |
|
|
try: |
|
|
word = pieces[text_column] |
|
|
except IndexError as e: |
|
|
raise IndexError("Filename %s: could not find word index %d at line %d |%s|" % (filename, text_column, line_idx, line)) from e |
|
|
if word == '\x96': |
|
|
|
|
|
continue |
|
|
try: |
|
|
tag = pieces[annotation_column] |
|
|
except IndexError as e: |
|
|
if keep_broken_tags: |
|
|
tag = None |
|
|
else: |
|
|
raise IndexError("Filename %s: could not find tag index %d at line %d |%s|" % (filename, annotation_column, line_idx, line)) from e |
|
|
if remap_tag_fn is not None: |
|
|
tag = remap_tag_fn(tag) |
|
|
|
|
|
if keep_all_columns: |
|
|
pieces[annotation_column] = tag |
|
|
current_sentence.append(pieces) |
|
|
else: |
|
|
current_sentence.append((word, tag)) |
|
|
|
|
|
if current_sentence: |
|
|
sentences.append(current_sentence) |
|
|
|
|
|
return sentences |
|
|
|
|
|
def random_shuffle_directory(input_dir, output_dir, short_name): |
|
|
input_files = os.listdir(input_dir) |
|
|
input_files = sorted(input_files) |
|
|
random_shuffle_files(input_dir, input_files, output_dir, short_name) |
|
|
|
|
|
def random_shuffle_files(input_dir, input_files, output_dir, short_name): |
|
|
""" |
|
|
Shuffle the files into different chunks based on their filename |
|
|
|
|
|
The first piece of the filename, split by ".", is used as a random seed. |
|
|
|
|
|
This will make it so that adding new files or using a different |
|
|
annotation scheme (assuming that's encoding in pieces of the |
|
|
filename) won't change the distibution of the files |
|
|
""" |
|
|
input_keys = {} |
|
|
for f in input_files: |
|
|
seed = f.split(".")[0] |
|
|
if seed in input_keys: |
|
|
raise ValueError("Multiple files with the same prefix: %s and %s" % (input_keys[seed], f)) |
|
|
input_keys[seed] = f |
|
|
assert len(input_keys) == len(input_files) |
|
|
|
|
|
train_files = [] |
|
|
dev_files = [] |
|
|
test_files = [] |
|
|
|
|
|
for filename in input_files: |
|
|
seed = filename.split(".")[0] |
|
|
|
|
|
|
|
|
seed = seed + ".txt.4class.tsv" |
|
|
random.seed(seed, 2) |
|
|
location = random.random() |
|
|
if location < 0.7: |
|
|
train_files.append(filename) |
|
|
elif location < 0.8: |
|
|
dev_files.append(filename) |
|
|
else: |
|
|
test_files.append(filename) |
|
|
|
|
|
print("Train files: %d Dev files: %d Test files: %d" % (len(train_files), len(dev_files), len(test_files))) |
|
|
assert len(train_files) + len(dev_files) + len(test_files) == len(input_files) |
|
|
|
|
|
file_lists = [train_files, dev_files, test_files] |
|
|
datasets = [] |
|
|
for files in file_lists: |
|
|
dataset = [] |
|
|
for filename in files: |
|
|
dataset.extend(read_tsv(os.path.join(input_dir, filename), 0, 1)) |
|
|
datasets.append(dataset) |
|
|
|
|
|
write_dataset(datasets, output_dir, short_name) |
|
|
return len(train_files), len(dev_files), len(test_files) |
|
|
|
|
|
def random_shuffle_by_prefixes(input_dir, output_dir, short_name, prefix_map): |
|
|
input_files = os.listdir(input_dir) |
|
|
input_files = sorted(input_files) |
|
|
|
|
|
file_divisions = defaultdict(list) |
|
|
for filename in input_files: |
|
|
for division in prefix_map.keys(): |
|
|
for prefix in prefix_map[division]: |
|
|
if filename.startswith(prefix): |
|
|
break |
|
|
else: |
|
|
continue |
|
|
break |
|
|
else: |
|
|
raise ValueError("Could not assign %s to any of the divisions in the prefix_map" % filename) |
|
|
|
|
|
file_divisions[division].append(filename) |
|
|
|
|
|
num_train_files = 0 |
|
|
num_dev_files = 0 |
|
|
num_test_files = 0 |
|
|
for division in file_divisions.keys(): |
|
|
print() |
|
|
print("Processing %d files from %s" % (len(file_divisions[division]), division)) |
|
|
d_train, d_dev, d_test = random_shuffle_files(input_dir, file_divisions[division], output_dir, "%s-%s" % (short_name, division)) |
|
|
num_train_files += d_train |
|
|
num_dev_files += d_dev |
|
|
num_test_files += d_test |
|
|
|
|
|
print() |
|
|
print("After shuffling: Train files: %d Dev files: %d Test files: %d" % (num_train_files, num_dev_files, num_test_files)) |
|
|
dataset_divisions = ["%s-%s" % (short_name, division) for division in file_divisions] |
|
|
combine_dataset(output_dir, output_dir, dataset_divisions, short_name) |
|
|
|
|
|
def combine_dataset(input_dir, output_dir, input_datasets, output_dataset): |
|
|
datasets = [] |
|
|
for shard in SHARDS: |
|
|
full_dataset = [] |
|
|
for input_dataset in input_datasets: |
|
|
input_filename = "%s.%s.json" % (input_dataset, shard) |
|
|
input_path = os.path.join(input_dir, input_filename) |
|
|
with open(input_path, encoding="utf-8") as fin: |
|
|
dataset = json.load(fin) |
|
|
converted = [[(word['text'], word['ner']) for word in sentence] for sentence in dataset] |
|
|
full_dataset.extend(converted) |
|
|
datasets.append(full_dataset) |
|
|
write_dataset(datasets, output_dir, output_dataset) |
|
|
|
|
|
def read_prefix_file(destination_file): |
|
|
""" |
|
|
Read a prefix file such as the one for the Worldwide dataset |
|
|
|
|
|
the format should be |
|
|
|
|
|
africa: |
|
|
af_ |
|
|
... |
|
|
|
|
|
asia: |
|
|
cn_ |
|
|
... |
|
|
""" |
|
|
destination = None |
|
|
known_prefixes = set() |
|
|
prefixes = [] |
|
|
|
|
|
prefix_map = {} |
|
|
with open(destination_file, encoding="utf-8") as fin: |
|
|
for line in fin: |
|
|
line = line.strip() |
|
|
if line.startswith("#"): |
|
|
continue |
|
|
if not line: |
|
|
continue |
|
|
if line.endswith(":"): |
|
|
if destination is not None: |
|
|
prefix_map[destination] = prefixes |
|
|
prefixes = [] |
|
|
destination = line[:-1].strip().lower().replace(" ", "_") |
|
|
else: |
|
|
if not destination: |
|
|
raise RuntimeError("Found a prefix before the first label was assigned when reading %s" % destination_file) |
|
|
prefixes.append(line) |
|
|
if line in known_prefixes: |
|
|
raise RuntimeError("Found the same prefix twice! %s" % line) |
|
|
known_prefixes.add(line) |
|
|
|
|
|
if destination and prefixes: |
|
|
prefix_map[destination] = prefixes |
|
|
|
|
|
return prefix_map |
|
|
|
|
|
def read_json_entities(filename): |
|
|
""" |
|
|
Read entities from a file, return a list of (text, label) |
|
|
|
|
|
Should work on both BIOES and BIO |
|
|
""" |
|
|
with open(filename) as fin: |
|
|
doc = Document(json.load(fin)) |
|
|
|
|
|
return list_doc_entities(doc) |
|
|
|
|
|
def list_doc_entities(doc): |
|
|
""" |
|
|
Return a list of (text, label) |
|
|
|
|
|
Should work on both BIOES and BIO |
|
|
""" |
|
|
entities = [] |
|
|
for sentence in doc.sentences: |
|
|
current_entity = [] |
|
|
previous_label = None |
|
|
for token in sentence.tokens: |
|
|
if token.ner == 'O' or token.ner.startswith("E-"): |
|
|
if token.ner.startswith("E-"): |
|
|
current_entity.append(token.text) |
|
|
if current_entity: |
|
|
assert previous_label is not None |
|
|
entities.append((current_entity, previous_label)) |
|
|
current_entity = [] |
|
|
previous_label = None |
|
|
elif token.ner.startswith("I-"): |
|
|
if previous_label is not None and previous_label != 'O' and previous_label != token.ner[2:]: |
|
|
if current_entity: |
|
|
assert previous_label is not None |
|
|
entities.append((current_entity, previous_label)) |
|
|
current_entity = [] |
|
|
previous_label = token.ner[2:] |
|
|
current_entity.append(token.text) |
|
|
elif token.ner.startswith("B-") or token.ner.startswith("S-"): |
|
|
if current_entity: |
|
|
assert previous_label is not None |
|
|
entities.append((current_entity, previous_label)) |
|
|
current_entity = [] |
|
|
previous_label = None |
|
|
current_entity.append(token.text) |
|
|
previous_label = token.ner[2:] |
|
|
if token.ner.startswith("S-"): |
|
|
assert previous_label is not None |
|
|
entities.append(current_entity) |
|
|
current_entity = [] |
|
|
previous_label = None |
|
|
else: |
|
|
raise RuntimeError("Expected BIO(ES) format in the json file!") |
|
|
previous_label = token.ner[2:] |
|
|
if current_entity: |
|
|
assert previous_label is not None |
|
|
entities.append((current_entity, previous_label)) |
|
|
entities = [(tuple(x[0]), x[1]) for x in entities] |
|
|
return entities |
|
|
|
|
|
def combine_files(output_filename, *input_filenames): |
|
|
""" |
|
|
Combine multiple NER json files into one NER file |
|
|
""" |
|
|
doc = [] |
|
|
|
|
|
for filename in input_filenames: |
|
|
with open(filename) as fin: |
|
|
new_doc = json.load(fin) |
|
|
doc.extend(new_doc) |
|
|
|
|
|
with open(output_filename, "w") as fout: |
|
|
json.dump(doc, fout, indent=2) |
|
|
|
|
|
|