""" Utils for the processing of NER datasets These can be invoked from either the specific dataset scripts or the entire prepare_ner_dataset.py script """ from collections import defaultdict import io import json import os import random import zipfile from stanza.models.common.doc import Document import stanza.utils.datasets.ner.prepare_ner_file as prepare_ner_file SHARDS = ('train', 'dev', 'test') def bioes_to_bio(tags): new_tags = [] in_entity = False for tag in tags: if tag == 'O': new_tags.append(tag) in_entity = False elif in_entity and (tag.startswith("B-") or tag.startswith("S-")): # TODO: does the tag have to match the previous tag? # eg, does B-LOC B-PER in BIOES need a B-PER or is I-PER sufficient? new_tags.append('B-' + tag[2:]) else: new_tags.append('I-' + tag[2:]) in_entity = True return new_tags def convert_bioes_to_bio(base_input_path, base_output_path, short_name): """ Convert BIOES files back to BIO (not BIO2) Useful for preparing datasets for CoreNLP, which doesn't do great with the more highly split classes """ for shard in SHARDS: input_filename = os.path.join(base_input_path, '%s.%s.bioes' % (short_name, shard)) output_filename = os.path.join(base_output_path, '%s.%s.bio' % (short_name, shard)) input_sentences = read_tsv(input_filename, text_column=0, annotation_column=1) new_sentences = [] for sentence in input_sentences: tags = [x[1] for x in sentence] tags = bioes_to_bio(tags) sentence = [(x[0], y) for x, y in zip(sentence, tags)] new_sentences.append(sentence) write_sentences(output_filename, new_sentences) def convert_bio_to_json(base_input_path, base_output_path, short_name, suffix="bio", shard_names=SHARDS, shards=SHARDS): """ Convert BIO files to json It can often be convenient to put the intermediate BIO files in the same directory as the output files, in which case you can pass in same path for both base_input_path and base_output_path. This also will rewrite a BIOES as json """ for input_shard, output_shard in zip(shard_names, shards): input_filename = os.path.join(base_input_path, '%s.%s.%s' % (short_name, input_shard, suffix)) if not os.path.exists(input_filename): alt_filename = os.path.join(base_input_path, '%s.%s' % (input_shard, suffix)) if os.path.exists(alt_filename): input_filename = alt_filename else: raise FileNotFoundError('Cannot find %s component of %s in %s or %s' % (output_shard, short_name, input_filename, alt_filename)) output_filename = os.path.join(base_output_path, '%s.%s.json' % (short_name, output_shard)) print("Converting %s to %s" % (input_filename, output_filename)) prepare_ner_file.process_dataset(input_filename, output_filename) def get_tags(datasets): """ return the set of tags used in these datasets datasets is expected to be train, dev, test but could be any list """ tags = set() for dataset in datasets: for sentence in dataset: for word, tag in sentence: tags.add(tag) return tags def write_sentences(output_filename, dataset): """ Write exactly one output file worth of dataset """ os.makedirs(os.path.split(output_filename)[0], exist_ok=True) with open(output_filename, "w", encoding="utf-8") as fout: for sent_idx, sentence in enumerate(dataset): for word_idx, word in enumerate(sentence): if len(word) > 2: word = word[:2] try: fout.write("%s\t%s\n" % word) except TypeError: raise TypeError("Unable to process sentence %d word %d of file %s" % (sent_idx, word_idx, output_filename)) fout.write("\n") def write_dataset(datasets, output_dir, short_name, suffix="bio", shard_names=SHARDS, shards=SHARDS): """ write all three pieces of a dataset to output_dir datasets should be 3 lists: train, dev, test each list should be a list of sentences each sentence is a list of pairs: word, tag after writing to .bio files, the files will be converted to .json """ for shard, dataset in zip(shard_names, datasets): output_filename = os.path.join(output_dir, "%s.%s.%s" % (short_name, shard, suffix)) write_sentences(output_filename, dataset) convert_bio_to_json(output_dir, output_dir, short_name, suffix, shard_names=shard_names, shards=shards) def write_multitag_json(output_filename, dataset): json_dataset = [] for sentence in dataset: json_sentence = [] for word in sentence: word = {'text': word[0], 'ner': word[1], 'multi_ner': word[2]} json_sentence.append(word) json_dataset.append(json_sentence) with open(output_filename, 'w', encoding='utf-8') as fout: json.dump(json_dataset, fout, indent=2) def write_multitag_dataset(datasets, output_dir, short_name, suffix="bio", shard_names=SHARDS, shards=SHARDS): for shard, dataset in zip(shard_names, datasets): output_filename = os.path.join(output_dir, "%s.%s.%s" % (short_name, shard, suffix)) write_sentences(output_filename, dataset) for shard, dataset in zip(shard_names, datasets): output_filename = os.path.join(output_dir, "%s.%s.json" % (short_name, shard)) write_multitag_json(output_filename, dataset) def read_tsv(filename, text_column, annotation_column, remap_tag_fn=None, remap_line=None, skip_comments=True, keep_broken_tags=False, keep_all_columns=False, separator="\t", zip_filename=None): """ Read sentences from a TSV file Returns a list of list of (word, tag) If keep_broken_tags==True, then None is returned for a missing. Otherwise, an IndexError is thrown """ if zip_filename is not None: with zipfile.ZipFile(zip_filename) as zin: with zin.open(filename) as fin: fin = io.TextIOWrapper(fin, encoding='utf-8') lines = fin.readlines() else: with open(filename, encoding="utf-8") as fin: lines = fin.readlines() lines = [x.strip() for x in lines] sentences = [] current_sentence = [] for line_idx, line in enumerate(lines): if not line: if current_sentence: sentences.append(current_sentence) current_sentence = [] continue if skip_comments and line.startswith("#"): continue if remap_line is not None: line = remap_line(line) pieces = line.split(separator) try: word = pieces[text_column] except IndexError as e: raise IndexError("Filename %s: could not find word index %d at line %d |%s|" % (filename, text_column, line_idx, line)) from e if word == '\x96': # this happens in GermEval2014 for some reason continue try: tag = pieces[annotation_column] except IndexError as e: if keep_broken_tags: tag = None else: raise IndexError("Filename %s: could not find tag index %d at line %d |%s|" % (filename, annotation_column, line_idx, line)) from e if remap_tag_fn is not None: tag = remap_tag_fn(tag) if keep_all_columns: pieces[annotation_column] = tag current_sentence.append(pieces) else: current_sentence.append((word, tag)) if current_sentence: sentences.append(current_sentence) return sentences def random_shuffle_directory(input_dir, output_dir, short_name): input_files = os.listdir(input_dir) input_files = sorted(input_files) random_shuffle_files(input_dir, input_files, output_dir, short_name) def random_shuffle_files(input_dir, input_files, output_dir, short_name): """ Shuffle the files into different chunks based on their filename The first piece of the filename, split by ".", is used as a random seed. This will make it so that adding new files or using a different annotation scheme (assuming that's encoding in pieces of the filename) won't change the distibution of the files """ input_keys = {} for f in input_files: seed = f.split(".")[0] if seed in input_keys: raise ValueError("Multiple files with the same prefix: %s and %s" % (input_keys[seed], f)) input_keys[seed] = f assert len(input_keys) == len(input_files) train_files = [] dev_files = [] test_files = [] for filename in input_files: seed = filename.split(".")[0] # "salt" the filenames when using as a seed # definitely not because of a dumb bug in the original implementation seed = seed + ".txt.4class.tsv" random.seed(seed, 2) location = random.random() if location < 0.7: train_files.append(filename) elif location < 0.8: dev_files.append(filename) else: test_files.append(filename) print("Train files: %d Dev files: %d Test files: %d" % (len(train_files), len(dev_files), len(test_files))) assert len(train_files) + len(dev_files) + len(test_files) == len(input_files) file_lists = [train_files, dev_files, test_files] datasets = [] for files in file_lists: dataset = [] for filename in files: dataset.extend(read_tsv(os.path.join(input_dir, filename), 0, 1)) datasets.append(dataset) write_dataset(datasets, output_dir, short_name) return len(train_files), len(dev_files), len(test_files) def random_shuffle_by_prefixes(input_dir, output_dir, short_name, prefix_map): input_files = os.listdir(input_dir) input_files = sorted(input_files) file_divisions = defaultdict(list) for filename in input_files: for division in prefix_map.keys(): for prefix in prefix_map[division]: if filename.startswith(prefix): break else: # for/else is intentional continue break else: # yes, stop asking raise ValueError("Could not assign %s to any of the divisions in the prefix_map" % filename) #print("Assigning %s to %s because of %s" % (filename, division, prefix)) file_divisions[division].append(filename) num_train_files = 0 num_dev_files = 0 num_test_files = 0 for division in file_divisions.keys(): print() print("Processing %d files from %s" % (len(file_divisions[division]), division)) d_train, d_dev, d_test = random_shuffle_files(input_dir, file_divisions[division], output_dir, "%s-%s" % (short_name, division)) num_train_files += d_train num_dev_files += d_dev num_test_files += d_test print() print("After shuffling: Train files: %d Dev files: %d Test files: %d" % (num_train_files, num_dev_files, num_test_files)) dataset_divisions = ["%s-%s" % (short_name, division) for division in file_divisions] combine_dataset(output_dir, output_dir, dataset_divisions, short_name) def combine_dataset(input_dir, output_dir, input_datasets, output_dataset): datasets = [] for shard in SHARDS: full_dataset = [] for input_dataset in input_datasets: input_filename = "%s.%s.json" % (input_dataset, shard) input_path = os.path.join(input_dir, input_filename) with open(input_path, encoding="utf-8") as fin: dataset = json.load(fin) converted = [[(word['text'], word['ner']) for word in sentence] for sentence in dataset] full_dataset.extend(converted) datasets.append(full_dataset) write_dataset(datasets, output_dir, output_dataset) def read_prefix_file(destination_file): """ Read a prefix file such as the one for the Worldwide dataset the format should be africa: af_ ... asia: cn_ ... """ destination = None known_prefixes = set() prefixes = [] prefix_map = {} with open(destination_file, encoding="utf-8") as fin: for line in fin: line = line.strip() if line.startswith("#"): continue if not line: continue if line.endswith(":"): if destination is not None: prefix_map[destination] = prefixes prefixes = [] destination = line[:-1].strip().lower().replace(" ", "_") else: if not destination: raise RuntimeError("Found a prefix before the first label was assigned when reading %s" % destination_file) prefixes.append(line) if line in known_prefixes: raise RuntimeError("Found the same prefix twice! %s" % line) known_prefixes.add(line) if destination and prefixes: prefix_map[destination] = prefixes return prefix_map def read_json_entities(filename): """ Read entities from a file, return a list of (text, label) Should work on both BIOES and BIO """ with open(filename) as fin: doc = Document(json.load(fin)) return list_doc_entities(doc) def list_doc_entities(doc): """ Return a list of (text, label) Should work on both BIOES and BIO """ entities = [] for sentence in doc.sentences: current_entity = [] previous_label = None for token in sentence.tokens: if token.ner == 'O' or token.ner.startswith("E-"): if token.ner.startswith("E-"): current_entity.append(token.text) if current_entity: assert previous_label is not None entities.append((current_entity, previous_label)) current_entity = [] previous_label = None elif token.ner.startswith("I-"): if previous_label is not None and previous_label != 'O' and previous_label != token.ner[2:]: if current_entity: assert previous_label is not None entities.append((current_entity, previous_label)) current_entity = [] previous_label = token.ner[2:] current_entity.append(token.text) elif token.ner.startswith("B-") or token.ner.startswith("S-"): if current_entity: assert previous_label is not None entities.append((current_entity, previous_label)) current_entity = [] previous_label = None current_entity.append(token.text) previous_label = token.ner[2:] if token.ner.startswith("S-"): assert previous_label is not None entities.append(current_entity) current_entity = [] previous_label = None else: raise RuntimeError("Expected BIO(ES) format in the json file!") previous_label = token.ner[2:] if current_entity: assert previous_label is not None entities.append((current_entity, previous_label)) entities = [(tuple(x[0]), x[1]) for x in entities] return entities def combine_files(output_filename, *input_filenames): """ Combine multiple NER json files into one NER file """ doc = [] for filename in input_filenames: with open(filename) as fin: new_doc = json.load(fin) doc.extend(new_doc) with open(output_filename, "w") as fout: json.dump(doc, fout, indent=2)