bowphs commited on Apr 8, 2025

Commit

986094c

verified ·

1 Parent(s): 495f002

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

stanza/stanza/models/classifiers/constituency_classifier.py +96 -0
stanza/stanza/models/classifiers/data.py +169 -0
stanza/stanza/models/coref/bert.py +69 -0
stanza/stanza/models/langid/__init__.py +0 -0
stanza/stanza/models/langid/data.py +134 -0
stanza/stanza/models/langid/model.py +126 -0
stanza/stanza/models/lemma_classifier/__init__.py +0 -0
stanza/stanza/models/ner/model.py +278 -0
stanza/stanza/models/pos/scorer.py +22 -0
stanza/stanza/models/pos/vocab.py +71 -0
stanza/stanza/pipeline/demo/stanza-brat.js +1316 -0
stanza/stanza/pipeline/external/corenlp_converter_depparse.py +29 -0
stanza/stanza/pipeline/external/jieba.py +71 -0
stanza/stanza/pipeline/external/sudachipy.py +84 -0
stanza/stanza/utils/charlm/oscar_to_text.py +78 -0
stanza/stanza/utils/constituency/__init__.py +0 -0
stanza/stanza/utils/constituency/grep_test_logs.py +24 -0
stanza/stanza/utils/datasets/constituency/build_silver_dataset.py +117 -0
stanza/stanza/utils/datasets/constituency/convert_cintil.py +80 -0
stanza/stanza/utils/datasets/constituency/count_common_words.py +12 -0
stanza/stanza/utils/datasets/constituency/prepare_con_dataset.py +594 -0
stanza/stanza/utils/datasets/constituency/silver_variance.py +108 -0
stanza/stanza/utils/datasets/coref/convert_hindi.py +170 -0
stanza/stanza/utils/datasets/ner/compare_entities.py +38 -0
stanza/stanza/utils/datasets/ner/conll_to_iob.py +59 -0
stanza/stanza/utils/datasets/ner/convert_bn_daffodil.py +123 -0
stanza/stanza/utils/datasets/ner/convert_en_conll03.py +42 -0
stanza/stanza/utils/datasets/ner/convert_he_iahlt.py +108 -0
stanza/stanza/utils/datasets/ner/convert_lst20.py +74 -0
stanza/stanza/utils/datasets/ner/convert_mr_l3cube.py +54 -0
stanza/stanza/utils/datasets/ner/convert_nner22.py +70 -0
stanza/stanza/utils/datasets/ner/convert_ontonotes.py +58 -0
stanza/stanza/utils/datasets/ner/json_to_bio.py +43 -0
stanza/stanza/utils/datasets/ner/misc_to_date.py +77 -0
stanza/stanza/utils/datasets/ner/preprocess_wikiner.py +37 -0
stanza/stanza/utils/datasets/ner/simplify_en_worldwide.py +152 -0
stanza/stanza/utils/datasets/ner/simplify_ontonotes_to_worldwide.py +118 -0
stanza/stanza/utils/datasets/ner/split_wikiner.py +104 -0
stanza/stanza/utils/datasets/ner/suc_conll_to_iob.py +72 -0
stanza/stanza/utils/datasets/pos/__init__.py +0 -0
stanza/stanza/utils/datasets/pos/convert_trees_to_pos.py +94 -0
stanza/stanza/utils/datasets/prepare_tokenizer_data.py +151 -0
stanza/stanza/utils/datasets/prepare_tokenizer_treebank.py +1396 -0
stanza/stanza/utils/datasets/pretrain/__init__.py +0 -0
stanza/stanza/utils/datasets/tokenization/__init__.py +0 -0
stanza/stanza/utils/datasets/tokenization/convert_vi_vlsp.py +155 -0
stanza/stanza/utils/ner/spacy_ner_tag_dataset.py +138 -0
stanza/stanza/utils/training/__init__.py +0 -0
stanza/stanza/utils/training/remove_constituency_optimizer.py +77 -0
stanza/stanza/utils/visualization/dependency_visualization.py +108 -0

stanza/stanza/models/classifiers/constituency_classifier.py ADDED Viewed

	@@ -0,0 +1,96 @@

+"""
+A classifier that uses a constituency parser for the base embeddings
+"""
+import dataclasses
+import logging
+from types import SimpleNamespace
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from stanza.models.classifiers.base_classifier import BaseClassifier
+from stanza.models.classifiers.config import ConstituencyConfig
+from stanza.models.classifiers.data import SentimentDatum
+from stanza.models.classifiers.utils import ModelType, build_output_layers
+from stanza.models.common.utils import split_into_batches, sort_with_indices, unsort
+logger = logging.getLogger('stanza')
+tlogger = logging.getLogger('stanza.classifiers.trainer')
+class ConstituencyClassifier(BaseClassifier):
+    def __init__(self, tree_embedding, labels, args):
+        super(ConstituencyClassifier, self).__init__()
+        self.labels = labels
+        # we build a separate config out of the args so that we can easily save it in torch
+        self.config = ConstituencyConfig(fc_shapes = args.fc_shapes,
+                                         dropout = args.dropout,
+                                         num_classes = len(labels),
+                                         constituency_backprop = args.constituency_backprop,
+                                         constituency_batch_norm = args.constituency_batch_norm,
+                                         constituency_node_attn = args.constituency_node_attn,
+                                         constituency_top_layer = args.constituency_top_layer,
+                                         constituency_all_words = args.constituency_all_words,
+                                         model_type = ModelType.CONSTITUENCY)
+        self.tree_embedding = tree_embedding
+        self.fc_layers = build_output_layers(self.tree_embedding.output_size, self.config.fc_shapes, self.config.num_classes)
+        self.dropout = nn.Dropout(self.config.dropout)
+    def is_unsaved_module(self, name):
+        return False
+    def log_configuration(self):
+        tlogger.info("Backprop into parser: %s", self.config.constituency_backprop)
+        tlogger.info("Batch norm: %s", self.config.constituency_batch_norm)
+        tlogger.info("Word positions used: %s", "all words" if self.config.constituency_all_words else "start and end words")
+        tlogger.info("Attention over nodes: %s", self.config.constituency_node_attn)
+        tlogger.info("Intermediate layers: %s", self.config.fc_shapes)
+    def log_norms(self):
+        lines = ["NORMS FOR MODEL PARAMTERS"]
+        lines.extend(["tree_embedding." + x for x in self.tree_embedding.get_norms()])
+        for name, param in self.named_parameters():
+            if param.requires_grad and not name.startswith('tree_embedding.'):
+                lines.append("%s %.6g" % (name, torch.norm(param).item()))
+        logger.info("\n".join(lines))
+    def forward(self, inputs):
+        inputs = [x.constituency if isinstance(x, SentimentDatum) else x for x in inputs]
+        embedding = self.tree_embedding.embed_trees(inputs)
+        previous_layer = torch.stack([torch.max(x, dim=0)[0] for x in embedding], dim=0)
+        previous_layer = self.dropout(previous_layer)
+        for fc in self.fc_layers[:-1]:
+            # relu cause many neuron die
+            previous_layer = self.dropout(F.gelu(fc(previous_layer)))
+        out = self.fc_layers[-1](previous_layer)
+        return out
+    def get_params(self, skip_modules=True):
+        model_state = self.state_dict()
+        # skip all of the constituency parameters here -
+        # we will add them by calling the model's get_params()
+        skipped = [k for k in model_state.keys() if k.startswith("tree_embedding.")]
+        for k in skipped:
+            del model_state[k]
+        tree_embedding = self.tree_embedding.get_params(skip_modules)
+        config = dataclasses.asdict(self.config)
+        config['model_type'] = config['model_type'].name
+        params = {
+            'model':           model_state,
+            'tree_embedding':  tree_embedding,
+            'config':          config,
+            'labels':          self.labels,
+        }
+        return params
+    def extract_sentences(self, doc):
+        return [sentence.constituency for sentence in doc.sentences]

stanza/stanza/models/classifiers/data.py ADDED Viewed

	@@ -0,0 +1,169 @@

+"""Stanza models classifier data functions."""
+import collections
+from collections import namedtuple
+import logging
+import json
+import random
+import re
+from typing import List
+from stanza.models.classifiers.utils import WVType
+from stanza.models.common.vocab import PAD, PAD_ID, UNK, UNK_ID
+import stanza.models.constituency.tree_reader as tree_reader
+logger = logging.getLogger('stanza')
+class SentimentDatum:
+    def __init__(self, sentiment, text, constituency=None):
+        self.sentiment = sentiment
+        self.text = text
+        self.constituency = constituency
+    def __eq__(self, other):
+        if self is other:
+            return True
+        if not isinstance(other, SentimentDatum):
+            return False
+        return self.sentiment == other.sentiment and self.text == other.text and self.constituency == other.constituency
+    def __str__(self):
+        return str(self._asdict())
+    def _asdict(self):
+        if self.constituency is None:
+            return {'sentiment': self.sentiment, 'text': self.text}
+        else:
+            return {'sentiment': self.sentiment, 'text': self.text, 'constituency': str(self.constituency)}
+def update_text(sentence: List[str], wordvec_type: WVType) -> List[str]:
+    """
+    Process a line of text (with tokenization provided as whitespace)
+    into a list of strings.
+    """
+    # stanford sentiment dataset has a lot of random - and /
+    # remove those characters and flatten the newly created sublists into one list each time
+    sentence = [y for x in sentence for y in x.split("-") if y]
+    sentence = [y for x in sentence for y in x.split("/") if y]
+    sentence = [x.strip() for x in sentence]
+    sentence = [x for x in sentence if x]
+    if sentence == []:
+        # removed too much
+        sentence = ["-"]
+    # our current word vectors are all entirely lowercased
+    sentence = [word.lower() for word in sentence]
+    if wordvec_type == WVType.WORD2VEC:
+        return sentence
+    elif wordvec_type == WVType.GOOGLE:
+        new_sentence = []
+        for word in sentence:
+            if word != '0' and word != '1':
+                word = re.sub('[0-9]', '#', word)
+            new_sentence.append(word)
+        return new_sentence
+    elif wordvec_type == WVType.FASTTEXT:
+        return sentence
+    elif wordvec_type == WVType.OTHER:
+        return sentence
+    else:
+        raise ValueError("Unknown wordvec_type {}".format(wordvec_type))
+def read_dataset(dataset, wordvec_type: WVType, min_len: int) -> List[SentimentDatum]:
+    """
+    returns a list where the values of the list are
+      label, [token...]
+    """
+    lines = []
+    for filename in str(dataset).split(","):
+        with open(filename, encoding="utf-8") as fin:
+            new_lines = json.load(fin)
+        new_lines = [(str(x['sentiment']), x['text'], x.get('constituency', None)) for x in new_lines]
+        lines.extend(new_lines)
+    # TODO: maybe do this processing later, once the model is built.
+    # then move the processing into the model so we can use
+    # overloading to potentially make future model types
+    lines = [SentimentDatum(x[0], update_text(x[1], wordvec_type), tree_reader.read_trees(x[2])[0] if x[2] else None) for x in lines]
+    if min_len:
+        lines = [x for x in lines if len(x.text) >= min_len]
+    return lines
+def dataset_labels(dataset):
+    """
+    Returns a sorted list of label name
+    """
+    labels = set([x.sentiment for x in dataset])
+    if all(re.match("^[0-9]+$", label) for label in labels):
+        # if all of the labels are integers, sort numerically
+        # maybe not super important, but it would be nicer than having
+        # 10 before 2
+        labels = [str(x) for x in sorted(map(int, list(labels)))]
+    else:
+        labels = sorted(list(labels))
+    return labels
+def dataset_vocab(dataset):
+    vocab = set()
+    for line in dataset:
+        for word in line.text:
+            vocab.add(word)
+    vocab = [PAD, UNK] + list(vocab)
+    if vocab[PAD_ID] != PAD or vocab[UNK_ID] != UNK:
+        raise ValueError("Unexpected values for PAD and UNK!")
+    return vocab
+def sort_dataset_by_len(dataset, keep_index=False):
+    """
+    returns a dict mapping length -> list of items of that length
+    an OrderedDict is used so that the mapping is sorted from smallest to largest
+    """
+    sorted_dataset = collections.OrderedDict()
+    lengths = sorted(list(set(len(x.text) for x in dataset)))
+    for l in lengths:
+        sorted_dataset[l] = []
+    for item_idx, item in enumerate(dataset):
+        if keep_index:
+            sorted_dataset[len(item.text)].append((item, item_idx))
+        else:
+            sorted_dataset[len(item.text)].append(item)
+    return sorted_dataset
+def shuffle_dataset(sorted_dataset, batch_size, batch_single_item):
+    """
+    Given a dataset sorted by len, sorts within each length to make
+    chunks of roughly the same size.  Returns all items as a single list.
+    """
+    dataset = []
+    for l in sorted_dataset.keys():
+        items = list(sorted_dataset[l])
+        random.shuffle(items)
+        dataset.extend(items)
+    batches = []
+    next_batch = []
+    for item in dataset:
+        if batch_single_item > 0 and len(item.text) >= batch_single_item:
+            batches.append([item])
+        else:
+            next_batch.append(item)
+            if len(next_batch) >= batch_size:
+                batches.append(next_batch)
+                next_batch = []
+    if len(next_batch) > 0:
+        batches.append(next_batch)
+    random.shuffle(batches)
+    return batches
+def check_labels(labels, dataset):
+    """
+    Check that all of the labels in the dataset are in the known labels.
+    Actually, unknown labels could be acceptable if we just treat the model as always wrong.
+    However, this is a good sanity check to make sure the datasets match
+    """
+    new_labels = dataset_labels(dataset)
+    not_found = [i for i in new_labels if i not in labels]
+    if not_found:
+        raise RuntimeError('Dataset contains labels which the model does not know about:' + str(not_found))

stanza/stanza/models/coref/bert.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""Functions related to BERT or similar models"""
+import logging
+from typing import List, Tuple
+import numpy as np                                 # type: ignore
+from transformers import AutoModel, AutoTokenizer  # type: ignore
+from stanza.models.coref.config import Config
+from stanza.models.coref.const import Doc
+logger = logging.getLogger('stanza')
+def get_subwords_batches(doc: Doc,
+                         config: Config,
+                         tok: AutoTokenizer
+                         ) -> np.ndarray:
+    """
+    Turns a list of subwords to a list of lists of subword indices
+    of max length == batch_size (or shorter, as batch boundaries
+    should match sentence boundaries). Each batch is enclosed in cls and sep
+    special tokens.
+    Returns:
+        batches of bert tokens [n_batches, batch_size]
+    """
+    batch_size = config.bert_window_size - 2  # to save space for CLS and SEP
+    subwords: List[str] = doc["subwords"]
+    subwords_batches = []
+    start, end = 0, 0
+    while end < len(subwords):
+        # to prevent the case where a batch_size step forward
+        # doesn't capture more than 1 sentence, we will just cut
+        # that sequence
+        prev_end = end
+        end = min(end + batch_size, len(subwords))
+        # Move back till we hit a sentence end
+        if end < len(subwords):
+            sent_id = doc["sent_id"][doc["word_id"][end]]
+            while end and doc["sent_id"][doc["word_id"][end - 1]] == sent_id:
+                end -= 1
+        # this occurs IFF there was no sentence end found throughout
+        # the forward scan; this means that our sentence was waay too
+        # long (i.e. longer than the max length of the transformer.
+        #
+        # if so, we give up and just chop the sentence off at the max length
+        # that was given
+        if end == prev_end:
+            end = min(end + batch_size, len(subwords))
+        length = end - start
+        if tok.cls_token == None or tok.sep_token == None:
+            batch = [tok.eos_token] + subwords[start:end] + [tok.eos_token]
+        else:
+            batch = [tok.cls_token] + subwords[start:end] + [tok.sep_token]
+        # Padding to desired length
+        batch += [tok.pad_token] * (batch_size - length)
+        subwords_batches.append([tok.convert_tokens_to_ids(token)
+                                 for token in batch])
+        start += length
+    return np.array(subwords_batches)

stanza/stanza/models/langid/__init__.py ADDED Viewed

File without changes

stanza/stanza/models/langid/data.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import json
+import random
+import torch
+class DataLoader:
+    """
+    Class for loading language id data and providing batches
+    Attempt to recreate data pre-processing from: https://github.com/AU-DIS/LSTM_langid
+    Uses methods from: https://github.com/AU-DIS/LSTM_langid/blob/main/src/language_datasets.py
+    Data format is same as LSTM_langid
+    """
+    def __init__(self, device=None):
+        self.batches = None
+        self.batches_iter = None
+        self.tag_to_idx = None
+        self.idx_to_tag = None
+        self.lang_weights = None
+        self.device = device
+    def load_data(self, batch_size, data_files, char_index, tag_index, randomize=False, randomize_range=(5,20),
+                  max_length=None):
+        """
+        Load sequence data and labels, calculate weights for weighted cross entropy loss.
+        Data is stored in a file, 1 example per line
+        Example: {"text": "Hello world.", "label": "en"}
+        """
+        # set up examples from data files
+        examples = []
+        for data_file in data_files:
+            examples += [x for x in open(data_file).read().split("\n") if x.strip()]
+        random.shuffle(examples)
+        examples = [json.loads(x) for x in examples]
+        # add additional labels in this data set to tag index
+        tag_index = dict(tag_index)
+        new_labels = set([x["label"] for x in examples]) - set(tag_index.keys())
+        for new_label in new_labels:
+            tag_index[new_label] = len(tag_index)
+        self.tag_to_idx = tag_index
+        self.idx_to_tag = [i[1] for i in sorted([(v,k) for k,v in self.tag_to_idx.items()])]
+        # set up lang counts used for weights for cross entropy loss
+        lang_counts = [0 for _ in tag_index]
+        # optionally limit text to max length
+        if max_length is not None:
+            examples = [{"text": x["text"][:max_length], "label": x["label"]} for x in examples]
+        # randomize data
+        if randomize:
+            split_examples = []
+            for example in examples:
+                sequence = example["text"]
+                label = example["label"]
+                sequences = DataLoader.randomize_data([sequence], upper_lim=randomize_range[1],
+                                                      lower_lim=randomize_range[0])
+                split_examples += [{"text": seq, "label": label} for seq in sequences]
+            examples = split_examples
+            random.shuffle(examples)
+        # break into equal length batches
+        batch_lengths = {}
+        for example in examples:
+            sequence = example["text"]
+            label = example["label"]
+            if len(sequence) not in batch_lengths:
+                batch_lengths[len(sequence)] = []
+            sequence_as_list = [char_index.get(c, char_index["UNK"]) for c in list(sequence)]
+            batch_lengths[len(sequence)].append((sequence_as_list, tag_index[label]))
+            lang_counts[tag_index[label]] += 1
+        for length in batch_lengths:
+            random.shuffle(batch_lengths[length])
+        # create final set of batches
+        batches = []
+        for length in batch_lengths:
+            for sublist in [batch_lengths[length][i:i + batch_size] for i in
+                            range(0, len(batch_lengths[length]), batch_size)]:
+                batches.append(sublist)
+        self.batches = [self.build_batch_tensors(batch) for batch in batches]
+        # set up lang weights
+        most_frequent = max(lang_counts)
+        # set to 0.0 if lang_count is 0 or most_frequent/lang_count otherwise
+        lang_counts = [(most_frequent * x)/(max(1, x) ** 2) for x in lang_counts]
+        self.lang_weights = torch.tensor(lang_counts, device=self.device, dtype=torch.float)
+        # shuffle batches to mix up lengths
+        random.shuffle(self.batches)
+        self.batches_iter = iter(self.batches)
+    @staticmethod
+    def randomize_data(sentences, upper_lim=20, lower_lim=5):
+        """
+        Takes the original data and creates random length examples with length between upper limit and lower limit
+        From LSTM_langid: https://github.com/AU-DIS/LSTM_langid/blob/main/src/language_datasets.py
+        """
+        new_data = []
+        for sentence in sentences:
+            remaining = sentence
+            while lower_lim < len(remaining):
+                lim = random.randint(lower_lim, upper_lim)
+                m = min(len(remaining), lim)
+                new_sentence = remaining[:m]
+                new_data.append(new_sentence)
+                split = remaining[m:].split(" ", 1)
+                if len(split) <= 1:
+                    break
+                remaining = split[1]
+        random.shuffle(new_data)
+        return new_data
+    def build_batch_tensors(self, batch):
+        """
+        Helper to turn batches into tensors
+        """
+        batch_tensors = dict()
+        batch_tensors["sentences"] = torch.tensor([s[0] for s in batch], device=self.device, dtype=torch.long)
+        batch_tensors["targets"] = torch.tensor([s[1] for s in batch], device=self.device, dtype=torch.long)
+        return batch_tensors
+    def next(self):
+        return next(self.batches_iter)

stanza/stanza/models/langid/model.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import os
+import torch
+import torch.nn as nn
+class LangIDBiLSTM(nn.Module):
+    """
+    Multi-layer BiLSTM model for language detecting. A recreation of "A reproduction of Apple's bi-directional LSTM models
+    for language identification in short strings." (Toftrup et al 2021)
+    Arxiv: https://arxiv.org/abs/2102.06282
+    GitHub: https://github.com/AU-DIS/LSTM_langid
+    This class is similar to https://github.com/AU-DIS/LSTM_langid/blob/main/src/LSTMLID.py
+    """
+    def __init__(self, char_to_idx, tag_to_idx, num_layers, embedding_dim, hidden_dim, batch_size=64, weights=None,
+                 dropout=0.0, lang_subset=None):
+        super(LangIDBiLSTM, self).__init__()
+        self.num_layers = num_layers
+        self.embedding_dim = embedding_dim
+        self.hidden_dim = hidden_dim
+        self.char_to_idx = char_to_idx
+        self.vocab_size = len(char_to_idx)
+        self.tag_to_idx = tag_to_idx
+        self.idx_to_tag = [i[1] for i in sorted([(v,k) for k,v in self.tag_to_idx.items()])]
+        self.lang_subset = lang_subset
+        self.padding_idx = char_to_idx["<PAD>"]
+        self.tagset_size = len(tag_to_idx)
+        self.batch_size = batch_size
+        self.loss_train = nn.CrossEntropyLoss(weight=weights)
+        self.dropout_prob = dropout
+        # embeddings for chars
+        self.char_embeds = nn.Embedding(
+                num_embeddings=self.vocab_size,
+                embedding_dim=self.embedding_dim,
+                padding_idx=self.padding_idx
+        )
+        # the bidirectional LSTM
+        self.lstm = nn.LSTM(
+                self.embedding_dim,
+                self.hidden_dim,
+                num_layers=self.num_layers,
+                bidirectional=True,
+                batch_first=True
+        )
+        # convert output to tag space
+        self.hidden_to_tag = nn.Linear(
+                self.hidden_dim * 2,
+                self.tagset_size
+        )
+        # dropout layer
+        self.dropout = nn.Dropout(p=self.dropout_prob)
+    def build_lang_mask(self, device):
+        """
+        Build language mask if a lang subset is specified (e.g. ["en", "fr"])
+        The mask will be added to the results to set the prediction scores of illegal languages to -inf
+        """
+        if self.lang_subset:
+            lang_mask_list = [0.0 if lang in self.lang_subset else -float('inf') for lang in self.idx_to_tag]
+            self.lang_mask = torch.tensor(lang_mask_list, device=device, dtype=torch.float)
+        else:
+            self.lang_mask = torch.zeros(len(self.idx_to_tag), device=device, dtype=torch.float)
+    def loss(self, Y_hat, Y):
+        return self.loss_train(Y_hat, Y)
+    def forward(self, x):
+        # embed input
+        x = self.char_embeds(x)
+        # run through LSTM
+        x, _ = self.lstm(x)
+        # run through linear layer
+        x = self.hidden_to_tag(x)
+        # sum character outputs for each sequence
+        x = torch.sum(x, dim=1)
+        return x
+    def prediction_scores(self, x):
+        prediction_probs = self(x)
+        if self.lang_subset:
+            prediction_batch_size = prediction_probs.size()[0]
+            batch_mask = torch.stack([self.lang_mask for _ in range(prediction_batch_size)])
+            prediction_probs = prediction_probs + batch_mask
+        return torch.argmax(prediction_probs, dim=1)
+    def save(self, path):
+        """ Save a model at path """
+        checkpoint = {
+            "char_to_idx": self.char_to_idx,
+            "tag_to_idx": self.tag_to_idx,
+            "num_layers": self.num_layers,
+            "embedding_dim": self.embedding_dim,
+            "hidden_dim": self.hidden_dim,
+            "model_state_dict": self.state_dict()
+        }
+        torch.save(checkpoint, path)
+    @classmethod
+    def load(cls, path, device=None, batch_size=64, lang_subset=None):
+        """ Load a serialized model located at path """
+        if path is None:
+            raise FileNotFoundError("Trying to load langid model, but path not specified!  Try --load_name")
+        if not os.path.exists(path):
+            raise FileNotFoundError("Trying to load langid model from path which does not exist: %s" % path)
+        checkpoint = torch.load(path, map_location=torch.device("cpu"), weights_only=True)
+        weights = checkpoint["model_state_dict"]["loss_train.weight"]
+        model = cls(checkpoint["char_to_idx"], checkpoint["tag_to_idx"], checkpoint["num_layers"],
+                    checkpoint["embedding_dim"], checkpoint["hidden_dim"], batch_size=batch_size, weights=weights,
+                    lang_subset=lang_subset)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        model = model.to(device)
+        model.build_lang_mask(device)
+        return model

stanza/stanza/models/lemma_classifier/__init__.py ADDED Viewed

File without changes

stanza/stanza/models/ner/model.py ADDED Viewed

	@@ -0,0 +1,278 @@

+import os
+import logging
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence, pack_sequence, pad_sequence, PackedSequence
+from stanza.models.common.data import map_to_ids, get_long_tensor
+from stanza.models.common.exceptions import ForwardCharlmNotFoundError, BackwardCharlmNotFoundError
+from stanza.models.common.packed_lstm import PackedLSTM
+from stanza.models.common.dropout import WordDropout, LockedDropout
+from stanza.models.common.char_model import CharacterModel, CharacterLanguageModel
+from stanza.models.common.crf import CRFLoss
+from stanza.models.common.foundation_cache import load_bert
+from stanza.models.common.utils import attach_bert_model
+from stanza.models.common.vocab import PAD_ID, UNK_ID, EMPTY_ID
+from stanza.models.common.bert_embedding import extract_bert_embeddings
+logger = logging.getLogger('stanza')
+# this gets created in two places in trainer
+# in both places, pass in the bert model & tokenizer
+class NERTagger(nn.Module):
+    def __init__(self, args, vocab, emb_matrix=None, foundation_cache=None, bert_model=None, bert_tokenizer=None, force_bert_saved=False, peft_name=None):
+        super().__init__()
+        self.vocab = vocab
+        self.args = args
+        self.unsaved_modules = []
+        # input layers
+        input_size = 0
+        if self.args['word_emb_dim'] > 0:
+            emb_finetune = self.args.get('emb_finetune', True)
+            # load pretrained embeddings if specified
+            word_emb = nn.Embedding(len(self.vocab['word']), self.args['word_emb_dim'], PAD_ID)
+            # if a model trained with no 'delta' vocab is loaded, and
+            # emb_finetune is off, any resaving of the model will need
+            # the updated vectors.  this is accounted for in load()
+            if not emb_finetune or 'delta' in self.vocab:
+                # if emb_finetune is off
+                # or if the delta embedding is present
+                # then we won't fine tune the original embedding
+                self.add_unsaved_module('word_emb', word_emb)
+                self.word_emb.weight.detach_()
+            else:
+                self.word_emb = word_emb
+            if emb_matrix is not None:
+                self.init_emb(emb_matrix)
+            # TODO: allow for expansion of delta embedding if new
+            # training data has new words in it?
+            self.delta_emb = None
+            if 'delta' in self.vocab:
+                # zero inits seems to work better
+                # note that the gradient will flow to the bottom and then adjust the 0 weights
+                # as opposed to a 0 matrix cutting off the gradient if higher up in the model
+                self.delta_emb = nn.Embedding(len(self.vocab['delta']), self.args['word_emb_dim'], PAD_ID)
+                nn.init.zeros_(self.delta_emb.weight)
+                # if the model was trained with a delta embedding, but emb_finetune is off now,
+                # then we will detach the delta embedding
+                if not emb_finetune:
+                    self.delta_emb.weight.detach_()
+            input_size += self.args['word_emb_dim']
+        self.peft_name = peft_name
+        attach_bert_model(self, bert_model, bert_tokenizer, self.args.get('use_peft', False), force_bert_saved)
+        if self.args.get('bert_model', None):
+            # TODO: refactor bert_hidden_layers between the different models
+            if args.get('bert_hidden_layers', False):
+                # The average will be offset by 1/N so that the default zeros
+                # represents an average of the N layers
+                self.bert_layer_mix = nn.Linear(args['bert_hidden_layers'], 1, bias=False)
+                nn.init.zeros_(self.bert_layer_mix.weight)
+            else:
+                # an average of layers 2, 3, 4 will be used
+                # (for historic reasons)
+                self.bert_layer_mix = None
+            input_size += self.bert_model.config.hidden_size
+        if self.args['char'] and self.args['char_emb_dim'] > 0:
+            if self.args['charlm']:
+                if args['charlm_forward_file'] is None or not os.path.exists(args['charlm_forward_file']):
+                    raise ForwardCharlmNotFoundError('Could not find forward character model: {}  Please specify with --charlm_forward_file'.format(args['charlm_forward_file']), args['charlm_forward_file'])
+                if args['charlm_backward_file'] is None or not os.path.exists(args['charlm_backward_file']):
+                    raise BackwardCharlmNotFoundError('Could not find backward character model: {}  Please specify with --charlm_backward_file'.format(args['charlm_backward_file']), args['charlm_backward_file'])
+                self.add_unsaved_module('charmodel_forward', CharacterLanguageModel.load(args['charlm_forward_file'], finetune=False))
+                self.add_unsaved_module('charmodel_backward', CharacterLanguageModel.load(args['charlm_backward_file'], finetune=False))
+                input_size += self.charmodel_forward.hidden_dim() + self.charmodel_backward.hidden_dim()
+            else:
+                self.charmodel = CharacterModel(args, vocab, bidirectional=True, attention=False)
+                input_size += self.args['char_hidden_dim'] * 2
+        # optionally add a input transformation layer
+        if self.args.get('input_transform', False):
+            self.input_transform = nn.Linear(input_size, input_size)
+        else:
+            self.input_transform = None
+        # recurrent layers
+        self.taggerlstm = PackedLSTM(input_size, self.args['hidden_dim'], self.args['num_layers'], batch_first=True, \
+                bidirectional=True, dropout=0 if self.args['num_layers'] == 1 else self.args['dropout'])
+        # self.drop_replacement = nn.Parameter(torch.randn(input_size) / np.sqrt(input_size))
+        self.drop_replacement = None
+        self.taggerlstm_h_init = nn.Parameter(torch.zeros(2 * self.args['num_layers'], 1, self.args['hidden_dim']), requires_grad=False)
+        self.taggerlstm_c_init = nn.Parameter(torch.zeros(2 * self.args['num_layers'], 1, self.args['hidden_dim']), requires_grad=False)
+        # tag classifier
+        tag_lengths = self.vocab['tag'].lens()
+        self.num_output_layers = len(tag_lengths)
+        if self.args.get('connect_output_layers'):
+            tag_clfs = [nn.Linear(self.args['hidden_dim']*2, tag_lengths[0])]
+            for prev_length, next_length in zip(tag_lengths[:-1], tag_lengths[1:]):
+                tag_clfs.append(nn.Linear(self.args['hidden_dim']*2 + prev_length, next_length))
+            self.tag_clfs = nn.ModuleList(tag_clfs)
+        else:
+            self.tag_clfs = nn.ModuleList([nn.Linear(self.args['hidden_dim']*2, num_tag) for num_tag in tag_lengths])
+        for tag_clf in self.tag_clfs:
+            tag_clf.bias.data.zero_()
+        self.crits = nn.ModuleList([CRFLoss(num_tag) for num_tag in tag_lengths])
+        self.drop = nn.Dropout(args['dropout'])
+        self.worddrop = WordDropout(args['word_dropout'])
+        self.lockeddrop = LockedDropout(args['locked_dropout'])
+    def init_emb(self, emb_matrix):
+        if isinstance(emb_matrix, np.ndarray):
+            emb_matrix = torch.from_numpy(emb_matrix)
+        vocab_size = len(self.vocab['word'])
+        dim = self.args['word_emb_dim']
+        assert emb_matrix.size() == (vocab_size, dim), \
+            "Input embedding matrix must match size: {} x {}, found {}".format(vocab_size, dim, emb_matrix.size())
+        self.word_emb.weight.data.copy_(emb_matrix)
+    def add_unsaved_module(self, name, module):
+        self.unsaved_modules += [name]
+        setattr(self, name, module)
+    def log_norms(self):
+        lines = ["NORMS FOR MODEL PARAMTERS"]
+        for name, param in self.named_parameters():
+            if param.requires_grad and name.split(".")[0] not in ('charmodel_forward', 'charmodel_backward'):
+                lines.append("  %s %.6g" % (name, torch.norm(param).item()))
+        logger.info("\n".join(lines))
+    def forward(self, sentences, wordchars, wordchars_mask, tags, word_orig_idx, sentlens, wordlens, chars, charoffsets, charlens, char_orig_idx):
+        device = next(self.parameters()).device
+        def pack(x):
+            return pack_padded_sequence(x, sentlens, batch_first=True)
+        inputs = []
+        batch_size = len(sentences)
+        if self.args['word_emb_dim'] > 0:
+            #extract static embeddings
+            static_words, word_mask = self.extract_static_embeddings(self.args, sentences, self.vocab['word'])
+            word_mask = word_mask.to(device)
+            static_words = static_words.to(device)
+            word_static_emb = self.word_emb(static_words)
+            if 'delta' in self.vocab and self.delta_emb is not None:
+                # masks should be the same
+                delta_words, _ = self.extract_static_embeddings(self.args, sentences, self.vocab['delta'])
+                delta_words = delta_words.to(device)
+                # unclear whether to treat words in the main embedding
+                # but not in delta as unknown
+                # simple heuristic though - treating them as not
+                # unknown keeps existing models the same when
+                # separating models into the base WV and delta WV
+                # also, note that at training time, words like this
+                # did not show up in the training data, but are
+                # not exactly UNK, so it makes sense
+                delta_unk_mask = torch.eq(delta_words, UNK_ID)
+                static_unk_mask = torch.not_equal(static_words, UNK_ID)
+                unk_mask = delta_unk_mask * static_unk_mask
+                delta_words[unk_mask] = PAD_ID
+                delta_emb = self.delta_emb(delta_words)
+                word_static_emb = word_static_emb + delta_emb
+            word_emb = pack(word_static_emb)
+            inputs += [word_emb]
+        if self.bert_model is not None:
+            device = next(self.parameters()).device
+            processed_bert = extract_bert_embeddings(self.args['bert_model'], self.bert_tokenizer, self.bert_model, sentences, device, keep_endpoints=False,
+                                                     num_layers=self.bert_layer_mix.in_features if self.bert_layer_mix is not None else None,
+                                                     detach=not self.args.get('bert_finetune', False),
+                                                     peft_name=self.peft_name)
+            if self.bert_layer_mix is not None:
+                # use a linear layer to weighted average the embedding dynamically
+                processed_bert = [self.bert_layer_mix(feature).squeeze(2) + feature.sum(axis=2) / self.bert_layer_mix.in_features for feature in processed_bert]
+            processed_bert = pad_sequence(processed_bert, batch_first=True)
+            inputs += [pack(processed_bert)]
+        def pad(x):
+            return pad_packed_sequence(PackedSequence(x, word_emb.batch_sizes), batch_first=True)[0]
+        if self.args['char'] and self.args['char_emb_dim'] > 0:
+            if self.args.get('charlm', None):
+                char_reps_forward = self.charmodel_forward.get_representation(chars[0], charoffsets[0], charlens, char_orig_idx)
+                char_reps_forward = PackedSequence(char_reps_forward.data, char_reps_forward.batch_sizes)
+                char_reps_backward = self.charmodel_backward.get_representation(chars[1], charoffsets[1], charlens, char_orig_idx)
+                char_reps_backward = PackedSequence(char_reps_backward.data, char_reps_backward.batch_sizes)
+                inputs += [char_reps_forward, char_reps_backward]
+            else:
+                char_reps = self.charmodel(wordchars, wordchars_mask, word_orig_idx, sentlens, wordlens)
+                char_reps = PackedSequence(char_reps.data, char_reps.batch_sizes)
+                inputs += [char_reps]
+        lstm_inputs = torch.cat([x.data for x in inputs], 1)
+        if self.args['word_dropout'] > 0:
+            lstm_inputs = self.worddrop(lstm_inputs, self.drop_replacement)
+        lstm_inputs = self.drop(lstm_inputs)
+        lstm_inputs = pad(lstm_inputs)
+        lstm_inputs = self.lockeddrop(lstm_inputs)
+        lstm_inputs = pack(lstm_inputs).data
+        if self.input_transform:
+            lstm_inputs = self.input_transform(lstm_inputs)
+        lstm_inputs = PackedSequence(lstm_inputs, inputs[0].batch_sizes)
+        lstm_outputs, _ = self.taggerlstm(lstm_inputs, sentlens, hx=(\
+                self.taggerlstm_h_init.expand(2 * self.args['num_layers'], batch_size, self.args['hidden_dim']).contiguous(), \
+                self.taggerlstm_c_init.expand(2 * self.args['num_layers'], batch_size, self.args['hidden_dim']).contiguous()))
+        lstm_outputs = lstm_outputs.data
+        # prediction layer
+        lstm_outputs = self.drop(lstm_outputs)
+        lstm_outputs = pad(lstm_outputs)
+        lstm_outputs = self.lockeddrop(lstm_outputs)
+        lstm_outputs = pack(lstm_outputs).data
+        loss = 0
+        logits = []
+        trans = []
+        for idx, (tag_clf, crit) in enumerate(zip(self.tag_clfs, self.crits)):
+            if not self.args.get('connect_output_layers') or idx == 0:
+                next_logits = pad(tag_clf(lstm_outputs)).contiguous()
+            else:
+                # here we pack the output of the previous round, then append it
+                packed_logits = pack(next_logits).data
+                input_logits = torch.cat([lstm_outputs, packed_logits], axis=1)
+                next_logits = pad(tag_clf(input_logits)).contiguous()
+            # the tag_mask lets us avoid backprop on a blank tag
+            tag_mask = torch.eq(tags[:, :, idx], EMPTY_ID)
+            next_loss, next_trans = crit(next_logits, torch.bitwise_or(tag_mask, word_mask), tags[:, :, idx])
+            loss = loss + next_loss
+            logits.append(next_logits)
+            trans.append(next_trans)
+        return loss, logits, trans
+    @staticmethod
+    def extract_static_embeddings(args, sents, vocab):
+        processed = []
+        if args.get('lowercase', True): # handle word case
+            case = lambda x: x.lower()
+        else:
+            case = lambda x: x
+        for idx, sent in enumerate(sents):
+            processed_sent = [vocab.map([case(w) for w in sent])]
+            processed.append(processed_sent[0])
+        words = get_long_tensor(processed, len(sents))
+        words_mask = torch.eq(words, PAD_ID)
+        return words, words_mask

stanza/stanza/models/pos/scorer.py ADDED Viewed

	@@ -0,0 +1,22 @@

+"""
+Utils and wrappers for scoring taggers.
+"""
+import logging
+from stanza.models.common.utils import ud_scores
+logger = logging.getLogger('stanza')
+def score(system_conllu_file, gold_conllu_file, verbose=True, eval_type='AllTags'):
+    """ Wrapper for tagger scorer. """
+    evaluation = ud_scores(gold_conllu_file, system_conllu_file)
+    el = evaluation[eval_type]
+    p = el.precision
+    r = el.recall
+    f = el.f1
+    if verbose:
+        scores = [evaluation[k].f1 * 100 for k in ['UPOS', 'XPOS', 'UFeats', 'AllTags']]
+        logger.info("UPOS\tXPOS\tUFeats\tAllTags")
+        logger.info("{:.2f}\t{:.2f}\t{:.2f}\t{:.2f}".format(*scores))
+    return p, r, f

stanza/stanza/models/pos/vocab.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from collections import Counter, OrderedDict
+from stanza.models.common.vocab import BaseVocab, BaseMultiVocab, CharVocab
+from stanza.models.common.vocab import CompositeVocab, VOCAB_PREFIX, EMPTY, EMPTY_ID
+class WordVocab(BaseVocab):
+    def __init__(self, data=None, lang="", idx=0, cutoff=0, lower=False, ignore=None):
+        self.ignore = ignore if ignore is not None else []
+        super().__init__(data, lang=lang, idx=idx, cutoff=cutoff, lower=lower)
+        self.state_attrs += ['ignore']
+    def id2unit(self, id):
+        if len(self.ignore) > 0 and id == EMPTY_ID:
+            return '_'
+        else:
+            return super().id2unit(id)
+    def unit2id(self, unit):
+        if len(self.ignore) > 0 and unit in self.ignore:
+            return self._unit2id[EMPTY]
+        else:
+            return super().unit2id(unit)
+    def build_vocab(self):
+        if self.lower:
+            counter = Counter([w[self.idx].lower() for sent in self.data for w in sent])
+        else:
+            counter = Counter([w[self.idx] for sent in self.data for w in sent])
+        for k in list(counter.keys()):
+            if counter[k] < self.cutoff or k in self.ignore:
+                del counter[k]
+        self._id2unit = VOCAB_PREFIX + list(sorted(list(counter.keys()), key=lambda k: counter[k], reverse=True))
+        self._unit2id = {w:i for i, w in enumerate(self._id2unit)}
+    def __str__(self):
+        return "<{}: {}>".format(type(self), ",".join("|%s|" % x for x in self._id2unit))
+class XPOSVocab(CompositeVocab):
+    def __init__(self, data=None, lang="", idx=0, sep="", keyed=False):
+        super().__init__(data, lang, idx=idx, sep=sep, keyed=keyed)
+class FeatureVocab(CompositeVocab):
+    def __init__(self, data=None, lang="", idx=0, sep="|", keyed=True):
+        super().__init__(data, lang, idx=idx, sep=sep, keyed=keyed)
+class MultiVocab(BaseMultiVocab):
+    def state_dict(self):
+        """ Also save a vocab name to class name mapping in state dict. """
+        state = OrderedDict()
+        key2class = OrderedDict()
+        for k, v in self._vocabs.items():
+            state[k] = v.state_dict()
+            key2class[k] = type(v).__name__
+        state['_key2class'] = key2class
+        return state
+    @classmethod
+    def load_state_dict(cls, state_dict):
+        class_dict = {'CharVocab': CharVocab,
+                      'WordVocab': WordVocab,
+                      'XPOSVocab': XPOSVocab,
+                      'FeatureVocab': FeatureVocab}
+        new = cls()
+        assert '_key2class' in state_dict, "Cannot find class name mapping in state dict!"
+        key2class = state_dict.pop('_key2class')
+        for k,v in state_dict.items():
+            classname = key2class[k]
+            new[k] = class_dict[classname].load_state_dict(v)
+        return new

stanza/stanza/pipeline/demo/stanza-brat.js ADDED Viewed

	@@ -0,0 +1,1316 @@

+// Takes Stanford CoreNLP JSON output (var data = ... in data.js)
+// and uses brat to render everything.
+//var serverAddress = 'http://localhost:5000';
+// Load Brat libraries
+var bratLocation = 'https://nlp.stanford.edu/js/brat/';
+head.js(
+  // External libraries
+  bratLocation + '/client/lib/jquery.svg.min.js',
+  bratLocation + '/client/lib/jquery.svgdom.min.js',
+  // brat helper modules
+  bratLocation + '/client/src/configuration.js',
+  bratLocation + '/client/src/util.js',
+  bratLocation + '/client/src/annotation_log.js',
+  bratLocation + '/client/lib/webfont.js',
+  // brat modules
+  bratLocation + '/client/src/dispatcher.js',
+  bratLocation + '/client/src/url_monitor.js',
+  bratLocation + '/client/src/visualizer.js',
+  // parse viewer
+  './stanza-parseviewer.js'
+);
+// Uses Dagre (https://github.com/cpettitt/dagre) for constinuency parse
+// visualization. It works better than the brat visualization.
+var useDagre = true;
+var currentQuery = 'The quick brown fox jumped over the lazy dog.';
+var currentSentences = '';
+var currentText = '';
+// ----------------------------------------------------------------------------
+// HELPERS
+// ----------------------------------------------------------------------------
+/**
+ * Add the startsWith function to the String class
+ */
+if (typeof String.prototype.startsWith !== 'function') {
+  // see below for better implementation!
+  String.prototype.startsWith = function (str){
+    return this.indexOf(str) === 0;
+  };
+}
+function isInt(value) {
+  return !isNaN(value) && (function(x) { return (x | 0) === x; })(parseFloat(value))
+}
+/**
+ * A reverse map of PTB tokens to their original gloss
+ */
+var tokensMap = {
+  '-LRB-': '(',
+  '-RRB-': ')',
+  '-LSB-': '[',
+  '-RSB-': ']',
+  '-LCB-': '{',
+  '-RCB-': '}',
+  '``': '"',
+  '\'\'': '"',
+};
+/**
+ * A mapping from part of speech tag to the associated
+ * visualization color
+ */
+function posColor(posTag) {
+  if (posTag === null) {
+    return '#E3E3E3';
+  } else if (posTag.startsWith('N')) {
+    return '#A4BCED';
+  } else if (posTag.startsWith('V') || posTag.startsWith('M')) {
+    return '#ADF6A2';
+  } else if (posTag.startsWith('P')) {
+    return '#CCDAF6';
+  } else if (posTag.startsWith('I')) {
+    return '#FFE8BE';
+  } else if (posTag.startsWith('R') || posTag.startsWith('W')) {
+    return '#FFFDA8';
+  } else if (posTag.startsWith('D') || posTag === 'CD') {
+    return '#CCADF6';
+  } else if (posTag.startsWith('J')) {
+    return '#FFFDA8';
+  } else if (posTag.startsWith('T')) {
+    return '#FFE8BE';
+  } else if (posTag.startsWith('E') || posTag.startsWith('S')) {
+    return '#E4CBF6';
+  } else if (posTag.startsWith('CC')) {
+    return '#FFFFFF';
+  } else if (posTag === 'LS' || posTag === 'FW') {
+    return '#FFFFFF';
+  } else {
+    return '#E3E3E3';
+  }
+}
+/**
+ * A mapping from part of speech tag to the associated
+ * visualization color
+ */
+function uposColor(posTag) {
+  if (posTag === null) {
+    return '#E3E3E3';
+  } else if (posTag === 'NOUN' || posTag === 'PROPN') {
+    return '#A4BCED';
+  } else if (posTag.startsWith('V') || posTag === 'AUX') {
+    return '#ADF6A2';
+  } else if (posTag === 'PART') {
+    return '#CCDAF6';
+  } else if (posTag === 'ADP') {
+    return '#FFE8BE';
+  } else if (posTag === 'ADV' || posTag.startsWith('PRON')) {
+    return '#FFFDA8';
+  } else if (posTag === 'NUM' || posTag === 'DET') {
+    return '#CCADF6';
+  } else if (posTag === 'ADJ') {
+    return '#FFFDA8';
+  } else if (posTag.startsWith('E') || posTag.startsWith('S')) {
+    return '#E4CBF6';
+  } else if (posTag.startsWith('CC')) {
+    return '#FFFFFF';
+  } else if (posTag === 'X' || posTag === 'FW') {
+    return '#FFFFFF';
+  } else {
+    return '#E3E3E3';
+  }
+}
+/**
+ * A mapping from named entity tag to the associated
+ * visualization color
+ */
+function nerColor(nerTag) {
+  if (nerTag === null) {
+    return '#E3E3E3';
+  } else if (nerTag === 'PERSON' || nerTag === 'PER') {
+    return '#FFCCAA';
+  } else if (nerTag === 'ORGANIZATION' || nerTag === 'ORG') {
+    return '#8FB2FF';
+  } else if (nerTag === 'MISC') {
+    return '#F1F447';
+  } else if (nerTag === 'LOCATION' || nerTag == 'LOC') {
+    return '#95DFFF';
+  } else if (nerTag === 'DATE' || nerTag === 'TIME' || nerTag === 'SET') {
+    return '#9AFFE6';
+  } else if (nerTag === 'MONEY') {
+    return '#FFFFFF';
+  } else if (nerTag === 'PERCENT') {
+    return '#FFA22B';
+  } else {
+    return '#E3E3E3';
+  }
+}
+/**
+ * A mapping from sentiment value to the associated
+ * visualization color
+ */
+function sentimentColor(sentiment) {
+  if (sentiment === "VERY POSITIVE") {
+    return '#00FF00';
+  } else if (sentiment === "POSITIVE") {
+    return '#7FFF00';
+  } else if (sentiment === "NEUTRAL") {
+    return '#FFFF00';
+  } else if (sentiment === "NEGATIVE") {
+    return '#FF7F00';
+  } else if (sentiment === "VERY NEGATIVE") {
+    return '#FF0000';
+  } else {
+    return '#E3E3E3';
+  }
+}
+/**
+ * Get a list of annotators, from the annotator option input.
+ */
+function annotators() {
+  var annotators = "tokenize,ssplit";
+  $('#annotators').find('option:selected').each(function () {
+    annotators += "," + $(this).val();
+  });
+  return annotators;
+}
+/**
+ * Get the input date
+ */
+function date() {
+  function f(n) {
+    return n < 10 ? '0' + n : n;
+  }
+  var date = new Date();
+  var M = date.getMonth() + 1;
+  var D = date.getDate();
+  var Y = date.getFullYear();
+  var h = date.getHours();
+  var m = date.getMinutes();
+  var s = date.getSeconds();
+  return "" + Y + "-" + f(M) + "-" + f(D) + "T" + f(h) + ':' + f(m) + ':' + f(s);
+}
+//-----------------------------------------------------------------------------
+// Constituency parser
+//-----------------------------------------------------------------------------
+function ConstituencyParseProcessor() {
+  var parenthesize = function (input, list) {
+    if (list === undefined) {
+      return parenthesize(input, []);
+    } else {
+      var token = input.shift();
+      if (token === undefined) {
+        return list.pop();
+      } else if (token === "(") {
+        list.push(parenthesize(input, []));
+        return parenthesize(input, list);
+      } else if (token === ")") {
+        return list;
+      } else {
+        return parenthesize(input, list.concat(token));
+      }
+    }
+  };
+  var toTree = function (list) {
+    if (list.length === 2 && typeof list[1] === 'string') {
+      return {label: list[0], text: list[1], isTerminal: true};
+    } else if (list.length >= 2) {
+      var label = list.shift();
+      var node = {label: label};
+      var rest = list.map(function (x) {
+        var t = toTree(x);
+        if (typeof t === 'object') {
+          t.parent = node;
+        }
+        return t;
+      });
+      node.children = rest;
+      return node;
+    } else {
+      return list;
+    }
+  };
+  var indexTree = function (tree, tokens, index) {
+    index = index || 0;
+    if (tree.isTerminal) {
+      tree.token = tokens[index];
+      tree.tokenIndex = index;
+      tree.tokenStart = index;
+      tree.tokenEnd = index + 1;
+      return index + 1;
+    } else if (tree.children) {
+      tree.tokenStart = index;
+      for (var i = 0; i < tree.children.length; i++) {
+        var child = tree.children[i];
+        index = indexTree(child, tokens, index);
+      }
+      tree.tokenEnd = index;
+    }
+    return index;
+  };
+  var tokenize = function (input) {
+    return input.split('"')
+      .map(function (x, i) {
+        if (i % 2 === 0) { // not in string
+          return x.replace(/\(/g, ' ( ')
+            .replace(/\)/g, ' ) ');
+        } else { // in string
+          return x.replace(/ /g, "!whitespace!");
+        }
+      })
+      .join('"')
+      .trim()
+      .split(/\s+/)
+      .map(function (x) {
+        return x.replace(/!whitespace!/g, " ");
+      });
+  };
+  var convertParseStringToTree = function (input, tokens) {
+    var p = parenthesize(tokenize(input));
+    if (Array.isArray(p)) {
+      var tree = toTree(p);
+      // Correlate tree with tokens
+      indexTree(tree, tokens);
+      return tree;
+    }
+  };
+  this.process = function(annotation) {
+    for (var i = 0; i < annotation.sentences.length; i++) {
+      var s = annotation.sentences[i];
+      if (s.parse) {
+        s.parseTree = convertParseStringToTree(s.parse, s.tokens);
+      }
+    }
+  }
+}
+// ----------------------------------------------------------------------------
+// RENDER
+// ----------------------------------------------------------------------------
+/**
+ * Render a given JSON data structure
+ */
+function render(data, reverse) {
+  // Tweak arguments
+  if (typeof reverse !== 'boolean') {
+    reverse = false;
+  }
+  // Error checks
+  if (typeof data.sentences === 'undefined') { return; }
+  /**
+   * Register an entity type (a tag) for Brat
+   */
+  var entityTypesSet = {};
+  var entityTypes = [];
+  function addEntityType(name, type, coarseType) {
+    if (typeof coarseType === "undefined") {
+      coarseType = type;
+    }
+    // Don't add duplicates
+    if (entityTypesSet[type]) return;
+    entityTypesSet[type] = true;
+    // Get the color of the entity type
+    color = '#ffccaa';
+    if (name === 'POS') {
+      color = posColor(type);
+    } else if (name === 'UPOS') {
+      color = uposColor(type);
+    } else if (name === 'NER') {
+      color = nerColor(coarseType);
+    } else if (name === 'NNER') {
+      color = nerColor(coarseType);
+    } else if (name === 'COREF') {
+      color = '#FFE000';
+    } else if (name === 'ENTITY') {
+      color = posColor('NN');
+    } else if (name === 'RELATION') {
+      color = posColor('VB');
+    } else if (name === 'LEMMA') {
+      color = '#FFFFFF';
+    } else if (name === 'SENTIMENT') {
+      color = sentimentColor(type);
+    } else if (name === 'LINK') {
+      color = '#FFFFFF';
+    } else if (name === 'KBP_ENTITY') {
+      color = '#FFFFFF';
+    }
+    // Register the type
+    entityTypes.push({
+      type: type,
+      labels : [type],
+      bgColor: color,
+      borderColor: 'darken'
+    });
+  }
+  /**
+   * Register a relation type (an arc) for Brat
+   */
+  var relationTypesSet = {};
+  var relationTypes = [];
+  function addRelationType(type, symmetricEdge) {
+    // Prevent adding duplicates
+    if (relationTypesSet[type]) return;
+    relationTypesSet[type] = true;
+    // Default arguments
+    if (typeof symmetricEdge === 'undefined') { symmetricEdge = false; }
+    // Add the type
+    relationTypes.push({
+      type: type,
+      labels: [type],
+      dashArray: (symmetricEdge ? '3,3' : undefined),
+      arrowHead: (symmetricEdge ? 'none' : undefined),
+    });
+  }
+  //
+  // Construct text of annotation
+  //
+  currentText = [];  // GLOBAL
+  currentSentences = data.sentences;  // GLOBAL
+  data.sentences.forEach(function(sentence) {
+    for (var i = 0; i < sentence.tokens.length; ++i) {
+      var token = sentence.tokens[i];
+      var word = token.word;
+      if (!(typeof tokensMap[word] === "undefined")) {
+        word = tokensMap[word];
+      }
+      if (i > 0) { currentText.push(' '); }
+      token.characterOffsetBegin = currentText.length;
+      for (var j = 0; j < word.length; ++j) {
+        currentText.push(word[j]);
+      }
+      token.characterOffsetEnd = currentText.length;
+    }
+    currentText.push('\n');
+  });
+  currentText = currentText.join('');
+  //
+  // Shared variables
+  // These are what we'll render in BRAT
+  //
+  // (pos)
+  var posEntities = [];
+  // (upos)
+  var uposEntities = [];
+  // (lemma)
+  var lemmaEntities = [];
+  // (ner)
+  var nerEntities = [];
+  var nerEntitiesNormalized = [];
+  // (sentiment)
+  var sentimentEntities = [];
+  // (entitylinking)
+  var linkEntities = [];
+  // (dependencies)
+  var depsRelations = [];
+  var deps2Relations = [];
+  // (openie)
+  var openieEntities = [];
+  var openieEntitiesSet = {};
+  var openieRelations = [];
+  var openieRelationsSet = {};
+  // (kbp)
+  var kbpEntities = [];
+  var kbpEntitiesSet = [];
+  var kbpRelations = [];
+  var kbpRelationsSet = [];
+  var cparseEntities = [];
+  var cparseRelations = [];
+  //
+  // Loop over sentences.
+  // This fills in the variables above.
+  //
+  for (var sentI = 0; sentI < data.sentences.length; ++sentI) {
+    var sentence = data.sentences[sentI];
+    var index = sentence.index;
+    var tokens = sentence.tokens;
+    var deps = sentence['basicDependencies'];
+    var deps2 = sentence['enhancedPlusPlusDependencies'];
+    var parseTree = sentence['parseTree'];
+    // POS tags
+    /**
+     * Generate a POS tagged token id
+     */
+    function posID(i) {
+      return 'POS_' + sentI + '_' + i;
+    }
+    var noXPOS = true;
+    if (tokens.length > 0 && typeof tokens[0].pos !== 'undefined' && tokens[0].pos !== null) {
+      noXPOS = false;
+      for (var i = 0; i < tokens.length; i++) {
+        var token = tokens[i];
+        var pos = token.pos;
+        var begin = parseInt(token.characterOffsetBegin);
+        var end = parseInt(token.characterOffsetEnd);
+        addEntityType('POS', pos);
+        posEntities.push([posID(i), pos, [[begin, end]]]);
+      }
+    }
+    // Universal POS tags
+    /**
+     * Generate a POS tagged token id
+     */
+    function uposID(i) {
+      return 'UPOS_' + sentI + '_' + i;
+    }
+    if (tokens.length > 0 && typeof tokens[0].upos !== 'undefined') {
+      for (var i = 0; i < tokens.length; i++) {
+        var token = tokens[i];
+        var upos = token.upos;
+        var begin = parseInt(token.characterOffsetBegin);
+        var end = parseInt(token.characterOffsetEnd);
+        addEntityType('UPOS', upos);
+        uposEntities.push([uposID(i), upos, [[begin, end]]]);
+      }
+    }
+    // Constituency parse
+    // Carries the same assumption as NER
+    if (parseTree && !useDagre) {
+      var parseEntities = [];
+      var parseRels = [];
+      function processParseTree(tree, index) {
+        tree.visitIndex = index;
+        index++;
+        if (tree.isTerminal) {
+          parseEntities[tree.visitIndex] = uposEntities[tree.tokenIndex];
+          return index;
+        } else if (tree.children) {
+          addEntityType('PARSENODE', tree.label);
+          parseEntities[tree.visitIndex] =
+            ['PARSENODE_' + sentI + '_' + tree.visitIndex, tree.label,
+              [[tokens[tree.tokenStart].characterOffsetBegin, tokens[tree.tokenEnd-1].characterOffsetEnd]]];
+          var parentEnt = parseEntities[tree.visitIndex];
+          for (var i = 0; i < tree.children.length; i++) {
+            var child = tree.children[i];
+            index = processParseTree(child, index);
+            var childEnt = parseEntities[child.visitIndex];
+            addRelationType('pc');
+            parseRels.push(['PARSEEDGE_' + sentI + '_' + parseRels.length, 'pc', [['parent', parentEnt[0]], ['child', childEnt[0]]]]);
+          }
+        }
+        return index;
+      }
+      processParseTree(parseTree, 0);
+      cparseEntities = cparseEntities.concat(cparseEntities, parseEntities);
+      cparseRelations = cparseRelations.concat(parseRels);
+    }
+    // Dependency parsing
+    /**
+     * Process a dependency tree from JSON to Brat relations
+     */
+    function processDeps(name, deps) {
+      var relations = [];
+      // Format: [${ID}, ${TYPE}, [[${ARGNAME}, ${TARGET}], [${ARGNAME}, ${TARGET}]]]
+      for (var i = 0; i < deps.length; i++) {
+        var dep = deps[i];
+        var governor = dep.governor - 1;
+        var dependent = dep.dependent - 1;
+        if (governor == -1) continue;
+        addRelationType(dep.dep);
+        relations.push([name + '_' + sentI + '_' + i, dep.dep, [['governor', uposID(governor)], ['dependent', uposID(dependent)]]]);
+      }
+      return relations;
+    }
+    // Actually add the dependencies
+    if (typeof deps !== 'undefined') {
+      depsRelations = depsRelations.concat(processDeps('dep', deps));
+    }
+    if (typeof deps2 !== 'undefined') {
+      deps2Relations = deps2Relations.concat(processDeps('dep2', deps2));
+    }
+    // Lemmas
+    if (tokens.length > 0 && typeof tokens[0].lemma !== 'undefined') {
+      for (var i = 0; i < tokens.length; i++) {
+        var token = tokens[i];
+        var lemma = token.lemma;
+        var begin = parseInt(token.characterOffsetBegin);
+        var end = parseInt(token.characterOffsetEnd);
+        addEntityType('LEMMA', lemma);
+        lemmaEntities.push(['LEMMA_' + sentI + '_' + i, lemma, [[begin, end]]]);
+      }
+    }
+    // NER tags
+    // Assumption: contiguous occurrence of one non-O is a single entity
+    var noNER = true;
+    if (tokens.some(function(token) { return token.ner; })) {
+      noNER = false;
+      for (var i = 0; i < tokens.length; i++) {
+        var ner = tokens[i].ner || 'O';
+        var normalizedNER = tokens[i].normalizedNER;
+        if (typeof normalizedNER === "undefined") {
+          normalizedNER = ner;
+        }
+        if (ner == 'O') continue;
+        var j = i;
+        while (j < tokens.length - 1 && tokens[j+1].ner == ner) j++;
+        addEntityType('NER', ner, ner);
+        nerEntities.push(['NER_' + sentI + '_' + i, ner, [[tokens[i].characterOffsetBegin, tokens[j].characterOffsetEnd]]]);
+        if (ner != normalizedNER) {
+          addEntityType('NNER', normalizedNER, ner);
+          nerEntities.push(['NNER_' + sentI + '_' + i, normalizedNER, [[tokens[i].characterOffsetBegin, tokens[j].characterOffsetEnd]]]);
+        }
+        i = j;
+      }
+    }
+    // Sentiment
+    if (typeof sentence.sentiment !== "undefined") {
+      var sentiment = sentence.sentiment.toUpperCase().replace("VERY", "VERY ");
+      addEntityType('SENTIMENT', sentiment);
+      sentimentEntities.push(['SENTIMENT_' + sentI, sentiment,
+        [[tokens[0].characterOffsetBegin, tokens[tokens.length - 1].characterOffsetEnd]]]);
+    }
+    // Entity Links
+    // Carries the same assumption as NER
+    if (tokens.length > 0) {
+      for (var i = 0; i < tokens.length; i++) {
+        var link = tokens[i].entitylink;
+        if (link == 'O' || typeof link === 'undefined') continue;
+        var j = i;
+        while (j < tokens.length - 1 && tokens[j+1].entitylink == link) j++;
+        addEntityType('LINK', link);
+        linkEntities.push(['LINK_' + sentI + '_' + i, link, [[tokens[i].characterOffsetBegin, tokens[j].characterOffsetEnd]]]);
+        i = j;
+      }
+    }
+    // Open IE
+    // Helper Functions
+    function openieID(span) {
+      return 'OPENIEENTITY' + '_' + sentI + '_' + span[0] + '_' + span[1];
+    }
+    function addEntity(span, role) {
+      // Don't add duplicate entities
+      if (openieEntitiesSet[[sentI, span, role]]) return;
+      openieEntitiesSet[[sentI, span, role]] = true;
+      // Add the entity
+      openieEntities.push([openieID(span), role,
+        [[tokens[span[0]].characterOffsetBegin,
+          tokens[span[1] - 1].characterOffsetEnd ]] ]);
+    }
+    function addRelation(gov, dep, role) {
+      // Don't add duplicate relations
+      if (openieRelationsSet[[sentI, gov, dep, role]]) return;
+      openieRelationsSet[[sentI, gov, dep, role]] = true;
+      // Add the relation
+      openieRelations.push(['OPENIESUBJREL_' + sentI + '_' + gov[0] + '_' + gov[1] + '_' + dep[0] + '_' + dep[1],
+                           role,
+                           [['governor',  openieID(gov)],
+                            ['dependent', openieID(dep)]  ] ]);
+    }
+    // Render OpenIE
+    if (typeof sentence.openie !== 'undefined') {
+      // Register the entities + relations we'll need
+      addEntityType('ENTITY',  'Entity');
+      addEntityType('RELATION', 'Relation');
+      addRelationType('subject');
+      addRelationType('object');
+      // Loop over triples
+      for (var i = 0; i < sentence.openie.length; ++i) {
+        var subjectSpan = sentence.openie[i].subjectSpan;
+        var relationSpan = sentence.openie[i].relationSpan;
+        var objectSpan = sentence.openie[i].objectSpan;
+        if (parseInt(relationSpan[0]) < 0  || parseInt(relationSpan[1]) < 0) {
+          continue;  // This is a phantom relation
+        }
+        var begin = parseInt(token.characterOffsetBegin);
+        // Add the entities
+        addEntity(subjectSpan, 'Entity');
+        addEntity(relationSpan, 'Relation');
+        addEntity(objectSpan, 'Entity');
+        // Add the relations
+        addRelation(relationSpan, subjectSpan, 'subject');
+        addRelation(relationSpan, objectSpan, 'object');
+      }
+    }  // End OpenIE block
+    //
+    // KBP
+    //
+    // Helper Functions
+    function kbpEntity(span) {
+      return 'KBPENTITY' + '_' + sentI + '_' + span[0] + '_' + span[1];
+    }
+    function addKBPEntity(span, role) {
+      // Don't add duplicate entities
+      if (kbpEntitiesSet[[sentI, span, role]]) return;
+      kbpEntitiesSet[[sentI, span, role]] = true;
+      // Add the entity
+      kbpEntities.push([kbpEntity(span), role,
+        [[tokens[span[0]].characterOffsetBegin,
+          tokens[span[1] - 1].characterOffsetEnd ]] ]);
+    }
+    function addKBPRelation(gov, dep, role) {
+      // Don't add duplicate relations
+      if (kbpRelationsSet[[sentI, gov, dep, role]]) return;
+      kbpRelationsSet[[sentI, gov, dep, role]] = true;
+      // Add the relation
+      kbpRelations.push(['KBPRELATION_' + sentI + '_' + gov[0] + '_' + gov[1] + '_' + dep[0] + '_' + dep[1],
+                           role,
+                           [['governor',  kbpEntity(gov)],
+                            ['dependent', kbpEntity(dep)]  ] ]);
+    }
+    if (typeof sentence.kbp !== 'undefined') {
+      // Register the entities + relations we'll need
+      addRelationType('subject');
+      addRelationType('object');
+      // Loop over triples
+      for (var i = 0; i < sentence.kbp.length; ++i) {
+        var subjectSpan = sentence.kbp[i].subjectSpan;
+        var subjectLink = 'Entity';
+        for (var k = subjectSpan[0]; k < subjectSpan[1]; ++k) {
+          if (subjectLink == 'Entity' &&
+              typeof tokens[k] !== 'undefined' &&
+              tokens[k].entitylink != 'O' &&
+              typeof tokens[k].entitylink !== 'undefined') {
+            subjectLink = tokens[k].entitylink
+          }
+        }
+        addEntityType('KBP_ENTITY',  subjectLink);
+        var objectSpan = sentence.kbp[i].objectSpan;
+        var objectLink = 'Entity';
+        for (var k = objectSpan[0]; k < objectSpan[1]; ++k) {
+          if (objectLink == 'Entity' &&
+              typeof tokens[k] !== 'undefined' &&
+              tokens[k].entitylink != 'O' &&
+              typeof tokens[k].entitylink !== 'undefined') {
+            objectLink = tokens[k].entitylink
+          }
+        }
+        addEntityType('KBP_ENTITY',  objectLink);
+        var relation = sentence.kbp[i].relation;
+        var begin = parseInt(token.characterOffsetBegin);
+        // Add the entities
+        addKBPEntity(subjectSpan, subjectLink);
+        addKBPEntity(objectSpan, objectLink);
+        // Add the relations
+        addKBPRelation(subjectSpan, objectSpan, relation);
+      }
+    }  // End KBP block
+  }  // End sentence loop
+  //
+  // Coreference
+  //
+  var corefEntities = [];
+  var corefRelations = [];
+  if (typeof data.corefs !== 'undefined') {
+    addRelationType('coref', true);
+    addEntityType('COREF', 'Mention');
+    var clusters = Object.keys(data.corefs);
+    clusters.forEach( function (clusterId) {
+      var chain = data.corefs[clusterId];
+      if (chain.length > 1) {
+        for (var i = 0; i < chain.length; ++i) {
+          var mention = chain[i];
+          var id = 'COREF' + mention.id;
+          var tokens = data.sentences[mention.sentNum - 1].tokens;
+          corefEntities.push([id, 'Mention',
+            [[tokens[mention.startIndex - 1].characterOffsetBegin,
+              tokens[mention.endIndex - 2].characterOffsetEnd      ]] ]);
+          if (i > 0) {
+            var lastId = 'COREF' + chain[i - 1].id;
+            corefRelations.push(['COREF' + chain[i-1].id + '_' + chain[i].id,
+                                 'coref',
+                                 [['governor', lastId],
+                                  ['dependent', id]    ] ]);
+          }
+        }
+      }
+    });
+  }  // End coreference block
+  //
+  // Actually render the elements
+  //
+  /**
+   * Helper function to render a given set of entities / relations
+   * to a Div, if it exists.
+   */
+  function embed(container, entities, relations, reverse) {
+    var text = currentText;
+    if (reverse) {
+      var length = currentText.length;
+      for (var i = 0; i < entities.length; ++i) {
+        var offsets = entities[i][2][0];
+        var tmp = length - offsets[0];
+        offsets[0] = length - offsets[1];
+        offsets[1] = tmp;
+      }
+      text = text.split("").reverse().join("");
+    }
+    if ($('#' + container).length > 0) {
+      Util.embed(container,
+                 {entity_types: entityTypes, relation_types: relationTypes},
+                 {text: text, entities: entities, relations: relations}
+                );
+    }
+  }
+  function reportna(container, text) {
+    $('#' + container).text(text);
+  }
+  // Render each annotation
+  head.ready(function() {
+    if (!noXPOS) {
+      embed('pos', posEntities);
+    } else {
+      reportna('pos', 'XPOS is not available for this language at this time.')
+    }
+    embed('upos', uposEntities);
+    embed('lemma', lemmaEntities);
+    if (!noNER) {
+      embed('ner', nerEntities);
+    } else {
+      reportna('ner', 'NER is not available for this language at this time.')
+    }
+    embed('entities', linkEntities);
+    if (!useDagre) {
+      embed('parse', cparseEntities, cparseRelations);
+    }
+    embed('deps', uposEntities, depsRelations);
+    embed('deps2', posEntities, deps2Relations);
+    embed('coref', corefEntities, corefRelations);
+    embed('openie', openieEntities, openieRelations);
+    embed('kbp',    kbpEntities, kbpRelations);
+    embed('sentiment', sentimentEntities);
+    // Constituency parse
+    // Uses d3 and dagre-d3 (not brat)
+    if ($('#parse').length > 0 && useDagre) {
+      var parseViewer = new ParseViewer({ selector: '#parse' });
+      parseViewer.showAnnotation(data);
+      $('#parse').addClass('svg').css('display', 'block');
+    }
+  });
+}  // End render function
+/**
+ * Render a TokensRegex response
+ */
+function renderTokensregex(data) {
+  /**
+   * Register an entity type (a tag) for Brat
+   */
+  var entityTypesSet = {};
+  var entityTypes = [];
+  function addEntityType(type, color) {
+    // Don't add duplicates
+    if (entityTypesSet[type]) return;
+    entityTypesSet[type] = true;
+    // Set the color
+    if (typeof color === 'undefined') {
+      color = '#ADF6A2';
+    }
+    // Register the type
+    entityTypes.push({
+      type: type,
+      labels : [type],
+      bgColor: color,
+      borderColor: 'darken'
+    });
+  }
+  var entities = [];
+  for (var sentI = 0; sentI < data.sentences.length; ++sentI) {
+    var tokens = currentSentences[sentI].tokens;
+    for (var matchI = 0; matchI < data.sentences[sentI].length; ++matchI) {
+      var match = data.sentences[sentI][matchI];
+      // Add groups
+      for (groupName in match) {
+        if (groupName.startsWith("$") || isInt(groupName)) {
+          addEntityType(groupName, '#FFFDA8');
+          var begin = parseInt(tokens[match[groupName].begin].characterOffsetBegin);
+          var end = parseInt(tokens[match[groupName].end - 1].characterOffsetEnd);
+          entities.push(['TOK_' + sentI + '_' + matchI + '_' + groupName,
+                              groupName,
+                              [[begin, end]]]);
+        }
+      }
+      // Add match
+      addEntityType('match', '#ADF6A2');
+      var begin = parseInt(tokens[match.begin].characterOffsetBegin);
+      var end = parseInt(tokens[match.end - 1].characterOffsetEnd);
+      entities.push(['TOK_' + sentI + '_' + matchI + '_match',
+                          'match',
+                          [[begin, end]]]);
+    }
+  }
+  Util.embed('tokensregex',
+         {entity_types: entityTypes, relation_types: []},
+         {text: currentText, entities: entities, relations: []}
+        );
+}  // END renderTokensregex()
+/**
+ * Render a Semgrex response
+ */
+function renderSemgrex(data) {
+  /**
+   * Register an entity type (a tag) for Brat
+   */
+  var entityTypesSet = {};
+  var entityTypes = [];
+  function addEntityType(type, color) {
+    // Don't add duplicates
+    if (entityTypesSet[type]) return;
+    entityTypesSet[type] = true;
+    // Set the color
+    if (typeof color === 'undefined') {
+      color = '#ADF6A2';
+    }
+    // Register the type
+    entityTypes.push({
+      type: type,
+      labels : [type],
+      bgColor: color,
+      borderColor: 'darken'
+    });
+  }
+  relationTypes = [{
+    type: 'semgrex',
+    labels: ['-'],
+    dashArray: '3,3',
+    arrowHead: 'none',
+  }];
+  var entities = [];
+  var relations = [];
+  for (var sentI = 0; sentI < data.sentences.length; ++sentI) {
+    var tokens = currentSentences[sentI].tokens;
+    for (var matchI = 0; matchI < data.sentences[sentI].length; ++matchI) {
+      var match = data.sentences[sentI][matchI];
+      // Add match
+      addEntityType('match', '#ADF6A2');
+      var begin = parseInt(tokens[match.begin].characterOffsetBegin);
+      var end = parseInt(tokens[match.end - 1].characterOffsetEnd);
+      entities.push(['SEM_' + sentI + '_' + matchI + '_match',
+                          'match',
+                          [[begin, end]]]);
+      // Add groups
+      for (groupName in match) {
+        if (groupName.startsWith("$") || isInt(groupName)) {
+          // (add node)
+          group = match[groupName];
+          groupName = groupName.substring(1);
+          addEntityType(groupName, '#FFFDA8');
+          var begin = parseInt(tokens[group.begin].characterOffsetBegin);
+          var end = parseInt(tokens[group.end - 1].characterOffsetEnd);
+          entities.push(['SEM_' + sentI + '_' + matchI + '_' + groupName,
+                              groupName,
+                              [[begin, end]]]);
+          // (add relation)
+          relations.push(['SEMGREX_' + sentI + '_' + matchI + '_' + groupName,
+                          'semgrex',
+                          [['governor', 'SEM_' + sentI + '_' + matchI + '_match'],
+                           ['dependent', 'SEM_' + sentI + '_' + matchI + '_' + groupName] ] ]);
+        }
+      }
+    }
+  }
+  Util.embed('semgrex',
+         {entity_types: entityTypes, relation_types: relationTypes},
+         {text: currentText, entities: entities, relations: relations}
+        );
+}  // END renderSemgrex
+/**
+ * Render a Tregex response
+ */
+function renderTregex(data) {
+  $('#tregex').empty();
+  $('#tregex').append('<pre>' + JSON.stringify(data, null, 4) + '</pre>');
+}  // END renderTregex
+// ----------------------------------------------------------------------------
+// MAIN
+// ----------------------------------------------------------------------------
+/**
+ * MAIN()
+ *
+ * The entry point of the page
+ */
+$(document).ready(function() {
+  // Some initial styling
+  $('.chosen-select').chosen();
+  $('.chosen-container').css('width', '100%');
+  // Language-specific changes
+  $('#language').on('change', function() {
+    $('#text').attr('dir', '');
+    if ($('#language').val() === 'ar' ||
+        $('#language').val() === 'fa' ||
+        $('#language').val() === 'he' ||
+        $('#language').val() === 'ur') {
+      $('#text').attr('dir', 'rtl');
+    }
+    if ($('#language').val() === 'ar') {
+      $('#text').attr('placeholder', 'على سبيل المثال، قفز الثعلب البني السريع فوق الكلب الكسول.');
+    } else if ($('#language').val() === 'en') {
+      $('#text').attr('placeholder', 'e.g., The quick brown fox jumped over the lazy dog.');
+    } else if ($('#language').val() === 'zh') {
+      $('#text').attr('placeholder', '例如，快速的棕色狐狸跳过了懒惰的狗。');
+    } else if ($('#language').val() === 'zh-Hant') {
+      $('#text').attr('placeholder', '例如，快速的棕色狐狸跳過了懶惰的狗。');
+    } else if ($('#language').val() === 'fr') {
+      $('#text').attr('placeholder', 'Par exemple, le renard brun rapide a sauté sur le chien paresseux.');
+    } else if ($('#language').val() === 'de') {
+      $('#text').attr('placeholder', 'Z. B. sprang der schnelle braune Fuchs über den faulen Hund.');
+    } else if ($('#language').val() === 'es') {
+      $('#text').attr('placeholder', 'Por ejemplo, el rápido zorro marrón saltó sobre el perro perezoso.');
+    } else if ($('#language').val() === 'ur') {
+      $('#text').attr('placeholder', 'میرا نام علی ہے');
+    } else {
+      $('#text').attr('placeholder', 'Unknown language for placeholder query: ' + $('#language').val());
+    }
+  });
+  // Submit on shift-enter
+  $('#text').keydown(function (event) {
+    if (event.keyCode == 13) {
+      if(event.shiftKey){
+        event.preventDefault();  // don't register the enter key when pressed
+        return false;
+      }
+    }
+  });
+  $('#text').keyup(function (event) {
+    if (event.keyCode == 13) {
+      if(event.shiftKey){
+        $('#submit').click();  // submit the form when the enter key is released
+        event.stopPropagation();
+        return false;
+      }
+    }
+  });
+  // Submit on clicking the 'submit' button
+  $('#submit').click(function() {
+    // Get the text to annotate
+    currentQuery = $('#text').val();
+    if (currentQuery.trim() == '') {
+      if ($('#language').val() === 'ar') {
+        currentQuery = 'قفز الثعلب البني السريع فوق الكلب الكسول.';
+      } else if ($('#language').val() === 'en') {
+        currentQuery = 'The quick brown fox jumped over the lazy dog.';
+      } else if ($('#language').val() === 'zh') {
+        currentQuery = '快速的棕色狐狸跳过了懒惰的狗。';
+      } else if ($('#language').val() === 'zh-Hant') {
+        currentQuery = '快速的棕色狐狸跳過了懶惰的狗。';
+      } else if ($('#language').val() === 'fr') {
+        currentQuery = 'Le renard brun rapide a sauté sur le chien paresseux.';
+      } else if ($('#language').val() === 'de') {
+        currentQuery = 'Sprang der schnelle braune Fuchs über den faulen Hund.';
+      } else if ($('#language').val() === 'es') {
+        currentQuery = 'El rápido zorro marrón saltó sobre el perro perezoso.';
+      } else if ($('#language').val() === 'ur') {
+        currentQuery = 'میرا نام علی ہے';
+      } else {
+        currentQuery = 'Unknown language for default query: ' + $('#language').val();
+      }
+      $('#text').val(currentQuery);
+    }
+    // Update the UI
+    $('#submit').prop('disabled', true);
+    $('#annotations').hide();
+    $('#patterns_row').hide();
+    $('#loading').show();
+    // Run query
+    $.ajax({
+      type: 'POST',
+      url: serverAddress + '?properties=' + encodeURIComponent(
+        '{"annotators": "' + annotators() + '", "date": "' + date() + '"}') +
+        '&pipelineLanguage=' + encodeURIComponent($('#language').val()),
+      data: encodeURIComponent(currentQuery), //jQuery doesn't automatically URI encode strings
+      dataType: 'json',
+      contentType: "application/x-www-form-urlencoded;charset=UTF-8",
+      responseType: "application/json",
+      success: function(data) {
+        $('#submit').prop('disabled', false);
+        if (typeof data === 'undefined' || data.sentences == undefined) {
+          alert("Failed to reach server!");
+        } else {
+          // Process constituency parse
+          var constituencyParseProcessor = new ConstituencyParseProcessor();
+          constituencyParseProcessor.process(data);
+          // Empty divs
+          $('#annotations').empty();
+          // Re-render divs
+          function createAnnotationDiv(id, annotator, selector, label) {
+            // (make sure we requested that element)
+            if (annotators().split(",").indexOf(annotator) < 0) {
+              return;
+            }
+            // (make sure the data contains that element)
+            ok = false;
+            if (typeof data[selector] !== 'undefined') {
+              ok = true;
+            } else if (typeof data.sentences !== 'undefined' && data.sentences.length > 0) {
+              if (typeof data.sentences[0][selector] !== 'undefined') {
+                ok = true;
+              } else if (typeof data.sentences[0].tokens != 'undefined' && data.sentences[0].tokens.length > 0) {
+                // (make sure the annotator select is in at least one of the tokens of any sentence)
+                ok = data.sentences.some(function(sentence) {
+                  return sentence.tokens.some(function(token) {
+                    return typeof token[selector] !== 'undefined';
+                  });
+                });
+              }
+            }
+            // (render the element)
+            if (ok) {
+              $('#annotations').append('<h4 class="red">' + label + ':</h4> <div id="' + id + '"></div>');
+            }
+          }
+          // (create the divs)
+          //                  div id      annotator     field_in_data                          label
+          createAnnotationDiv('pos',      'pos',        'pos',                                 'Part-of-Speech (XPOS)'          );
+          createAnnotationDiv('upos',     'upos',       'upos',                                'Universal Part-of-Speech');
+          createAnnotationDiv('lemma',    'lemma',      'lemma',                               'Lemmas'                  );
+          createAnnotationDiv('ner',      'ner',        'ner',                                 'Named Entity Recognition');
+          createAnnotationDiv('deps',     'depparse',   'basicDependencies',                   'Universal Dependencies'      );
+          createAnnotationDiv('parse',    'parse',      'parseTree',                           'Constituency Parse'      );
+          //createAnnotationDiv('deps2',    'depparse',   'enhancedPlusPlusDependencies',        'Enhanced++ Dependencies' );
+          //createAnnotationDiv('openie',   'openie',     'openie',                              'Open IE'                 );
+          //createAnnotationDiv('coref',    'coref',      'corefs',                              'Coreference'             );
+          //createAnnotationDiv('entities', 'entitylink', 'entitylink',                          'Wikidict Entities'       );
+          //createAnnotationDiv('kbp',      'kbp',        'kbp',                                 'KBP Relations'           );
+          //createAnnotationDiv('sentiment','sentiment',  'sentiment',                           'Sentiment'               );
+          // Update UI
+          $('#loading').hide();
+          $('.corenlp_error').remove();  // Clear error messages
+          $('#annotations').show();
+          // Render
+          var reverse = ($('#language').val() === 'ar' || $('#language').val() === 'fa' || $('#language').val() === 'he' || $('#language').val() === 'ur');
+          render(data, reverse);
+          // Render patterns
+          //$('#annotations').append('<h4 class="red" style="margin-top: 4ex;">CoreNLP Tools:</h4>');  // TODO(gabor) a strange place to add this header to
+          //$('#patterns_row').show();
+        }
+      },
+      error: function(data) {
+        DATA = data;
+        var alertDiv = $('<div/>').addClass('alert').addClass('alert-danger').addClass('alert-dismissible').addClass('corenlp_error').attr('role', 'alert')
+        var button = $('<button type="button" class="close" data-dismiss="alert" aria-label="Close"><span aria-hidden="true">&times;</span></button>');
+        var message = $('<span/>').text(data.responseText);
+        button.appendTo(alertDiv);
+        message.appendTo(alertDiv);
+        $('#loading').hide();
+        alertDiv.appendTo($('#errors'));
+        $('#submit').prop('disabled', false);
+      }
+    });
+    event.preventDefault();
+    event.stopPropagation();
+    return false;
+  });
+  // Support passing parameters on page launch, via window.location.hash parameters.
+  // Example: http://localhost:9000/#text=foo%20bar&annotators=pos,lemma,ner
+  (function() {
+    var rawParams = window.location.hash.slice(1).split("&");
+    var params = {};
+    rawParams.forEach(function(paramKV) {
+      paramKV = paramKV.split("=");
+      if (paramKV.length === 2) {
+        var key   = paramKV[0];
+        var value = paramKV[1];
+        params[key] = value;
+      }
+    });
+    if (params.text) {
+      var text = decodeURIComponent(params.text);
+      $('#text').val(text);
+    }
+    if (params.annotators) {
+      var annotators = params.annotators.split(",");
+      // De-select everything
+      $('#annotators').find('option').each(function() {
+        $(this).prop('selected', false);
+      });
+      // Select the specified ones.
+      annotators.forEach(function(a) {
+        $('#annotators').find('option[value="'+a+'"]').prop('selected', true);
+      });
+      // Refresh Chosen
+      $('#annotators').trigger('chosen:updated');
+    }
+    if (params.text || params.annotators) {
+      // Finally, let's auto-submit.
+      $('#submit').click();
+    }
+  })();
+  $('#form_tokensregex').submit( function (e) {
+    // Don't actually submit the form
+    e.preventDefault();
+    // Get text
+    if ($('#tokensregex_search').val().trim() == '') {
+      $('#tokensregex_search').val('(?$foxtype [{pos:JJ}]+ ) fox');
+    }
+    var pattern = $('#tokensregex_search').val();
+    // Remove existing annotation
+    $('#tokensregex').remove();
+    // Make ajax call
+    $.ajax({
+      type: 'POST',
+      url: serverAddress + '/tokensregex?pattern=' + encodeURIComponent(
+        pattern.replace("&", "\\&").replace('+', '\\+')) +
+        '&properties=' + encodeURIComponent(
+        '{"annotators": "' + annotators() + '", "date": "' + date() + '"}') +
+        '&pipelineLanguage=' + encodeURIComponent($('#language').val()),
+      data: encodeURIComponent(currentQuery),
+      success: function(data) {
+        $('.tokensregex_error').remove();  // Clear error messages
+        $('<div id="tokensregex" class="pattern_brat"/>').appendTo($('#div_tokensregex'));
+        renderTokensregex(data);
+      },
+      error: function(data) {
+        var alertDiv = $('<div/>').addClass('alert').addClass('alert-danger').addClass('alert-dismissible').addClass('tokensregex_error').attr('role', 'alert')
+        var button = $('<button type="button" class="close" data-dismiss="alert" aria-label="Close"><span aria-hidden="true">&times;</span></button>');
+        var message = $('<span/>').text(data.responseText);
+        button.appendTo(alertDiv);
+        message.appendTo(alertDiv);
+        alertDiv.appendTo($('#div_tokensregex'));
+      }
+    });
+  });
+  $('#form_semgrex').submit( function (e) {
+    // Don't actually submit the form
+    e.preventDefault();
+    // Get text
+    if ($('#semgrex_search').val().trim() == '') {
+      $('#semgrex_search').val('{pos:/VB.*/} >nsubj {}=subject >/nmod:.*/ {}=prep_phrase');
+    }
+    var pattern = $('#semgrex_search').val();
+    // Remove existing annotation
+    $('#semgrex').remove();
+    // Add missing required annotators
+    var requiredAnnotators = annotators().split(',');
+    if (requiredAnnotators.indexOf('depparse') < 0) {
+      requiredAnnotators.push('depparse');
+    }
+    // Make ajax call
+    $.ajax({
+      type: 'POST',
+      url: serverAddress + '/semgrex?pattern=' + encodeURIComponent(
+        pattern.replace("&", "\\&").replace('+', '\\+')) +
+        '&properties=' + encodeURIComponent(
+        '{"annotators": "' + requiredAnnotators.join(',') + '", "date": "' + date() + '"}') +
+        '&pipelineLanguage=' + encodeURIComponent($('#language').val()),
+      data: encodeURIComponent(currentQuery),
+      success: function(data) {
+        $('.semgrex_error').remove();  // Clear error messages
+        $('<div id="semgrex" class="pattern_brat"/>').appendTo($('#div_semgrex'));
+        renderSemgrex(data);
+      },
+      error: function(data) {
+        var alertDiv = $('<div/>').addClass('alert').addClass('alert-danger').addClass('alert-dismissible').addClass('semgrex_error').attr('role', 'alert')
+        var button = $('<button type="button" class="close" data-dismiss="alert" aria-label="Close"><span aria-hidden="true">&times;</span></button>');
+        var message = $('<span/>').text(data.responseText);
+        button.appendTo(alertDiv);
+        message.appendTo(alertDiv);
+        alertDiv.appendTo($('#div_semgrex'));
+      }
+    });
+  });
+  $('#form_tregex').submit( function (e) {
+    // Don't actually submit the form
+    e.preventDefault();
+    // Get text
+    if ($('#tregex_search').val().trim() == '') {
+      $('#tregex_search').val('NP < NN=animal');
+    }
+    var pattern = $('#tregex_search').val();
+    // Remove existing annotation
+    $('#tregex').remove();
+    // Add missing required annotators
+    var requiredAnnotators = annotators().split(',');
+    if (requiredAnnotators.indexOf('parse') < 0) {
+      requiredAnnotators.push('parse');
+    }
+    // Make ajax call
+    $.ajax({
+      type: 'POST',
+      url: serverAddress + '/tregex?pattern=' + encodeURIComponent(
+        pattern.replace("&", "\\&").replace('+', '\\+')) +
+        '&properties=' + encodeURIComponent(
+        '{"annotators": "' + requiredAnnotators.join(',') + '", "date": "' + date() + '"}') +
+        '&pipelineLanguage=' + encodeURIComponent($('#language').val()),
+      data: encodeURIComponent(currentQuery),
+      success: function(data) {
+        $('.tregex_error').remove();  // Clear error messages
+        $('<div id="tregex" class="pattern_brat"/>').appendTo($('#div_tregex'));
+        renderTregex(data);
+      },
+      error: function(data) {
+        var alertDiv = $('<div/>').addClass('alert').addClass('alert-danger').addClass('alert-dismissible').addClass('tregex_error').attr('role', 'alert')
+        var button = $('<button type="button" class="close" data-dismiss="alert" aria-label="Close"><span aria-hidden="true">&times;</span></button>');
+        var message = $('<span/>').text(data.responseText);
+        button.appendTo(alertDiv);
+        message.appendTo(alertDiv);
+        alertDiv.appendTo($('#div_tregex'));
+      }
+    });
+  });
+});

stanza/stanza/pipeline/external/corenlp_converter_depparse.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""
+A depparse processor which converts constituency trees using CoreNLP
+"""
+from stanza.pipeline._constants import TOKENIZE, CONSTITUENCY, DEPPARSE
+from stanza.pipeline.processor import ProcessorVariant, register_processor_variant
+from stanza.server.dependency_converter import DependencyConverter
+@register_processor_variant(DEPPARSE, 'converter')
+class ConverterDepparse(ProcessorVariant):
+    # set of processor requirements for this processor
+    REQUIRES_DEFAULT = set([TOKENIZE, CONSTITUENCY])
+    def __init__(self, config):
+        if config['lang'] != 'en':
+            raise ValueError("Constituency to dependency converter only works for English")
+        # TODO: get classpath from config
+        # TODO: close this when finished?
+        #   a more involved approach would be to turn the Pipeline into
+        #   a context with __enter__ and __exit__
+        #   __exit__ would try to free all resources, although some
+        #   might linger such as GPU allocations
+        #   maybe it isn't worth even trying to clean things up on account of that
+        self.converter = DependencyConverter(classpath="$CLASSPATH")
+        self.converter.open_pipe()
+    def process(self, document):
+        return self.converter.process(document)

stanza/stanza/pipeline/external/jieba.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""
+Processors related to Jieba in the pipeline.
+"""
+import re
+from stanza.models.common import doc
+from stanza.pipeline._constants import TOKENIZE
+from stanza.pipeline.processor import ProcessorVariant, register_processor_variant
+def check_jieba():
+    """
+    Import necessary components from Jieba to perform tokenization.
+    """
+    try:
+        import jieba
+    except ImportError:
+        raise ImportError(
+            "Jieba is used but not installed on your machine. Go to https://pypi.org/project/jieba/ for installation instructions."
+        )
+    return True
+@register_processor_variant(TOKENIZE, 'jieba')
+class JiebaTokenizer(ProcessorVariant):
+    def __init__(self, config):
+        """ Construct a Jieba-based tokenizer by loading the Jieba pipeline.
+        Note that this tokenizer uses regex for sentence segmentation.
+        """
+        if config['lang'] not in ['zh', 'zh-hans', 'zh-hant']:
+            raise Exception("Jieba tokenizer is currently only allowed in Chinese (simplified or traditional) pipelines.")
+        check_jieba()
+        import jieba
+        self.nlp = jieba
+        self.no_ssplit = config.get('no_ssplit', False)
+    def process(self, document):
+        """ Tokenize a document with the Jieba tokenizer and wrap the results into a Doc object.
+        """
+        if isinstance(document, doc.Document):
+            text = document.text
+        else:
+            text = document
+        if not isinstance(text, str):
+            raise Exception("Must supply a string or Stanza Document object to the Jieba tokenizer.")
+        tokens = self.nlp.cut(text, cut_all=False)
+        sentences = []
+        current_sentence = []
+        offset = 0
+        for token in tokens:
+            if re.match(r'\s+', token):
+                offset += len(token)
+                continue
+            token_entry = {
+                doc.TEXT: token,
+                doc.MISC: f"{doc.START_CHAR}={offset}|{doc.END_CHAR}={offset+len(token)}"
+            }
+            current_sentence.append(token_entry)
+            offset += len(token)
+            if not self.no_ssplit and token in ['。', '！', '？', '!', '?']:
+                sentences.append(current_sentence)
+                current_sentence = []
+        if len(current_sentence) > 0:
+            sentences.append(current_sentence)
+        return doc.Document(sentences, text)

stanza/stanza/pipeline/external/sudachipy.py ADDED Viewed

	@@ -0,0 +1,84 @@

+"""
+Processors related to SudachiPy in the pipeline.
+GitHub Home: https://github.com/WorksApplications/SudachiPy
+"""
+import re
+from stanza.models.common import doc
+from stanza.pipeline._constants import TOKENIZE
+from stanza.pipeline.processor import ProcessorVariant, register_processor_variant
+def check_sudachipy():
+    """
+    Import necessary components from SudachiPy to perform tokenization.
+    """
+    try:
+        import sudachipy
+        import sudachidict_core
+    except ImportError:
+        raise ImportError(
+            "Both sudachipy and sudachidict_core libraries are required. "
+            "Try install them with `pip install sudachipy sudachidict_core`. "
+            "Go to https://github.com/WorksApplications/SudachiPy for more information."
+        )
+    return True
+@register_processor_variant(TOKENIZE, 'sudachipy')
+class SudachiPyTokenizer(ProcessorVariant):
+    def __init__(self, config):
+        """ Construct a SudachiPy-based tokenizer.
+        Note that this tokenizer uses regex for sentence segmentation.
+        """
+        if config['lang'] != 'ja':
+            raise Exception("SudachiPy tokenizer is only allowed in Japanese pipelines.")
+        check_sudachipy()
+        from sudachipy import tokenizer
+        from sudachipy import dictionary
+        self.tokenizer = dictionary.Dictionary().create()
+        self.no_ssplit = config.get('no_ssplit', False)
+    def process(self, document):
+        """ Tokenize a document with the SudachiPy tokenizer and wrap the results into a Doc object.
+        """
+        if isinstance(document, doc.Document):
+            text = document.text
+        else:
+            text = document
+        if not isinstance(text, str):
+            raise Exception("Must supply a string or Stanza Document object to the SudachiPy tokenizer.")
+        # we use the default sudachipy tokenization mode (i.e., mode C)
+        # more config needs to be added to support other modes
+        tokens = self.tokenizer.tokenize(text)
+        sentences = []
+        current_sentence = []
+        for token in tokens:
+            token_text = token.surface()
+            # by default sudachipy will output whitespace as a token
+            # we need to skip these tokens to be consistent with other tokenizers
+            if token_text.isspace():
+                continue
+            start = token.begin()
+            end = token.end()
+            token_entry = {
+                doc.TEXT: token_text,
+                doc.MISC: f"{doc.START_CHAR}={start}|{doc.END_CHAR}={end}"
+            }
+            current_sentence.append(token_entry)
+            if not self.no_ssplit and token_text in ['。', '！', '？', '!', '?']:
+                sentences.append(current_sentence)
+                current_sentence = []
+        if len(current_sentence) > 0:
+            sentences.append(current_sentence)
+        return doc.Document(sentences, text)

stanza/stanza/utils/charlm/oscar_to_text.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""
+Turns an Oscar 2022 jsonl file to text
+YOU DO NOT NEED THIS if you use the oscar extractor which reads from
+HuggingFace, dump_oscar.py
+to run:
+python3 -m stanza.utils.charlm.oscar_to_text <path> ...
+each path can be a file or a directory with multiple .jsonl files in it
+"""
+import argparse
+import glob
+import json
+import lzma
+import os
+import sys
+from stanza.models.common.utils import open_read_text
+def extract_file(output_directory, input_filename, use_xz):
+    print("Extracting %s" % input_filename)
+    if output_directory is None:
+        output_directory, output_filename = os.path.split(input_filename)
+    else:
+        _, output_filename = os.path.split(input_filename)
+    json_idx = output_filename.rfind(".jsonl")
+    if json_idx < 0:
+        output_filename = output_filename + ".txt"
+    else:
+        output_filename = output_filename[:json_idx] + ".txt"
+    if use_xz:
+        output_filename += ".xz"
+        open_file = lambda x: lzma.open(x, "wt", encoding="utf-8")
+    else:
+        open_file = lambda x: open(x, "w", encoding="utf-8")
+    output_filename = os.path.join(output_directory, output_filename)
+    print("Writing content to %s" % output_filename)
+    with open_read_text(input_filename) as fin:
+        with open_file(output_filename) as fout:
+            for line in fin:
+                content = json.loads(line)
+                content = content['content']
+                fout.write(content)
+                fout.write("\n\n")
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output", default=None, help="Output directory for saving files.  If None, will write to the original directory")
+    parser.add_argument("--no_xz", default=True, dest="xz", action="store_false", help="Don't use xz to compress the output files")
+    parser.add_argument("filenames", nargs="+", help="Filenames or directories to process")
+    args = parser.parse_args()
+    return args
+def main():
+    """
+    Go through each of the given filenames or directories, convert json to .txt.xz
+    """
+    args = parse_args()
+    if args.output is not None:
+        os.makedirs(args.output, exist_ok=True)
+    for filename in args.filenames:
+        if os.path.isfile(filename):
+            extract_file(args.output, filename, args.xz)
+        elif os.path.isdir(filename):
+            files = glob.glob(os.path.join(filename, "*jsonl*"))
+            files = sorted([x for x in files if os.path.isfile(x)])
+            print("Found %d files:" % len(files))
+            if len(files) > 0:
+                print("  %s" % "\n  ".join(files))
+            for json_filename in files:
+                extract_file(args.output, json_filename, args.xz)
+if __name__ == "__main__":
+    main()

stanza/stanza/utils/constituency/__init__.py ADDED Viewed

File without changes

stanza/stanza/utils/constituency/grep_test_logs.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import subprocess
+import sys
+filenames = sys.argv[1:]
+total_score = 0.0
+num_scores = 0
+for filename in filenames:
+    grep_cmd = ["grep", "F1 score.*test.*", filename]
+    grep_result = subprocess.run(grep_cmd, stdout=subprocess.PIPE, encoding="utf-8")
+    grep_result = grep_result.stdout.strip()
+    if not grep_result:
+        print("{}: no result".format(filename))
+        continue
+    score = float(grep_result.split()[-1])
+    print("{}: {}".format(filename, score))
+    total_score += score
+    num_scores += 1
+if num_scores > 0:
+    avg = total_score / num_scores
+    print("Avg: {}".format(avg))

stanza/stanza/utils/datasets/constituency/build_silver_dataset.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""
+Given two ensembles and a tokenized file, output the trees for which those ensembles agree and report how many of the sub-models agree on those trees.
+For example:
+python3 -m stanza.utils.datasets.constituency.build_silver_dataset --tokenized_file /u/nlp/data/constituency-parser/italian/2024_wiki_tokenization/it_wiki_tokenized_AA.txt --lang it --output_file asdf.out --e1 saved_models/constituency/it_vit_electra_100?_top_constituency.pt --e2 saved_models/constituency/it_vit_electra_100?_constituency.pt
+for i in `echo f g h i j k l m n o p q r s t`; do nlprun -d a6000 "python3 -m stanza.utils.datasets.constituency.build_silver_dataset --tokenized_file /u/nlp/data/constituency-parser/italian/2024_wiki_tokenization/it_wiki_tok_6M_a$i.txt --lang it --output_file /u/nlp/data/constituency-parser/italian/2024_it_vit_electra/a$i.trees --e1 saved_models/constituency/it_vit_electra_100?_top_constituency.pt --e2 saved_models/constituency/it_vit_electra_100?_constituency.pt" -o /u/nlp/data/constituency-parser/italian/2024_it_vit_electra/a$i.out; done
+for i in `echo a b c d`; do nlprun -d a6000 "python3 -m stanza.utils.datasets.constituency.build_silver_dataset --tokenized_file /u/nlp/data/constituency-parser/english/en_wiki_2023/shuf_1M.a$i --lang en --output_file /u/nlp/data/constituency-parser/english/2024_en_ptb3_electra/forward_a$i.trees --e1 saved_models/constituency/en_ptb3_electra-large_100?_in_constituency.pt --e2 saved_models/constituency/en_ptb3_electra-large_100?_top_constituency.pt" -o /u/nlp/data/constituency-parser/english/2024_en_ptb3_electra/forward_a$i.out; done
+"""
+import argparse
+import json
+import logging
+from stanza.models.common import utils
+from stanza.models.common.foundation_cache import FoundationCache
+from stanza.models.constituency import retagging
+from stanza.models.constituency import text_processing
+from stanza.models.constituency import tree_reader
+from stanza.models.constituency.ensemble import Ensemble
+from stanza.utils.get_tqdm import get_tqdm
+tqdm = get_tqdm()
+logger = logging.getLogger('stanza.constituency.trainer')
+def parse_args(args=None):
+    parser = argparse.ArgumentParser(description="Script that uses multiple ensembles to find trees where both ensembles agree")
+    input_group = parser.add_mutually_exclusive_group(required=True)
+    input_group.add_argument('--tokenized_file', type=str, default=None, help='Input file of tokenized text for parsing with parse_text.')
+    input_group.add_argument('--tree_file', type=str, default=None, help='Input file of already parsed text for reparsing with parse_text.')
+    parser.add_argument('--output_file', type=str, default=None, help='Where to put the output file')
+    parser.add_argument('--charlm_forward_file', type=str, default=None, help="Exact path to use for forward charlm")
+    parser.add_argument('--charlm_backward_file', type=str, default=None, help="Exact path to use for backward charlm")
+    parser.add_argument('--wordvec_pretrain_file', type=str, default=None, help='Exact name of the pretrain file to read')
+    utils.add_device_args(parser)
+    parser.add_argument('--lang', default='en', help='Language to use')
+    parser.add_argument('--eval_batch_size', type=int, default=50, help='How many trees to batch when running eval')
+    parser.add_argument('--e1', type=str, nargs='+', default=None, help="Which model(s) to load in the first ensemble")
+    parser.add_argument('--e2', type=str, nargs='+', default=None, help="Which model(s) to load in the second ensemble")
+    parser.add_argument('--mode', default='predict', choices=['parse_text', 'predict'])
+    # another option would be to include the tree idx in each entry in an existing saved file
+    # the processing could then pick up at exactly the last known idx
+    parser.add_argument('--start_tree', type=int, default=0, help='Where to start... most useful if the previous incarnation crashed')
+    parser.add_argument('--end_tree', type=int, default=None, help='Where to end.  If unset, will process to the end of the file')
+    retagging.add_retag_args(parser)
+    args = vars(parser.parse_args())
+    retagging.postprocess_args(args)
+    args['num_generate'] = 0
+    return args
+def main():
+    args = parse_args()
+    utils.log_training_args(args, logger, name="ensemble")
+    retag_pipeline = retagging.build_retag_pipeline(args)
+    foundation_cache = retag_pipeline[0].foundation_cache if retag_pipeline else FoundationCache()
+    logger.info("Building ensemble #1 out of %s", args['e1'])
+    e1 = Ensemble(args, filenames=args['e1'], foundation_cache=foundation_cache)
+    e1.to(args.get('device', None))
+    logger.info("Building ensemble #2 out of %s", args['e2'])
+    e2 = Ensemble(args, filenames=args['e2'], foundation_cache=foundation_cache)
+    e2.to(args.get('device', None))
+    if args['tokenized_file']:
+        tokenized_sentences = text_processing.read_tokenized_file(args['tokenized_file'])
+    elif args['tree_file']:
+        treebank = tree_reader.read_treebank(args['tree_file'])
+        tokenized_sentences = [x.leaf_labels() for x in treebank]
+        if args['lang'] == 'vi':
+            tokenized_sentences = [[x.replace("_", " ") for x in sentence] for sentence in tokenized_sentences]
+    logger.info("Read %d tokenized sentences", len(tokenized_sentences))
+    all_models = e1.models + e2.models
+    chunk_size = 1000
+    with open(args['output_file'], 'w', encoding='utf-8') as fout:
+        end_tree = len(tokenized_sentences) if args['end_tree'] is None else args['end_tree']
+        for chunk_start in tqdm(range(args['start_tree'], end_tree, chunk_size)):
+            chunk = tokenized_sentences[chunk_start:chunk_start+chunk_size]
+            logger.info("Processing trees %d to %d", chunk_start, chunk_start+len(chunk))
+            parsed1 = text_processing.parse_tokenized_sentences(args, e1, retag_pipeline, chunk)
+            parsed1 = [x.predictions[0].tree for x in parsed1]
+            parsed2 = text_processing.parse_tokenized_sentences(args, e2, retag_pipeline, chunk)
+            parsed2 = [x.predictions[0].tree for x in parsed2]
+            matching = [t for t, t2 in zip(parsed1, parsed2) if t == t2]
+            logger.info("%d trees matched", len(matching))
+            model_counts = [0] * len(matching)
+            for model in all_models:
+                model_chunk = model.parse_sentences_no_grad(iter(matching), model.build_batch_from_trees, args['eval_batch_size'], model.predict)
+                model_chunk = [x.predictions[0].tree for x in model_chunk]
+                for idx, (t1, t2) in enumerate(zip(matching, model_chunk)):
+                    if t1 == t2:
+                        model_counts[idx] += 1
+            for count, tree in zip(model_counts, matching):
+                line = {"tree": "%s" % tree, "count": count}
+                fout.write(json.dumps(line))
+                fout.write("\n")
+if __name__ == '__main__':
+    main()

stanza/stanza/utils/datasets/constituency/convert_cintil.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import xml.etree.ElementTree as ET
+from stanza.models.constituency import tree_reader
+from stanza.utils.datasets.constituency import utils
+def read_xml_file(input_filename):
+    """
+    Convert the CINTIL xml file to id & test
+    Returns a list of tuples: (id, text)
+    """
+    with open(input_filename, encoding="utf-8") as fin:
+        dataset = ET.parse(fin)
+    dataset = dataset.getroot()
+    corpus = dataset.find("{http://nlx.di.fc.ul.pt}corpus")
+    if not corpus:
+        raise ValueError("Unexpected dataset structure : no 'corpus'")
+    trees = []
+    for sentence in corpus:
+        if sentence.tag != "{http://nlx.di.fc.ul.pt}sentence":
+            raise ValueError("Unexpected sentence tag: {}".format(sentence.tag))
+        id_node = None
+        raw_node = None
+        tree_nodde = None
+        for node in sentence:
+            if node.tag == '{http://nlx.di.fc.ul.pt}id':
+                id_node = node
+            elif node.tag == '{http://nlx.di.fc.ul.pt}raw':
+                raw_node = node
+            elif node.tag == '{http://nlx.di.fc.ul.pt}tree':
+                tree_node = node
+            else:
+                raise ValueError("Unexpected tag in sentence {}: {}".format(sentence, node.tag))
+        if id_node is None or raw_node is None or tree_node is None:
+            raise ValueError("Missing node in sentence {}".format(sentence))
+        tree_id = "".join(id_node.itertext())
+        tree_text = "".join(tree_node.itertext())
+        trees.append((tree_id, tree_text))
+    return trees
+def convert_cintil_treebank(input_filename, train_size=0.8, dev_size=0.1):
+    """
+    dev_size is the size for splitting train & dev
+    """
+    trees = read_xml_file(input_filename)
+    synthetic_trees = []
+    natural_trees = []
+    for tree_id, tree_text in trees:
+        if tree_text.find(" _") >= 0:
+            raise ValueError("Unexpected underscore")
+        tree_text = tree_text.replace("_)", ")")
+        tree_text = tree_text.replace("(A (", "(A' (")
+        # trees don't have ROOT, but we typically use a ROOT label at the top
+        tree_text = "(ROOT %s)" % tree_text
+        trees = tree_reader.read_trees(tree_text)
+        if len(trees) != 1:
+            raise ValueError("Unexpectedly found %d trees in %s" % (len(trees), tree_id))
+        tree = trees[0]
+        if tree_id.startswith("aTSTS"):
+            synthetic_trees.append(tree)
+        elif tree_id.find("TSTS") >= 0:
+            raise ValueError("Unexpected TSTS")
+        else:
+            natural_trees.append(tree)
+    print("Read %d synthetic trees" % len(synthetic_trees))
+    print("Read %d natural trees" % len(natural_trees))
+    train_trees, dev_trees, test_trees = utils.split_treebank(natural_trees, train_size, dev_size)
+    print("Split %d trees into %d train %d dev %d test" % (len(natural_trees), len(train_trees), len(dev_trees), len(test_trees)))
+    train_trees = synthetic_trees + train_trees
+    print("Total lengths %d train %d dev %d test" % (len(train_trees), len(dev_trees), len(test_trees)))
+    return train_trees, dev_trees, test_trees
+def main():
+    treebank = convert_cintil_treebank("extern_data/constituency/portuguese/CINTIL/CINTIL-Treebank.xml")
+if __name__ == '__main__':
+    main()

stanza/stanza/utils/datasets/constituency/count_common_words.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import sys
+from collections import Counter
+from stanza.models.constituency import parse_tree
+from stanza.models.constituency import tree_reader
+word_counter = Counter()
+count_words = lambda x: word_counter.update(x.leaf_labels())
+tree_reader.read_tree_file(sys.argv[1], tree_callback=count_words)
+print(word_counter.most_common()[:100])

stanza/stanza/utils/datasets/constituency/prepare_con_dataset.py ADDED Viewed

	@@ -0,0 +1,594 @@

+"""Converts raw data files from their original format (dataset dependent) into PTB trees.
+The operation of this script depends heavily on the dataset in question.
+The common result is that the data files go to data/constituency and are in PTB format.
+da_arboretum
+  Ekhard Bick
+    Arboretum, a Hybrid Treebank for Danish
+    https://www.researchgate.net/publication/251202293_Arboretum_a_Hybrid_Treebank_for_Danish
+  Available here for a license fee:
+    http://catalog.elra.info/en-us/repository/browse/ELRA-W0084/
+  Internal to Stanford, please contact Chris Manning and/or John Bauer
+  The file processed is the tiger xml, although there are some edits
+    needed in order to make it functional for our parser
+  The treebank comes as a tar.gz file, W0084.tar.gz
+  untar this file in $CONSTITUENCY_BASE/danish
+  then move the extracted folder to "arboretum"
+    $CONSTITUENCY_BASE/danish/W0084/... becomes
+    $CONSTITUENCY_BASE/danish/arboretum/...
+en_ptb3-revised is an updated version of PTB with NML and stuff
+  put LDC2015T13 in $CONSTITUENCY_BASE/english
+  the directory name may look like LDC2015T13_eng_news_txt_tbnk-ptb_revised
+  python3 -m stanza.utils.datasets.constituency.prepare_con_dataset en_ptb3-revised
+  All this needs to do is concatenate the various pieces
+  @article{ptb_revised,
+    title= {Penn Treebank Revised: English News Text Treebank LDC2015T13},
+    journal= {},
+    author= {Ann Bies and Justin Mott and Colin Warner},
+    year= {2015},
+    url= {https://doi.org/10.35111/xpjy-at91},
+    doi= {10.35111/xpjy-at91},
+    isbn= {1-58563-724-6},
+    dcmi= {text},
+    languages= {english},
+    language= {english},
+    ldc= {LDC2015T13},
+  }
+id_icon
+  ICON: Building a Large-Scale Benchmark Constituency Treebank
+    for the Indonesian Language
+    Ee Suan Lim, Wei Qi Leong, Ngan Thanh Nguyen, Dea Adhista,
+    Wei Ming Kng, William Chandra Tjhi, Ayu Purwarianti
+    https://aclanthology.org/2023.tlt-1.5.pdf
+  Available at https://github.com/aisingapore/seacorenlp-data
+  git clone the repo in $CONSTITUENCY_BASE/seacorenlp
+  so there is now a directory
+    $CONSTITUENCY_BASE/seacorenlp/seacorenlp-data
+  python3 -m stanza.utils.datasets.constituency.prepare_con_dataset id_icon
+it_turin
+  A combination of Evalita competition from 2011 and the ParTUT trees
+  More information is available in convert_it_turin
+it_vit
+  The original for the VIT UD Dataset
+  The UD version has a lot of corrections, so we try to apply those as much as possible
+  In fact, we applied some corrections of our own back to UD based on this treebank.
+    The first version which had those corrections is UD 2.10
+    Versions of UD before that won't work
+    Hopefully versions after that work
+    Set UDBASE to a path such that $UDBASE/UD_Italian-VIT is the UD version
+  The constituency labels are generally not very understandable, unfortunately
+    Some documentation is available here:
+      https://core.ac.uk/download/pdf/223148096.pdf
+      https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.423.5538&rep=rep1&type=pdf
+  Available from ELRA:
+    http://catalog.elra.info/en-us/repository/browse/ELRA-W0040/
+ja_alt
+  Asian Language Treebank produced a treebank for Japanese:
+    Ye Kyaw Thu, Win Pa Pa, Masao Utiyama, Andrew Finch, Eiichiro Sumita
+    Introducing the Asian Language Treebank
+    http://www.lrec-conf.org/proceedings/lrec2016/pdf/435_Paper.pdf
+  Download
+    https://www2.nict.go.jp/astrec-att/member/mutiyama/ALT/Japanese-ALT-20210218.zip
+  unzip this in $CONSTITUENCY_BASE/japanese
+  this should create a directory $CONSTITUENCY_BASE/japanese/Japanese-ALT-20210218
+  In this directory, also download the following:
+    https://www2.nict.go.jp/astrec-att/member/mutiyama/ALT/URL-train.txt
+    https://www2.nict.go.jp/astrec-att/member/mutiyama/ALT/URL-dev.txt
+    https://www2.nict.go.jp/astrec-att/member/mutiyama/ALT/URL-test.txt
+  In particular, there are two files with a bunch of bracketed parses,
+    Japanese-ALT-Draft.txt and Japanese-ALT-Reviewed.txt
+  The first word of each of these lines is SNT.80188.1 or something like that
+  This correlates with the three URL-... files, telling us whether the
+    sentence belongs in train/dev/test
+  python3 -m stanza.utils.datasets.constituency.prepare_con_dataset ja_alt
+pt_cintil
+  CINTIL treebank for Portuguese, available at ELRA:
+    https://catalogue.elra.info/en-us/repository/browse/ELRA-W0055/
+  It can also be obtained from here:
+    https://hdl.handle.net/21.11129/0000-000B-D2FE-A
+  Produced at U Lisbon
+    António Branco; João Silva; Francisco Costa; Sérgio Castro
+      CINTIL TreeBank Handbook: Design options for the representation of syntactic constituency
+    Silva, João; António Branco; Sérgio Castro; Ruben Reis
+      Out-of-the-Box Robust Parsing of Portuguese
+    https://portulanclarin.net/repository/extradocs/CINTIL-Treebank.pdf
+    http://www.di.fc.ul.pt/~ahb/pubs/2011bBrancoSilvaCostaEtAl.pdf
+  If at Stanford, ask John Bauer or Chris Manning for the data
+  Otherwise, purchase it from ELRA or find it elsewhere if possible
+  Either way, unzip it in
+    $CONSTITUENCY_BASE/portuguese to the CINTIL directory
+    so for example, the final result might be
+    extern_data/constituency/portuguese/CINTIL/CINTIL-Treebank.xml
+  python3 -m stanza.utils.datasets.constituency.prepare_con_dataset pt_cintil
+tr_starlang
+  A dataset in three parts from the Starlang group in Turkey:
+  Neslihan Kara, Büşra Marşan, et al
+    Creating A Syntactically Felicitous Constituency Treebank For Turkish
+    https://ieeexplore.ieee.org/document/9259873
+  git clone the following three repos
+    https://github.com/olcaytaner/TurkishAnnotatedTreeBank-15
+    https://github.com/olcaytaner/TurkishAnnotatedTreeBank2-15
+    https://github.com/olcaytaner/TurkishAnnotatedTreeBank2-20
+  Put them in
+    $CONSTITUENCY_BASE/turkish
+  python3 -m stanza.utils.datasets.constituency.prepare_con_dataset tr_starlang
+vlsp09 is the 2009 constituency treebank:
+  Nguyen Phuong Thai, Vu Xuan Luong, Nguyen Thi Minh Huyen, Nguyen Van Hiep, Le Hong Phuong
+    Building a Large Syntactically-Annotated Corpus of Vietnamese
+    Proceedings of The Third Linguistic Annotation Workshop
+    In conjunction with ACL-IJCNLP 2009, Suntec City, Singapore, 2009
+  This can be obtained by contacting vlsp.resources@gmail.com
+vlsp22 is the 2022 constituency treebank from the VLSP bakeoff
+  there is an official test set as well
+  you may be able to obtain both of these by contacting vlsp.resources@gmail.com
+  NGUYEN Thi Minh Huyen, HA My Linh, VU Xuan Luong, PHAN Thi Hue,
+  LE Van Cuong, NGUYEN Thi Luong, NGO The Quyen
+    VLSP 2022 Challenge: Vietnamese Constituency Parsing
+    to appear in Journal of Computer Science and Cybernetics.
+vlsp23 is the 2023 update to the constituency treebank from the VLSP bakeoff
+  the vlsp22 code also works for the new dataset,
+    although some effort may be needed to update the tags
+  As of late 2024, the test set is available on request at vlsp.resources@gmail.com
+  Organize the directory
+    $CONSTITUENCY_BASE/vietnamese/VLSP_2023
+      $CONSTITUENCY_BASE/vietnamese/VLSP_2023/Trainingset
+      $CONSTITUENCY_BASE/vietnamese/VLSP_2023/test
+zh_ctb-51 is the 5.1 version of CTB
+  put LDC2005T01U01_ChineseTreebank5.1 in $CONSTITUENCY_BASE/chinese
+  python3 -m stanza.utils.datasets.constituency.prepare_con_dataset zh_ctb-51
+  @article{xue_xia_chiou_palmer_2005,
+           title={The Penn Chinese TreeBank: Phrase structure annotation of a large corpus},
+           volume={11},
+           DOI={10.1017/S135132490400364X},
+           number={2},
+           journal={Natural Language Engineering},
+           publisher={Cambridge University Press},
+           author={XUE, NAIWEN and XIA, FEI and CHIOU, FU-DONG and PALMER, MARTA},
+           year={2005},
+           pages={207–238}}
+zh_ctb-51b is the same dataset, but using a smaller dev/test set
+  in our experiments, this is substantially easier
+zh_ctb-90 is the 9.0 version of CTB
+  put LDC2016T13 in $CONSTITUENCY_BASE/chinese
+  python3 -m stanza.utils.datasets.constituency.prepare_con_dataset zh_ctb-90
+  the splits used are the ones from the file docs/ctb9.0-file-list.txt
+    included in the CTB 9.0 release
+SPMRL adds several treebanks
+  https://www.spmrl.org/
+  https://www.spmrl.org/sancl-posters2014.html
+  Currently only German is converted, the German version being a
+    version of the Tiger Treebank
+  python3 -m stanza.utils.datasets.constituency.prepare_con_dataset de_spmrl
+en_mctb is a multidomain test set covering five domains other than newswire
+  https://github.com/RingoS/multi-domain-parsing-analysis
+  Challenges to Open-Domain Constituency Parsing
+  @inproceedings{yang-etal-2022-challenges,
+    title = "Challenges to Open-Domain Constituency Parsing",
+    author = "Yang, Sen  and
+      Cui, Leyang and
+      Ning, Ruoxi and
+      Wu, Di and
+      Zhang, Yue",
+    booktitle = "Findings of the Association for Computational Linguistics: ACL 2022",
+    month = may,
+    year = "2022",
+    address = "Dublin, Ireland",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2022.findings-acl.11",
+    doi = "10.18653/v1/2022.findings-acl.11",
+    pages = "112--127",
+  }
+  This conversion replaces the top bracket from top -> ROOT and puts an extra S
+    bracket on any roots with more than one node.
+"""
+import argparse
+import os
+import random
+import sys
+import tempfile
+from tqdm import tqdm
+from stanza.models.constituency import parse_tree
+import stanza.utils.default_paths as default_paths
+from stanza.models.constituency import tree_reader
+from stanza.models.constituency.parse_tree import Tree
+from stanza.server import tsurgeon
+from stanza.utils.datasets.common import UnknownDatasetError
+from stanza.utils.datasets.constituency import utils
+from stanza.utils.datasets.constituency.convert_alt import convert_alt
+from stanza.utils.datasets.constituency.convert_arboretum import convert_tiger_treebank
+from stanza.utils.datasets.constituency.convert_cintil import convert_cintil_treebank
+import stanza.utils.datasets.constituency.convert_ctb as convert_ctb
+from stanza.utils.datasets.constituency.convert_it_turin import convert_it_turin
+from stanza.utils.datasets.constituency.convert_it_vit import convert_it_vit
+from stanza.utils.datasets.constituency.convert_spmrl import convert_spmrl
+from stanza.utils.datasets.constituency.convert_starlang import read_starlang
+from stanza.utils.datasets.constituency.utils import SHARDS, write_dataset
+import stanza.utils.datasets.constituency.vtb_convert as vtb_convert
+import stanza.utils.datasets.constituency.vtb_split as vtb_split
+def process_it_turin(paths, dataset_name, *args):
+    """
+    Convert the it_turin dataset
+    """
+    assert dataset_name == 'it_turin'
+    input_dir = os.path.join(paths["CONSTITUENCY_BASE"], "italian")
+    output_dir = paths["CONSTITUENCY_DATA_DIR"]
+    convert_it_turin(input_dir, output_dir)
+def process_it_vit(paths, dataset_name, *args):
+    # needs at least UD 2.11 or this will not work
+    # in the meantime, the git version of VIT will suffice
+    assert dataset_name == 'it_vit'
+    convert_it_vit(paths, dataset_name)
+def process_vlsp09(paths, dataset_name, *args):
+    """
+    Processes the VLSP 2009 dataset, discarding or fixing trees when needed
+    """
+    assert dataset_name == 'vi_vlsp09'
+    vlsp_path = os.path.join(paths["CONSTITUENCY_BASE"], "vietnamese", "VietTreebank_VLSP_SP73", "Kho ngu lieu 10000 cay cu phap")
+    with tempfile.TemporaryDirectory() as tmp_output_path:
+        vtb_convert.convert_dir(vlsp_path, tmp_output_path)
+        vtb_split.split_files(tmp_output_path, paths["CONSTITUENCY_DATA_DIR"], dataset_name)
+def process_vlsp21(paths, dataset_name, *args):
+    """
+    Processes the VLSP 2021 dataset, which is just a single file
+    """
+    assert dataset_name == 'vi_vlsp21'
+    vlsp_file = os.path.join(paths["CONSTITUENCY_BASE"], "vietnamese", "VLSP_2021", "VTB_VLSP21_tree.txt")
+    if not os.path.exists(vlsp_file):
+        raise FileNotFoundError("Could not find the 2021 dataset in the expected location of {} - CONSTITUENCY_BASE == {}".format(vlsp_file, paths["CONSTITUENCY_BASE"]))
+    with tempfile.TemporaryDirectory() as tmp_output_path:
+        vtb_convert.convert_files([vlsp_file], tmp_output_path)
+        # This produces a 0 length test set, just as a placeholder until the actual test set is released
+        vtb_split.split_files(tmp_output_path, paths["CONSTITUENCY_DATA_DIR"], dataset_name, train_size=0.9, dev_size=0.1)
+    _, _, test_file = vtb_split.create_paths(paths["CONSTITUENCY_DATA_DIR"], dataset_name)
+    with open(test_file, "w"):
+        # create an empty test file - currently we don't have actual test data for VLSP 21
+        pass
+def process_vlsp22(paths, dataset_name, *args):
+    """
+    Processes the VLSP 2022 dataset, which is four separate files for some reason
+    """
+    assert dataset_name == 'vi_vlsp22' or dataset_name == 'vi_vlsp23'
+    if dataset_name == 'vi_vlsp22':
+        default_subdir = 'VLSP_2022'
+        default_make_test_split = False
+        updated_tagset = False
+    elif dataset_name == 'vi_vlsp23':
+        default_subdir = os.path.join('VLSP_2023', 'Trainingdataset')
+        default_make_test_split = False
+        updated_tagset = True
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--subdir', default=default_subdir, type=str, help='Where to find the data - allows for using previous versions, if needed')
+    parser.add_argument('--no_convert_brackets', default=True, action='store_false', dest='convert_brackets', help="Don't convert the VLSP parens RKBT & LKBT to PTB parens")
+    parser.add_argument('--n_splits', default=None, type=int, help='Split the data into this many pieces.  Relevant as there is no set training/dev split, so this allows for N models on N different dev sets')
+    parser.add_argument('--test_split', default=default_make_test_split, action='store_true', help='Split 1/10th of the data as a test split as well.  Useful for experimental results.  Less relevant since there is now an official test set')
+    parser.add_argument('--no_test_split', dest='test_split', action='store_false', help='Split 1/10th of the data as a test split as well.  Useful for experimental results.  Less relevant since there is now an official test set')
+    parser.add_argument('--seed', default=1234, type=int, help='Random seed to use when splitting')
+    args = parser.parse_args(args=list(*args))
+    if os.path.exists(args.subdir):
+        vlsp_dir = args.subdir
+    else:
+        vlsp_dir = os.path.join(paths["CONSTITUENCY_BASE"], "vietnamese", args.subdir)
+    if not os.path.exists(vlsp_dir):
+        raise FileNotFoundError("Could not find the {} dataset in the expected location of {} - CONSTITUENCY_BASE == {}".format(dataset_name, vlsp_dir, paths["CONSTITUENCY_BASE"]))
+    vlsp_files = os.listdir(vlsp_dir)
+    vlsp_train_files = [os.path.join(vlsp_dir, x) for x in vlsp_files if x.startswith("file") and not x.endswith(".zip")]
+    vlsp_train_files.sort()
+    if dataset_name == 'vi_vlsp22':
+        vlsp_test_files = [os.path.join(vlsp_dir, x) for x in vlsp_files if x.startswith("private") and not x.endswith(".zip")]
+    elif dataset_name == 'vi_vlsp23':
+        vlsp_test_dir = os.path.abspath(os.path.join(vlsp_dir, os.pardir, "test"))
+        vlsp_test_files = os.listdir(vlsp_test_dir)
+        vlsp_test_files = [os.path.join(vlsp_test_dir, x) for x in vlsp_test_files if x.endswith(".csv")]
+    if len(vlsp_train_files) == 0:
+        raise FileNotFoundError("No train files (files starting with 'file') found in {}".format(vlsp_dir))
+    if not args.test_split and len(vlsp_test_files) == 0:
+        raise FileNotFoundError("No test files found in {}".format(vlsp_dir))
+    print("Loading training files from {}".format(vlsp_dir))
+    print("Procesing training files:\n  {}".format("\n  ".join(vlsp_train_files)))
+    with tempfile.TemporaryDirectory() as train_output_path:
+        vtb_convert.convert_files(vlsp_train_files, train_output_path, verbose=True, fix_errors=True, convert_brackets=args.convert_brackets, updated_tagset=updated_tagset)
+        # This produces a 0 length test set, just as a placeholder until the actual test set is released
+        if args.n_splits:
+            test_size = 0.1 if args.test_split else 0.0
+            dev_size = (1.0 - test_size) / args.n_splits
+            train_size = 1.0 - test_size - dev_size
+            for rotation in range(args.n_splits):
+                # there is a shuffle inside the split routine,
+                # so we need to reset the random seed each time
+                random.seed(args.seed)
+                rotation_name = "%s-%d-%d" % (dataset_name, rotation, args.n_splits)
+                if args.test_split:
+                    rotation_name = rotation_name + "t"
+                vtb_split.split_files(train_output_path, paths["CONSTITUENCY_DATA_DIR"], rotation_name, train_size=train_size, dev_size=dev_size, rotation=(rotation, args.n_splits))
+        else:
+            test_size = 0.1 if args.test_split else 0.0
+            dev_size = 0.1
+            train_size = 1.0 - test_size - dev_size
+            if args.test_split:
+                dataset_name = dataset_name + "t"
+            vtb_split.split_files(train_output_path, paths["CONSTITUENCY_DATA_DIR"], dataset_name, train_size=train_size, dev_size=dev_size)
+    if not args.test_split:
+        print("Procesing test files:\n  {}".format("\n  ".join(vlsp_test_files)))
+        with tempfile.TemporaryDirectory() as test_output_path:
+            vtb_convert.convert_files(vlsp_test_files, test_output_path, verbose=True, fix_errors=True, convert_brackets=args.convert_brackets, updated_tagset=updated_tagset)
+            if args.n_splits:
+                for rotation in range(args.n_splits):
+                    rotation_name = "%s-%d-%d" % (dataset_name, rotation, args.n_splits)
+                    vtb_split.split_files(test_output_path, paths["CONSTITUENCY_DATA_DIR"], rotation_name, train_size=0, dev_size=0)
+            else:
+                vtb_split.split_files(test_output_path, paths["CONSTITUENCY_DATA_DIR"], dataset_name, train_size=0, dev_size=0)
+    if not args.test_split and not args.n_splits and dataset_name == 'vi_vlsp23':
+        print("Procesing test files and keeping ids:\n  {}".format("\n  ".join(vlsp_test_files)))
+        with tempfile.TemporaryDirectory() as test_output_path:
+            vtb_convert.convert_files(vlsp_test_files, test_output_path, verbose=True, fix_errors=True, convert_brackets=args.convert_brackets, updated_tagset=updated_tagset, write_ids=True)
+            vtb_split.split_files(test_output_path, paths["CONSTITUENCY_DATA_DIR"], dataset_name + "-ids", train_size=0, dev_size=0)
+def process_arboretum(paths, dataset_name, *args):
+    """
+    Processes the Danish dataset, Arboretum
+    """
+    assert dataset_name == 'da_arboretum'
+    arboretum_file = os.path.join(paths["CONSTITUENCY_BASE"], "danish", "arboretum", "arboretum.tiger", "arboretum.tiger")
+    if not os.path.exists(arboretum_file):
+        raise FileNotFoundError("Unable to find input file for Arboretum.  Expected in {}".format(arboretum_file))
+    treebank = convert_tiger_treebank(arboretum_file)
+    datasets = utils.split_treebank(treebank, 0.8, 0.1)
+    output_dir = paths["CONSTITUENCY_DATA_DIR"]
+    output_filename = os.path.join(output_dir, "%s.mrg" % dataset_name)
+    print("Writing {} trees to {}".format(len(treebank), output_filename))
+    parse_tree.Tree.write_treebank(treebank, output_filename)
+    write_dataset(datasets, output_dir, dataset_name)
+def process_starlang(paths, dataset_name, *args):
+    """
+    Convert the Turkish Starlang dataset to brackets
+    """
+    assert dataset_name == 'tr_starlang'
+    PIECES = ["TurkishAnnotatedTreeBank-15",
+              "TurkishAnnotatedTreeBank2-15",
+              "TurkishAnnotatedTreeBank2-20"]
+    output_dir = paths["CONSTITUENCY_DATA_DIR"]
+    chunk_paths = [os.path.join(paths["CONSTITUENCY_BASE"], "turkish", piece) for piece in PIECES]
+    datasets = read_starlang(chunk_paths)
+    write_dataset(datasets, output_dir, dataset_name)
+def process_ja_alt(paths, dataset_name, *args):
+    """
+    Convert and split the ALT dataset
+    TODO: could theoretically extend this to MY or any other similar dataset from ALT
+    """
+    lang, source = dataset_name.split("_", 1)
+    assert lang == 'ja'
+    assert source == 'alt'
+    PIECES = ["Japanese-ALT-Draft.txt", "Japanese-ALT-Reviewed.txt"]
+    input_dir = os.path.join(paths["CONSTITUENCY_BASE"], "japanese", "Japanese-ALT-20210218")
+    input_files = [os.path.join(input_dir, input_file) for input_file in PIECES]
+    split_files = [os.path.join(input_dir, "URL-%s.txt" % shard) for shard in SHARDS]
+    output_dir = paths["CONSTITUENCY_DATA_DIR"]
+    output_files = [os.path.join(output_dir, "%s_%s.mrg" % (dataset_name, shard)) for shard in SHARDS]
+    convert_alt(input_files, split_files, output_files)
+def process_pt_cintil(paths, dataset_name, *args):
+    """
+    Convert and split the PT Cintil dataset
+    """
+    lang, source = dataset_name.split("_", 1)
+    assert lang == 'pt'
+    assert source == 'cintil'
+    input_file = os.path.join(paths["CONSTITUENCY_BASE"], "portuguese", "CINTIL", "CINTIL-Treebank.xml")
+    output_dir = paths["CONSTITUENCY_DATA_DIR"]
+    datasets = convert_cintil_treebank(input_file)
+    write_dataset(datasets, output_dir, dataset_name)
+def process_id_icon(paths, dataset_name, *args):
+    lang, source = dataset_name.split("_", 1)
+    assert lang == 'id'
+    assert source == 'icon'
+    input_dir = os.path.join(paths["CONSTITUENCY_BASE"], "seacorenlp", "seacorenlp-data", "id", "constituency")
+    input_files = [os.path.join(input_dir, x) for x in ("train.txt", "dev.txt", "test.txt")]
+    datasets = []
+    for input_file in input_files:
+        trees = tree_reader.read_tree_file(input_file)
+        trees = [Tree("ROOT", tree) for tree in trees]
+        datasets.append(trees)
+    output_dir = paths["CONSTITUENCY_DATA_DIR"]
+    write_dataset(datasets, output_dir, dataset_name)
+def process_ctb_51(paths, dataset_name, *args):
+    lang, source = dataset_name.split("_", 1)
+    assert lang == 'zh-hans'
+    assert source == 'ctb-51'
+    input_dir = os.path.join(paths["CONSTITUENCY_BASE"], "chinese", "LDC2005T01U01_ChineseTreebank5.1", "bracketed")
+    output_dir = paths["CONSTITUENCY_DATA_DIR"]
+    convert_ctb.convert_ctb(input_dir, output_dir, dataset_name, convert_ctb.Version.V51)
+def process_ctb_51b(paths, dataset_name, *args):
+    lang, source = dataset_name.split("_", 1)
+    assert lang == 'zh-hans'
+    assert source == 'ctb-51b'
+    input_dir = os.path.join(paths["CONSTITUENCY_BASE"], "chinese", "LDC2005T01U01_ChineseTreebank5.1", "bracketed")
+    output_dir = paths["CONSTITUENCY_DATA_DIR"]
+    if not os.path.exists(input_dir):
+        raise FileNotFoundError("CTB 5.1 location not found: %s" % input_dir)
+    print("Loading trees from %s" % input_dir)
+    convert_ctb.convert_ctb(input_dir, output_dir, dataset_name, convert_ctb.Version.V51b)
+def process_ctb_90(paths, dataset_name, *args):
+    lang, source = dataset_name.split("_", 1)
+    assert lang == 'zh-hans'
+    assert source == 'ctb-90'
+    input_dir = os.path.join(paths["CONSTITUENCY_BASE"], "chinese", "LDC2016T13", "ctb9.0", "data", "bracketed")
+    output_dir = paths["CONSTITUENCY_DATA_DIR"]
+    convert_ctb.convert_ctb(input_dir, output_dir, dataset_name, convert_ctb.Version.V90)
+def process_ptb3_revised(paths, dataset_name, *args):
+    input_dir = os.path.join(paths["CONSTITUENCY_BASE"], "english", "LDC2015T13_eng_news_txt_tbnk-ptb_revised")
+    if not os.path.exists(input_dir):
+        backup_input_dir = os.path.join(paths["CONSTITUENCY_BASE"], "english", "LDC2015T13")
+        if not os.path.exists(backup_input_dir):
+            raise FileNotFoundError("Could not find ptb3-revised in either %s or %s" % (input_dir, backup_input_dir))
+        input_dir = backup_input_dir
+    bracket_dir = os.path.join(input_dir, "data", "penntree")
+    output_dir = paths["CONSTITUENCY_DATA_DIR"]
+    # compensate for a weird mislabeling in the original dataset
+    label_map = {"ADJ-PRD": "ADJP-PRD"}
+    train_trees = []
+    for i in tqdm(range(2, 22)):
+        new_trees = tree_reader.read_directory(os.path.join(bracket_dir, "%02d" % i))
+        new_trees = [t.remap_constituent_labels(label_map) for t in new_trees]
+        train_trees.extend(new_trees)
+    move_tregex = "_ROOT_ <1 __=home <2 /^[.]$/=move"
+    move_tsurgeon = "move move >-1 home"
+    print("Moving sentence final punctuation if necessary")
+    with tsurgeon.Tsurgeon() as tsurgeon_processor:
+        train_trees = [tsurgeon_processor.process(tree, move_tregex, move_tsurgeon)[0] for tree in tqdm(train_trees)]
+    dev_trees = tree_reader.read_directory(os.path.join(bracket_dir, "22"))
+    dev_trees = [t.remap_constituent_labels(label_map) for t in dev_trees]
+    test_trees = tree_reader.read_directory(os.path.join(bracket_dir, "23"))
+    test_trees = [t.remap_constituent_labels(label_map) for t in test_trees]
+    print("Read %d train trees, %d dev trees, and %d test trees" % (len(train_trees), len(dev_trees), len(test_trees)))
+    datasets = [train_trees, dev_trees, test_trees]
+    write_dataset(datasets, output_dir, dataset_name)
+def process_en_mctb(paths, dataset_name, *args):
+    """
+    Converts the following blocks:
+    dialogue.cleaned.txt  forum.cleaned.txt  law.cleaned.txt  literature.cleaned.txt  review.cleaned.txt
+    """
+    base_path = os.path.join(paths["CONSTITUENCY_BASE"], "english", "multi-domain-parsing-analysis", "data", "MCTB_en")
+    if not os.path.exists(base_path):
+        raise FileNotFoundError("Please download multi-domain-parsing-analysis to %s" % base_path)
+    def tree_callback(tree):
+        if len(tree.children) > 1:
+            tree = parse_tree.Tree("S", tree.children)
+            return parse_tree.Tree("ROOT", [tree])
+        return parse_tree.Tree("ROOT", tree.children)
+    filenames = ["dialogue.cleaned.txt", "forum.cleaned.txt", "law.cleaned.txt", "literature.cleaned.txt", "review.cleaned.txt"]
+    for filename in filenames:
+        trees = tree_reader.read_tree_file(os.path.join(base_path, filename), tree_callback=tree_callback)
+        print("%d trees in %s" % (len(trees), filename))
+        output_filename = "%s-%s_test.mrg" % (dataset_name, filename.split(".")[0])
+        output_filename = os.path.join(paths["CONSTITUENCY_DATA_DIR"], output_filename)
+        print("Writing trees to %s" % output_filename)
+        parse_tree.Tree.write_treebank(trees, output_filename)
+def process_spmrl(paths, dataset_name, *args):
+    if dataset_name != 'de_spmrl':
+        raise ValueError("SPMRL dataset %s currently not supported" % dataset_name)
+    output_directory = paths["CONSTITUENCY_DATA_DIR"]
+    input_directory = os.path.join(paths["CONSTITUENCY_BASE"], "spmrl", "SPMRL_SHARED_2014", "GERMAN_SPMRL", "gold", "ptb")
+    convert_spmrl(input_directory, output_directory, dataset_name)
+DATASET_MAPPING = {
+    'da_arboretum': process_arboretum,
+    'de_spmrl':     process_spmrl,
+    'en_ptb3-revised': process_ptb3_revised,
+    'en_mctb':      process_en_mctb,
+    'id_icon':      process_id_icon,
+    'it_turin':     process_it_turin,
+    'it_vit':       process_it_vit,
+    'ja_alt':       process_ja_alt,
+    'pt_cintil':    process_pt_cintil,
+    'tr_starlang':  process_starlang,
+    'vi_vlsp09':    process_vlsp09,
+    'vi_vlsp21':    process_vlsp21,
+    'vi_vlsp22':    process_vlsp22,
+    'vi_vlsp23':    process_vlsp22,  # options allow for this
+    'zh-hans_ctb-51':   process_ctb_51,
+    'zh-hans_ctb-51b':  process_ctb_51b,
+    'zh-hans_ctb-90':   process_ctb_90,
+}
+def main(dataset_name, *args):
+    paths = default_paths.get_default_paths()
+    random.seed(1234)
+    if dataset_name in DATASET_MAPPING:
+        DATASET_MAPPING[dataset_name](paths, dataset_name, *args)
+    else:
+        raise UnknownDatasetError(dataset_name, f"dataset {dataset_name} currently not handled by prepare_con_dataset")
+if __name__ == '__main__':
+    if len(sys.argv) == 1:
+        print("Known datasets:")
+        for key in DATASET_MAPPING:
+            print("  %s" % key)
+    else:
+        main(sys.argv[1], sys.argv[2:])

stanza/stanza/utils/datasets/constituency/silver_variance.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""
+Use the concepts in "Dataset Cartography" and "Mind Your Outliers" to find trees with the least variance over a training run
+https://arxiv.org/pdf/2009.10795.pdf
+https://arxiv.org/abs/2107.02331
+The idea here is that high variance trees are more likely to be wrong in the first place.  Using this will filter a silver dataset to have better trees.
+for example:
+nlprun -d a6000 -p high "export CLASSPATH=/sailhome/horatio/CoreNLP/classes:/sailhome/horatio/CoreNLP/lib/*:$CLASSPATH; python3 stanza/utils/datasets/constituency/silver_variance.py --eval_file /u/nlp/data/constituency-parser/italian/2024_it_vit_electra/it_silver_0.mrg saved_models/constituency/it_vit.top.each.silver0.constituency_0*0.pt --output_file filtered_silver0.mrg" -o filter.out
+"""
+import argparse
+import logging
+import numpy
+from stanza.models.common import utils
+from stanza.models.common.foundation_cache import FoundationCache
+from stanza.models.constituency import retagging
+from stanza.models.constituency import tree_reader
+from stanza.models.constituency.parser_training import run_dev_set
+from stanza.models.constituency.trainer import Trainer
+from stanza.models.constituency.utils import retag_trees
+from stanza.server.parser_eval import EvaluateParser
+from stanza.utils.get_tqdm import get_tqdm
+tqdm = get_tqdm()
+logger = logging.getLogger('stanza.constituency.trainer')
+def parse_args(args=None):
+    parser = argparse.ArgumentParser(description="Script to filter trees by how much variance they show over multiple checkpoints of a parser training run.")
+    parser.add_argument('--eval_file', type=str, default=None, help='Input file for data loader.')
+    parser.add_argument('--output_file', type=str, default=None, help='Output file after sorting by variance.')
+    parser.add_argument('--charlm_forward_file', type=str, default=None, help="Exact path to use for forward charlm")
+    parser.add_argument('--charlm_backward_file', type=str, default=None, help="Exact path to use for backward charlm")
+    parser.add_argument('--wordvec_pretrain_file', type=str, default=None, help='Exact name of the pretrain file to read')
+    utils.add_device_args(parser)
+    # TODO: use the training scripts to pick the charlm & pretrain if needed
+    parser.add_argument('--lang', default='it', help='Language to use')
+    parser.add_argument('--eval_batch_size', type=int, default=50, help='How many trees to batch when running eval')
+    parser.add_argument('models', type=str, nargs='+', default=None, help="Which model(s) to load")
+    parser.add_argument('--keep', type=float, default=0.5, help="How many trees to keep after sorting by variance")
+    parser.add_argument('--reverse', default=False, action='store_true', help='Actually, keep the high variance trees')
+    retagging.add_retag_args(parser)
+    args = vars(parser.parse_args())
+    retagging.postprocess_args(args)
+    return args
+def main():
+    args = parse_args()
+    retag_pipeline = retagging.build_retag_pipeline(args)
+    foundation_cache = retag_pipeline[0].foundation_cache if retag_pipeline else FoundationCache()
+    print("Analyzing with the following models:\n  " + "\n  ".join(args['models']))
+    treebank = tree_reader.read_treebank(args['eval_file'])
+    logger.info("Read %d trees for analysis", len(treebank))
+    f1_history = []
+    retagged_treebank = None
+    chunk_size = 5000
+    with EvaluateParser() as evaluator:
+        for model_filename in args['models']:
+            print("Starting processing with %s" % model_filename)
+            trainer = Trainer.load(model_filename, args=args, foundation_cache=foundation_cache)
+            if retag_pipeline is not None and retagged_treebank is None:
+                retag_method = trainer.model.args['retag_method']
+                retag_xpos = trainer.model.args['retag_xpos']
+                logger.info("Retagging trees using the %s tags from the %s package...", retag_method, args['retag_package'])
+                retagged_treebank = retag_trees(treebank, retag_pipeline, retag_xpos)
+                logger.info("Retagging finished")
+            current_history = []
+            for chunk_start in range(0, len(treebank), chunk_size):
+                chunk = treebank[chunk_start:chunk_start+chunk_size]
+                retagged_chunk = retagged_treebank[chunk_start:chunk_start+chunk_size] if retagged_treebank else None
+                f1, kbestF1, treeF1 = run_dev_set(trainer.model, retagged_chunk, chunk, args, evaluator)
+                current_history.extend(treeF1)
+            f1_history.append(current_history)
+    f1_history = numpy.array(f1_history)
+    f1_variance = numpy.var(f1_history, axis=0)
+    f1_sorted = sorted([(x, idx) for idx, x in enumerate(f1_variance)], reverse=args['reverse'])
+    num_keep = int(len(f1_sorted) * args['keep'])
+    with open(args['output_file'], "w", encoding="utf-8") as fout:
+        for _, idx in f1_sorted[:num_keep]:
+            fout.write(str(treebank[idx]))
+            fout.write("\n")
+if __name__ == "__main__":
+    main()

stanza/stanza/utils/datasets/coref/convert_hindi.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import argparse
+import json
+from operator import itemgetter
+import os
+import stanza
+from stanza.utils.default_paths import get_default_paths
+from stanza.utils.get_tqdm import get_tqdm
+from stanza.utils.datasets.coref.utils import process_document
+tqdm = get_tqdm()
+def flatten_spans(coref_spans):
+    """
+    Put span IDs on each span, then flatten them into a single list sorted by first word
+    """
+    # put span indices on the spans
+    #   [[[38, 39], [42, 43], [41, 41], [180, 180], [300, 300]], [[60, 68],
+    #   -->
+    #   [[[0, 38, 39], [0, 42, 43], [0, 41, 41], [0, 180, 180], [0, 300, 300]], [[1, 60, 68], ...
+    coref_spans = [[[span_idx, x, y] for x, y in span] for span_idx, span in enumerate(coref_spans)]
+    # flatten list
+    #   -->
+    #   [[0, 38, 39], [0, 42, 43], [0, 41, 41], [0, 180, 180], [0, 300, 300], [1, 60, 68], ...
+    coref_spans = [y for x in coref_spans for y in x]
+    # sort by the first word index
+    #   -->
+    #   [[0, 38, 39], [0, 42, 43], [0, 41, 41], [1, 60, 68], [0, 180, 180], [0, 300, 300], ...
+    coref_spans = sorted(coref_spans, key=itemgetter(1))
+    return coref_spans
+def remove_nulls(coref_spans, sentences):
+    """
+    Removes the "" and "NULL" words from the sentences
+    Also, reindex the spans by the number of words removed.
+    So, we might get something like
+      [[0, 2], [31, 33], [134, 136], [161, 162]]
+      ->
+      [[0, 2], [30, 32], [129, 131], [155, 156]]
+    """
+    word_map = []
+    word_idx = 0
+    map_idx = 0
+    new_sentences = []
+    for sentence in sentences:
+        new_sentence = []
+        for word in sentence:
+            word_map.append(map_idx)
+            word_idx += 1
+            if word != '' and word != 'NULL':
+                new_sentence.append(word)
+                map_idx += 1
+        new_sentences.append(new_sentence)
+    new_spans = []
+    for mention in coref_spans:
+        new_mention = []
+        for span in mention:
+            span = [word_map[x] for x in span]
+            new_mention.append(span)
+        new_spans.append(new_mention)
+    return new_spans, new_sentences
+def arrange_spans_by_sentence(coref_spans, sentences):
+    sentence_spans = []
+    current_index = 0
+    span_idx = 0
+    for sentence in sentences:
+        current_sentence_spans = []
+        end_index = current_index + len(sentence)
+        while span_idx < len(coref_spans) and coref_spans[span_idx][1] < end_index:
+            new_span = [coref_spans[span_idx][0], coref_spans[span_idx][1] - current_index, coref_spans[span_idx][2] - current_index]
+            current_sentence_spans.append(new_span)
+            span_idx += 1
+        sentence_spans.append(current_sentence_spans)
+        current_index = end_index
+    return sentence_spans
+def convert_dataset_section(pipe, section, use_cconj_heads):
+    """
+    Reprocess the original data into a format compatible with previous conversion utilities
+    - remove blank and NULL words
+    - rearrange the spans into spans per sentence instead of a list of indices for each span
+    - process the document using a Hindi pipeline
+    """
+    processed_section = []
+    for idx, doc in enumerate(tqdm(section)):
+        doc_id = doc['doc_key']
+        part_id = ""
+        sentences = doc['sentences']
+        sentence_speakers = doc['speakers']
+        coref_spans = doc['clusters']
+        coref_spans, sentences = remove_nulls(coref_spans, sentences)
+        coref_spans = flatten_spans(coref_spans)
+        coref_spans = arrange_spans_by_sentence(coref_spans, sentences)
+        processed = process_document(pipe, doc_id, part_id, sentences, coref_spans, sentence_speakers, use_cconj_heads=use_cconj_heads)
+        processed_section.append(processed)
+    return processed_section
+def remove_nulls_dataset_section(section):
+    processed_section = []
+    for doc in section:
+        sentences = doc['sentences']
+        coref_spans = doc['clusters']
+        coref_spans, sentences = remove_nulls(coref_spans, sentences)
+        doc['sentences'] = sentences
+        doc['clusters'] = coref_spans
+        processed_section.append(doc)
+    return processed_section
+def read_json_file(filename):
+    with open(filename, encoding="utf-8") as fin:
+        dataset = []
+        for line in fin:
+            line = line.strip()
+            if not line:
+                continue
+            dataset.append(json.loads(line))
+    return dataset
+def write_json_file(output_filename, converted_section):
+    with open(output_filename, "w", encoding="utf-8") as fout:
+        json.dump(converted_section, fout, indent=2)
+def main():
+    parser = argparse.ArgumentParser(
+        prog='Convert Hindi Coref Data',
+    )
+    parser.add_argument('--no_use_cconj_heads', dest='use_cconj_heads', action='store_false', help="Don't use the conjunction-aware transformation")
+    parser.add_argument('--remove_nulls', action='store_true', help="The only action is to remove the NULLs and blank tokens")
+    args = parser.parse_args()
+    paths = get_default_paths()
+    coref_input_path = paths["COREF_BASE"]
+    hindi_base_path = os.path.join(coref_input_path, "hindi", "dataset")
+    sections = ("train", "dev", "test")
+    if args.remove_nulls:
+        for section in sections:
+            input_filename = os.path.join(hindi_base_path, "%s.hindi.jsonlines" % section)
+            dataset = read_json_file(input_filename)
+            dataset = remove_nulls_dataset_section(dataset)
+            output_filename = os.path.join(hindi_base_path, "hi_deeph.%s.nonulls.json" % section)
+            with open(output_filename, "w", encoding="utf-8") as fout:
+                for doc in dataset:
+                    json.dump(doc, fout, ensure_ascii=False)
+                    fout.write("\n")
+    else:
+        pipe = stanza.Pipeline("hi", processors="tokenize,pos,lemma,depparse", package="default_accurate", tokenize_pretokenized=True)
+        os.makedirs(paths["COREF_DATA_DIR"], exist_ok=True)
+        for section in sections:
+            input_filename = os.path.join(hindi_base_path, "%s.hindi.jsonlines" % section)
+            dataset = read_json_file(input_filename)
+            output_filename = os.path.join(paths["COREF_DATA_DIR"], "hi_deeph.%s.json" % section)
+            converted_section = convert_dataset_section(pipe, dataset, use_cconj_heads=args.use_cconj_heads)
+            write_json_file(output_filename, converted_section)
+if __name__ == '__main__':
+    main()

stanza/stanza/utils/datasets/ner/compare_entities.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""
+Report the fraction of NER entities in one file which are present in another.
+Purpose: show the coverage of one file on another, such as reporting
+the number of entities in one dataset on another
+"""
+import argparse
+from stanza.utils.datasets.ner.utils import read_json_entities
+def parse_args():
+    parser = argparse.ArgumentParser(description="Report the coverage of one NER file on another.")
+    parser.add_argument('--train', type=str, nargs="+", required=True, help='File to use to collect the known entities (not necessarily train).')
+    parser.add_argument('--test', type=str, nargs="+", required=True, help='File for which we want to know the ratio of known entities')
+    args = parser.parse_args()
+    return args
+def report_known_entities(train_file, test_file):
+    train_entities = read_json_entities(train_file)
+    test_entities = read_json_entities(test_file)
+    train_entities = set(x[0] for x in train_entities)
+    total_score = sum(1 for x in test_entities if x[0] in train_entities)
+    print(train_file, test_file, total_score / len(test_entities))
+def main():
+    args = parse_args()
+    for train_idx, train_file in enumerate(args.train):
+        if train_idx > 0:
+            print()
+        for test_file in args.test:
+            report_known_entities(train_file, test_file)
+if __name__ == '__main__':
+    main()

stanza/stanza/utils/datasets/ner/conll_to_iob.py ADDED Viewed

	@@ -0,0 +1,59 @@

+"""
+Process a conll file into BIO
+Includes the ability to process a file from a text file
+or a text file within a zip
+Main program extracts a piece of the zip file from the Danish DDT dataset
+"""
+import io
+import zipfile
+from zipfile import ZipFile
+from stanza.utils.conll import CoNLL
+def process_conll(input_file, output_file, zip_file=None, conversion=None, attr_prefix="name", allow_empty=False):
+    """
+    Process a single file from DDT
+    zip_filename: path to ddt.zip
+    in_filename: which piece to read
+    out_filename: where to write the result
+    label: which attribute to get from the misc field
+    """
+    if not attr_prefix.endswith("="):
+        attr_prefix = attr_prefix + "="
+    doc = CoNLL.conll2doc(input_file=input_file, zip_file=zip_file)
+    with open(output_file, "w", encoding="utf-8") as fout:
+        for sentence_idx, sentence in enumerate(doc.sentences):
+            for token_idx, token in enumerate(sentence.tokens):
+                misc = token.misc.split("|")
+                for attr in misc:
+                    if attr.startswith(attr_prefix):
+                        ner = attr.split("=", 1)[1]
+                        break
+                else: # name= not found
+                    if allow_empty:
+                        ner = "O"
+                    else:
+                        raise ValueError("Could not find ner tag in document {}, sentence {}, token {}".format(input_file, sentence_idx, token_idx))
+                if ner != "O" and conversion is not None:
+                    if isinstance(conversion, dict):
+                        bio, label = ner.split("-", 1)
+                        if label in conversion:
+                            label = conversion[label]
+                        ner = "%s-%s" % (bio, label)
+                    else:
+                        ner = conversion(ner)
+                fout.write("%s\t%s\n" % (token.text, ner))
+            fout.write("\n")
+def main():
+    process_conll(zip_file="extern_data/ner/da_ddt/ddt.zip", input_file="ddt.train.conllu", output_file="data/ner/da_ddt.train.bio")
+if __name__ == '__main__':
+    main()

stanza/stanza/utils/datasets/ner/convert_bn_daffodil.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""
+Convert a Bengali NER dataset to our internal .json format
+The dataset is here:
+https://github.com/Rifat1493/Bengali-NER/tree/master/Input
+"""
+import argparse
+import os
+import random
+import tempfile
+from stanza.utils.datasets.ner.utils import read_tsv, write_dataset
+def redo_time_tags(sentences):
+    """
+    Replace all TIM, TIM with B-TIM, I-TIM
+    A brief use of Google Translate suggests the time phrases are
+    generally one phrase, so we don't want to turn this into B-TIM, B-TIM
+    """
+    new_sentences = []
+    for sentence in sentences:
+        new_sentence = []
+        prev_time = False
+        for word, tag in sentence:
+            if tag == 'TIM':
+                if prev_time:
+                    new_sentence.append((word, "I-TIM"))
+                else:
+                    prev_time = True
+                    new_sentence.append((word, "B-TIM"))
+            else:
+                prev_time = False
+                new_sentence.append((word, tag))
+        new_sentences.append(new_sentence)
+    return new_sentences
+def strip_words(dataset):
+    return [[(x[0].strip().replace('\ufeff', ''), x[1]) for x in sentence] for sentence in dataset]
+def filter_blank_words(train_file, train_filtered_file):
+    """
+    As of July 2022, this dataset has blank words with O labels, which is not ideal
+    This method removes those lines
+    """
+    with open(train_file, encoding="utf-8") as fin:
+        with open(train_filtered_file, "w", encoding="utf-8") as fout:
+            for line in fin:
+                if line.strip() == 'O':
+                    continue
+                fout.write(line)
+def filter_broken_tags(train_sentences):
+    """
+    Eliminate any sentences where any of the tags were empty
+    """
+    return [x for x in train_sentences if not any(y[1] is None for y in x)]
+def filter_bad_words(train_sentences):
+    """
+    Not bad words like poop, but characters that don't exist
+    These characters look like n and l in emacs, but they are really
+    0xF06C and 0xF06E
+    """
+    return [[x for x in sentence if not x[0] in ("", "")] for sentence in train_sentences]
+def read_datasets(in_directory):
+    """
+    Reads & splits the train data, reads the test data
+    There is no validation data, so we split the training data into
+    two pieces and use the smaller piece as the dev set
+    Also performeed is a conversion of TIM -> B-TIM, I-TIM
+    """
+    # make sure we always get the same shuffle & split
+    random.seed(1234)
+    train_file = os.path.join(in_directory, "Input", "train_data.txt")
+    with tempfile.TemporaryDirectory() as tempdir:
+        train_filtered_file = os.path.join(tempdir, "train.txt")
+        filter_blank_words(train_file, train_filtered_file)
+        train_sentences = read_tsv(train_filtered_file, text_column=0, annotation_column=1, keep_broken_tags=True)
+    train_sentences = filter_broken_tags(train_sentences)
+    train_sentences = filter_bad_words(train_sentences)
+    train_sentences = redo_time_tags(train_sentences)
+    train_sentences = strip_words(train_sentences)
+    test_file = os.path.join(in_directory, "Input", "test_data.txt")
+    test_sentences = read_tsv(test_file, text_column=0, annotation_column=1, keep_broken_tags=True)
+    test_sentences = filter_broken_tags(test_sentences)
+    test_sentences = filter_bad_words(test_sentences)
+    test_sentences = redo_time_tags(test_sentences)
+    test_sentences = strip_words(test_sentences)
+    random.shuffle(train_sentences)
+    split_len = len(train_sentences) * 9 // 10
+    dev_sentences = train_sentences[split_len:]
+    train_sentences = train_sentences[:split_len]
+    datasets = (train_sentences, dev_sentences, test_sentences)
+    return datasets
+def convert_dataset(in_directory, out_directory):
+    """
+    Reads the datasets using read_datasets, then write them back out
+    """
+    datasets = read_datasets(in_directory)
+    write_dataset(datasets, out_directory, "bn_daffodil")
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_path', type=str, default="/home/john/extern_data/ner/bangla/Bengali-NER", help="Where to find the files")
+    parser.add_argument('--output_path', type=str, default="/home/john/stanza/data/ner", help="Where to output the results")
+    args = parser.parse_args()
+    convert_dataset(args.input_path, args.output_path)

stanza/stanza/utils/datasets/ner/convert_en_conll03.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""
+Downloads (if necessary) conll03 from Huggingface, then converts it to Stanza .json
+Some online sources for CoNLL 2003 require multiple pieces, but it is currently hosted on HF:
+https://huggingface.co/datasets/conll2003
+"""
+import os
+from stanza.utils.default_paths import get_default_paths
+from stanza.utils.datasets.ner.utils import write_dataset
+TAG_TO_ID = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
+ID_TO_TAG = {y: x for x, y in TAG_TO_ID.items()}
+def convert_dataset_section(section):
+    sentences = []
+    for item in section:
+        words = item['tokens']
+        tags = [ID_TO_TAG[x] for x in item['ner_tags']]
+        sentences.append(list(zip(words, tags)))
+    return sentences
+def process_dataset(short_name, conll_path, ner_output_path):
+    try:
+        from datasets import load_dataset
+    except ImportError as e:
+        raise ImportError("Please install the datasets package to process CoNLL03 with Stanza")
+    dataset = load_dataset('conll2003', cache_dir=conll_path)
+    datasets = [convert_dataset_section(x) for x in [dataset['train'], dataset['validation'], dataset['test']]]
+    write_dataset(datasets, ner_output_path, short_name)
+def main():
+    paths = get_default_paths()
+    ner_input_path = paths['NERBASE']
+    conll_path = os.path.join(ner_input_path, "english", "en_conll03")
+    ner_output_path = paths['NER_DATA_DIR']
+    process_dataset("en_conll03", conll_path, ner_output_path)
+if __name__ == '__main__':
+    main()

stanza/stanza/utils/datasets/ner/convert_he_iahlt.py ADDED Viewed

	@@ -0,0 +1,108 @@

+from collections import defaultdict
+import os
+import re
+from stanza.utils.conll import CoNLL
+import stanza.utils.default_paths as default_paths
+from stanza.utils.datasets.ner.utils import write_dataset
+def output_entities(sentence):
+    for word in sentence.words:
+        misc = word.misc
+        if misc is None:
+            continue
+        pieces = misc.split("|")
+        for piece in pieces:
+            if piece.startswith("Entity="):
+                entity = piece.split("=", maxsplit=1)[1]
+                print("  " + entity)
+                break
+def extract_single_sentence(sentence):
+    current_entity = []
+    words = []
+    for word in sentence.words:
+        text = word.text
+        misc = word.misc
+        if misc is None:
+            pieces = []
+        else:
+            pieces = misc.split("|")
+        closes = []
+        first_entity = False
+        for piece in pieces:
+            if piece.startswith("Entity="):
+                entity = piece.split("=", maxsplit=1)[1]
+                entity_pieces = re.split(r"([()])", entity)
+                entity_pieces = [x for x in entity_pieces if x]   # remove blanks from re.split
+                entity_idx = 0
+                while entity_idx < len(entity_pieces):
+                    if entity_pieces[entity_idx] == '(':
+                        assert len(entity_pieces) > entity_idx + 1, "Opening an unspecified entity"
+                        if len(current_entity) == 0:
+                            first_entity = True
+                        current_entity.append(entity_pieces[entity_idx + 1])
+                        entity_idx += 2
+                    elif entity_pieces[entity_idx] == ')':
+                        assert entity_idx != 0, "Closing an unspecified entity"
+                        closes.append(entity_pieces[entity_idx-1])
+                        entity_idx += 1
+                    else:
+                        # the entities themselves get added or removed via the ()
+                        entity_idx += 1
+        if len(current_entity) == 0:
+            entity = 'O'
+        else:
+            entity = current_entity[0]
+            entity = "B-" + entity if first_entity else "I-" + entity
+        words.append((text, entity))
+        assert len(current_entity) >= len(closes), "Too many closes for the current open entities"
+        for close_entity in closes:
+            # TODO: check the close is closing the right thing
+            assert close_entity == current_entity[-1], "Closed the wrong entity: %s vs %s" % (close_entity, current_entity[-1])
+            current_entity = current_entity[:-1]
+    return words
+def extract_sentences(doc):
+    sentences = []
+    for sentence in doc.sentences:
+        try:
+            words = extract_single_sentence(sentence)
+            sentences.append(words)
+        except AssertionError as e:
+            print("Skipping sentence %s  ... %s" % (sentence.sent_id, str(e)))
+            output_entities(sentence)
+    return sentences
+def convert_iahlt(udbase, output_dir, short_name):
+    shards = ("train", "dev", "test")
+    ud_datasets = ["UD_Hebrew-IAHLTwiki", "UD_Hebrew-IAHLTknesset"]
+    base_filenames = ["he_iahltwiki-ud-%s.conllu", "he_iahltknesset-ud-%s.conllu"]
+    datasets = defaultdict(list)
+    for ud_dataset, base_filename in zip(ud_datasets, base_filenames):
+        ud_dataset_path = os.path.join(udbase, ud_dataset)
+        for shard in shards:
+            filename = os.path.join(ud_dataset_path, base_filename % shard)
+            doc = CoNLL.conll2doc(filename)
+            sentences = extract_sentences(doc)
+            print("Read %d sentences from %s" % (len(sentences), filename))
+            datasets[shard].extend(sentences)
+    datasets = [datasets[x] for x in shards]
+    write_dataset(datasets, output_dir, short_name)
+def main():
+    paths = default_paths.get_default_paths()
+    udbase = paths["UDBASE_GIT"]
+    output_directory = paths["NER_DATA_DIR"]
+    convert_iahlt(udbase, output_directory, "he_iahlt")
+if __name__ == '__main__':
+    main()

stanza/stanza/utils/datasets/ner/convert_lst20.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""
+Converts the Thai LST20 dataset to a format usable by Stanza's NER model
+The dataset in the original format has a few tag errors which we
+automatically fix (or at worst cover up)
+"""
+import os
+from stanza.utils.datasets.ner.utils import convert_bio_to_json
+def convert_lst20(paths, short_name, include_space_char=True):
+    assert short_name == "th_lst20"
+    SHARDS = ("train", "eval", "test")
+    BASE_OUTPUT_PATH = paths["NER_DATA_DIR"]
+    input_split = [(os.path.join(paths["NERBASE"], "thai", "LST20_Corpus", x), x) for x in SHARDS]
+    if not include_space_char:
+        short_name = short_name + "_no_ws"
+    for input_folder, split_type in input_split:
+        text_list = [text for text in os.listdir(input_folder) if text[0] == 'T']
+        if split_type == "eval":
+            split_type = "dev"
+        output_path = os.path.join(BASE_OUTPUT_PATH, "%s.%s.bio" % (short_name, split_type))
+        print(output_path)
+        with open(output_path, 'w', encoding='utf-8') as fout:
+            for text in text_list:
+                lst = []
+                with open(os.path.join(input_folder, text), 'r', encoding='utf-8') as fin:
+                    lines = fin.readlines()
+                for line_idx, line in enumerate(lines):
+                    x = line.strip().split('\t')
+                    if len(x) > 1:
+                        if x[0] == '_' and not include_space_char:
+                            continue
+                        else:
+                            word, tag = x[0], x[2]
+                            if tag == "MEA_BI":
+                                tag = "B_MEA"
+                            if tag == "OBRN_B":
+                                tag = "B_BRN"
+                            if tag == "ORG_I":
+                                tag = "I_ORG"
+                            if tag == "PER_I":
+                                tag = "I_PER"
+                            if tag == "LOC_I":
+                                tag = "I_LOC"
+                            if tag == "B" and line_idx + 1 < len(lines):
+                                x_next = lines[line_idx+1].strip().split('\t')
+                                if len(x_next) > 1:
+                                    tag_next = x_next[2]
+                                    if "I_" in tag_next or "E_" in tag_next:
+                                        tag = tag + tag_next[1:]
+                                    else:
+                                        tag = "O"
+                                else:
+                                    tag = "O"
+                            if "_" in tag:
+                                tag = tag.replace("_", "-")
+                            if "ABB" in tag or tag == "DDEM" or tag == "I" or tag == "__":
+                                tag = "O"
+                            fout.write('{}\t{}'.format(word, tag))
+                            fout.write('\n')
+                    else:
+                        fout.write('\n')
+    convert_bio_to_json(BASE_OUTPUT_PATH, BASE_OUTPUT_PATH, short_name)

stanza/stanza/utils/datasets/ner/convert_mr_l3cube.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""
+Reads one piece of the MR L3Cube dataset
+The dataset is structured as a long list of words already in IOB format
+The sentences have an ID which changes when a new sentence starts
+The tags are labeled BNEM instead of B-NEM, so we update that.
+(Could theoretically remap the tags to names more typical of other datasets as well)
+"""
+def convert(input_file):
+    """
+    Converts one file of the dataset
+    Return: a list of list of pairs, (text, tag)
+    """
+    with open(input_file, encoding="utf-8") as fin:
+        lines = fin.readlines()
+    sentences = []
+    current_sentence = []
+    prev_sent_id = None
+    for idx, line in enumerate(lines):
+        # first line of each of the segments is the header
+        if idx == 0:
+            continue
+        line = line.strip()
+        if not line:
+            continue
+        pieces = line.split("\t")
+        if len(pieces) != 3:
+            raise ValueError("Unexpected number of pieces at line %d of %s" % (idx, input_file))
+        text, ner, sent_id = pieces
+        if ner != 'O':
+            # ner symbols are written as BNEM, BNED, etc in this dataset
+            ner = ner[0] + "-" + ner[1:]
+        if not prev_sent_id:
+            prev_sent_id = sent_id
+        if sent_id != prev_sent_id:
+            prev_sent_id = sent_id
+            if len(current_sentence) == 0:
+                raise ValueError("This should not happen!")
+            sentences.append(current_sentence)
+            current_sentence = []
+        current_sentence.append((text, ner))
+    if current_sentence:
+        sentences.append(current_sentence)
+    print("Read %d sentences in %d lines from %s" % (len(sentences), len(lines), input_file))
+    return sentences

stanza/stanza/utils/datasets/ner/convert_nner22.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""
+Converts the Thai NNER22 dataset to a format usable by Stanza's NER model
+The dataset is already written in json format, so we will convert into a compatible json format.
+The dataset in the original format has nested NER format which we will only extract the first layer
+of NER tag and write it in the format accepted by current Stanza model
+"""
+import os
+import logging
+import json
+def convert_nner22(paths, short_name, include_space_char=True):
+    assert short_name == "th_nner22"
+    SHARDS = ("train", "dev", "test")
+    BASE_INPUT_PATH = os.path.join(paths["NERBASE"], "thai", "Thai-NNER", "data", "scb-nner-th-2022", "postproc")
+    if not include_space_char:
+        short_name = short_name + "_no_ws"
+    for shard in SHARDS:
+        input_path = os.path.join(BASE_INPUT_PATH, "%s.json" % (shard))
+        output_path = os.path.join(paths["NER_DATA_DIR"], "%s.%s.json" % (short_name, shard))
+        logging.info("Output path for %s split at %s" % (shard, output_path))
+        data = json.load(open(input_path))
+        documents = []
+        for i in range(len(data)):
+            token, entities = data[i]["tokens"], data[i]["entities"]
+            token_length, sofar = len(token), 0
+            document, ner_dict = [], {}
+            for entity in entities:
+                start, stop = entity["span"]
+                if stop > sofar:
+                    ner = entity["entity_type"].upper()
+                    sofar = stop
+                    for j in range(start, stop):
+                        if j == start:
+                            ner_tag = "B-" + ner
+                        elif j == stop - 1:
+                            ner_tag = "E-" + ner
+                        else:
+                            ner_tag = "I-" + ner
+                        ner_dict[j] = (ner_tag, token[j])
+            for k in range(token_length):
+                dict_add = {}
+                if k not in ner_dict:
+                    dict_add["ner"], dict_add["text"] = "O", token[k]
+                else:
+                    dict_add["ner"], dict_add["text"] = ner_dict[k]
+                document.append(dict_add)
+            documents.append(document)
+        with open(output_path, "w") as outfile:
+            json.dump(documents, outfile, indent=1)
+        logging.info("%s.%s.json file successfully created" % (short_name, shard))

stanza/stanza/utils/datasets/ner/convert_ontonotes.py ADDED Viewed

	@@ -0,0 +1,58 @@

+"""
+Downloads (if necessary) conll03 from Huggingface, then converts it to Stanza .json
+Some online sources for CoNLL 2003 require multiple pieces, but it is currently hosted on HF:
+https://huggingface.co/datasets/conll2003
+"""
+import os
+from stanza.utils.default_paths import get_default_paths
+from stanza.utils.datasets.ner.utils import write_dataset
+ID_TO_TAG = ["O", "B-PERSON", "I-PERSON", "B-NORP", "I-NORP", "B-FAC", "I-FAC", "B-ORG", "I-ORG", "B-GPE", "I-GPE", "B-LOC", "I-LOC", "B-PRODUCT", "I-PRODUCT", "B-DATE", "I-DATE", "B-TIME", "I-TIME", "B-PERCENT", "I-PERCENT", "B-MONEY", "I-MONEY", "B-QUANTITY", "I-QUANTITY", "B-ORDINAL", "I-ORDINAL", "B-CARDINAL", "I-CARDINAL", "B-EVENT", "I-EVENT", "B-WORK_OF_ART", "I-WORK_OF_ART", "B-LAW", "I-LAW", "B-LANGUAGE", "I-LANGUAGE",]
+def convert_dataset_section(config_name, section):
+    sentences = []
+    for doc in section:
+        # the nt_ sentences (New Testament) in the HF version of OntoNotes
+        # have blank named_entities, even though there was no original .name file
+        # that corresponded with these annotations
+        if config_name.startswith("english") and doc['document_id'].startswith("pt/nt"):
+            continue
+        for sentence in doc['sentences']:
+            words = sentence['words']
+            tags = [ID_TO_TAG[x] for x in sentence['named_entities']]
+            sentences.append(list(zip(words, tags)))
+    return sentences
+def process_dataset(short_name, conll_path, ner_output_path):
+    try:
+        from datasets import load_dataset
+    except ImportError as e:
+        raise ImportError("Please install the datasets package to process CoNLL03 with Stanza")
+    if short_name == 'en_ontonotes':
+        # there is an english_v12, but it is filled with junk annotations
+        # for example, near the end:
+        #   And John_O, I realize
+        config_name = 'english_v4'
+    elif short_name in ('zh_ontonotes', 'zh-hans_ontonotes'):
+        config_name = 'chinese_v4'
+    elif short_name == 'ar_ontonotes':
+        config_name = 'arabic_v4'
+    else:
+        raise ValueError("Unknown short name for downloading ontonotes: %s" % short_name)
+    dataset = load_dataset("conll2012_ontonotesv5", config_name, cache_dir=conll_path)
+    datasets = [convert_dataset_section(config_name, x) for x in [dataset['train'], dataset['validation'], dataset['test']]]
+    write_dataset(datasets, ner_output_path, short_name)
+def main():
+    paths = get_default_paths()
+    ner_input_path = paths['NERBASE']
+    conll_path = os.path.join(ner_input_path, "english", "en_ontonotes")
+    ner_output_path = paths['NER_DATA_DIR']
+    process_dataset("en_ontonotes", conll_path, ner_output_path)
+if __name__ == '__main__':
+    main()

stanza/stanza/utils/datasets/ner/json_to_bio.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""
+If you want to convert .json back to .bio for some reason, this will do it for you
+"""
+import argparse
+import json
+import os
+from stanza.models.common.doc import Document
+from stanza.models.ner.utils import process_tags
+from stanza.utils.default_paths import get_default_paths
+def convert_json_to_bio(input_filename, output_filename):
+    with open(input_filename, encoding="utf-8") as fin:
+        doc = Document(json.load(fin))
+    sentences = [[(word.text, word.ner) for word in sentence.tokens] for sentence in doc.sentences]
+    sentences = process_tags(sentences, "bioes")
+    with open(output_filename, "w", encoding="utf-8") as fout:
+        for sentence in sentences:
+            for word in sentence:
+                fout.write("%s\t%s\n" % word)
+            fout.write("\n")
+def main(args=None):
+    ner_data_dir = get_default_paths()['NER_DATA_DIR']
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_filename', type=str, default="data/ner/en_foreign-4class.test.json", help='Convert an individual file')
+    parser.add_argument('--input_dir', type=str, default=ner_data_dir, help='Which directory to find the dataset, if using --input_dataset')
+    parser.add_argument('--input_dataset', type=str, help='Convert an entire dataset')
+    parser.add_argument('--output_suffix', type=str, default='bioes', help='suffix for output filenames')
+    args = parser.parse_args(args)
+    if args.input_dataset:
+        input_filenames = [os.path.join(args.input_dir, "%s.%s.json" % (args.input_dataset, shard))
+                           for shard in ("train", "dev", "test")]
+    else:
+        input_filenames = [args.input_filename]
+    for input_filename in input_filenames:
+        output_filename = os.path.splitext(input_filename)[0] + "." + args.output_suffix
+        print("%s -> %s" % (input_filename, output_filename))
+        convert_json_to_bio(input_filename, output_filename)
+if __name__ == '__main__':
+    main()

stanza/stanza/utils/datasets/ner/misc_to_date.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# for the Worldwide dataset, automatically switch the Misc tags to Date when Stanza Ontonotes thinks it's a Date
+# this keeps our annotation scheme for dates (eg, not "3 months ago") while hopefully switching them all to Date
+#
+# maybe some got missed
+# also, there are a few with some nested entities.  printed out warnings and edited those by hand
+#
+# just need to run this with the Worldwide dataset in the ner path
+# it will automatically convert as many as it can
+import os
+from tqdm import tqdm
+import stanza
+from stanza.utils.datasets.ner.utils import read_tsv
+from stanza.utils.default_paths import get_default_paths
+paths = get_default_paths()
+BASE_PATH = os.path.join(paths["NERBASE"], "en_foreign")
+input_dir = os.path.join(BASE_PATH, "en-foreign-newswire")
+pipe = stanza.Pipeline("en", processors="tokenize,ner", tokenize_pretokenized=True, package={"ner": "ontonotes_bert"})
+filenames = []
+def ner_tags(pipe, sentence):
+    doc = pipe([sentence])
+    tags = [token.ner for sentence in doc.sentences for token in sentence.tokens]
+    return tags
+for root, dirs, files in os.walk(input_dir):
+    if root[-6:] == "REVIEW":
+        batch_files = os.listdir(root)
+        for filename in batch_files:
+            file_path = os.path.join(root, filename)
+            filenames.append(file_path)
+for filename in tqdm(filenames):
+    try:
+        data = read_tsv(filename, text_column=0, annotation_column=1, skip_comments=False, keep_all_columns=True)
+        with open(filename, 'w', encoding='utf-8') as fout:
+            warned_file = False
+            for sentence in data:  # segments delimited by spaces, effectively sentences
+                tokens = [x[0] for x in sentence]
+                labels = [x[1] for x in sentence]
+                if any(x.endswith("Misc") for x in labels):
+                    stanza_tags = ner_tags(pipe, tokens)
+                    in_date = False
+                    for i, stanza_tag in enumerate(stanza_tags):
+                        if stanza_tag[2:] == "DATE" and labels[i] != "O":
+                            if len(sentence[i]) > 2:
+                                if not warned_file:
+                                    print("Warning: file %s has nested tags being altered" % filename)
+                                    warned_file = True
+                            # put DATE tags where Stanza thinks there are DATEs
+                            # as long as we already had a MISC (or something else, I suppose)
+                            if in_date and not stanza_tag[0].startswith("B") and not stanza_tag[0].startswith("S"):
+                                sentence[i][1] = "I-Date"
+                            else:
+                                sentence[i][1] = "B-Date"
+                            in_date = True
+                        elif in_date:
+                            # make sure new tags start with B- instead of I-
+                            # honestly it's not clear if, in these cases,
+                            # we should be switching the following tags to
+                            # DATE as well. will have to experiment some
+                            in_date = False
+                            if labels[i].startswith("I-"):
+                                sentence[i][1] = "B-" + labels[i][2:]
+                for word in sentence:
+                    fout.write("\t".join(word))
+                    fout.write("\n")
+                fout.write("\n")
+    except AssertionError:
+        print("Could not process %s" % filename)

stanza/stanza/utils/datasets/ner/preprocess_wikiner.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""
+Converts the WikiNER data format to a format usable by our processing tools
+python preprocess_wikiner input output
+"""
+import sys
+def preprocess_wikiner(input_file, output_file, encoding="utf-8"):
+    with open(input_file, encoding=encoding) as fin:
+        with open(output_file, "w", encoding="utf-8") as fout:
+            for line in fin:
+                line = line.strip()
+                if not line:
+                    fout.write("-DOCSTART- O\n")
+                    fout.write("\n")
+                    continue
+                words = line.split()
+                for word in words:
+                    pieces = word.split("|")
+                    text = pieces[0]
+                    tag = pieces[-1]
+                    # some words look like Daniel_Bernoulli|I-PER
+                    # but the original .pl conversion script didn't take that into account
+                    subtext = text.split("_")
+                    if tag.startswith("B-") and len(subtext) > 1:
+                        fout.write("{} {}\n".format(subtext[0], tag))
+                        for chunk in subtext[1:]:
+                            fout.write("{} I-{}\n".format(chunk, tag[2:]))
+                    else:
+                        for chunk in subtext:
+                            fout.write("{} {}\n".format(chunk, tag))
+                fout.write("\n")
+if __name__ == '__main__':
+    preprocess_wikiner(sys.argv[1], sys.argv[2])

stanza/stanza/utils/datasets/ner/simplify_en_worldwide.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import argparse
+import os
+import tempfile
+import stanza
+from stanza.utils.default_paths import get_default_paths
+from stanza.utils.datasets.ner.utils import read_tsv
+from stanza.utils.get_tqdm import get_tqdm
+tqdm = get_tqdm()
+PUNCTUATION = """!"#%&'()*+, -./:;<=>?@[\\]^_`{|}~"""
+MONEY_WORDS = {"million", "billion", "trillion", "millions", "billions", "trillions", "hundred", "hundreds",
+               "lakh", "crore", # south asian english
+               "tens", "of", "ten", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "couple"}
+# Doesn't include Money but this case is handled explicitly for processing
+LABEL_TRANSLATION = {
+    "Date":         None,
+    "Misc":         "MISC",
+    "Product":      "MISC",
+    "NORP":         "MISC",
+    "Facility":     "LOC",
+    "Location":     "LOC",
+    "Person":       "PER",
+    "Organization": "ORG",
+}
+def isfloat(num):
+    try:
+        float(num)
+        return True
+    except ValueError:
+        return False
+def process_label(line, is_start=False):
+    """
+    Converts our stuff to conll labels
+    event, product, work of art, norp -> MISC
+    take out dates - can use Stanza to identify them as dates and eliminate them
+    money requires some special care
+    facility -> location  (there are examples of Bridge and Hospital in the data)
+    the version of conll we used to train CoreNLP NER is here:
+    Overall plan:
+    Collapse Product, NORP, Money (extract only the symbols), into misc.
+    Collapse Facilities into LOC
+    Deletes Dates
+    Rule for currency is that we take out labels for the numbers that return True for isfloat()
+    Take out words that categorize money (Million, Billion, Trillion, Thousand, Hundred, Ten, Nine, Eight, Seven, Six, Five,
+    Four, Three, Two, One)
+    Take out punctuation characters
+    If we remove the 'B' tag, then move it to the first remaining tag.
+    Replace tags with 'O'
+    is_start parameter signals whether or not this current line is the new start of a tag. Needed for when
+    the previous line analyzed is the start of a MONEY tag but is removed because it is a non symbol- need to
+    set the starting token that is a symbol to the B-MONEY tag when it might have previously been I-MONEY
+    """
+    if not line:
+        return []
+    token = line[0]
+    biggest_label = line[1]
+    position, label_name = biggest_label[:2], biggest_label[2:]
+    if label_name == "Money":
+        if token.lower() in MONEY_WORDS or token in PUNCTUATION or isfloat(token):  # remove this tag
+            label_name = "O"
+            is_start = True
+            position = ""
+        else:  # keep money tag
+            label_name = "MISC"
+            if is_start:
+                position = "B-"
+                is_start = False
+    elif not label_name or label_name == "O":
+        pass
+    elif label_name in LABEL_TRANSLATION:
+        label_name = LABEL_TRANSLATION[label_name]
+        if label_name is None:
+            position = ""
+            label_name = "O"
+            is_start = False
+    else:
+        raise ValueError("Oops, missed a label: %s" % label_name)
+    return [token, position + label_name, is_start]
+def write_new_file(save_dir, input_path, old_file, simplify):
+    starts_b = False
+    with open(input_path, "r+", encoding="utf-8") as iob:
+        new_filename = (os.path.splitext(old_file)[0] + ".4class.tsv") if simplify else old_file
+        with open(os.path.join(save_dir, new_filename), 'w', encoding='utf-8') as fout:
+            for i, line in enumerate(iob):
+                if i == 0 or i == 1:  # skip over the URL and subsequent space line.
+                    continue
+                line = line.strip()
+                if not line:
+                    fout.write("\n")
+                    continue
+                label = line.split("\t")
+                if simplify:
+                    try:
+                        edited = process_label(label, is_start=starts_b)  # processed label line labels
+                    except ValueError as e:
+                        raise ValueError("Error in %s at line %d" % (input_path, i)) from e
+                    assert edited
+                    starts_b = edited[-1]
+                    fout.write("\t".join(edited[:-1]))
+                    fout.write("\n")
+                else:
+                    fout.write("%s\t%s\n" % (label[0], label[1]))
+def copy_and_simplify(base_path, simplify):
+    with tempfile.TemporaryDirectory(dir=base_path) as tempdir:
+        # Condense Labels
+        input_dir = os.path.join(base_path, "en-worldwide-newswire")
+        final_dir = os.path.join(base_path, "4class" if simplify else "9class")
+        os.makedirs(tempdir, exist_ok=True)
+        os.makedirs(final_dir, exist_ok=True)
+        for root, dirs, files in os.walk(input_dir):
+            if root[-6:] == "REVIEW":
+                batch_files = os.listdir(root)
+                for filename in batch_files:
+                    file_path = os.path.join(root, filename)
+                    write_new_file(final_dir, file_path, filename, simplify)
+def main(args=None):
+    BASE_PATH = "C:\\Users\\SystemAdmin\\PycharmProjects\\General Code\\stanza source code"
+    if not os.path.exists(BASE_PATH):
+        paths = get_default_paths()
+        BASE_PATH = os.path.join(paths["NERBASE"], "en_worldwide")
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--base_path', type=str, default=BASE_PATH, help="Where to find the raw data")
+    parser.add_argument('--simplify', default=False, action='store_true', help='Simplify to 4 classes... otherwise, keep all classes')
+    parser.add_argument('--no_simplify', dest='simplify', action='store_false', help="Don't simplify to 4 classes")
+    args = parser.parse_args(args=args)
+    copy_and_simplify(args.base_path, args.simplify)
+if __name__ == '__main__':
+    main()

stanza/stanza/utils/datasets/ner/simplify_ontonotes_to_worldwide.py ADDED Viewed

	@@ -0,0 +1,118 @@

+"""
+Simplify an existing ner json with the OntoNotes 18 class scheme to the Worldwide scheme
+Simplified classes used in the Worldwide dataset are:
+Date
+Facility
+Location
+Misc
+Money
+NORP
+Organization
+Person
+Product
+vs OntoNotes classes:
+CARDINAL
+DATE
+EVENT
+FAC
+GPE
+LANGUAGE
+LAW
+LOC
+MONEY
+NORP
+ORDINAL
+ORG
+PERCENT
+PERSON
+PRODUCT
+QUANTITY
+TIME
+WORK_OF_ART
+"""
+import argparse
+import glob
+import json
+import os
+from stanza.utils.default_paths import get_default_paths
+WORLDWIDE_ENTITY_MAPPING = {
+    "CARDINAL":    None,
+    "ORDINAL":     None,
+    "PERCENT":     None,
+    "QUANTITY":    None,
+    "TIME":        None,
+    "DATE":        "Date",
+    "EVENT":       "Misc",
+    "FAC":         "Facility",
+    "GPE":         "Location",
+    "LANGUAGE":    "NORP",
+    "LAW":         "Misc",
+    "LOC":         "Location",
+    "MONEY":       "Money",
+    "NORP":        "NORP",
+    "ORG":         "Organization",
+    "PERSON":      "Person",
+    "PRODUCT":     "Product",
+    "WORK_OF_ART": "Misc",
+    # identity map in case this is called on the Worldwide half of the tags
+    "Date":        "Date",
+    "Facility":    "Facility",
+    "Location":    "Location",
+    "Misc":        "Misc",
+    "Money":       "Money",
+    "Organization":"Organization",
+    "Person":      "Person",
+    "Product":     "Product",
+}
+def simplify_ontonotes_to_worldwide(entity):
+    if not entity or entity == "O":
+        return "O"
+    ent_iob, ent_type = entity.split("-", maxsplit=1)
+    if ent_type in WORLDWIDE_ENTITY_MAPPING:
+        if not WORLDWIDE_ENTITY_MAPPING[ent_type]:
+            return "O"
+        return ent_iob + "-" + WORLDWIDE_ENTITY_MAPPING[ent_type]
+    raise ValueError("Unhandled entity: %s" % ent_type)
+def convert_file(in_file, out_file):
+    with open(in_file) as fin:
+        gold_doc = json.load(fin)
+    for sentence in gold_doc:
+        for word in sentence:
+            if 'ner' not in word:
+                continue
+            word['ner'] = simplify_ontonotes_to_worldwide(word['ner'])
+    with open(out_file, "w", encoding="utf-8") as fout:
+        json.dump(gold_doc, fout, indent=2)
+def main():
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--input_dataset', type=str, default='en_ontonotes', help='which files to convert')
+    parser.add_argument('--output_dataset', type=str, default='en_ontonotes-8class', help='which files to write out')
+    parser.add_argument('--ner_data_dir', type=str, default=get_default_paths()["NER_DATA_DIR"], help='which directory has the data')
+    args = parser.parse_args()
+    input_files = glob.glob(os.path.join(args.ner_data_dir, args.input_dataset + ".*"))
+    for input_file in input_files:
+        output_file = os.path.split(input_file)[1][len(args.input_dataset):]
+        output_file = os.path.join(args.ner_data_dir, args.output_dataset + output_file)
+        print("Converting %s to %s" % (input_file, output_file))
+        convert_file(input_file, output_file)
+if __name__ == '__main__':
+    main()

stanza/stanza/utils/datasets/ner/split_wikiner.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+Preprocess the WikiNER dataset, by
+1) normalizing tags;
+2) split into train (70%), dev (15%), test (15%) datasets.
+"""
+import os
+import random
+import warnings
+from collections import Counter
+def read_sentences(filename, encoding):
+    sents = []
+    cache = []
+    skipped = 0
+    skip = False
+    with open(filename, encoding=encoding) as infile:
+        for i, line in enumerate(infile):
+            line = line.rstrip()
+            if len(line) == 0:
+                if len(cache) > 0:
+                    if not skip:
+                        sents.append(cache)
+                    else:
+                        skipped += 1
+                        skip = False
+                    cache = []
+                continue
+            array = line.split()
+            if len(array) != 2:
+                skip = True
+                warnings.warn("Format error at line {}: {}".format(i+1, line))
+                continue
+            w, t = array
+            cache.append([w, t])
+        if len(cache) > 0:
+            if not skip:
+                sents.append(cache)
+            else:
+                skipped += 1
+            cache = []
+    print("Skipped {} examples due to formatting issues.".format(skipped))
+    return sents
+def write_sentences_to_file(sents, filename):
+    print(f"Writing {len(sents)} sentences to {filename}")
+    with open(filename, 'w') as outfile:
+        for sent in sents:
+            for pair in sent:
+                print(f"{pair[0]}\t{pair[1]}", file=outfile)
+            print("", file=outfile)
+def remap_labels(sents, remap):
+    new_sentences = []
+    for sentence in sents:
+        new_sent = []
+        for word in sentence:
+            new_sent.append([word[0], remap.get(word[1], word[1])])
+        new_sentences.append(new_sent)
+    return new_sentences
+def split_wikiner(directory, *in_filenames, encoding="utf-8", prefix="", suffix="bio", remap=None, shuffle=True, train_fraction=0.7, dev_fraction=0.15, test_section=True):
+    random.seed(1234)
+    sents = []
+    for filename in in_filenames:
+        new_sents = read_sentences(filename, encoding)
+        print(f"{len(new_sents)} sentences read from {filename}.")
+        sents.extend(new_sents)
+    if remap:
+        sents = remap_labels(sents, remap)
+    # split
+    num = len(sents)
+    train_num = int(num*train_fraction)
+    if test_section:
+        dev_num = int(num*dev_fraction)
+        if train_fraction + dev_fraction > 1.0:
+            raise ValueError("Train and dev fractions added up to more than 1: {} {} {}".format(train_fraction, dev_fraction))
+    else:
+        dev_num = num - train_num
+    if shuffle:
+        random.shuffle(sents)
+    train_sents = sents[:train_num]
+    dev_sents = sents[train_num:train_num+dev_num]
+    if test_section:
+        test_sents = sents[train_num+dev_num:]
+        batches = [train_sents, dev_sents, test_sents]
+        filenames = [f'train.{suffix}', f'dev.{suffix}', f'test.{suffix}']
+    else:
+        batches = [train_sents, dev_sents]
+        filenames = [f'train.{suffix}', f'dev.{suffix}']
+    if prefix:
+        filenames = ['%s.%s' % (prefix, f) for f in filenames]
+    for batch, filename in zip(batches, filenames):
+        write_sentences_to_file(batch, os.path.join(directory, filename))
+if __name__ == "__main__":
+    in_filename = 'raw/wp2.txt'
+    directory = "."
+    split_wikiner(directory, in_filename)

stanza/stanza/utils/datasets/ner/suc_conll_to_iob.py ADDED Viewed

	@@ -0,0 +1,72 @@

+"""
+Process the licensed version of SUC3 to BIO
+The main program processes the expected location, or you can pass in a
+specific zip or filename to read
+"""
+from io import TextIOWrapper
+from zipfile import ZipFile
+def extract(infile, outfile):
+    """
+    Convert the infile to an outfile
+    Assumes the files are already open (this allows you to pass in a zipfile reader, for example)
+    The SUC3 format is like conll, but with the tags in tabs 10 and 11
+    """
+    lines = infile.readlines()
+    sentences = []
+    cur_sentence = []
+    for idx, line in enumerate(lines):
+        line = line.strip()
+        if not line:
+            # if we're currently reading a sentence, append it to the list
+            if cur_sentence:
+                sentences.append(cur_sentence)
+                cur_sentence = []
+            continue
+        pieces = line.split("\t")
+        if len(pieces) < 12:
+            raise ValueError("Unexpected line length in the SUC3 dataset at %d" % idx)
+        if pieces[10] == 'O':
+            cur_sentence.append((pieces[1], "O"))
+        else:
+            cur_sentence.append((pieces[1], "%s-%s" % (pieces[10], pieces[11])))
+    if cur_sentence:
+        sentences.append(cur_sentence)
+    for sentence in sentences:
+        for word in sentence:
+            outfile.write("%s\t%s\n" % word)
+        outfile.write("\n")
+    return len(sentences)
+def extract_from_zip(zip_filename, in_filename, out_filename):
+    """
+    Process a single file from SUC3
+    zip_filename: path to SUC3.0.zip
+    in_filename: which piece to read
+    out_filename: where to write the result
+    """
+    with ZipFile(zip_filename) as zin:
+        with zin.open(in_filename) as fin:
+            with open(out_filename, "w") as fout:
+                num = extract(TextIOWrapper(fin, encoding="utf-8"), fout)
+                print("Processed %d sentences from %s:%s to %s" % (num, zip_filename, in_filename, out_filename))
+                return num
+def process_suc3(zip_filename, short_name, out_dir):
+    extract_from_zip(zip_filename, "SUC3.0/corpus/conll/suc-train.conll", "%s/%s.train.bio" % (out_dir, short_name))
+    extract_from_zip(zip_filename, "SUC3.0/corpus/conll/suc-dev.conll", "%s/%s.dev.bio" % (out_dir, short_name))
+    extract_from_zip(zip_filename, "SUC3.0/corpus/conll/suc-test.conll", "%s/%s.test.bio" % (out_dir, short_name))
+def main():
+    process_suc3("extern_data/ner/sv_suc3/SUC3.0.zip", "data/ner")
+if __name__ == '__main__':
+    main()

stanza/stanza/utils/datasets/pos/__init__.py ADDED Viewed

File without changes

stanza/stanza/utils/datasets/pos/convert_trees_to_pos.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""
+Turns a constituency treebank into a POS dataset with the tags as the upos column
+The constituency treebank first has to be converted from the original
+data to PTB style trees.  This script converts trees from the
+CONSTITUENCY_DATA_DIR folder to a conllu dataset in the POS_DATA_DIR folder.
+Note that this doesn't pay any attention to whether or not the tags actually are upos.
+Also not possible: using this for tokenization.
+TODO: upgrade the POS model to handle xpos datasets with no upos, then make upos/xpos an option here
+To run this:
+  python3 stanza/utils/training/run_pos.py vi_vlsp22
+"""
+import argparse
+import os
+import shutil
+import sys
+from stanza.models.constituency import tree_reader
+import stanza.utils.default_paths as default_paths
+from stanza.utils.get_tqdm import get_tqdm
+tqdm = get_tqdm()
+SHARDS = ("train", "dev", "test")
+def convert_file(in_file, out_file, upos):
+    print("Reading %s" % in_file)
+    trees = tree_reader.read_tree_file(in_file)
+    print("Writing %s" % out_file)
+    with open(out_file, "w") as fout:
+        for tree in tqdm(trees):
+            tree = tree.simplify_labels()
+            text = " ".join(tree.leaf_labels())
+            fout.write("# text = %s\n" % text)
+            for pt_idx, pt in enumerate(tree.yield_preterminals()):
+                # word index
+                fout.write("%d\t" % (pt_idx+1))
+                # word
+                fout.write("%s\t" % pt.children[0].label)
+                # don't know the lemma
+                fout.write("_\t")
+                # always put the tag, whatever it is, in the upos (for now)
+                if upos:
+                    fout.write("%s\t_\t" % pt.label)
+                else:
+                    fout.write("_\t%s\t" % pt.label)
+                # don't have any features
+                fout.write("_\t")
+                # so word 0 fake dep on root, everyone else fake dep on previous word
+                fout.write("%d\t" % pt_idx)
+                if pt_idx == 0:
+                    fout.write("root")
+                else:
+                    fout.write("dep")
+                fout.write("\t_\t_\n")
+            fout.write("\n")
+def convert_treebank(short_name, upos, output_name, paths):
+    in_dir = paths["CONSTITUENCY_DATA_DIR"]
+    in_files = [os.path.join(in_dir, "%s_%s.mrg" % (short_name, shard)) for shard in SHARDS]
+    for in_file in in_files:
+        if not os.path.exists(in_file):
+            raise FileNotFoundError("Cannot find expected datafile %s" % in_file)
+    out_dir = paths["POS_DATA_DIR"]
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+    if output_name is None:
+        output_name = short_name
+    out_files = [os.path.join(out_dir, "%s.%s.in.conllu" % (output_name, shard)) for shard in SHARDS]
+    gold_files = [os.path.join(out_dir, "%s.%s.gold.conllu" % (output_name, shard)) for shard in SHARDS]
+    for in_file, out_file in zip(in_files, out_files):
+        convert_file(in_file, out_file, upos)
+    for out_file, gold_file in zip(out_files, gold_files):
+        shutil.copy2(out_file, gold_file)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("dataset", help="Which dataset to process from trees to POS")
+    parser.add_argument("--upos", action="store_true", default=False, help="Store tags on the UPOS")
+    parser.add_argument("--xpos", dest="upos", action="store_false", help="Store tags on the XPOS")
+    parser.add_argument("--output_name", default=None, help="What name to give the output dataset.  If blank, will use the dataset arg")
+    args = parser.parse_args()
+    paths = default_paths.get_default_paths()
+    convert_treebank(args.dataset, args.upos, args.output_name, paths)

stanza/stanza/utils/datasets/prepare_tokenizer_data.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import argparse
+import json
+import os
+import re
+import sys
+from collections import Counter
+"""
+Data is output in 4 files:
+a file containing the mwt information
+a file containing the words and sentences in conllu format
+a file containing the raw text of each paragraph
+a file of 0,1,2 indicating word break or sentence break on a character level for the raw text
+  1: end of word
+  2: end of sentence
+"""
+PARAGRAPH_BREAK = re.compile(r'\n\s*\n')
+def is_para_break(index, text):
+    """ Detect if a paragraph break can be found, and return the length of the paragraph break sequence. """
+    if text[index] == '\n':
+        para_break = PARAGRAPH_BREAK.match(text, index)
+        if para_break:
+            break_len = len(para_break.group(0))
+            return True, break_len
+    return False, 0
+def find_next_word(index, text, word, output):
+    """
+    Locate the next word in the text. In case a paragraph break is found, also write paragraph break to labels.
+    """
+    idx = 0
+    word_sofar = ''
+    while index < len(text) and idx < len(word):
+        para_break, break_len = is_para_break(index, text)
+        if para_break:
+            # multiple newlines found, paragraph break
+            if len(word_sofar) > 0:
+                assert re.match(r'^\s+$', word_sofar), 'Found non-empty string at the end of a paragraph that doesn\'t match any token: |{}|'.format(word_sofar)
+                word_sofar = ''
+            output.write('\n\n')
+            index += break_len - 1
+        elif re.match(r'^\s$', text[index]) and not re.match(r'^\s$', word[idx]):
+            # whitespace found, and whitespace is not part of a word
+            word_sofar += text[index]
+        else:
+            # non-whitespace char, or a whitespace char that's part of a word
+            word_sofar += text[index]
+            assert text[index].replace('\n', ' ') == word[idx], "Character mismatch: raw text contains |%s| but the next word is |%s|." % (word_sofar, word)
+            idx += 1
+        index += 1
+    return index, word_sofar
+def main(args):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('plaintext_file', type=str, help="Plaintext file containing the raw input")
+    parser.add_argument('conllu_file', type=str, help="CoNLL-U file containing tokens and sentence breaks")
+    parser.add_argument('-o', '--output', default=None, type=str, help="Output file name; output to the console if not specified (the default)")
+    parser.add_argument('-m', '--mwt_output', default=None, type=str, help="Output file name for MWT expansions; output to the console if not specified (the default)")
+    args = parser.parse_args(args=args)
+    with open(args.plaintext_file, 'r', encoding='utf-8') as f:
+        text = ''.join(f.readlines())
+    textlen = len(text)
+    if args.output is None:
+        output = sys.stdout
+    else:
+        outdir = os.path.split(args.output)[0]
+        os.makedirs(outdir, exist_ok=True)
+        output = open(args.output, 'w')
+    index = 0 # character offset in rawtext
+    mwt_expansions = []
+    with open(args.conllu_file, 'r', encoding='utf-8') as f:
+        buf = ''
+        mwtbegin = 0
+        mwtend = -1
+        expanded = []
+        last_comments = ""
+        for line in f:
+            line = line.strip()
+            if len(line):
+                if line[0] == "#":
+                    # comment, don't do anything
+                    if len(last_comments) == 0:
+                        last_comments = line
+                    continue
+                line = line.split('\t')
+                if '.' in line[0]:
+                    # the tokenizer doesn't deal with ellipsis
+                    continue
+                word = line[1]
+                if '-' in line[0]:
+                    # multiword token
+                    mwtbegin, mwtend = [int(x) for x in line[0].split('-')]
+                    lastmwt = word
+                    expanded = []
+                elif mwtbegin <= int(line[0]) < mwtend:
+                    expanded += [word]
+                    continue
+                elif int(line[0]) == mwtend:
+                    expanded += [word]
+                    expanded = [x.lower() for x in expanded] # evaluation doesn't care about case
+                    mwt_expansions += [(lastmwt, tuple(expanded))]
+                    if lastmwt[0].islower() and not expanded[0][0].islower():
+                        print('Sentence ID with potential wrong MWT expansion: ', last_comments, file=sys.stderr)
+                    mwtbegin = 0
+                    mwtend = -1
+                    lastmwt = None
+                    continue
+                if len(buf):
+                    output.write(buf)
+                index, word_found = find_next_word(index, text, word, output)
+                buf = '0' * (len(word_found)-1) + ('1' if '-' not in line[0] else '3')
+            else:
+                # sentence break found
+                if len(buf):
+                    assert int(buf[-1]) >= 1
+                    output.write(buf[:-1] + '{}'.format(int(buf[-1]) + 1))
+                    buf = ''
+                last_comments = ''
+    status_line = ""
+    if args.output:
+        output.close()
+        status_line = 'Tokenizer labels written to %s\n  ' % args.output
+    mwts = Counter(mwt_expansions)
+    if args.mwt_output is None:
+        print('MWTs:', mwts)
+    else:
+        with open(args.mwt_output, 'w') as f:
+            json.dump(list(mwts.items()), f, indent=2)
+        status_line = status_line + '{} unique MWTs found in data.  MWTs written to {}'.format(len(mwts), args.mwt_output)
+        print(status_line)
+if __name__ == '__main__':
+    main(sys.argv[1:])

stanza/stanza/utils/datasets/prepare_tokenizer_treebank.py ADDED Viewed

	@@ -0,0 +1,1396 @@

+"""
+Prepares train, dev, test for a treebank
+For example, do
+  python -m stanza.utils.datasets.prepare_tokenizer_treebank TREEBANK
+such as
+  python -m stanza.utils.datasets.prepare_tokenizer_treebank UD_English-EWT
+and it will prepare each of train, dev, test
+There are macros for preparing all of the UD treebanks at once:
+  python -m stanza.utils.datasets.prepare_tokenizer_treebank ud_all
+  python -m stanza.utils.datasets.prepare_tokenizer_treebank all_ud
+Both are present because I kept forgetting which was the correct one
+There are a few special case handlings of treebanks in this file:
+  - all Vietnamese treebanks have special post-processing to handle
+    some of the difficult spacing issues in Vietnamese text
+  - treebanks with train and test but no dev split have the
+    train data randomly split into two pieces
+  - however, instead of splitting very tiny treebanks, we skip those
+"""
+import argparse
+import glob
+import io
+import os
+import random
+import re
+import tempfile
+import zipfile
+from collections import Counter
+from stanza.models.common.constant import treebank_to_short_name
+import stanza.utils.datasets.common as common
+from stanza.utils.datasets.common import read_sentences_from_conllu, write_sentences_to_conllu, write_sentences_to_file, INT_RE, MWT_RE, MWT_OR_COPY_RE
+import stanza.utils.datasets.tokenization.convert_ml_cochin as convert_ml_cochin
+import stanza.utils.datasets.tokenization.convert_my_alt as convert_my_alt
+import stanza.utils.datasets.tokenization.convert_vi_vlsp as convert_vi_vlsp
+import stanza.utils.datasets.tokenization.convert_th_best as convert_th_best
+import stanza.utils.datasets.tokenization.convert_th_lst20 as convert_th_lst20
+import stanza.utils.datasets.tokenization.convert_th_orchid as convert_th_orchid
+def copy_conllu_file(tokenizer_dir, tokenizer_file, dest_dir, dest_file, short_name):
+    original = f"{tokenizer_dir}/{short_name}.{tokenizer_file}.conllu"
+    copied = f"{dest_dir}/{short_name}.{dest_file}.conllu"
+    print("Copying from %s to %s" % (original, copied))
+    # do this instead of shutil.copyfile in case there are manipulations needed
+    # for example, we might need to add fake dependencies (TODO: still needed?)
+    sents = read_sentences_from_conllu(original)
+    write_sentences_to_conllu(copied, sents)
+def copy_conllu_treebank(treebank, model_type, paths, dest_dir, postprocess=None, augment=True):
+    """
+    This utility method copies only the conllu files to the given destination directory.
+    Both POS, lemma, and depparse annotators need this.
+    """
+    os.makedirs(dest_dir, exist_ok=True)
+    short_name = treebank_to_short_name(treebank)
+    short_language = short_name.split("_")[0]
+    with tempfile.TemporaryDirectory() as tokenizer_dir:
+        paths = dict(paths)
+        paths["TOKENIZE_DATA_DIR"] = tokenizer_dir
+        # first we process the tokenization data
+        args = argparse.Namespace()
+        args.augment = augment
+        args.prepare_labels = False
+        process_treebank(treebank, model_type, paths, args)
+        os.makedirs(dest_dir, exist_ok=True)
+        if postprocess is None:
+            postprocess = copy_conllu_file
+        # now we copy the processed conllu data files
+        postprocess(tokenizer_dir, "train.gold", dest_dir, "train.in", short_name)
+        postprocess(tokenizer_dir, "dev.gold", dest_dir, "dev.in", short_name)
+        postprocess(tokenizer_dir, "test.gold", dest_dir, "test.in", short_name)
+        if model_type is not common.ModelType.POS and model_type is not common.ModelType.DEPPARSE:
+            copy_conllu_file(dest_dir, "dev.in", dest_dir, "dev.gold", short_name)
+            copy_conllu_file(dest_dir, "test.in", dest_dir, "test.gold", short_name)
+def split_train_file(treebank, train_input_conllu, train_output_conllu, dev_output_conllu):
+    # set the seed for each data file so that the results are the same
+    # regardless of how many treebanks are processed at once
+    random.seed(1234)
+    # read and shuffle conllu data
+    sents = read_sentences_from_conllu(train_input_conllu)
+    random.shuffle(sents)
+    n_dev = int(len(sents) * XV_RATIO)
+    assert n_dev >= 1, "Dev sentence number less than one."
+    n_train = len(sents) - n_dev
+    # split conllu data
+    dev_sents = sents[:n_dev]
+    train_sents = sents[n_dev:]
+    print("Train/dev split not present.  Randomly splitting train file from %s to %s and %s" % (train_input_conllu, train_output_conllu, dev_output_conllu))
+    print(f"{len(sents)} total sentences found: {n_train} in train, {n_dev} in dev")
+    # write conllu
+    write_sentences_to_conllu(train_output_conllu, train_sents)
+    write_sentences_to_conllu(dev_output_conllu, dev_sents)
+    return True
+def has_space_after_no(piece):
+    if not piece or piece == "_":
+        return False
+    if piece == "SpaceAfter=No":
+        return True
+    tags = piece.split("|")
+    return any(t == "SpaceAfter=No" for t in tags)
+def remove_space_after_no(piece, fail_if_missing=True):
+    """
+    Removes a SpaceAfter=No annotation from a single piece of a single word.
+    In other words, given a list of conll lines, first call split("\t"), then call this on the -1 column
+    """
+    # |SpaceAfter is in UD_Romanian-Nonstandard... seems fitting
+    if piece == "SpaceAfter=No" or piece == "|SpaceAfter=No":
+        piece = "_"
+    elif piece.startswith("SpaceAfter=No|"):
+        piece = piece.replace("SpaceAfter=No|", "")
+    elif piece.find("|SpaceAfter=No") > 0:
+        piece = piece.replace("|SpaceAfter=No", "")
+    elif fail_if_missing:
+        raise ValueError("Could not find SpaceAfter=No in the given notes field")
+    return piece
+def add_space_after_no(piece, fail_if_found=True):
+    if piece == '_':
+        return "SpaceAfter=No"
+    else:
+        if fail_if_found:
+            if has_space_after_no(piece):
+                raise ValueError("Given notes field already contained SpaceAfter=No")
+        return piece + "|SpaceAfter=No"
+def augment_arabic_padt(sents, ratio=0.05):
+    """
+    Basic Arabic tokenizer gets the trailing punctuation wrong if there is a blank space.
+    Reason seems to be that there are almost no examples of "text ." in the dataset.
+    This function augments the Arabic-PADT dataset with a few such examples.
+    TODO: it may very well be that a lot of tokeners have this problem.
+    Also, there are a few examples in UD2.7 which are apparently
+    headlines where there is a ' . ' in the middle of the text.
+    According to an Arabic speaking labmate, the sentences are
+    headlines which could be reasonably split into two items.  Having
+    them as one item is quite confusing and possibly incorrect, but
+    such is life.
+    """
+    new_sents = []
+    for sentence in sents:
+        if len(sentence) < 4:
+            raise ValueError("Read a surprisingly short sentence")
+        text_line = None
+        if sentence[0].startswith("# newdoc") and sentence[3].startswith("# text"):
+            text_line = 3
+        elif sentence[0].startswith("# newpar") and sentence[2].startswith("# text"):
+            text_line = 2
+        elif sentence[0].startswith("# sent_id") and sentence[1].startswith("# text"):
+            text_line = 1
+        else:
+            raise ValueError("Could not find text line in %s" % sentence[0].split()[-1])
+        # for some reason performance starts dropping quickly at higher numbers
+        if random.random() > ratio:
+            continue
+        if (sentence[text_line][-1] in ('.', '؟', '?', '!') and
+            sentence[text_line][-2] not in ('.', '؟', '?', '!', ' ') and
+            has_space_after_no(sentence[-2].split()[-1]) and
+            len(sentence[-1].split()[1]) == 1):
+            new_sent = list(sentence)
+            new_sent[text_line] = new_sent[text_line][:-1] + ' ' + new_sent[text_line][-1]
+            pieces = sentence[-2].split("\t")
+            pieces[-1] = remove_space_after_no(pieces[-1])
+            new_sent[-2] = "\t".join(pieces)
+            assert new_sent != sentence
+            new_sents.append(new_sent)
+    return sents + new_sents
+def augment_telugu(sents):
+    """
+    Add a few sentences with modified punctuation to Telugu_MTG
+    The Telugu-MTG dataset has punctuation separated from the text in
+    almost all cases, which makes the tokenizer not learn how to
+    process that correctly.
+    All of the Telugu sentences end with their sentence final
+    punctuation being separated.  Furthermore, all commas are
+    separated.  We change that on some subset of the sentences to
+    make the tools more generalizable on wild text.
+    """
+    new_sents = []
+    for sentence in sents:
+        if not sentence[1].startswith("# text"):
+            raise ValueError("Expected the second line of %s to start with # text" % sentence[0])
+        if not sentence[2].startswith("# translit"):
+            raise ValueError("Expected the second line of %s to start with # translit" % sentence[0])
+        if sentence[1].endswith(". . .") or sentence[1][-1] not in ('.', '?', '!'):
+            continue
+        if sentence[1][-1] in ('.', '?', '!') and sentence[1][-2] != ' ' and sentence[1][-3:] != ' ..' and sentence[1][-4:] != ' ...':
+            raise ValueError("Sentence %s does not end with space-punctuation, which is against our assumptions for the te_mtg treebank.  Please check the augment method to see if it is still needed" % sentence[0])
+        if random.random() < 0.1:
+            new_sentence = list(sentence)
+            new_sentence[1] = new_sentence[1][:-2] + new_sentence[1][-1]
+            new_sentence[2] = new_sentence[2][:-2] + new_sentence[2][-1]
+            new_sentence[-2] = new_sentence[-2] + "|SpaceAfter=No"
+            new_sents.append(new_sentence)
+        if sentence[1].find(",") > 1 and random.random() < 0.1:
+            new_sentence = list(sentence)
+            index = sentence[1].find(",")
+            new_sentence[1] = sentence[1][:index-1] + sentence[1][index:]
+            index = sentence[1].find(",")
+            new_sentence[2] = sentence[2][:index-1] + sentence[2][index:]
+            for idx, word in enumerate(new_sentence):
+                if idx < 4:
+                    # skip sent_id, text, transliteration, and the first word
+                    continue
+                if word.split("\t")[1] == ',':
+                    new_sentence[idx-1] = new_sentence[idx-1] + "|SpaceAfter=No"
+                    break
+            new_sents.append(new_sentence)
+    return sents + new_sents
+COMMA_SEPARATED_RE = re.compile(" ([a-zA-Z]+)[,] ([a-zA-Z]+) ")
+def augment_comma_separations(sents, ratio=0.03):
+    """Find some fraction of the sentences which match "asdf, zzzz" and squish them to "asdf,zzzz"
+    This leaves the tokens and all of the other data the same.  The
+    only change made is to change SpaceAfter=No for the "," token and
+    adjust the #text line, with the assumption that the conllu->txt
+    conversion will correctly handle this change.
+    This was particularly an issue for Spanish-AnCora, but it's
+    reasonable to think it could happen to any dataset.  Currently
+    this just operates on commas and ascii letters to avoid
+    accidentally squishing anything that shouldn't be squished.
+    UD_Spanish-AnCora 2.7 had a problem is with this sentence:
+    # orig_file_sentence 143#5
+    In this sentence, there was a comma smashed next to a token.
+    Fixing just this one sentence is not sufficient to tokenize
+    "asdf,zzzz" as desired, so we also augment by some fraction where
+    we have squished "asdf, zzzz" into "asdf,zzzz".
+    This exact example was later fixed in UD 2.8, but it should still
+    potentially be useful for compensating for typos.
+    """
+    new_sents = []
+    for sentence in sents:
+        for text_idx, text_line in enumerate(sentence):
+            # look for the line that starts with "# text".
+            # keep going until we find it, or silently ignore it
+            # if the dataset isn't in that format
+            if text_line.startswith("# text"):
+                break
+        else:
+            continue
+        match = COMMA_SEPARATED_RE.search(sentence[text_idx])
+        if match and random.random() < ratio:
+            for idx, word in enumerate(sentence):
+                if word.startswith("#"):
+                    continue
+                # find() doesn't work because we wind up finding substrings
+                if word.split("\t")[1] != match.group(1):
+                    continue
+                if sentence[idx+1].split("\t")[1] != ',':
+                    continue
+                if sentence[idx+2].split("\t")[1] != match.group(2):
+                    continue
+                break
+            if idx == len(sentence) - 1:
+                # this can happen with MWTs.  we may actually just
+                # want to skip MWTs anyway, so no big deal
+                continue
+            # now idx+1 should be the line with the comma in it
+            comma = sentence[idx+1]
+            pieces = comma.split("\t")
+            assert pieces[1] == ','
+            pieces[-1] = add_space_after_no(pieces[-1])
+            comma = "\t".join(pieces)
+            new_sent = sentence[:idx+1] + [comma] + sentence[idx+2:]
+            text_offset = sentence[text_idx].find(match.group(1) + ", " + match.group(2))
+            text_len = len(match.group(1) + ", " + match.group(2))
+            new_text = sentence[text_idx][:text_offset] + match.group(1) + "," + match.group(2) + sentence[text_idx][text_offset+text_len:]
+            new_sent[text_idx] = new_text
+            new_sents.append(new_sent)
+    print("Added %d new sentences with asdf, zzzz -> asdf,zzzz" % len(new_sents))
+    return sents + new_sents
+def augment_move_comma(sents, ratio=0.02):
+    """
+    Move the comma from after a word to before the next word some fraction of the time
+    We looks for this exact pattern:
+      w1, w2
+    and replace it with
+      w1 ,w2
+    The idea is that this is a relatively common typo, but the tool
+    won't learn how to tokenize it without some help.
+    Note that this modification replaces the original text.
+    """
+    new_sents = []
+    num_operations = 0
+    for sentence in sents:
+        if random.random() > ratio:
+            new_sents.append(sentence)
+            continue
+        found = False
+        for word_idx, word in enumerate(sentence):
+            if word.startswith("#"):
+                continue
+            if word_idx == 0 or word_idx >= len(sentence) - 2:
+                continue
+            pieces = word.split("\t")
+            if pieces[1] == ',' and not has_space_after_no(pieces[-1]):
+                # found a comma with a space after it
+                prev_word = sentence[word_idx-1]
+                if not has_space_after_no(prev_word.split("\t")[-1]):
+                    # unfortunately, the previous word also had a
+                    # space after it.  does not fit what we are
+                    # looking for
+                    continue
+                # also, want to skip instances near MWT or copy nodes,
+                # since those are harder to rearrange
+                next_word = sentence[word_idx+1]
+                if MWT_OR_COPY_RE.match(next_word.split("\t")[0]):
+                    continue
+                if MWT_OR_COPY_RE.match(prev_word.split("\t")[0]):
+                    continue
+                # at this point, the previous word has no space and the comma does
+                found = True
+                break
+        if not found:
+            new_sents.append(sentence)
+            continue
+        new_sentence = list(sentence)
+        pieces = new_sentence[word_idx].split("\t")
+        pieces[-1] = add_space_after_no(pieces[-1])
+        new_sentence[word_idx] = "\t".join(pieces)
+        pieces = new_sentence[word_idx-1].split("\t")
+        prev_word = pieces[1]
+        pieces[-1] = remove_space_after_no(pieces[-1])
+        new_sentence[word_idx-1] = "\t".join(pieces)
+        next_word = new_sentence[word_idx+1].split("\t")[1]
+        for text_idx, text_line in enumerate(sentence):
+            # look for the line that starts with "# text".
+            # keep going until we find it, or silently ignore it
+            # if the dataset isn't in that format
+            if text_line.startswith("# text"):
+                old_chunk = prev_word + ", " + next_word
+                new_chunk = prev_word + " ," + next_word
+                word_idx = text_line.find(old_chunk)
+                if word_idx < 0:
+                    raise RuntimeError("Unexpected #text line which did not contain the original text to be modified.  Looking for\n" + old_chunk + "\n" + text_line)
+                new_text_line = text_line[:word_idx] + new_chunk + text_line[word_idx+len(old_chunk):]
+                new_sentence[text_idx] = new_text_line
+                break
+        new_sents.append(new_sentence)
+        num_operations = num_operations + 1
+    print("Swapped 'w1, w2' for 'w1 ,w2' %d times" % num_operations)
+    return new_sents
+def augment_apos(sents):
+    """
+    If there are no instances of ’ in the dataset, but there are instances of ',
+    we replace some fraction of ' with ’ so that the tokenizer will recognize it.
+    # TODO: we could do it the other way around as well
+    """
+    has_unicode_apos = False
+    has_ascii_apos = False
+    for sent_idx, sent in enumerate(sents):
+        if len(sent) == 0:
+            raise AssertionError("Got a blank sentence in position %d!" % sent_idx)
+        for line in sent:
+            if line.startswith("# text"):
+                if line.find("'") >= 0:
+                    has_ascii_apos = True
+                if line.find("’") >= 0:
+                    has_unicode_apos = True
+                break
+        else:
+            raise ValueError("Cannot find '# text' in sentences %d.  First line: %s" % (sent_idx, sent[0]))
+    if has_unicode_apos or not has_ascii_apos:
+        return sents
+    new_sents = []
+    for sent in sents:
+        if random.random() > 0.05:
+            new_sents.append(sent)
+            continue
+        new_sent = []
+        for line in sent:
+            if line.startswith("# text"):
+                new_sent.append(line.replace("'", "’"))
+            elif line.startswith("#"):
+                new_sent.append(line)
+            else:
+                pieces = line.split("\t")
+                pieces[1] = pieces[1].replace("'", "’")
+                new_sent.append("\t".join(pieces))
+        new_sents.append(new_sent)
+    return new_sents
+def augment_ellipses(sents):
+    """
+    Replaces a fraction of '...' with '…'
+    """
+    has_ellipses = False
+    has_unicode_ellipses = False
+    for sent in sents:
+        for line in sent:
+            if line.startswith("#"):
+                continue
+            pieces = line.split("\t")
+            if pieces[1] == '...':
+                has_ellipses = True
+            elif pieces[1] == '…':
+                has_unicode_ellipses = True
+    if has_unicode_ellipses or not has_ellipses:
+        return sents
+    new_sents = []
+    num_updated = 0
+    for sent in sents:
+        if random.random() > 0.1:
+            new_sents.append(sent)
+            continue
+        found = False
+        new_sent = []
+        for line in sent:
+            if line.startswith("#"):
+                new_sent.append(line)
+            else:
+                pieces = line.split("\t")
+                if pieces[1] == '...':
+                    pieces[1] = '…'
+                    found = True
+                new_sent.append("\t".join(pieces))
+        new_sents.append(new_sent)
+        if found:
+            num_updated = num_updated + 1
+    print("Changed %d sentences to use fancy unicode ellipses" % num_updated)
+    return new_sents
+# https://en.wikipedia.org/wiki/Quotation_mark
+QUOTES = ['"', '“', '”', '«', '»', '「', '」', '《', '》', '„', '″']
+QUOTES_RE = re.compile("(.?)[" + "".join(QUOTES) + "](.+)[" + "".join(QUOTES) + "](.?)")
+# Danish does '«' the other way around from most European languages
+START_QUOTES = ['"', '“', '”', '«', '»', '「', '《', '„', '„', '″']
+END_QUOTES   = ['"', '“', '”', '»', '«', '」', '》', '”', '“', '″']
+def augment_quotes(sents, ratio=0.15):
+    """
+    Go through the sentences and replace a fraction of sentences with alternate quotes
+    TODO: for certain languages we may want to make some language-specific changes
+      eg Danish, don't add «...»
+    """
+    assert len(START_QUOTES) == len(END_QUOTES)
+    counts = Counter()
+    new_sents = []
+    for sent in sents:
+        if random.random() > ratio:
+            new_sents.append(sent)
+            continue
+        # count if there are exactly 2 quotes in this sentence
+        # this is for convenience - otherwise we need to figure out which pairs go together
+        count_quotes = sum(1 for x in sent
+                           if (not x.startswith("#") and
+                               x.split("\t")[1] in QUOTES))
+        if count_quotes != 2:
+            new_sents.append(sent)
+            continue
+        # choose a pair of quotes from the candidates
+        quote_idx = random.choice(range(len(START_QUOTES)))
+        start_quote = START_QUOTES[quote_idx]
+        end_quote = END_QUOTES[quote_idx]
+        counts[start_quote + end_quote] = counts[start_quote + end_quote] + 1
+        new_sent = []
+        saw_start = False
+        for line in sent:
+            if line.startswith("#"):
+                new_sent.append(line)
+                continue
+            pieces = line.split("\t")
+            if pieces[1] in QUOTES:
+                if saw_start:
+                    # Note that we don't change the lemma.  Presumably it's
+                    # set to the correct lemma for a quote for this treebank
+                    pieces[1] = end_quote
+                else:
+                    pieces[1] = start_quote
+                    saw_start = True
+                new_sent.append("\t".join(pieces))
+            else:
+                new_sent.append(line)
+        for text_idx, text_line in enumerate(new_sent):
+            # look for the line that starts with "# text".
+            # keep going until we find it, or silently ignore it
+            # if the dataset isn't in that format
+            if text_line.startswith("# text"):
+                replacement = "\\1%s\\2%s\\3" % (start_quote, end_quote)
+                new_text_line = QUOTES_RE.sub(replacement, text_line)
+                new_sent[text_idx] = new_text_line
+        new_sents.append(new_sent)
+    print("Augmented {} quotes: {}".format(sum(counts.values()), counts))
+    return new_sents
+def find_text_idx(sentence):
+    """
+    Return the index of the # text line or -1
+    """
+    for idx, line in enumerate(sentence):
+        if line.startswith("# text"):
+            return idx
+    return -1
+DIGIT_RE = re.compile("[0-9]")
+def change_indices(line, delta):
+    """
+    Adjust all indices in the given sentence by delta.  Useful when removing a word, for example
+    """
+    if line.startswith("#"):
+        return line
+    pieces = line.split("\t")
+    if MWT_RE.match(pieces[0]):
+        indices = pieces[0].split("-")
+        pieces[0] = "%d-%d" % (int(indices[0]) + delta, int(indices[1]) + delta)
+        line = "\t".join(pieces)
+        return line
+    if MWT_OR_COPY_RE.match(pieces[0]):
+        index_pieces = pieces[0].split(".", maxsplit=1)
+        pieces[0] = "%d.%s" % (int(index_pieces[0]) + delta, index_pieces[1])
+    elif not INT_RE.match(pieces[0]):
+        raise NotImplementedError("Unknown index type: %s" % pieces[0])
+    else:
+        pieces[0] = str(int(pieces[0]) + delta)
+    if pieces[6] != '_':
+        # copy nodes don't have basic dependencies in the es_ancora treebank
+        dep = int(pieces[6])
+        if dep != 0:
+            pieces[6] = str(int(dep) + delta)
+    if pieces[8] != '_':
+        dep_pieces = pieces[8].split(":", maxsplit=1)
+        if DIGIT_RE.search(dep_pieces[1]):
+            raise NotImplementedError("Need to handle multiple additional deps:\n%s" % line)
+        if int(dep_pieces[0]) != 0:
+            pieces[8] = str(int(dep_pieces[0]) + delta) + ":" + dep_pieces[1]
+    line = "\t".join(pieces)
+    return line
+def augment_initial_punct(sents, ratio=0.20):
+    """
+    If a sentence starts with certain punct marks, occasionally use the same sentence without the initial punct.
+    Currently this just handles ¿
+    This helps languages such as CA and ES where the models go awry when the initial ¿ is missing.
+    """
+    new_sents = []
+    for sent in sents:
+        if random.random() > ratio:
+            continue
+        text_idx = find_text_idx(sent)
+        text_line = sent[text_idx]
+        if text_line.count("¿") != 1:
+            # only handle sentences with exactly one ¿
+            continue
+        # find the first line with actual text
+        for idx, line in enumerate(sent):
+            if line.startswith("#"):
+                continue
+            break
+        if idx >= len(sent) - 1:
+            raise ValueError("Unexpectedly an entire sentence is comments")
+        pieces = line.split("\t")
+        if pieces[1] != '¿':
+            continue
+        if has_space_after_no(pieces[-1]):
+            replace_text = "¿"
+        else:
+            replace_text = "¿ "
+        new_sent = sent[:idx] + sent[idx+1:]
+        new_sent[text_idx] = text_line.replace(replace_text, "")
+        # now need to update all indices
+        new_sent = [change_indices(x, -1) for x in new_sent]
+        new_sents.append(new_sent)
+    if len(new_sents) > 0:
+        print("Added %d sentences with the leading ¿ removed" % len(new_sents))
+    return sents + new_sents
+def augment_brackets(sents, ratio=0.1):
+    """
+    If there are no sentences with [], transform some () into []
+    """
+    new_sents = []
+    for sent in sents:
+        text_idx = find_text_idx(sent)
+        text_line = sent[text_idx]
+        if text_line.count("[") > 0 or text_line.count("]") > 0:
+            # found a square bracket, so, never mind
+            return sents
+    for sent in sents:
+        if random.random() > ratio:
+            continue
+        text_idx = find_text_idx(sent)
+        text_line = sent[text_idx]
+        if text_line.count("(") == 0 and text_line.count(")") == 0:
+            continue
+        text_line = text_line.replace("(", "[").replace(")", "]")
+        new_sent = list(sent)
+        new_sent[text_idx] = text_line
+        for idx, line in enumerate(new_sent):
+            if line.startswith("#"):
+                continue
+            pieces = line.split("\t")
+            if pieces[1] == '(':
+                pieces[1] = '['
+            elif pieces[1] == ')':
+                pieces[1] = ']'
+            new_sent[idx] = "\t".join(pieces)
+        new_sents.append(new_sent)
+    if len(new_sents) > 0:
+        print("Added %d sentences with parens replaced with square brackets" % len(new_sents))
+    return sents + new_sents
+def augment_punct(sents):
+    """
+    If there are no instances of ’ in the dataset, but there are instances of ',
+    we replace some fraction of ' with ’ so that the tokenizer will recognize it.
+    Also augments with ... / …
+    """
+    new_sents = augment_apos(sents)
+    new_sents = augment_quotes(new_sents)
+    new_sents = augment_move_comma(new_sents)
+    new_sents = augment_comma_separations(new_sents)
+    new_sents = augment_initial_punct(new_sents)
+    new_sents = augment_ellipses(new_sents)
+    new_sents = augment_brackets(new_sents)
+    return new_sents
+def write_augmented_dataset(input_conllu, output_conllu, augment_function):
+    # set the seed for each data file so that the results are the same
+    # regardless of how many treebanks are processed at once
+    random.seed(1234)
+    # read and shuffle conllu data
+    sents = read_sentences_from_conllu(input_conllu)
+    # the actual meat of the function - produce new sentences
+    new_sents = augment_function(sents)
+    write_sentences_to_conllu(output_conllu, new_sents)
+def remove_spaces_from_sentences(sents):
+    """
+    Makes sure every word in the list of sentences has SpaceAfter=No.
+    Returns a new list of sentences
+    """
+    new_sents = []
+    for sentence in sents:
+        new_sentence = []
+        for word in sentence:
+            if word.startswith("#"):
+                new_sentence.append(word)
+                continue
+            pieces = word.split("\t")
+            if pieces[-1] == "_":
+                pieces[-1] = "SpaceAfter=No"
+            elif pieces[-1].find("SpaceAfter=No") >= 0:
+                pass
+            else:
+                raise ValueError("oops")
+            word = "\t".join(pieces)
+            new_sentence.append(word)
+        new_sents.append(new_sentence)
+    return new_sents
+def remove_spaces(input_conllu, output_conllu):
+    """
+    Turns a dataset into something appropriate for building a segmenter.
+    For example, this works well on the Korean datasets.
+    """
+    sents = read_sentences_from_conllu(input_conllu)
+    new_sents = remove_spaces_from_sentences(sents)
+    write_sentences_to_conllu(output_conllu, new_sents)
+def build_combined_korean_dataset(udbase_dir, tokenizer_dir, short_name, dataset, output_conllu):
+    """
+    Builds a combined dataset out of multiple Korean datasets.
+    Currently this uses GSD and Kaist.  If a segmenter-appropriate
+    dataset was requested, spaces are removed.
+    TODO: we need to handle the difference in xpos tags somehow.
+    """
+    gsd_conllu = common.find_treebank_dataset_file("UD_Korean-GSD", udbase_dir, dataset, "conllu")
+    kaist_conllu = common.find_treebank_dataset_file("UD_Korean-Kaist", udbase_dir, dataset, "conllu")
+    sents = read_sentences_from_conllu(gsd_conllu) + read_sentences_from_conllu(kaist_conllu)
+    segmenter = short_name.endswith("_seg")
+    if segmenter:
+        sents = remove_spaces_from_sentences(sents)
+    write_sentences_to_conllu(output_conllu, sents)
+def build_combined_korean(udbase_dir, tokenizer_dir, short_name):
+    for dataset in ("train", "dev", "test"):
+        output_conllu = common.tokenizer_conllu_name(tokenizer_dir, short_name, dataset)
+        build_combined_korean_dataset(udbase_dir, tokenizer_dir, short_name, dataset, output_conllu)
+def build_combined_italian_dataset(paths, model_type, dataset):
+    udbase_dir = paths["UDBASE"]
+    if dataset == 'train':
+        # could maybe add ParTUT, but that dataset has a slightly different xpos set
+        # (no DE or I)
+        # and I didn't feel like sorting through the differences
+        # Note: currently these each have small changes compared with
+        # the UD2.11 release.  See the issues (possibly closed by now)
+        # filed by AngledLuffa on each of the treebanks for more info.
+        treebanks = [
+            "UD_Italian-ISDT",
+            "UD_Italian-VIT",
+        ]
+        if model_type is not common.ModelType.TOKENIZER:
+            treebanks.extend([
+                "UD_Italian-TWITTIRO",
+                "UD_Italian-PoSTWITA"
+            ])
+        print("Building {} dataset out of {}".format(model_type, " ".join(treebanks)))
+        sents = []
+        for treebank in treebanks:
+            conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu", fail=True)
+            sents.extend(read_sentences_from_conllu(conllu_file))
+    else:
+        istd_conllu = common.find_treebank_dataset_file("UD_Italian-ISDT", udbase_dir, dataset, "conllu")
+        sents = read_sentences_from_conllu(istd_conllu)
+    return sents
+def check_gum_ready(udbase_dir):
+    gum_conllu = common.find_treebank_dataset_file("UD_English-GUMReddit", udbase_dir, "train", "conllu")
+    if common.mostly_underscores(gum_conllu):
+        raise ValueError("Cannot process UD_English-GUMReddit in its current form.  There should be a download script available in the directory which will help integrate the missing proprietary values.  Please run that script to update the data, then try again.")
+def build_combined_english_dataset(paths, model_type, dataset):
+    """
+    en_combined is currently EWT, GUM, PUD, Pronouns, and handparsed
+    """
+    udbase_dir = paths["UDBASE"]
+    check_gum_ready(udbase_dir)
+    if dataset == 'train':
+        # TODO: include more UD treebanks, possibly with xpos removed
+        #  UD_English-ParTUT - xpos are different
+        # also include "external" treebanks such as PTB
+        # NOTE: in order to get the best results, make sure each of these treebanks have the latest edits applied
+        train_treebanks = ["UD_English-EWT", "UD_English-GUM", "UD_English-GUMReddit"]
+        test_treebanks = ["UD_English-PUD", "UD_English-Pronouns"]
+        sents = []
+        for treebank in train_treebanks:
+            conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu", fail=True)
+            new_sents = read_sentences_from_conllu(conllu_file)
+            print("Read %d sentences from %s" % (len(new_sents), conllu_file))
+            sents.extend(new_sents)
+        for treebank in test_treebanks:
+            conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "test", "conllu", fail=True)
+            new_sents = read_sentences_from_conllu(conllu_file)
+            print("Read %d sentences from %s" % (len(new_sents), conllu_file))
+            sents.extend(new_sents)
+    else:
+        ewt_conllu = common.find_treebank_dataset_file("UD_English-EWT", udbase_dir, dataset, "conllu")
+        sents = read_sentences_from_conllu(ewt_conllu)
+    return sents
+def add_english_sentence_final_punctuation(handparsed_sentences):
+    """
+    Add a period to the end of a sentence with no punct at the end.
+    The next-to-last word has SpaceAfter=No added as well.
+    Possibly English-specific because of the xpos.  Could be upgraded
+    to handle multiple languages by passing in the xpos as an argument
+    """
+    new_sents = []
+    for sent in handparsed_sentences:
+        root_id = None
+        max_id = None
+        last_punct = False
+        for line in sent:
+            if line.startswith("#"):
+                continue
+            pieces = line.split("\t")
+            if MWT_OR_COPY_RE.match(pieces[0]):
+                continue
+            if pieces[6] == '0':
+                root_id = pieces[0]
+            max_id = int(pieces[0])
+            last_punct = pieces[3] == 'PUNCT'
+        if not last_punct:
+            new_sent = list(sent)
+            pieces = new_sent[-1].split("\t")
+            pieces[-1] = add_space_after_no(pieces[-1])
+            new_sent[-1] = "\t".join(pieces)
+            new_sent.append("%d\t.\t.\tPUNCT\t.\t_\t%s\tpunct\t%s:punct\t_" % (max_id+1, root_id, root_id))
+            new_sents.append(new_sent)
+        else:
+            new_sents.append(sent)
+    return new_sents
+def build_extra_combined_french_dataset(paths, model_type, dataset):
+    """
+    Extra sentences we don't want augmented for French - currently, handparsed lemmas
+    """
+    handparsed_dir = paths["HANDPARSED_DIR"]
+    sents = []
+    if dataset == 'train':
+        if model_type is common.ModelType.LEMMA:
+            handparsed_path = os.path.join(handparsed_dir, "french-lemmas", "fr_lemmas.conllu")
+            handparsed_sentences = read_sentences_from_conllu(handparsed_path)
+            print("Loaded %d sentences from %s" % (len(handparsed_sentences), handparsed_path))
+            sents.extend(handparsed_sentences)
+    return sents
+def build_extra_combined_english_dataset(paths, model_type, dataset):
+    """
+    Extra sentences we don't want augmented
+    """
+    handparsed_dir = paths["HANDPARSED_DIR"]
+    sents = []
+    if dataset == 'train':
+        handparsed_path = os.path.join(handparsed_dir, "english-handparsed", "english.conll")
+        handparsed_sentences = read_sentences_from_conllu(handparsed_path)
+        handparsed_sentences = add_english_sentence_final_punctuation(handparsed_sentences)
+        sents.extend(handparsed_sentences)
+        print("Loaded %d sentences from %s" % (len(sents), handparsed_path))
+        if model_type is common.ModelType.LEMMA:
+            handparsed_path = os.path.join(handparsed_dir, "english-lemmas", "en_lemmas.conllu")
+            handparsed_sentences = read_sentences_from_conllu(handparsed_path)
+            print("Loaded %d sentences from %s" % (len(handparsed_sentences), handparsed_path))
+            sents.extend(handparsed_sentences)
+    return sents
+def build_extra_combined_italian_dataset(paths, model_type, dataset):
+    """
+    Extra data - the MWT data for Italian
+    """
+    handparsed_dir = paths["HANDPARSED_DIR"]
+    if dataset != 'train':
+        return []
+    extra_italian = os.path.join(handparsed_dir, "italian-mwt", "italian.mwt")
+    if not os.path.exists(extra_italian):
+        raise FileNotFoundError("Cannot find the extra dataset 'italian.mwt' which includes various multi-words retokenized, expected {}".format(extra_italian))
+    extra_sents = read_sentences_from_conllu(extra_italian)
+    for sentence in extra_sents:
+        if not sentence[2].endswith("_") or not MWT_RE.match(sentence[2]):
+            raise AssertionError("Unexpected format of the italian.mwt file.  Has it already be modified to have SpaceAfter=No everywhere?")
+        sentence[2] = sentence[2][:-1] + "SpaceAfter=No"
+    print("Loaded %d sentences from %s" % (len(extra_sents), extra_italian))
+    return extra_sents
+def replace_semicolons(sentences):
+    """
+    Spanish GSD and AnCora have different standards for semicolons.
+    GSD has semicolons at the end of sentences, AnCora has them in the middle as clause separators.
+    Consecutive sentences in GSD do not seem to be related, so there is no combining that can be done.
+    The easiest solution is to replace sentence final semicolons with "." in GSD
+    """
+    new_sents = []
+    count = 0
+    for sentence in sentences:
+        for text_idx, text_line in enumerate(sentence):
+            if text_line.startswith("# text"):
+                break
+        else:
+            raise ValueError("Expected every sentence in GSD to have a # text field")
+        if not text_line.endswith(";"):
+            new_sents.append(sentence)
+            continue
+        count = count + 1
+        new_sent = list(sentence)
+        new_sent[text_idx] = text_line[:-1] + "."
+        new_sent[-1] = new_sent[-1].replace(";", ".")
+        count = count + 1
+        new_sents.append(new_sent)
+    print("Updated %d sentences to replace sentence-final ; with ." % count)
+    return new_sents
+def strip_column(sents, column):
+    """
+    Removes a specified column from the given dataset
+    Particularly useful when mixing two different POS formalisms in the same tagger
+    """
+    new_sents = []
+    for sentence in sents:
+        new_sent = []
+        for word in sentence:
+            if word.startswith("#"):
+                new_sent.append(word)
+                continue
+            pieces = word.split("\t")
+            pieces[column] = "_"
+            new_sent.append("\t".join(pieces))
+        new_sents.append(new_sent)
+    return new_sents
+def strip_xpos(sents):
+    """
+    Removes all xpos from the given dataset
+    Particularly useful when mixing two different POS formalisms in the same tagger
+    """
+    return strip_column(sents, 4)
+def strip_feats(sents):
+    """
+    Removes all features from the given dataset
+    Particularly useful when mixing two different POS formalisms in the same tagger
+    """
+    return strip_column(sents, 5)
+def build_combined_albanian_dataset(paths, model_type, dataset):
+    """
+    sq_combined is STAF as the base, with TSA added for some things
+    """
+    udbase_dir = paths["UDBASE"]
+    udbase_git_dir = paths["UDBASE_GIT"]
+    handparsed_dir = paths["HANDPARSED_DIR"]
+    treebanks = ["UD_Albanian-STAF", "UD_Albanian-TSA"]
+    if dataset == 'train' and model_type == common.ModelType.POS:
+        documents = {}
+        conllu_file = common.find_treebank_dataset_file(treebanks[0], udbase_dir, "train", "conllu", fail=True)
+        new_sents = read_sentences_from_conllu(conllu_file)
+        documents[treebanks[0]] = new_sents
+        # we use udbase_git_dir for TSA because of an updated MWT scheme
+        conllu_file = common.find_treebank_dataset_file(treebanks[1], udbase_git_dir, "test", "conllu", fail=True)
+        new_sents = read_sentences_from_conllu(conllu_file)
+        new_sents = strip_xpos(new_sents)
+        new_sents = strip_feats(new_sents)
+        documents[treebanks[1]] = new_sents
+        return documents
+    if dataset == 'train' and model_type is not common.ModelType.DEPPARSE:
+        sents = []
+        conllu_file = common.find_treebank_dataset_file(treebanks[0], udbase_dir, "train", "conllu", fail=True)
+        new_sents = read_sentences_from_conllu(conllu_file)
+        print("Read %d sentences from %s" % (len(new_sents), conllu_file))
+        sents.extend(new_sents)
+        conllu_file = common.find_treebank_dataset_file(treebanks[1], udbase_git_dir, "test", "conllu", fail=True)
+        new_sents = read_sentences_from_conllu(conllu_file)
+        print("Read %d sentences from %s" % (len(new_sents), conllu_file))
+        sents.extend(new_sents)
+        return sents
+    conllu_file = common.find_treebank_dataset_file(treebanks[0], udbase_dir, dataset, "conllu", fail=True)
+    sents = read_sentences_from_conllu(conllu_file)
+    return sents
+def build_combined_spanish_dataset(paths, model_type, dataset):
+    """
+    es_combined is AnCora and GSD put together
+    For POS training, we put the different datasets into a zip file so
+    that we can keep the conllu files separate and remove the xpos
+    from the non-AnCora training files.  It is necessary to remove the
+    xpos because GSD and PUD both use different xpos schemes from
+    AnCora, and the tagger can use additional data files as training
+    data without a specific column if that column is entirely blank
+    TODO: consider mixing in PUD?
+    """
+    udbase_dir = paths["UDBASE"]
+    handparsed_dir = paths["HANDPARSED_DIR"]
+    treebanks = ["UD_Spanish-AnCora", "UD_Spanish-GSD"]
+    if dataset == 'train' and model_type == common.ModelType.POS:
+        documents = {}
+        for treebank in treebanks:
+            conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu", fail=True)
+            new_sents = read_sentences_from_conllu(conllu_file)
+            if not treebank.endswith("AnCora"):
+                new_sents = strip_xpos(new_sents)
+            documents[treebank] = new_sents
+        return documents
+    if dataset == 'train':
+        sents = []
+        for treebank in treebanks:
+            conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu", fail=True)
+            new_sents = read_sentences_from_conllu(conllu_file)
+            print("Read %d sentences from %s" % (len(new_sents), conllu_file))
+            if treebank.endswith("GSD"):
+                new_sents = replace_semicolons(new_sents)
+            sents.extend(new_sents)
+        if model_type in (common.ModelType.TOKENIZER, common.ModelType.MWT, common.ModelType.LEMMA):
+            extra_spanish = os.path.join(handparsed_dir, "spanish-mwt", "adjectives.conllu")
+            if not os.path.exists(extra_spanish):
+                raise FileNotFoundError("Cannot find the extra dataset 'handpicked.mwt' which includes various multi-words retokenized, expected {}".format(extra_italian))
+            extra_sents = read_sentences_from_conllu(extra_spanish)
+            print("Read %d sentences from %s" % (len(extra_sents), extra_spanish))
+            sents.extend(extra_sents)
+    else:
+        conllu_file = common.find_treebank_dataset_file("UD_Spanish-AnCora", udbase_dir, dataset, "conllu", fail=True)
+        sents = read_sentences_from_conllu(conllu_file)
+    return sents
+def build_combined_french_dataset(paths, model_type, dataset):
+    udbase_dir = paths["UDBASE"]
+    handparsed_dir = paths["HANDPARSED_DIR"]
+    if dataset == 'train':
+        train_treebanks = ["UD_French-GSD", "UD_French-ParisStories", "UD_French-Rhapsodie", "UD_French-Sequoia"]
+        sents = []
+        for treebank in train_treebanks:
+            conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu", fail=True)
+            new_sents = read_sentences_from_conllu(conllu_file)
+            print("Read %d sentences from %s" % (len(new_sents), conllu_file))
+            sents.extend(new_sents)
+        extra_french = os.path.join(handparsed_dir, "french-handparsed", "handparsed_deps.conllu")
+        if not os.path.exists(extra_french):
+            raise FileNotFoundError("Cannot find the extra dataset 'handparsed_deps.conllu' which includes various dependency fixes, expected {}".format(extra_italian))
+        extra_sents = read_sentences_from_conllu(extra_french)
+        print("Read %d sentences from %s" % (len(extra_sents), extra_french))
+        sents.extend(extra_sents)
+    else:
+        gsd_conllu = common.find_treebank_dataset_file("UD_French-GSD", udbase_dir, dataset, "conllu")
+        sents = read_sentences_from_conllu(gsd_conllu)
+    return sents
+def build_combined_hebrew_dataset(paths, model_type, dataset):
+    """
+    Combines the IAHLT treebank with an updated form of HTB where the annotation style more closes matches IAHLT
+    Currently the updated HTB is not in UD, so you will need to clone
+    git@github.com:IAHLT/UD_Hebrew.git to $UDBASE_GIT
+    dev and test sets will be those from IAHLT
+    """
+    udbase_dir = paths["UDBASE"]
+    udbase_git_dir = paths["UDBASE_GIT"]
+    treebanks = ["UD_Hebrew-IAHLTwiki", "UD_Hebrew-IAHLTknesset"]
+    if dataset == 'train':
+        sents = []
+        for treebank in treebanks:
+            conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu", fail=True)
+            new_sents = read_sentences_from_conllu(conllu_file)
+            print("Read %d sentences from %s" % (len(new_sents), conllu_file))
+            sents.extend(new_sents)
+        # if/when this gets ported back to UD, switch to getting both datasets from UD
+        hebrew_git_dir = os.path.join(udbase_git_dir, "UD_Hebrew")
+        if not os.path.exists(hebrew_git_dir):
+            raise FileNotFoundError("Please download git@github.com:IAHLT/UD_Hebrew.git to %s (based on $UDBASE_GIT)" % hebrew_git_dir)
+        conllu_file = os.path.join(hebrew_git_dir, "he_htb-ud-train.conllu")
+        if not os.path.exists(conllu_file):
+            raise FileNotFoundError("Found %s but inexplicably there was no %s" % (hebrew_git_dir, conllu_file))
+        new_sents = read_sentences_from_conllu(conllu_file)
+        print("Read %d sentences from %s" % (len(new_sents), conllu_file))
+        sents.extend(new_sents)
+    else:
+        conllu_file = common.find_treebank_dataset_file(treebanks[0], udbase_dir, dataset, "conllu", fail=True)
+        sents = read_sentences_from_conllu(conllu_file)
+    return sents
+COMBINED_FNS = {
+    "en_combined": build_combined_english_dataset,
+    "es_combined": build_combined_spanish_dataset,
+    "fr_combined": build_combined_french_dataset,
+    "he_combined": build_combined_hebrew_dataset,
+    "it_combined": build_combined_italian_dataset,
+    "sq_combined": build_combined_albanian_dataset,
+}
+# some extra data for the combined models without augmenting
+COMBINED_EXTRA_FNS = {
+    "en_combined": build_extra_combined_english_dataset,
+    "fr_combined": build_extra_combined_french_dataset,
+    "it_combined": build_extra_combined_italian_dataset,
+}
+def build_combined_dataset(paths, short_name, model_type, augment):
+    random.seed(1234)
+    tokenizer_dir = paths["TOKENIZE_DATA_DIR"]
+    build_fn = COMBINED_FNS[short_name]
+    extra_fn = COMBINED_EXTRA_FNS.get(short_name, None)
+    for dataset in ("train", "dev", "test"):
+        output_conllu = common.tokenizer_conllu_name(tokenizer_dir, short_name, dataset)
+        sents = build_fn(paths, model_type, dataset)
+        if isinstance(sents, dict):
+            if dataset == 'train' and augment:
+                for filename in list(sents.keys()):
+                    sents[filename] = augment_punct(sents[filename])
+            output_zip = os.path.splitext(output_conllu)[0] + ".zip"
+            with zipfile.ZipFile(output_zip, "w") as zout:
+                for filename in list(sents.keys()):
+                    with zout.open(filename + ".conllu", "w") as zfout:
+                        with io.TextIOWrapper(zfout, encoding='utf-8', newline='') as fout:
+                            write_sentences_to_file(fout, sents[filename])
+        else:
+            if dataset == 'train' and augment:
+                sents = augment_punct(sents)
+            if extra_fn is not None:
+                sents.extend(extra_fn(paths, model_type, dataset))
+            write_sentences_to_conllu(output_conllu, sents)
+BIO_DATASETS = ("en_craft", "en_genia", "en_mimic")
+def build_bio_dataset(paths, udbase_dir, tokenizer_dir, handparsed_dir, short_name, model_type, augment):
+    """
+    Process the en bio datasets
+    Creates a dataset by combining the en_combined data with one of the bio sets
+    """
+    random.seed(1234)
+    name, bio_dataset = short_name.split("_")
+    assert name == 'en'
+    for dataset in ("train", "dev", "test"):
+        output_conllu = common.tokenizer_conllu_name(tokenizer_dir, short_name, dataset)
+        if dataset == 'train':
+            sents = build_combined_english_dataset(paths, model_type, dataset)
+            if dataset == 'train' and augment:
+                sents = augment_punct(sents)
+        else:
+            sents = []
+        bio_file = os.path.join(paths["BIO_UD_DIR"], "UD_English-%s" % bio_dataset.upper(), "en_%s-ud-%s.conllu" % (bio_dataset.lower(), dataset))
+        sents.extend(read_sentences_from_conllu(bio_file))
+        write_sentences_to_conllu(output_conllu, sents)
+def build_combined_english_gum_dataset(udbase_dir, tokenizer_dir, short_name, dataset, augment):
+    """
+    Build the GUM dataset by combining GUMReddit
+    It checks to make sure GUMReddit is filled out using the included script
+    """
+    check_gum_ready(udbase_dir)
+    random.seed(1234)
+    output_conllu = common.tokenizer_conllu_name(tokenizer_dir, short_name, dataset)
+    treebanks = ["UD_English-GUM", "UD_English-GUMReddit"]
+    sents = []
+    for treebank in treebanks:
+        conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu", fail=True)
+        sents.extend(read_sentences_from_conllu(conllu_file))
+    if dataset == 'train' and augment:
+        sents = augment_punct(sents)
+    write_sentences_to_conllu(output_conllu, sents)
+def build_combined_english_gum(udbase_dir, tokenizer_dir, short_name, augment):
+    for dataset in ("train", "dev", "test"):
+        build_combined_english_gum_dataset(udbase_dir, tokenizer_dir, short_name, dataset, augment)
+def prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, dataset, augment=True, input_conllu=None, output_conllu=None):
+    if input_conllu is None:
+        input_conllu = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu", fail=True)
+    if output_conllu is None:
+        output_conllu = common.tokenizer_conllu_name(tokenizer_dir, short_name, dataset)
+    print("Reading from %s and writing to %s" % (input_conllu, output_conllu))
+    if short_name == "te_mtg" and dataset == 'train' and augment:
+        write_augmented_dataset(input_conllu, output_conllu, augment_telugu)
+    elif short_name == "ar_padt" and dataset == 'train' and augment:
+        write_augmented_dataset(input_conllu, output_conllu, augment_arabic_padt)
+    elif short_name.startswith("ko_") and short_name.endswith("_seg"):
+        remove_spaces(input_conllu, output_conllu)
+    elif dataset == 'train' and augment:
+        write_augmented_dataset(input_conllu, output_conllu, augment_punct)
+    else:
+        sents = read_sentences_from_conllu(input_conllu)
+        write_sentences_to_conllu(output_conllu, sents)
+def process_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language, augment=True):
+    """
+    Process a normal UD treebank with train/dev/test splits
+    SL-SSJ and other datasets with inline modifications all use this code path as well.
+    """
+    prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, "train", augment)
+    prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, "dev", augment)
+    prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, "test", augment)
+XV_RATIO = 0.2
+def process_partial_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language):
+    """
+    Process a UD treebank with only train/test splits
+    For example, in UD 2.7:
+      UD_Buryat-BDT
+      UD_Galician-TreeGal
+      UD_Indonesian-CSUI
+      UD_Kazakh-KTB
+      UD_Kurmanji-MG
+      UD_Latin-Perseus
+      UD_Livvi-KKPP
+      UD_North_Sami-Giella
+      UD_Old_Russian-RNC
+      UD_Sanskrit-Vedic
+      UD_Slovenian-SST
+      UD_Upper_Sorbian-UFAL
+      UD_Welsh-CCG
+    """
+    train_input_conllu = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu")
+    test_input_conllu = common.find_treebank_dataset_file(treebank, udbase_dir, "test", "conllu")
+    train_output_conllu = common.tokenizer_conllu_name(tokenizer_dir, short_name, "train")
+    dev_output_conllu = common.tokenizer_conllu_name(tokenizer_dir, short_name, "dev")
+    test_output_conllu = common.tokenizer_conllu_name(tokenizer_dir, short_name, "test")
+    if (common.num_words_in_file(train_input_conllu) <= 1000 and
+        common.num_words_in_file(test_input_conllu) > 5000):
+        train_input_conllu, test_input_conllu = test_input_conllu, train_input_conllu
+    if not split_train_file(treebank=treebank,
+                            train_input_conllu=train_input_conllu,
+                            train_output_conllu=train_output_conllu,
+                            dev_output_conllu=dev_output_conllu):
+        return
+    # the test set is already fine
+    # currently we do not do any augmentation of these partial treebanks
+    prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, "test", augment=False, input_conllu=test_input_conllu, output_conllu=test_output_conllu)
+def add_specific_args(parser):
+    parser.add_argument('--no_augment', action='store_false', dest='augment', default=True,
+                        help='Augment the dataset in various ways')
+    parser.add_argument('--no_prepare_labels', action='store_false', dest='prepare_labels', default=True,
+                        help='Prepare tokenizer and MWT labels.  Expensive, but obviously necessary for training those models.')
+    convert_th_lst20.add_lst20_args(parser)
+    convert_vi_vlsp.add_vlsp_args(parser)
+def process_treebank(treebank, model_type, paths, args):
+    """
+    Processes a single treebank into train, dev, test parts
+    Includes processing for a few external tokenization datasets:
+      vi_vlsp, th_orchid, th_best
+    Also, there is no specific mechanism for UD_Arabic-NYUAD or
+    similar treebanks, which need integration with LDC datsets
+    """
+    udbase_dir = paths["UDBASE"]
+    tokenizer_dir = paths["TOKENIZE_DATA_DIR"]
+    handparsed_dir = paths["HANDPARSED_DIR"]
+    short_name = treebank_to_short_name(treebank)
+    short_language = short_name.split("_")[0]
+    os.makedirs(tokenizer_dir, exist_ok=True)
+    if short_name == "my_alt":
+        convert_my_alt.convert_my_alt(paths["CONSTITUENCY_BASE"], tokenizer_dir)
+    elif short_name == "vi_vlsp":
+        convert_vi_vlsp.convert_vi_vlsp(paths["STANZA_EXTERN_DIR"], tokenizer_dir, args)
+    elif short_name == "th_orchid":
+        convert_th_orchid.main(paths["STANZA_EXTERN_DIR"], tokenizer_dir)
+    elif short_name == "th_lst20":
+        convert_th_lst20.convert(paths["STANZA_EXTERN_DIR"], tokenizer_dir, args)
+    elif short_name == "th_best":
+        convert_th_best.main(paths["STANZA_EXTERN_DIR"], tokenizer_dir)
+    elif short_name == "ml_cochin":
+        convert_ml_cochin.main(paths["STANZA_EXTERN_DIR"], tokenizer_dir)
+    elif short_name.startswith("ko_combined"):
+        build_combined_korean(udbase_dir, tokenizer_dir, short_name)
+    elif short_name in COMBINED_FNS: # eg "it_combined", "en_combined", etc
+        build_combined_dataset(paths, short_name, model_type, args.augment)
+    elif short_name in BIO_DATASETS:
+        build_bio_dataset(paths, udbase_dir, tokenizer_dir, handparsed_dir, short_name, model_type, args.augment)
+    elif short_name.startswith("en_gum"):
+        # we special case GUM because it should include a filled-out GUMReddit
+        print("Preparing data for %s: %s, %s" % (treebank, short_name, short_language))
+        build_combined_english_gum(udbase_dir, tokenizer_dir, short_name, args.augment)
+    else:
+        # check that we can find the train file where we expect it
+        train_conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu", fail=True)
+        print("Preparing data for %s: %s, %s" % (treebank, short_name, short_language))
+        if not common.find_treebank_dataset_file(treebank, udbase_dir, "dev", "conllu", fail=False):
+            process_partial_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language)
+        else:
+            process_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language, args.augment)
+    if model_type is common.ModelType.TOKENIZER or model_type is common.ModelType.MWT:
+        if not short_name in ('th_orchid', 'th_lst20'):
+            common.convert_conllu_to_txt(tokenizer_dir, short_name)
+        if args.prepare_labels:
+            common.prepare_tokenizer_treebank_labels(tokenizer_dir, short_name)
+def main():
+    common.main(process_treebank, common.ModelType.TOKENIZER, add_specific_args)
+if __name__ == '__main__':
+    main()

stanza/stanza/utils/datasets/pretrain/__init__.py ADDED Viewed

File without changes

stanza/stanza/utils/datasets/tokenization/__init__.py ADDED Viewed

File without changes

stanza/stanza/utils/datasets/tokenization/convert_vi_vlsp.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import os
+punctuation_set = (',', '.', '!', '?', ')', ':', ';', '”', '…', '...')
+def find_spaces(sentence):
+    # TODO: there are some sentences where there is only one quote,
+    # and some of them should be attached to the previous word instead
+    # of the next word.  Training should work this way, though
+    odd_quotes = False
+    spaces = []
+    for word_idx, word in enumerate(sentence):
+        space = True
+        # Quote period at the end of a sentence needs to be attached
+        # to the rest of the text.  Some sentences have `"... text`
+        # in the middle, though, so look for that
+        if word_idx < len(sentence) - 2 and sentence[word_idx+1] == '"':
+            if sentence[word_idx+2] == '.':
+                space = False
+            elif word_idx == len(sentence) - 3 and sentence[word_idx+2] == '...':
+                space = False
+        if word_idx < len(sentence) - 1:
+            if sentence[word_idx+1] in (',', '.', '!', '?', ')', ':', ';', '”', '…', '...','/', '%'):
+                space = False
+        if word in ('(', '“', '/'):
+            space = False
+        if word == '"':
+            if odd_quotes:
+                # already saw one quote.  put this one at the end of the PREVIOUS word
+                # note that we know there must be at least one word already
+                odd_quotes = False
+                spaces[word_idx-1] = False
+            else:
+                odd_quotes = True
+                space = False
+        spaces.append(space)
+    return spaces
+def add_vlsp_args(parser):
+    parser.add_argument('--include_pos_data', action='store_true', default=False, help='To include or not POS training dataset for tokenization training. The path to POS dataset is expected to be in the same dir with WS path. For example, extern_dir/vietnamese/VLSP2013-POS-data')
+    parser.add_argument('--vlsp_include_spaces', action='store_true', default=False, help='When processing vi_vlsp tokenization, include all of the spaces.  Otherwise, we try to turn the text back into standard text')
+def write_file(vlsp_include_spaces, output_filename, sentences, shard):
+    with open(output_filename, "w") as fout:
+        check_headlines = False
+        for sent_idx, sentence in enumerate(sentences):
+            fout.write("# sent_id = %s.%d\n" % (shard, sent_idx))
+            orig_text = " ".join(sentence)
+            #check if the previous line is a headline (no ending mark at the end) then make this sentence a new par
+            if check_headlines:
+                fout.write("# newpar id =%s.%d.1\n" % (shard, sent_idx))
+                check_headlines = False
+            if sentence[len(sentence) - 1] not in punctuation_set:
+                check_headlines = True
+            if vlsp_include_spaces:
+                fout.write("# text = %s\n" % orig_text)
+            else:
+                spaces = find_spaces(sentence)
+                full_text = ""
+                for word, space in zip(sentence, spaces):
+                    # could be made more efficient, but shouldn't matter
+                    full_text = full_text + word
+                    if space:
+                        full_text = full_text + " "
+                fout.write("# text = %s\n" % full_text)
+                fout.write("# orig_text = %s\n" % orig_text)
+            for word_idx, word in enumerate(sentence):
+                fake_dep = "root" if word_idx == 0 else "dep"
+                fout.write("%d\t%s\t%s" % ((word_idx+1), word, word))
+                fout.write("\t_\t_\t_")
+                fout.write("\t%d\t%s" % (word_idx, fake_dep))
+                fout.write("\t_\t")
+                if vlsp_include_spaces or spaces[word_idx]:
+                    fout.write("_")
+                else:
+                    fout.write("SpaceAfter=No")
+                fout.write("\n")
+            fout.write("\n")
+def convert_pos_dataset(file_path):
+    """
+    This function is to process the pos dataset
+    """
+    file = open(file_path, "r")
+    document = file.readlines()
+    sentences = []
+    sent = []
+    for line in document:
+        if line == "\n" and len(sent)>1:
+            if sent not in sentences:
+                sentences.append(sent)
+            sent = []
+        elif line != "\n":
+            sent.append(line.split("\t")[0].replace("_"," ").strip())
+    return sentences
+def convert_file(vlsp_include_spaces, input_filename, output_filename, shard, split_filename=None, split_shard=None, pos_data = None):
+    with open(input_filename) as fin:
+        lines = fin.readlines()
+    sentences = []
+    set_sentences = set()
+    for line in lines:
+        if len(line.replace("_", " ").split())>1:
+            words = line.split()
+            #one syllable lines are eliminated
+            if len(words) == 1 and len(words[0].split("_")) == 1:
+                continue
+            else:
+                words = [w.replace("_", " ") for w in words]
+                #only add sentences that hasn't been added before
+                if words not in sentences:
+                    sentences.append(words)
+                    set_sentences.add(' '.join(words))
+    if split_filename is not None:
+        # even this is a larger dev set than the train set
+        split_point = int(len(sentences) * 0.95)
+        #check pos_data that aren't overlapping with current VLSP WS dataset
+        sentences_pos = [] if pos_data is None else [sent for sent in pos_data if ' '.join(sent) not in set_sentences]
+        print("Added ", len(sentences_pos), " sentences from POS dataset.")
+        write_file(vlsp_include_spaces, output_filename, sentences[:split_point]+sentences_pos, shard)
+        write_file(vlsp_include_spaces, split_filename, sentences[split_point:], split_shard)
+    else:
+        write_file(vlsp_include_spaces, output_filename, sentences, shard)
+def convert_vi_vlsp(extern_dir, tokenizer_dir, args):
+    input_path = os.path.join(extern_dir, "vietnamese", "VLSP2013-WS-data")
+    input_pos_path = os.path.join(extern_dir, "vietnamese", "VLSP2013-POS-data")
+    input_train_filename = os.path.join(input_path, "VLSP2013_WS_train_gold.txt")
+    input_test_filename = os.path.join(input_path, "VLSP2013_WS_test_gold.txt")
+    input_pos_filename = os.path.join(input_pos_path, "VLSP2013_POS_train_BI_POS_Column.txt.goldSeg")
+    if not os.path.exists(input_train_filename):
+        raise FileNotFoundError("Cannot find train set for VLSP at %s" % input_train_filename)
+    if not os.path.exists(input_test_filename):
+        raise FileNotFoundError("Cannot find test set for VLSP at %s" % input_test_filename)
+    pos_data = None
+    if args.include_pos_data:
+        if not os.path.exists(input_pos_filename):
+            raise FileNotFoundError("Cannot find pos dataset for VLSP at %" % input_pos_filename)
+        else:
+            pos_data = convert_pos_dataset(input_pos_filename)
+    output_train_filename = os.path.join(tokenizer_dir, "vi_vlsp.train.gold.conllu")
+    output_dev_filename = os.path.join(tokenizer_dir,   "vi_vlsp.dev.gold.conllu")
+    output_test_filename = os.path.join(tokenizer_dir,  "vi_vlsp.test.gold.conllu")
+    convert_file(args.vlsp_include_spaces, input_train_filename, output_train_filename, "train", output_dev_filename, "dev", pos_data)
+    convert_file(args.vlsp_include_spaces, input_test_filename, output_test_filename, "test")

stanza/stanza/utils/ner/spacy_ner_tag_dataset.py ADDED Viewed

	@@ -0,0 +1,138 @@

+"""
+Test a spacy model on a 4 class dataset
+"""
+import argparse
+import json
+import spacy
+from spacy.tokens import Doc
+from stanza.models.ner.utils import process_tags
+from stanza.models.ner.scorer import score_by_entity, score_by_token
+from stanza.utils.confusion import format_confusion
+from stanza.utils.datasets.ner.simplify_ontonotes_to_worldwide import simplify_ontonotes_to_worldwide
+from stanza.utils.get_tqdm import get_tqdm
+tqdm = get_tqdm()
+"""
+Simplified classes used in the Worldwide dataset are:
+Date
+Facility
+Location
+Misc
+Money
+NORP
+Organization
+Person
+Product
+vs OntoNotes classes:
+CARDINAL
+DATE
+EVENT
+FAC
+GPE
+LANGUAGE
+LAW
+LOC
+MONEY
+NORP
+ORDINAL
+ORG
+PERCENT
+PERSON
+PRODUCT
+QUANTITY
+TIME
+WORK_OF_ART
+"""
+def test_file(eval_file, tagger, simplify):
+    with open(eval_file) as fin:
+        gold_doc = json.load(fin)
+    gold_doc = [[(x['text'], x['ner']) for x in sentence] for sentence in gold_doc]
+    gold_doc = process_tags(gold_doc, 'bioes')
+    if simplify:
+        for doc in gold_doc:
+            for idx, word in enumerate(doc):
+                if word[1] != "O":
+                    word = [word[0], simplify_ontonotes_to_worldwide(word[1])]
+                    doc[idx] = word
+    ignore_tags = "Date,DATE" if simplify else None
+    original_text = [[x[0] for x in gold_sentence] for gold_sentence in gold_doc]
+    pred_doc = []
+    for sentence in tqdm(original_text):
+        spacy_sentence = Doc(tagger.vocab, sentence)
+        spacy_sentence = tagger(spacy_sentence)
+        entities = ["O" if not token.ent_type_ else "%s-%s" % (token.ent_iob_, token.ent_type_) for token in spacy_sentence]
+        if simplify:
+            entities = [simplify_ontonotes_to_worldwide(x) for x in entities]
+        pred_sentence = [[token.text, entity] for token, entity in zip(spacy_sentence, entities)]
+        pred_doc.append(pred_sentence)
+    pred_doc = process_tags(pred_doc, 'bioes')
+    pred_tags = [[x[1] for x in sentence] for sentence in pred_doc]
+    gold_tags = [[x[1] for x in sentence] for sentence in gold_doc]
+    print("RESULTS ON: %s" % eval_file)
+    _, _, f_micro, _ = score_by_entity(pred_tags, gold_tags, ignore_tags=ignore_tags)
+    _, _, _, confusion = score_by_token(pred_tags, gold_tags, ignore_tags=ignore_tags)
+    print("NER token confusion matrix:\n{}".format(format_confusion(confusion, hide_blank=True, transpose=True)))
+    return f_micro
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--ner_model', type=str, default=None,  help='Which spacy model to test')
+    parser.add_argument('filename', type=str, nargs='*', help='which files to test')
+    parser.add_argument('--simplify', default=False, action='store_true', help='Simplify classes to the 8 class Worldwide model')
+    args = parser.parse_args()
+    if args.ner_model is None:
+        ner_models = ['en_core_web_sm', 'en_core_web_trf']
+    else:
+        ner_models = [args.ner_model]
+    if not args.filename:
+        args.filename = ["data/ner/en_ontonotes-8class.test.json",
+                         "data/ner/en_worldwide-8class.test.json",
+                         "data/ner/en_worldwide-8class-africa.test.json",
+                         "data/ner/en_worldwide-8class-asia.test.json",
+                         "data/ner/en_worldwide-8class-indigenous.test.json",
+                         "data/ner/en_worldwide-8class-latam.test.json",
+                         "data/ner/en_worldwide-8class-middle_east.test.json"]
+    print("Processing the files: %s" % ",".join(args.filename))
+    results = []
+    model_results = {}
+    for ner_model in ner_models:
+        model_results[ner_model] = []
+        # load tagger
+        print("-----------------------------")
+        print("Running %s" % ner_model)
+        print("-----------------------------")
+        tagger = spacy.load(ner_model, disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])
+        for filename in args.filename:
+            f_micro = test_file(filename, tagger, args.simplify)
+            f_micro = "%.2f" % (f_micro * 100)
+            results.append((ner_model, filename, f_micro))
+            model_results[ner_model].append(f_micro)
+    for result in results:
+        print(result)
+    for model in model_results.keys():
+        result = [model] + model_results[model]
+        print(" & ".join(result))
+if __name__ == '__main__':
+    main()

stanza/stanza/utils/training/__init__.py ADDED Viewed

File without changes

stanza/stanza/utils/training/remove_constituency_optimizer.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""Saved a huge, bloated model with an optimizer?  Use this to remove it, greatly shrinking the model size
+This tries to find reasonable defaults for word vectors and charlm
+(which need to be loaded so that the model knows the matrix sizes)
+so ideally all that needs to be run is
+python3 stanza/utils/training/remove_constituency_optimizer.py <treebanks>
+python3 stanza/utils/training/remove_constituency_optimizer.py da_arboretum ...
+This can also be used to load and save models as part of an update
+to the serialized format
+"""
+import argparse
+import logging
+import os
+from stanza.models import constituency_parser
+from stanza.models.common.constant import treebank_to_short_name
+from stanza.resources.default_packages import default_charlms, default_pretrains
+from stanza.utils.training import common
+logger = logging.getLogger('stanza')
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--wordvec_pretrain_file', type=str, default=None, help='Exact name of the pretrain file to read')
+    parser.add_argument('--charlm', default="default", type=str, help='Which charlm to run on.  Will use the default charlm for this language/model if not set.  Set to None to turn off charlm for languages with a default charlm')
+    parser.add_argument('--no_charlm', dest='charlm', action="store_const", const=None, help="Don't use a charlm, even if one is used by default for this package")
+    parser.add_argument('--load_dir', type=str, default="saved_models/constituency", help="Root dir for getting the models to resave.")
+    parser.add_argument('--save_dir', type=str, default="resaved_models/constituency", help="Root dir for resaving the models.")
+    parser.add_argument('treebanks', type=str, nargs='+', help='Which treebanks to run on.  Use all_ud or ud_all for all UD treebanks')
+    args = parser.parse_args()
+    return args
+def main():
+    """
+    For each of the models specified, load and resave the model
+    The resaved model will have the optimizer removed
+    """
+    args = parse_args()
+    os.makedirs(args.save_dir, exist_ok=True)
+    for treebank in args.treebanks:
+        logger.info("PROCESSING %s", treebank)
+        short_name = treebank_to_short_name(treebank)
+        language, dataset = short_name.split("_", maxsplit=1)
+        logger.info("%s: %s %s", short_name, language, dataset)
+        if not args.wordvec_pretrain_file:
+            # will throw an error if the pretrain can't be found
+            wordvec_pretrain = common.find_wordvec_pretrain(language, default_pretrains)
+            wordvec_args = ['--wordvec_pretrain_file', wordvec_pretrain]
+        else:
+            wordvec_args = []
+        charlm = common.choose_charlm(language, dataset, args.charlm, default_charlms, {})
+        charlm_args = common.build_charlm_args(language, charlm, base_args=False)
+        base_name = '{}_constituency.pt'.format(short_name)
+        load_name = os.path.join(args.load_dir, base_name)
+        save_name = os.path.join(args.save_dir, base_name)
+        resave_args = ['--mode', 'remove_optimizer',
+                       '--load_name', load_name,
+                       '--save_name', save_name,
+                       '--save_dir', ".",
+                       '--shorthand', short_name]
+        resave_args = resave_args + wordvec_args + charlm_args
+        constituency_parser.main(resave_args)
+if __name__ == '__main__':
+    main()

stanza/stanza/utils/visualization/dependency_visualization.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""
+Functions to visualize dependency relations in texts and Stanza documents
+"""
+from stanza.models.common.constant import is_right_to_left
+import stanza
+import spacy
+from spacy import displacy
+from spacy.tokens import Doc
+def visualize_doc(doc, language):
+    """
+    Takes in a Document and visualizes it using displacy.
+    The document to visualize must be from the stanza pipeline.
+    right-to-left languages such as Arabic are displayed right-to-left based on the language code
+    """
+    visualization_options = {"compact": True, "bg": "#09a3d5", "color": "white", "distance": 90,
+                             "font": "Source Sans Pro", "arrow_spacing": 25}
+    # blank model - we don't use any of the model features, just the viz
+    nlp = spacy.blank("en")
+    sentences_to_visualize = []
+    for sentence in doc.sentences:
+        words, lemmas, heads, deps, tags = [], [], [], [], []
+        if is_right_to_left(language):  # order of words displayed is reversed, dependency arcs remain intact
+            sent_len = len(sentence.words)
+            for word in reversed(sentence.words):
+                words.append(word.text)
+                lemmas.append(word.lemma)
+                deps.append(word.deprel)
+                tags.append(word.upos)
+                if word.head == 0:  # spaCy head indexes are formatted differently than that of Stanza
+                    heads.append(sent_len - word.id)
+                else:
+                    heads.append(sent_len - word.head)
+        else:   # left to right rendering
+            for word in sentence.words:
+                words.append(word.text)
+                lemmas.append(word.lemma)
+                deps.append(word.deprel)
+                tags.append(word.upos)
+                if word.head == 0:
+                    heads.append(word.id - 1)
+                else:
+                    heads.append(word.head - 1)
+        document_result = Doc(nlp.vocab, words=words, lemmas=lemmas, heads=heads, deps=deps, pos=tags)
+        sentences_to_visualize.append(document_result)
+    for line in sentences_to_visualize:  # render all sentences through displaCy
+        # If this program is NOT being run in a Jupyter notebook, replace displacy.render with displacy.serve
+        # and the visualization will be hosted locally, link being provided in the program output.
+        displacy.render(line, style="dep", options=visualization_options)
+def visualize_str(text, pipeline_code, pipe):
+    """
+    Takes a string and visualizes it using displacy.
+    The string is processed using the stanza pipeline and its
+    dependencies are formatted into a spaCy doc object for easy
+    visualization. Accepts valid stanza (UD) pipelines as the pipeline
+    argument. Must supply the stanza pipeline code (the two-letter
+    abbreviation of the language, such as 'en' for English. Must also
+    supply the stanza pipeline object as the third argument.
+    """
+    doc = pipe(text)
+    visualize_doc(doc, pipeline_code)
+def visualize_docs(docs, lang_code):
+    """
+    Takes in a list of Stanza document objects and a language code (ex: 'en' for English) and visualizes the
+    dependency relationships within each document.
+    This function uses spaCy visualizations. See the visualize_doc function for more details.
+    """
+    for doc in docs:
+        visualize_doc(doc, lang_code)
+def visualize_strings(texts, lang_code):
+    """
+    Takes a language code (ex: 'en' for English) and a list of strings to process and visualizes the
+    dependency relationships in each text.
+    This function loads the Stanza pipeline for the given language and uses it to visualize all of the strings provided.
+    """
+    pipe = stanza.Pipeline(lang_code, processors="tokenize,pos,lemma,depparse")
+    for text in texts:
+        visualize_str(text, lang_code, pipe)
+def main():
+    ar_strings = ['برلين ترفض حصول شركة اميركية على رخصة تصنيع دبابة "ليوبارد" الالمانية', "هل بإمكاني مساعدتك؟",
+               "أراك في مابعد", "لحظة من فضلك"]
+    en_strings = ["This is a sentence.",
+                  "Barack Obama was born in Hawaii. He was elected President of the United States in 2008."]
+    zh_strings = ["中国是一个很有意思的国家。"]
+    # Testing with right to left language
+    visualize_strings(ar_strings, "ar")
+    # Testing with left to right languages
+    visualize_strings(en_strings, "en")
+    visualize_strings(zh_strings, "zh")
+if __name__ == '__main__':
+    main()