bowphs commited on Apr 8, 2025

Commit

fff2f9b

verified ·

1 Parent(s): ba68d3c

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

stanza/stanza/models/common/__init__.py +0 -0
stanza/stanza/models/common/bert_embedding.py +509 -0
stanza/stanza/models/common/biaffine.py +80 -0
stanza/stanza/models/common/build_short_name_to_treebank.py +78 -0
stanza/stanza/models/common/char_model.py +362 -0
stanza/stanza/models/common/chuliu_edmonds.py +281 -0
stanza/stanza/models/common/constant.py +550 -0
stanza/stanza/models/common/count_ner_coverage.py +38 -0
stanza/stanza/models/common/count_pretrain_coverage.py +41 -0
stanza/stanza/models/common/crf.py +149 -0
stanza/stanza/models/common/data.py +155 -0
stanza/stanza/models/common/doc.py +1741 -0
stanza/stanza/models/common/dropout.py +75 -0
stanza/stanza/models/common/exceptions.py +15 -0
stanza/stanza/models/common/foundation_cache.py +148 -0
stanza/stanza/models/common/hlstm.py +124 -0
stanza/stanza/models/common/large_margin_loss.py +68 -0
stanza/stanza/models/common/loss.py +134 -0
stanza/stanza/models/common/maxout_linear.py +42 -0
stanza/stanza/models/common/packed_lstm.py +105 -0
stanza/stanza/models/common/peft_config.py +119 -0
stanza/stanza/models/common/seq2seq_constant.py +17 -0
stanza/stanza/models/common/seq2seq_model.py +364 -0
stanza/stanza/models/common/seq2seq_utils.py +121 -0
stanza/stanza/models/common/short_name_to_treebank.py +619 -0
stanza/stanza/models/common/trainer.py +20 -0
stanza/stanza/models/common/utils.py +816 -0
stanza/stanza/models/common/vocab.py +298 -0
stanza/stanza/models/constituency/base_model.py +532 -0
stanza/stanza/models/constituency/base_trainer.py +153 -0
stanza/stanza/models/constituency/ensemble.py +486 -0
stanza/stanza/models/constituency/in_order_compound_oracle.py +327 -0
stanza/stanza/models/constituency/in_order_oracle.py +1029 -0
stanza/stanza/models/constituency/lstm_model.py +1178 -0
stanza/stanza/models/constituency/parse_tree.py +591 -0
stanza/stanza/models/constituency/positional_encoding.py +89 -0
stanza/stanza/models/constituency/retagging.py +130 -0
stanza/stanza/models/constituency/state.py +144 -0
stanza/stanza/models/constituency/top_down_oracle.py +757 -0
stanza/stanza/models/constituency/trainer.py +306 -0
stanza/stanza/models/constituency/transformer_tree_stack.py +198 -0
stanza/stanza/models/constituency/transition_sequence.py +186 -0
stanza/stanza/models/constituency/tree_embedding.py +135 -0
stanza/stanza/models/coref/config.py +66 -0
stanza/stanza/models/coref/coref_config.toml +285 -0
stanza/stanza/models/coref/dataset.py +61 -0
stanza/stanza/models/coref/pairwise_encoder.py +94 -0
stanza/stanza/models/coref/rough_scorer.py +61 -0
stanza/stanza/models/coref/utils.py +35 -0
stanza/stanza/models/depparse/model.py +265 -0

stanza/stanza/models/common/__init__.py ADDED Viewed

File without changes

stanza/stanza/models/common/bert_embedding.py ADDED Viewed

	@@ -0,0 +1,509 @@

+import math
+import logging
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence, pack_sequence, PackedSequence
+logger = logging.getLogger('stanza')
+BERT_ARGS = {
+    "vinai/phobert-base": { "use_fast": True },
+    "vinai/phobert-large": { "use_fast": True },
+}
+class TextTooLongError(ValueError):
+    """
+    A text was too long for the underlying model (possibly BERT)
+    """
+    def __init__(self, length, max_len, line_num, text):
+        super().__init__("Found a text of length %d (possibly after tokenizing).  Maximum handled length is %d  Error occurred at line %d" % (length, max_len, line_num))
+        self.line_num = line_num
+        self.text = text
+def update_max_length(model_name, tokenizer):
+    if model_name in ('hf-internal-testing/tiny-bert',
+                      'google/muril-base-cased',
+                      'google/muril-large-cased',
+                      'airesearch/wangchanberta-base-att-spm-uncased',
+                      'camembert/camembert-large',
+                      'hfl/chinese-electra-180g-large-discriminator',
+                      'NYTK/electra-small-discriminator-hungarian'):
+        tokenizer.model_max_length = 512
+def load_tokenizer(model_name, tokenizer_kwargs=None, local_files_only=False):
+    if model_name:
+        # note that use_fast is the default
+        try:
+            from transformers import AutoTokenizer
+        except ImportError:
+            raise ImportError("Please install transformers library for BERT support! Try `pip install transformers`.")
+        bert_args = BERT_ARGS.get(model_name, dict())
+        if not model_name.startswith("vinai/phobert"):
+            bert_args["add_prefix_space"] = True
+        if tokenizer_kwargs:
+            bert_args.update(tokenizer_kwargs)
+        bert_args['local_files_only'] = local_files_only
+        bert_tokenizer = AutoTokenizer.from_pretrained(model_name, **bert_args)
+        update_max_length(model_name, bert_tokenizer)
+        return bert_tokenizer
+    return None
+def load_bert(model_name, tokenizer_kwargs=None, local_files_only=False):
+    if model_name:
+        # such as: "vinai/phobert-base"
+        try:
+            from transformers import AutoModel
+        except ImportError:
+            raise ImportError("Please install transformers library for BERT support! Try `pip install transformers`.")
+        bert_model = AutoModel.from_pretrained(model_name, local_files_only=local_files_only)
+        bert_tokenizer = load_tokenizer(model_name, tokenizer_kwargs=tokenizer_kwargs, local_files_only=local_files_only)
+        return bert_model, bert_tokenizer
+    return None, None
+def tokenize_manual(model_name, sent, tokenizer):
+    """
+    Tokenize a sentence manually, using for checking long sentences and PHOBert.
+    """
+    #replace \xa0 or whatever the space character is by _ since PhoBERT expects _ between syllables
+    tokenized = [word.replace("\xa0","_").replace(" ", "_") for word in sent] if model_name.startswith("vinai/phobert") else [word.replace("\xa0"," ") for word in sent]
+    #concatenate to a sentence
+    sentence = ' '.join(tokenized)
+    #tokenize using AutoTokenizer PhoBERT
+    tokenized = tokenizer.tokenize(sentence)
+    #convert tokens to ids
+    sent_ids = tokenizer.convert_tokens_to_ids(tokenized)
+    #add start and end tokens to sent_ids
+    tokenized_sent = [tokenizer.bos_token_id] + sent_ids + [tokenizer.eos_token_id]
+    return tokenized, tokenized_sent
+def filter_data(model_name, data, tokenizer = None, log_level=logging.DEBUG):
+    """
+    Filter out the (NER, POS) data that is too long for BERT model.
+    """
+    if tokenizer is None:
+        tokenizer = load_tokenizer(model_name)
+    filtered_data = []
+    #eliminate all the sentences that are too long for bert model
+    for sent in data:
+        sentence = [word if isinstance(word, str) else word[0] for word in sent]
+        _, tokenized_sent = tokenize_manual(model_name, sentence, tokenizer)
+        if len(tokenized_sent) > tokenizer.model_max_length - 2:
+            continue
+        filtered_data.append(sent)
+    logger.log(log_level, "Eliminated %d of %d datapoints because their length is over maximum size of BERT model.", (len(data)-len(filtered_data)), len(data))
+    return filtered_data
+def needs_length_filter(model_name):
+    """
+    TODO: we were lazy and didn't implement any form of length fudging for models other than bert/roberta/electra
+    """
+    if 'bart' in model_name or 'xlnet' in model_name:
+        return True
+    if model_name.startswith("vinai/phobert"):
+        return True
+    return False
+def cloned_feature(feature, num_layers, detach=True):
+    """
+    Clone & detach the feature, keeping the last N layers (or averaging -2,-3,-4 if not specified)
+    averaging 3 of the last 4 layers worked well for non-VI languages
+    """
+    # in most cases, need to call with features.hidden_states
+    # bartpho is different - it has features.decoder_hidden_states
+    # feature[2] is the same for bert, but it didn't work for
+    # older versions of transformers for xlnet
+    if num_layers is None:
+        feature = torch.stack(feature[-4:-1], axis=3).sum(axis=3) / 4
+    else:
+        feature = torch.stack(feature[-num_layers:], axis=3)
+    if detach:
+        return feature.clone().detach()
+    else:
+        return feature
+def extract_bart_word_embeddings(model_name, tokenizer, model, data, device, keep_endpoints, num_layers, detach=True):
+    """
+    Handles vi-bart.  May need testing before using on other bart
+    https://github.com/VinAIResearch/BARTpho
+    """
+    processed = [] # final product, returns the list of list of word representation
+    sentences = [" ".join([word.replace(" ", "_") for word in sentence]) for sentence in data]
+    tokenized = tokenizer(sentences, return_tensors='pt', padding=True, return_attention_mask=True)
+    input_ids = tokenized['input_ids'].to(device)
+    attention_mask = tokenized['attention_mask'].to(device)
+    for i in range(int(math.ceil(len(sentences)/128))):
+        start_sentence = i * 128
+        end_sentence = min(start_sentence + 128, len(sentences))
+        input_ids = input_ids[start_sentence:end_sentence]
+        attention_mask = attention_mask[start_sentence:end_sentence]
+        if detach:
+            with torch.no_grad():
+                features = model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
+                features = cloned_feature(features.decoder_hidden_states, num_layers, detach)
+        else:
+            features = model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
+            features = cloned_feature(features.decoder_hidden_states, num_layers, detach)
+        for feature, sentence in zip(features, data):
+            # +2 for the endpoints
+            feature = feature[:len(sentence)+2]
+            if not keep_endpoints:
+                feature = feature[1:-1]
+            processed.append(feature)
+    return processed
+def extract_phobert_embeddings(model_name, tokenizer, model, data, device, keep_endpoints, num_layers, detach=True):
+    """
+    Extract transformer embeddings using a method specifically for phobert
+    Since phobert doesn't have the is_split_into_words / tokenized.word_ids(batch_index=0)
+    capability, we instead look for @@ to denote a continued token.
+    data: list of list of string (the text tokens)
+    """
+    processed = [] # final product, returns the list of list of word representation
+    tokenized_sents = [] # list of sentences, each is a torch tensor with start and end token
+    list_tokenized = [] # list of tokenized sentences from phobert
+    for idx, sent in enumerate(data):
+        tokenized, tokenized_sent = tokenize_manual(model_name, sent, tokenizer)
+        #add tokenized to list_tokenzied for later checking
+        list_tokenized.append(tokenized)
+        if len(tokenized_sent) > tokenizer.model_max_length:
+            logger.error("Invalid size, max size: %d, got %d %s", tokenizer.model_max_length, len(tokenized_sent), data[idx])
+            raise TextTooLongError(len(tokenized_sent), tokenizer.model_max_length, idx, " ".join(data[idx]))
+        #add to tokenized_sents
+        tokenized_sents.append(torch.tensor(tokenized_sent).detach())
+        processed_sent = []
+        processed.append(processed_sent)
+        # done loading bert emb
+    size = len(tokenized_sents)
+    #padding the inputs
+    tokenized_sents_padded = torch.nn.utils.rnn.pad_sequence(tokenized_sents,batch_first=True,padding_value=tokenizer.pad_token_id)
+    features = []
+    # Feed into PhoBERT 128 at a time in a batch fashion. In testing, the loop was
+    # run only 1 time as the batch size for the outer model was less than that
+    # (30 for conparser, for example)
+    for i in range(int(math.ceil(size/128))):
+        padded_input = tokenized_sents_padded[128*i:128*i+128]
+        start_sentence = i * 128
+        end_sentence = start_sentence + padded_input.shape[0]
+        attention_mask = torch.zeros(end_sentence - start_sentence, padded_input.shape[1], device=device)
+        for sent_idx, sent in enumerate(tokenized_sents[start_sentence:end_sentence]):
+            attention_mask[sent_idx, :len(sent)] = 1
+        if detach:
+            with torch.no_grad():
+                # TODO: is the clone().detach() necessary?
+                feature = model(padded_input.clone().detach().to(device), attention_mask=attention_mask, output_hidden_states=True)
+                features += cloned_feature(feature.hidden_states, num_layers, detach)
+        else:
+            feature = model(padded_input.to(device), attention_mask=attention_mask, output_hidden_states=True)
+            features += cloned_feature(feature.hidden_states, num_layers, detach)
+    assert len(features)==size
+    assert len(features)==len(processed)
+    #process the output
+    #only take the vector of the last word piece of a word/ you can do other methods such as first word piece or averaging.
+    # idx2+1 compensates for the start token at the start of a sentence
+    offsets = [[idx2+1 for idx2, _ in enumerate(list_tokenized[idx]) if (idx2 > 0 and not list_tokenized[idx][idx2-1].endswith("@@")) or (idx2==0)]
+                for idx, sent in enumerate(processed)]
+    if keep_endpoints:
+        # [0] and [-1] grab the start and end representations as well
+        offsets = [[0] + off + [-1] for off in offsets]
+    processed = [feature[offset] for feature, offset in zip(features, offsets)]
+    # This is a list of tensors
+    # Each tensor holds the representation of a sentence extracted from phobert
+    return processed
+BAD_TOKENIZERS = ('bert-base-german-cased',
+                  # the dbmdz tokenizers turn one or more types of characters into empty words
+                  # for example, from PoSTWITA:
+                  #   ewww 󾓺 — in viaggio Roma
+                  # the character which may not be rendering properly is 0xFE4FA
+                  # https://github.com/dbmdz/berts/issues/48
+                  'dbmdz/bert-base-german-cased',
+                  'dbmdz/bert-base-italian-xxl-cased',
+                  'dbmdz/bert-base-italian-cased',
+                  'dbmdz/electra-base-italian-xxl-cased-discriminator',
+                  # each of these (perhaps using similar tokenizers?)
+                  # does not digest the script-flip-mark \u200f
+                  'avichr/heBERT',
+                  'onlplab/alephbert-base',
+                  'imvladikon/alephbertgimmel-base-512',
+                  # these indonesian models fail on a sentence in the Indonesian GSD dataset:
+                  # 'Tak', 'dapat', 'disangkal', 'jika', '\u200e', 'kemenangan', ...
+                  # weirdly some other indonesian models (even by the same group) don't have that problem
+                  'cahya/bert-base-indonesian-1.5G',
+                  'indolem/indobert-base-uncased',
+                  'google/muril-base-cased',
+                  'l3cube-pune/marathi-roberta')
+def fix_blank_tokens(tokenizer, data):
+    """Patch bert tokenizers with missing characters
+    There is an issue that some tokenizers (so far the German ones identified above)
+    tokenize soft hyphens or other unknown characters into nothing
+    If an entire word is tokenized as a soft hyphen, this means the tokenizer
+    simply vaporizes that word.  The result is we're missing an embedding for
+    an entire word we wanted to use.
+    The solution we take here is to look for any words which get vaporized
+    in such a manner, eg `len(token) == 2`, and replace it with a regular "-"
+    Actually, recently we have found that even the Bert / Electra tokenizer
+    can do this in the case of "words" which are one special character long,
+    so the easiest thing to do is just always run this function
+    """
+    new_data = []
+    for sentence in data:
+        tokenized = tokenizer(sentence, is_split_into_words=False).input_ids
+        new_sentence = [word if len(token) > 2 else "-" for word, token in zip(sentence, tokenized)]
+        new_data.append(new_sentence)
+    return new_data
+def extract_xlnet_embeddings(model_name, tokenizer, model, data, device, keep_endpoints, num_layers, detach=True):
+    # using attention masks makes contextual embeddings much more useful for downstream tasks
+    tokenized = tokenizer(data, is_split_into_words=True, return_offsets_mapping=False, return_attention_mask=False)
+    #tokenized = tokenizer(data, padding="longest", is_split_into_words=True, return_offsets_mapping=False, return_attention_mask=True)
+    list_offsets = [[None] * (len(sentence)+2) for sentence in data]
+    for idx in range(len(data)):
+        offsets = tokenized.word_ids(batch_index=idx)
+        list_offsets[idx][0] = 0
+        for pos, offset in enumerate(offsets):
+            if offset is None:
+                break
+            # this uses the last token piece for any offset by overwriting the previous value
+            # this will be one token earlier
+            # we will add a <pad> to the start of each sentence for the endpoints
+            list_offsets[idx][offset+1] = pos + 1
+        list_offsets[idx][-1] = list_offsets[idx][-2] + 1
+        if any(x is None for x in list_offsets[idx]):
+            raise ValueError("OOPS, hit None when preparing to use Bert\ndata[idx]: {}\noffsets: {}\nlist_offsets[idx]: {}".format(data[idx], offsets, list_offsets[idx], tokenized))
+        if len(offsets) > tokenizer.model_max_length - 2:
+            logger.error("Invalid size, max size: %d, got %d %s", tokenizer.model_max_length, len(offsets), data[idx])
+            raise TextTooLongError(len(offsets), tokenizer.model_max_length, idx, " ".join(data[idx]))
+    features = []
+    for i in range(int(math.ceil(len(data)/128))):
+        # TODO: find a suitable representation for attention masks for xlnet
+        # xlnet base on WSJ:
+        # sep_token_id at beginning, cls_token_id at end:     0.9441
+        # bos_token_id at beginning, eos_token_id at end:     0.9463
+        # bos_token_id at beginning, sep_token_id at end:     0.9459
+        # bos_token_id at beginning, cls_token_id at end:     0.9457
+        # bos_token_id at beginning, sep/cls at end:          0.9454
+        # use the xlnet tokenization with words at end,
+        # begin token is last pad, end token is sep, no mask: 0.9463
+        # same, but with masks:                               0.9440
+        input_ids = [[tokenizer.bos_token_id] + x[:-2] + [tokenizer.eos_token_id] for x in tokenized['input_ids'][128*i:128*i+128]]
+        max_len = max(len(x) for x in input_ids)
+        attention_mask = torch.zeros(len(input_ids), max_len, dtype=torch.long, device=device)
+        for idx, input_row in enumerate(input_ids):
+            attention_mask[idx, :len(input_row)] = 1
+            if len(input_row) < max_len:
+                input_row.extend([tokenizer.pad_token_id] * (max_len - len(input_row)))
+        if detach:
+            with torch.no_grad():
+                id_tensor = torch.tensor(input_ids, device=device)
+                feature = model(id_tensor, attention_mask=attention_mask, output_hidden_states=True)
+                # feature[2] is the same for bert, but it didn't work for
+                # older versions of transformers for xlnet
+                # feature = feature[2]
+                features += cloned_feature(feature.hidden_states, num_layers, detach)
+        else:
+            id_tensor = torch.tensor(input_ids, device=device)
+            feature = model(id_tensor, attention_mask=attention_mask, output_hidden_states=True)
+            # feature[2] is the same for bert, but it didn't work for
+            # older versions of transformers for xlnet
+            # feature = feature[2]
+            features += cloned_feature(feature.hidden_states, num_layers, detach)
+    processed = []
+    #process the output
+    if not keep_endpoints:
+        #remove the bos and eos tokens
+        list_offsets = [sent[1:-1] for sent in list_offsets]
+    for feature, offsets in zip(features, list_offsets):
+        new_sent = feature[offsets]
+        processed.append(new_sent)
+    return processed
+def build_cloned_features(model, tokenizer, attention_tensor, id_tensor, num_layers, detach, device):
+    """
+    Extract an embedding from the given transformer for a certain attention mask and tokens range
+    In the event that the tokens are longer than the max length
+    supported by the model, the range is split up into overlapping
+    sections and the overlapping pieces are connected.  No idea if
+    this is actually any good, but at least it returns something
+    instead of horribly failing
+    TODO: at least two upgrades are very relevant
+      1) cut off some overlap at the end as well
+      2) use this on the phobert, bart, and xln versions as well
+    """
+    if attention_tensor.shape[1] <= tokenizer.model_max_length:
+        features = model(id_tensor, attention_mask=attention_tensor, output_hidden_states=True)
+        features = cloned_feature(features.hidden_states, num_layers, detach)
+        return features
+    slices = []
+    slice_len = max(tokenizer.model_max_length - 20, tokenizer.model_max_length // 2)
+    prefix_len = tokenizer.model_max_length - slice_len
+    if slice_len < 5:
+        raise RuntimeError("Really tiny tokenizer!")
+    remaining_attention = attention_tensor
+    remaining_ids = id_tensor
+    while True:
+        attention_slice = remaining_attention[:, :tokenizer.model_max_length]
+        id_slice = remaining_ids[:, :tokenizer.model_max_length]
+        features = model(id_slice, attention_mask=attention_slice, output_hidden_states=True)
+        features = cloned_feature(features.hidden_states, num_layers, detach)
+        if len(slices) > 0:
+            features = features[:, prefix_len:, :]
+        slices.append(features)
+        if remaining_attention.shape[1] <= tokenizer.model_max_length:
+            break
+        remaining_attention = remaining_attention[:, slice_len:]
+        remaining_ids = remaining_ids[:, slice_len:]
+    slices = torch.cat(slices, axis=1)
+    return slices
+def convert_to_position_list(sentence, offsets):
+    """
+    Convert a transformers-tokenized sentence's offsets to a list of word to position
+    """
+    # +2 for the beginning and end
+    list_offsets = [None] * (len(sentence) + 2)
+    for pos, offset in enumerate(offsets):
+        if offset is None:
+            continue
+        # this uses the last token piece for any offset by overwriting the previous value
+        list_offsets[offset+1] = pos
+    list_offsets[0] = 0
+    for offset in list_offsets[-2::-1]:
+        # count backwards in case the last position was
+        # a word or character that got erased by the tokenizer
+        # this loop should eventually find something...
+        # after all, we just set the first one to be 0
+        if offset is not None:
+            list_offsets[-1] = offset + 1
+            break
+    return list_offsets
+def extract_base_embeddings(model_name, tokenizer, model, data, device, keep_endpoints, num_layers, detach):
+    #add add_prefix_space = True for RoBerTa-- error if not
+    # using attention masks makes contextual embeddings much more useful for downstream tasks
+    tokenized = tokenizer(data, padding="longest", is_split_into_words=True, return_offsets_mapping=False, return_attention_mask=True)
+    list_offsets = []
+    for idx in range(len(data)):
+        converted_offsets = convert_to_position_list(data[idx], tokenized.word_ids(batch_index=idx))
+        list_offsets.append(converted_offsets)
+        #if list_offsets[idx][-1] > tokenizer.model_max_length - 1:
+        #    logger.error("Invalid size, max size: %d, got %d.\nTokens: %s\nTokenized: %s", tokenizer.model_max_length, len(offsets), data[idx][:1000], offsets[:1000])
+        #    raise TextTooLongError(len(offsets), tokenizer.model_max_length, idx, " ".join(data[idx]))
+    if any(any(x is None for x in converted_offsets) for converted_offsets in list_offsets):
+        # at least one of the tokens in the data is composed entirely of characters the tokenizer doesn't know about
+        # one possible approach would be to retokenize only those sentences
+        # however, in that case the attention mask might be of a different length,
+        # as would the token ids, and it would be a pain to fix those
+        # easiest to just retokenize the whole thing, hopefully a rare event
+        data = fix_blank_tokens(tokenizer, data)
+        tokenized = tokenizer(data, padding="longest", is_split_into_words=True, return_offsets_mapping=False, return_attention_mask=True)
+        list_offsets = []
+        for idx in range(len(data)):
+            converted_offsets = convert_to_position_list(data[idx], tokenized.word_ids(batch_index=idx))
+            list_offsets.append(converted_offsets)
+    if any(any(x is None for x in converted_offsets) for converted_offsets in list_offsets):
+        raise ValueError("OOPS, hit None when preparing to use Bert\ndata[idx]: {}\noffsets: {}\nlist_offsets[idx]: {}".format(data[idx], offsets, list_offsets[idx], tokenized))
+    features = []
+    for i in range(int(math.ceil(len(data)/128))):
+        attention_tensor = torch.tensor(tokenized['attention_mask'][128*i:128*i+128], device=device)
+        id_tensor = torch.tensor(tokenized['input_ids'][128*i:128*i+128], device=device)
+        if detach:
+            with torch.no_grad():
+                features += build_cloned_features(model, tokenizer, attention_tensor, id_tensor, num_layers, detach, device)
+        else:
+            features += build_cloned_features(model, tokenizer, attention_tensor, id_tensor, num_layers, detach, device)
+    processed = []
+    #process the output
+    if not keep_endpoints:
+        #remove the bos and eos tokens
+        list_offsets = [sent[1:-1] for sent in list_offsets]
+    for feature, offsets in zip(features, list_offsets):
+        new_sent = feature[offsets]
+        processed.append(new_sent)
+    return processed
+def extract_bert_embeddings(model_name, tokenizer, model, data, device, keep_endpoints, num_layers=None, detach=True, peft_name=None):
+    """
+    Extract transformer embeddings using a generic roberta extraction
+    data: list of list of string (the text tokens)
+    num_layers: how many to return.  If None, the average of -2, -3, -4 is returned
+    """
+    # TODO: can maybe cache this value for a model and save some time
+    # TODO: too bad it isn't thread safe, but then again, who does?
+    if peft_name is None:
+        if model._hf_peft_config_loaded:
+            model.disable_adapters()
+    else:
+        model.enable_adapters()
+        model.set_adapter(peft_name)
+    if model_name.startswith("vinai/phobert"):
+        return extract_phobert_embeddings(model_name, tokenizer, model, data, device, keep_endpoints, num_layers, detach)
+    if 'bart' in model_name:
+        # this should work with "vinai/bartpho-word"
+        # not sure this works with any other Bart
+        return extract_bart_word_embeddings(model_name, tokenizer, model, data, device, keep_endpoints, num_layers, detach)
+    if isinstance(data, tuple):
+        data = list(data)
+    if "xlnet" in model_name:
+        return extract_xlnet_embeddings(model_name, tokenizer, model, data, device, keep_endpoints, num_layers, detach)
+    return extract_base_embeddings(model_name, tokenizer, model, data, device, keep_endpoints, num_layers, detach)

stanza/stanza/models/common/biaffine.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class PairwiseBilinear(nn.Module):
+    ''' A bilinear module that deals with broadcasting for efficient memory usage.
+    Input: tensors of sizes (N x L1 x D1) and (N x L2 x D2)
+    Output: tensor of size (N x L1 x L2 x O)'''
+    def __init__(self, input1_size, input2_size, output_size, bias=True):
+        super().__init__()
+        self.input1_size = input1_size
+        self.input2_size = input2_size
+        self.output_size = output_size
+        self.weight = nn.Parameter(torch.Tensor(input1_size, input2_size, output_size))
+        self.bias = nn.Parameter(torch.Tensor(output_size)) if bias else 0
+    def forward(self, input1, input2):
+        input1_size = list(input1.size())
+        input2_size = list(input2.size())
+        output_size = [input1_size[0], input1_size[1], input2_size[1], self.output_size]
+        # ((N x L1) x D1) * (D1 x (D2 x O)) -> (N x L1) x (D2 x O)
+        intermediate = torch.mm(input1.view(-1, input1_size[-1]), self.weight.view(-1, self.input2_size * self.output_size))
+        # (N x L2 x D2) -> (N x D2 x L2)
+        input2 = input2.transpose(1, 2)
+        # (N x (L1 x O) x D2) * (N x D2 x L2) -> (N x (L1 x O) x L2)
+        output = intermediate.view(input1_size[0], input1_size[1] * self.output_size, input2_size[2]).bmm(input2)
+        # (N x (L1 x O) x L2) -> (N x L1 x L2 x O)
+        output = output.view(input1_size[0], input1_size[1], self.output_size, input2_size[1]).transpose(2, 3)
+        return output
+class BiaffineScorer(nn.Module):
+    def __init__(self, input1_size, input2_size, output_size):
+        super().__init__()
+        self.W_bilin = nn.Bilinear(input1_size + 1, input2_size + 1, output_size)
+        self.W_bilin.weight.data.zero_()
+        self.W_bilin.bias.data.zero_()
+    def forward(self, input1, input2):
+        input1 = torch.cat([input1, input1.new_ones(*input1.size()[:-1], 1)], len(input1.size())-1)
+        input2 = torch.cat([input2, input2.new_ones(*input2.size()[:-1], 1)], len(input2.size())-1)
+        return self.W_bilin(input1, input2)
+class PairwiseBiaffineScorer(nn.Module):
+    def __init__(self, input1_size, input2_size, output_size):
+        super().__init__()
+        self.W_bilin = PairwiseBilinear(input1_size + 1, input2_size + 1, output_size)
+        self.W_bilin.weight.data.zero_()
+        self.W_bilin.bias.data.zero_()
+    def forward(self, input1, input2):
+        input1 = torch.cat([input1, input1.new_ones(*input1.size()[:-1], 1)], len(input1.size())-1)
+        input2 = torch.cat([input2, input2.new_ones(*input2.size()[:-1], 1)], len(input2.size())-1)
+        return self.W_bilin(input1, input2)
+class DeepBiaffineScorer(nn.Module):
+    def __init__(self, input1_size, input2_size, hidden_size, output_size, hidden_func=F.relu, dropout=0, pairwise=True):
+        super().__init__()
+        self.W1 = nn.Linear(input1_size, hidden_size)
+        self.W2 = nn.Linear(input2_size, hidden_size)
+        self.hidden_func = hidden_func
+        if pairwise:
+            self.scorer = PairwiseBiaffineScorer(hidden_size, hidden_size, output_size)
+        else:
+            self.scorer = BiaffineScorer(hidden_size, hidden_size, output_size)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, input1, input2):
+        return self.scorer(self.dropout(self.hidden_func(self.W1(input1))), self.dropout(self.hidden_func(self.W2(input2))))
+if __name__ == "__main__":
+    x1 = torch.randn(3,4)
+    x2 = torch.randn(3,5)
+    scorer = DeepBiaffineScorer(4, 5, 6, 7)
+    print(scorer(x1, x2))

stanza/stanza/models/common/build_short_name_to_treebank.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import glob
+import os
+from stanza.models.common.constant import treebank_to_short_name, UnknownLanguageError, treebank_special_cases
+from stanza.utils import default_paths
+paths = default_paths.get_default_paths()
+udbase = paths["UDBASE"]
+directories = glob.glob(udbase + "/UD_*")
+directories.sort()
+output_name = os.path.join(os.path.split(__file__)[0], "short_name_to_treebank.py")
+ud_names = [os.path.split(ud_path)[1] for ud_path in directories]
+short_names = []
+# check that all languages are known in the language map
+# use that language map to come up with a shortname for these treebanks
+for directory, ud_name in zip(directories, ud_names):
+    try:
+        short_names.append(treebank_to_short_name(ud_name))
+    except UnknownLanguageError as e:
+        raise UnknownLanguageError("Could not find language short name for dataset %s, path %s" % (ud_name, directory)) from e
+for directory, ud_name in zip(directories, ud_names):
+    if ud_name.startswith("UD_Norwegian"):
+        if ud_name not in treebank_special_cases:
+            raise ValueError("Please figure out if dataset %s is NN or NB, then add to treebank_special_cases" % ud_name)
+    if ud_name.startswith("UD_Chinese"):
+        if ud_name not in treebank_special_cases:
+            raise ValueError("Please figure out if dataset %s is NN or NB, then add to treebank_special_cases" % ud_name)
+max_len = max(len(x) for x in short_names) + 8
+line_format = "    %-" + str(max_len) + "s '%s',\n"
+print("Writing to %s" % output_name)
+with open(output_name, "w") as fout:
+    fout.write("# This module is autogenerated by build_short_name_to_treebank.py\n")
+    fout.write("# Please do not edit\n")
+    fout.write("\n")
+    fout.write("SHORT_NAMES = {\n")
+    for short_name, ud_name in zip(short_names, ud_names):
+        fout.write(line_format % ("'" + short_name + "':", ud_name))
+        if short_name.startswith("zh_"):
+            short_name = "zh-hans_" + short_name[3:]
+            fout.write(line_format % ("'" + short_name + "':", ud_name))
+        elif short_name.startswith("zh-hans_") or short_name.startswith("zh-hant_"):
+            short_name = "zh_" + short_name[8:]
+            fout.write(line_format % ("'" + short_name + "':", ud_name))
+        elif short_name == 'nb_bokmaal':
+            short_name = 'no_bokmaal'
+            fout.write(line_format % ("'" + short_name + "':", ud_name))
+    fout.write("}\n")
+    fout.write("""
+def short_name_to_treebank(short_name):
+    return SHORT_NAMES[short_name]
+""")
+    max_len = max(len(x) for x in ud_names) + 5
+    line_format = "    %-" + str(max_len) + "s '%s',\n"
+    fout.write("CANONICAL_NAMES = {\n")
+    for ud_name in ud_names:
+        fout.write(line_format % ("'" + ud_name.lower() + "':", ud_name))
+    fout.write("}\n")
+    fout.write("""
+def canonical_treebank_name(ud_name):
+    if ud_name in SHORT_NAMES:
+        return SHORT_NAMES[ud_name]
+    return CANONICAL_NAMES.get(ud_name.lower(), ud_name)
+""")

stanza/stanza/models/common/char_model.py ADDED Viewed

	@@ -0,0 +1,362 @@

+"""
+Based on
+@inproceedings{akbik-etal-2018-contextual,
+    title = "Contextual String Embeddings for Sequence Labeling",
+    author = "Akbik, Alan  and
+      Blythe, Duncan  and
+      Vollgraf, Roland",
+    booktitle = "Proceedings of the 27th International Conference on Computational Linguistics",
+    month = aug,
+    year = "2018",
+    address = "Santa Fe, New Mexico, USA",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/C18-1139",
+    pages = "1638--1649",
+}
+"""
+from collections import Counter
+from operator import itemgetter
+import os
+import torch
+import torch.nn as nn
+from torch.nn.utils.rnn import pack_sequence, pad_packed_sequence, pack_padded_sequence, PackedSequence
+from stanza.models.common.data import get_long_tensor
+from stanza.models.common.packed_lstm import PackedLSTM
+from stanza.models.common.utils import open_read_text, tensor_unsort, unsort
+from stanza.models.common.dropout import SequenceUnitDropout
+from stanza.models.common.vocab import UNK_ID, CharVocab
+class CharacterModel(nn.Module):
+    def __init__(self, args, vocab, pad=False, bidirectional=False, attention=True):
+        super().__init__()
+        self.args = args
+        self.pad = pad
+        self.num_dir = 2 if bidirectional else 1
+        self.attn = attention
+        # char embeddings
+        self.char_emb = nn.Embedding(len(vocab['char']), self.args['char_emb_dim'], padding_idx=0)
+        if self.attn:
+            self.char_attn = nn.Linear(self.num_dir * self.args['char_hidden_dim'], 1, bias=False)
+            self.char_attn.weight.data.zero_()
+        # modules
+        self.charlstm = PackedLSTM(self.args['char_emb_dim'], self.args['char_hidden_dim'], self.args['char_num_layers'], batch_first=True, \
+                dropout=0 if self.args['char_num_layers'] == 1 else args['dropout'], rec_dropout = self.args['char_rec_dropout'], bidirectional=bidirectional)
+        self.charlstm_h_init = nn.Parameter(torch.zeros(self.num_dir * self.args['char_num_layers'], 1, self.args['char_hidden_dim']))
+        self.charlstm_c_init = nn.Parameter(torch.zeros(self.num_dir * self.args['char_num_layers'], 1, self.args['char_hidden_dim']))
+        self.dropout = nn.Dropout(args['dropout'])
+    def forward(self, chars, chars_mask, word_orig_idx, sentlens, wordlens):
+        embs = self.dropout(self.char_emb(chars))
+        batch_size = embs.size(0)
+        embs = pack_padded_sequence(embs, wordlens, batch_first=True)
+        output = self.charlstm(embs, wordlens, hx=(\
+                self.charlstm_h_init.expand(self.num_dir * self.args['char_num_layers'], batch_size, self.args['char_hidden_dim']).contiguous(), \
+                self.charlstm_c_init.expand(self.num_dir * self.args['char_num_layers'], batch_size, self.args['char_hidden_dim']).contiguous()))
+        # apply attention, otherwise take final states
+        if self.attn:
+            char_reps = output[0]
+            weights = torch.sigmoid(self.char_attn(self.dropout(char_reps.data)))
+            char_reps = PackedSequence(char_reps.data * weights, char_reps.batch_sizes)
+            char_reps, _ = pad_packed_sequence(char_reps, batch_first=True)
+            res = char_reps.sum(1)
+        else:
+            h, c = output[1]
+            res = h[-2:].transpose(0,1).contiguous().view(batch_size, -1)
+        # recover character order and word separation
+        res = tensor_unsort(res, word_orig_idx)
+        res = pack_sequence(res.split(sentlens))
+        if self.pad:
+            res = pad_packed_sequence(res, batch_first=True)[0]
+        return res
+def build_charlm_vocab(path, cutoff=0):
+    """
+    Build a vocab for a CharacterLanguageModel
+    Requires a large amount of memory, but only need to build once
+    here we need some trick to deal with excessively large files
+    for each file we accumulate the counter of characters, and
+    at the end we simply pass a list of chars to the vocab builder
+    """
+    counter = Counter()
+    if os.path.isdir(path):
+        filenames = sorted(os.listdir(path))
+    else:
+        filenames = [os.path.split(path)[1]]
+        path = os.path.split(path)[0]
+    for filename in filenames:
+        filename = os.path.join(path, filename)
+        with open_read_text(filename) as fin:
+            for line in fin:
+                counter.update(list(line))
+    if len(counter) == 0:
+        raise ValueError("Training data was empty!")
+    # remove infrequent characters from vocab
+    for k in list(counter.keys()):
+        if counter[k] < cutoff:
+            del counter[k]
+    # a singleton list of all characters
+    data = [sorted([x[0] for x in counter.most_common()])]
+    if len(data[0]) == 0:
+        raise ValueError("All characters in the training data were less frequent than --cutoff!")
+    vocab = CharVocab(data) # skip cutoff argument because this has been dealt with
+    return vocab
+CHARLM_START = "\n"
+CHARLM_END = " "
+class CharacterLanguageModel(nn.Module):
+    def __init__(self, args, vocab, pad=False, is_forward_lm=True):
+        super().__init__()
+        self.args = args
+        self.vocab = vocab
+        self.is_forward_lm = is_forward_lm
+        self.pad = pad
+        self.finetune = True # always finetune unless otherwise specified
+        # char embeddings
+        self.char_emb = nn.Embedding(len(self.vocab['char']), self.args['char_emb_dim'], padding_idx=None) # we use space as padding, so padding_idx is not necessary
+        # modules
+        self.charlstm = PackedLSTM(self.args['char_emb_dim'], self.args['char_hidden_dim'], self.args['char_num_layers'], batch_first=True, \
+                dropout=0 if self.args['char_num_layers'] == 1 else args['char_dropout'], rec_dropout = self.args['char_rec_dropout'], bidirectional=False)
+        self.charlstm_h_init = nn.Parameter(torch.zeros(self.args['char_num_layers'], 1, self.args['char_hidden_dim']))
+        self.charlstm_c_init = nn.Parameter(torch.zeros(self.args['char_num_layers'], 1, self.args['char_hidden_dim']))
+        # decoder
+        self.decoder = nn.Linear(self.args['char_hidden_dim'], len(self.vocab['char']))
+        self.dropout = nn.Dropout(args['char_dropout'])
+        self.char_dropout = SequenceUnitDropout(args.get('char_unit_dropout', 0), UNK_ID)
+    def forward(self, chars, charlens, hidden=None):
+        chars = self.char_dropout(chars)
+        embs = self.dropout(self.char_emb(chars))
+        batch_size = embs.size(0)
+        embs = pack_padded_sequence(embs, charlens, batch_first=True)
+        if hidden is None:
+            hidden = (self.charlstm_h_init.expand(self.args['char_num_layers'], batch_size, self.args['char_hidden_dim']).contiguous(),
+                      self.charlstm_c_init.expand(self.args['char_num_layers'], batch_size, self.args['char_hidden_dim']).contiguous())
+        output, hidden = self.charlstm(embs, charlens, hx=hidden)
+        output = self.dropout(pad_packed_sequence(output, batch_first=True)[0])
+        decoded = self.decoder(output)
+        return output, hidden, decoded
+    def get_representation(self, chars, charoffsets, charlens, char_orig_idx):
+        with torch.no_grad():
+            output, _, _ = self.forward(chars, charlens)
+            res = [output[i, offsets] for i, offsets in enumerate(charoffsets)]
+            res = unsort(res, char_orig_idx)
+            res = pack_sequence(res)
+            if self.pad:
+                res = pad_packed_sequence(res, batch_first=True)[0]
+        return res
+    def per_char_representation(self, words):
+        device = next(self.parameters()).device
+        vocab = self.char_vocab()
+        all_data = [(vocab.map(word), len(word), idx) for idx, word in enumerate(words)]
+        all_data.sort(key=itemgetter(1), reverse=True)
+        chars = [x[0] for x in all_data]
+        char_lens = [x[1] for x in all_data]
+        char_tensor = get_long_tensor(chars, len(chars), pad_id=vocab.unit2id(CHARLM_END)).to(device=device)
+        with torch.no_grad():
+            output, _, _ = self.forward(char_tensor, char_lens)
+            output = [x[:y, :] for x, y in zip(output, char_lens)]
+            output = unsort(output, [x[2] for x in all_data])
+        return output
+    def build_char_representation(self, sentences):
+        """
+        Return values from this charlm for a list of list of words
+        input: [[str]]
+          K sentences, each of length Ki (can be different for each sentence)
+        output: [tensor(Ki x dim)]
+          list of tensors, each one with shape Ki by the dim of the character model
+        Values are taken from the last character in a word for each word.
+        The words are effectively treated as if they are whitespace separated
+        (which may actually be somewhat inaccurate for languages such as Chinese or for MWT)
+        """
+        forward = self.is_forward_lm
+        vocab = self.char_vocab()
+        device = next(self.parameters()).device
+        all_data = []
+        for idx, words in enumerate(sentences):
+            if not forward:
+                words = [x[::-1] for x in reversed(words)]
+            chars = [CHARLM_START]
+            offsets = []
+            for w in words:
+                chars.extend(w)
+                chars.append(CHARLM_END)
+                offsets.append(len(chars) - 1)
+            if not forward:
+                offsets.reverse()
+            chars = vocab.map(chars)
+            all_data.append((chars, offsets, len(chars), len(all_data)))
+        all_data.sort(key=itemgetter(2), reverse=True)
+        chars, char_offsets, char_lens, orig_idx = tuple(zip(*all_data))
+        # TODO: can this be faster?
+        chars = get_long_tensor(chars, len(all_data), pad_id=vocab.unit2id(CHARLM_END)).to(device=device)
+        with torch.no_grad():
+            output, _, _ = self.forward(chars, char_lens)
+            res = [output[i, offsets] for i, offsets in enumerate(char_offsets)]
+            res = unsort(res, orig_idx)
+        return res
+    def hidden_dim(self):
+        return self.args['char_hidden_dim']
+    def char_vocab(self):
+        return self.vocab['char']
+    def train(self, mode=True):
+        """
+        Override the default train() function, so that when self.finetune == False, the training mode
+        won't be impacted by the parent models' status change.
+        """
+        if not mode: # eval() is always allowed, regardless of finetune status
+            super().train(mode)
+        else:
+            if self.finetune: # only set to training mode in finetune status
+                super().train(mode)
+    def full_state(self):
+        state = {
+            'vocab': self.vocab['char'].state_dict(),
+            'args': self.args,
+            'state_dict': self.state_dict(),
+            'pad': self.pad,
+            'is_forward_lm': self.is_forward_lm
+        }
+        return state
+    def save(self, filename):
+        os.makedirs(os.path.split(filename)[0], exist_ok=True)
+        state = self.full_state()
+        torch.save(state, filename, _use_new_zipfile_serialization=False)
+    @classmethod
+    def from_full_state(cls, state, finetune=False):
+        vocab = {'char': CharVocab.load_state_dict(state['vocab'])}
+        model = cls(state['args'], vocab, state['pad'], state['is_forward_lm'])
+        model.load_state_dict(state['state_dict'])
+        model.eval()
+        model.finetune = finetune # set finetune status
+        return model
+    @classmethod
+    def load(cls, filename, finetune=False):
+        state = torch.load(filename, lambda storage, loc: storage, weights_only=True)
+        # allow saving just the Model object,
+        # and allow for old charlms to still work
+        if 'state_dict' in state:
+            return cls.from_full_state(state, finetune)
+        return cls.from_full_state(state['model'], finetune)
+class CharacterLanguageModelWordAdapter(nn.Module):
+    """
+    Adapts a character model to return embeddings for each character in a word
+    """
+    def __init__(self, charlms):
+        super().__init__()
+        self.charlms = charlms
+    def forward(self, words):
+        words = [CHARLM_START + x + CHARLM_END for x in words]
+        padded_reps = []
+        for charlm in self.charlms:
+            rep = charlm.per_char_representation(words)
+            padded_rep = torch.zeros(len(rep), max(x.shape[0] for x in rep), rep[0].shape[1], dtype=rep[0].dtype, device=rep[0].device)
+            for idx, row in enumerate(rep):
+                padded_rep[idx, :row.shape[0], :] = row
+            padded_reps.append(padded_rep)
+        padded_rep = torch.cat(padded_reps, dim=2)
+        return padded_rep
+    def hidden_dim(self):
+        return sum(charlm.hidden_dim() for charlm in self.charlms)
+class CharacterLanguageModelTrainer():
+    def __init__(self, model, params, optimizer, criterion, scheduler, epoch=1, global_step=0):
+        self.model = model
+        self.params = params
+        self.optimizer = optimizer
+        self.criterion = criterion
+        self.scheduler = scheduler
+        self.epoch = epoch
+        self.global_step = global_step
+    def save(self, filename, full=True):
+        os.makedirs(os.path.split(filename)[0], exist_ok=True)
+        state = {
+            'model': self.model.full_state(),
+            'epoch': self.epoch,
+            'global_step': self.global_step,
+        }
+        if full and self.optimizer is not None:
+            state['optimizer'] = self.optimizer.state_dict()
+        if full and self.criterion is not None:
+            state['criterion'] = self.criterion.state_dict()
+        if full and self.scheduler is not None:
+            state['scheduler'] = self.scheduler.state_dict()
+        torch.save(state, filename, _use_new_zipfile_serialization=False)
+    @classmethod
+    def from_new_model(cls, args, vocab):
+        model = CharacterLanguageModel(args, vocab, is_forward_lm=True if args['direction'] == 'forward' else False)
+        model = model.to(args['device'])
+        params = [param for param in model.parameters() if param.requires_grad]
+        optimizer = torch.optim.SGD(params, lr=args['lr0'], momentum=args['momentum'], weight_decay=args['weight_decay'])
+        criterion = torch.nn.CrossEntropyLoss()
+        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, verbose=True, factor=args['anneal'], patience=args['patience'])
+        return cls(model, params, optimizer, criterion, scheduler)
+    @classmethod
+    def load(cls, args, filename, finetune=False):
+        """
+        Load the model along with any other saved state for training
+        Note that you MUST set finetune=True if planning to continue training
+        Otherwise the only benefit you will get will be a warm GPU
+        """
+        state = torch.load(filename, lambda storage, loc: storage, weights_only=True)
+        model = CharacterLanguageModel.from_full_state(state['model'], finetune)
+        model = model.to(args['device'])
+        params = [param for param in model.parameters() if param.requires_grad]
+        optimizer = torch.optim.SGD(params, lr=args['lr0'], momentum=args['momentum'], weight_decay=args['weight_decay'])
+        if 'optimizer' in state: optimizer.load_state_dict(state['optimizer'])
+        criterion = torch.nn.CrossEntropyLoss()
+        if 'criterion' in state: criterion.load_state_dict(state['criterion'])
+        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, verbose=True, factor=args['anneal'], patience=args['patience'])
+        if 'scheduler' in state: scheduler.load_state_dict(state['scheduler'])
+        epoch = state.get('epoch', 1)
+        global_step = state.get('global_step', 0)
+        return cls(model, params, optimizer, criterion, scheduler, epoch, global_step)

stanza/stanza/models/common/chuliu_edmonds.py ADDED Viewed

	@@ -0,0 +1,281 @@

+# Adapted from Tim's code here: https://github.com/tdozat/Parser-v3/blob/master/scripts/chuliu_edmonds.py
+import numpy as np
+def tarjan(tree):
+    """Finds the cycles in a dependency graph
+    The input should be a numpy array of integers,
+    where in the standard use case,
+    tree[i] is the head of node i.
+    tree[0] == 0 to represent the root
+    so for example, for the English sentence "This is a test",
+    the input is
+    [0 4 4 4 0]
+    "Arthritis makes my hip hurt"
+    [0 2 0 4 2 2]
+    The return is a list of cycles, where in cycle has True if the
+    node at that index is participating in the cycle.
+    So, for example, the previous examples both return empty lists,
+    whereas an input of
+      np.array([0, 3, 1, 2])
+    has an output of
+      [np.array([False,  True,  True,  True])]
+    """
+    indices = -np.ones_like(tree)
+    lowlinks = -np.ones_like(tree)
+    onstack = np.zeros_like(tree, dtype=bool)
+    stack = list()
+    _index = [0]
+    cycles = []
+    #-------------------------------------------------------------
+    def maybe_pop_cycle(i):
+        if lowlinks[i] == indices[i]:
+            # There's a cycle!
+            cycle = np.zeros_like(indices, dtype=bool)
+            while stack[-1] != i:
+                j = stack.pop()
+                onstack[j] = False
+                cycle[j] = True
+            stack.pop()
+            onstack[i] = False
+            cycle[i] = True
+            if cycle.sum() > 1:
+                cycles.append(cycle)
+    def initialize_strong_connect(i):
+        _index[0] += 1
+        index = _index[-1]
+        indices[i] = lowlinks[i] = index - 1
+        stack.append(i)
+        onstack[i] = True
+    def strong_connect(i):
+        # this ridiculous atrocity is because somehow people keep
+        # coming up with graphs which overflow python's call stack
+        # so instead we make our own call stack and turn the recursion
+        # into a loop
+        # see for example
+        #   https://github.com/stanfordnlp/stanza/issues/962
+        #   https://github.com/spraakbanken/sparv-pipeline/issues/166
+        # in an ideal world this block of code would look like this
+        #    initialize_strong_connect(i)
+        #    dependents = iter(np.where(np.equal(tree, i))[0])
+        #    for j in dependents:
+        #        if indices[j] == -1:
+        #            strong_connect(j)
+        #            lowlinks[i] = min(lowlinks[i], lowlinks[j])
+        #        elif onstack[j]:
+        #            lowlinks[i] = min(lowlinks[i], indices[j])
+        #
+        #     maybe_pop_cycle(i)
+        call_stack = [(i, None, None)]
+        while len(call_stack) > 0:
+            i, dependents_iterator, j = call_stack.pop()
+            if dependents_iterator is None: # first time getting here for this i
+                initialize_strong_connect(i)
+                dependents_iterator = iter(np.where(np.equal(tree, i))[0])
+            else: # been here before.  j was the dependent we were just considering
+                lowlinks[i] = min(lowlinks[i], lowlinks[j])
+            for j in dependents_iterator:
+                if indices[j] == -1:
+                    # have to remember where we were...
+                    # put the current iterator & its state on the "call stack"
+                    # we will come back to it later
+                    call_stack.append((i, dependents_iterator, j))
+                    # also, this is what we do next...
+                    call_stack.append((j, None, None))
+                    # this will break this iterator for now
+                    # the next time through, we will continue progressing this iterator
+                    break
+                elif onstack[j]:
+                    lowlinks[i] = min(lowlinks[i], indices[j])
+            else:
+                # this is an intended use of for/else
+                # please stop filing git issues on obscure language features
+                # we finished iterating without a break
+                # and can finally resolve any possible cycles
+                maybe_pop_cycle(i)
+            # at this point, there are two cases:
+            #
+            # we iterated all the way through an iterator (the else in the for/else)
+            # and have resolved any possible cycles.  can then proceed to the previous
+            # iterator we were considering (or finish, if there are no others)
+            # OR
+            # we have hit a break in the iteration over the dependents
+            # for a node
+            # and we need to dig deeper into the graph and resolve the dependent's dependents
+            # before we can continue the previous node
+            #
+            # either way, we check to see if there are unfinished subtrees
+            # when that is finally done, we can return
+    #-------------------------------------------------------------
+    for i in range(len(tree)):
+        if indices[i] == -1:
+            strong_connect(i)
+    return cycles
+def process_cycle(tree, cycle, scores):
+    """
+    Build a subproblem with one cycle broken
+    """
+    # indices of cycle in original tree; (c) in t
+    cycle_locs = np.where(cycle)[0]
+    # heads of cycle in original tree; (c) in t
+    cycle_subtree = tree[cycle]
+    # scores of cycle in original tree; (c) in R
+    cycle_scores = scores[cycle, cycle_subtree]
+    # total score of cycle; () in R
+    cycle_score = cycle_scores.sum()
+    # locations of noncycle; (t) in [0,1]
+    noncycle = np.logical_not(cycle)
+    # indices of noncycle in original tree; (n) in t
+    noncycle_locs = np.where(noncycle)[0]
+    #print(cycle_locs, noncycle_locs)
+    # scores of cycle's potential heads; (c x n) - (c) + () -> (n x c) in R
+    metanode_head_scores = scores[cycle][:,noncycle] - cycle_scores[:,None] + cycle_score
+    # scores of cycle's potential dependents; (n x c) in R
+    metanode_dep_scores = scores[noncycle][:,cycle]
+    # best noncycle head for each cycle dependent; (n) in c
+    metanode_heads = np.argmax(metanode_head_scores, axis=0)
+    # best cycle head for each noncycle dependent; (n) in c
+    metanode_deps = np.argmax(metanode_dep_scores, axis=1)
+    # scores of noncycle graph; (n x n) in R
+    subscores = scores[noncycle][:,noncycle]
+    # pad to contracted graph; (n+1 x n+1) in R
+    subscores = np.pad(subscores, ( (0,1) , (0,1) ), 'constant')
+    # set the contracted graph scores of cycle's potential heads; (c x n)[:, (n) in n] in R -> (n) in R
+    subscores[-1, :-1] = metanode_head_scores[metanode_heads, np.arange(len(noncycle_locs))]
+    # set the contracted graph scores of cycle's potential dependents; (n x c)[(n) in n] in R-> (n) in R
+    subscores[:-1,-1] = metanode_dep_scores[np.arange(len(noncycle_locs)), metanode_deps]
+    return subscores, cycle_locs, noncycle_locs, metanode_heads, metanode_deps
+def expand_contracted_tree(tree, contracted_tree, cycle_locs, noncycle_locs, metanode_heads, metanode_deps):
+    """
+    Given a partially solved tree with a cycle and a solved subproblem
+    for the cycle, build a larger solution without the cycle
+    """
+    # head of the cycle; () in n
+    #print(contracted_tree)
+    cycle_head = contracted_tree[-1]
+    # fixed tree: (n) in n+1
+    contracted_tree = contracted_tree[:-1]
+    # initialize new tree; (t) in 0
+    new_tree = -np.ones_like(tree)
+    #print(0, new_tree)
+    # fixed tree with no heads coming from the cycle: (n) in [0,1]
+    contracted_subtree = contracted_tree < len(contracted_tree)
+    # add the nodes to the new tree (t)[(n)[(n) in [0,1]] in t] in t = (n)[(n)[(n) in [0,1]] in n] in t
+    new_tree[noncycle_locs[contracted_subtree]] = noncycle_locs[contracted_tree[contracted_subtree]]
+    #print(1, new_tree)
+    # fixed tree with heads coming from the cycle: (n) in [0,1]
+    contracted_subtree = np.logical_not(contracted_subtree)
+    # add the nodes to the tree (t)[(n)[(n) in [0,1]] in t] in t = (c)[(n)[(n) in [0,1]] in c] in t
+    new_tree[noncycle_locs[contracted_subtree]] = cycle_locs[metanode_deps[contracted_subtree]]
+    #print(2, new_tree)
+    # add the old cycle to the tree; (t)[(c) in t] in t = (t)[(c) in t] in t
+    new_tree[cycle_locs] = tree[cycle_locs]
+    #print(3, new_tree)
+    # root of the cycle; (n)[() in n] in c = () in c
+    cycle_root = metanode_heads[cycle_head]
+    # add the root of the cycle to the new tree; (t)[(c)[() in c] in t] = (c)[() in c]
+    new_tree[cycle_locs[cycle_root]] = noncycle_locs[cycle_head]
+    #print(4, new_tree)
+    return new_tree
+def prepare_scores(scores):
+    """
+    Alter the scores matrix to avoid self loops and handle the root
+    """
+    # prevent self-loops, set up the root location
+    np.fill_diagonal(scores, -float('inf')) # prevent self-loops
+    scores[0] = -float('inf')
+    scores[0,0] = 0
+def chuliu_edmonds(scores):
+    subtree_stack = []
+    prepare_scores(scores)
+    tree = np.argmax(scores, axis=1)
+    cycles = tarjan(tree)
+    #print(scores)
+    #print(cycles)
+    # recursive implementation:
+    #if cycles:
+    #    # t = len(tree); c = len(cycle); n = len(noncycle)
+    #    # cycles.pop(): locations of cycle; (t) in [0,1]
+    #    subscores, cycle_locs, noncycle_locs, metanode_heads, metanode_deps = process_cycle(tree, cycles.pop(), scores)
+    #    # MST with contraction; (n+1) in n+1
+    #    contracted_tree = chuliu_edmonds(subscores)
+    #    tree = expand_contracted_tree(tree, contracted_tree, cycle_locs, noncycle_locs, metanode_heads, metanode_deps)
+    # unfortunately, while the recursion is simpler to understand, it can get too deep for python's stack limit
+    # so instead we make our own recursion, with blackjack and (you know how it goes)
+    while cycles:
+        # t = len(tree); c = len(cycle); n = len(noncycle)
+        # cycles.pop(): locations of cycle; (t) in [0,1]
+        subscores, cycle_locs, noncycle_locs, metanode_heads, metanode_deps = process_cycle(tree, cycles.pop(), scores)
+        subtree_stack.append((tree, cycles, scores, subscores, cycle_locs, noncycle_locs, metanode_heads, metanode_deps))
+        scores = subscores
+        prepare_scores(scores)
+        tree = np.argmax(scores, axis=1)
+        cycles = tarjan(tree)
+    while len(subtree_stack) > 0:
+        contracted_tree = tree
+        (tree, cycles, scores, subscores, cycle_locs, noncycle_locs, metanode_heads, metanode_deps) = subtree_stack.pop()
+        tree = expand_contracted_tree(tree, contracted_tree, cycle_locs, noncycle_locs, metanode_heads, metanode_deps)
+    return tree
+#===============================================================
+def chuliu_edmonds_one_root(scores):
+    """"""
+    scores = scores.astype(np.float64)
+    tree = chuliu_edmonds(scores)
+    roots_to_try = np.where(np.equal(tree[1:], 0))[0]+1
+    if len(roots_to_try) == 1:
+        return tree
+    #-------------------------------------------------------------
+    def set_root(scores, root):
+        root_score = scores[root,0]
+        scores = np.array(scores)
+        scores[1:,0] = -float('inf')
+        scores[root] = -float('inf')
+        scores[root,0] = 0
+        return scores, root_score
+    #-------------------------------------------------------------
+    best_score, best_tree = -np.inf, None # This is what's causing it to crash
+    for root in roots_to_try:
+        _scores, root_score = set_root(scores, root)
+        _tree = chuliu_edmonds(_scores)
+        tree_probs = _scores[np.arange(len(_scores)), _tree]
+        tree_score = (tree_probs).sum()+(root_score) if (tree_probs > -np.inf).all() else -np.inf
+        if tree_score > best_score:
+            best_score = tree_score
+            best_tree = _tree
+    try:
+        assert best_tree is not None
+    except:
+        with open('debug.log', 'w') as f:
+            f.write('{}: {}, {}\n'.format(tree, scores, roots_to_try))
+            f.write('{}: {}, {}, {}\n'.format(_tree, _scores, tree_probs, tree_score))
+        raise
+    return best_tree

stanza/stanza/models/common/constant.py ADDED Viewed

	@@ -0,0 +1,550 @@

+"""
+Global constants.
+These language codes mirror UD language codes when possible
+"""
+import re
+class UnknownLanguageError(ValueError):
+    pass
+# tuples in a list so we can assert that the langcodes are all unique
+# When applicable, we favor the UD decision over any other possible
+# language code or language name
+# ISO 639-1 is out of date, but many of the UD datasets are labeled
+# using the two letter abbreviations, so we add those for non-UD
+# languages in the hopes that we've guessed right if those languages
+# are eventually processed
+lcode2lang_raw = [
+    ("abq", "Abaza"),
+    ("ab",  "Abkhazian"),
+    ("aa",  "Afar"),
+    ("af",  "Afrikaans"),
+    ("ak",  "Akan"),
+    ("akk", "Akkadian"),
+    ("aqz", "Akuntsu"),
+    ("sq",  "Albanian"),
+    ("am",  "Amharic"),
+    ("grc", "Ancient_Greek"),
+    ("hbo", "Ancient_Hebrew"),
+    ("apu", "Apurina"),
+    ("ar",  "Arabic"),
+    ("arz", "Egyptian_Arabic"),
+    ("an",  "Aragonese"),
+    ("hy",  "Armenian"),
+    ("as",  "Assamese"),
+    ("aii", "Assyrian"),
+    ("ast", "Asturian"),
+    ("av",  "Avaric"),
+    ("ae",  "Avestan"),
+    ("ay",  "Aymara"),
+    ("az",  "Azerbaijani"),
+    ("bm",  "Bambara"),
+    ("ba",  "Bashkir"),
+    ("eu",  "Basque"),
+    ("bar", "Bavarian"),
+    ("bej", "Beja"),
+    ("be",  "Belarusian"),
+    ("bn",  "Bengali"),
+    ("bho", "Bhojpuri"),
+    ("bpy", "Bishnupriya_Manipuri"),
+    ("bi",  "Bislama"),
+    ("bor", "Bororo"),
+    ("bs",  "Bosnian"),
+    ("br",  "Breton"),
+    ("bg",  "Bulgarian"),
+    ("bxr", "Buryat"),
+    ("yue", "Cantonese"),
+    ("cpg", "Cappadocian"),
+    ("ca",  "Catalan"),
+    ("ceb", "Cebuano"),
+    ("km",  "Central_Khmer"),
+    ("ch",  "Chamorro"),
+    ("ce",  "Chechen"),
+    ("ny",  "Chichewa"),
+    ("ckt", "Chukchi"),
+    ("cv",  "Chuvash"),
+    ("xcl", "Classical_Armenian"),
+    ("lzh", "Classical_Chinese"),
+    ("cop", "Coptic"),
+    ("kw",  "Cornish"),
+    ("co",  "Corsican"),
+    ("cr",  "Cree"),
+    ("hr",  "Croatian"),
+    ("cs",  "Czech"),
+    ("da",  "Danish"),
+    ("dar", "Dargwa"),
+    ("dv",  "Dhivehi"),
+    ("nl",  "Dutch"),
+    ("dz",  "Dzongkha"),
+    ("egy", "Egyptian"),
+    ("en",  "English"),
+    ("myv", "Erzya"),
+    ("eo",  "Esperanto"),
+    ("et",  "Estonian"),
+    ("ee",  "Ewe"),
+    ("ext", "Extremaduran"),
+    ("fo",  "Faroese"),
+    ("fj",  "Fijian"),
+    ("fi",  "Finnish"),
+    ("fon", "Fon"),
+    ("fr",  "French"),
+    ("qfn", "Frisian_Dutch"),
+    ("ff",  "Fulah"),
+    ("gl",  "Galician"),
+    ("lg",  "Ganda"),
+    ("ka",  "Georgian"),
+    ("de",  "German"),
+    ("aln", "Gheg"),
+    ("bbj", "Ghomálá'"),
+    ("got", "Gothic"),
+    ("el",  "Greek"),
+    ("kl",  "Greenlandic"),
+    ("gub", "Guajajara"),
+    ("gn",  "Guarani"),
+    ("gu",  "Gujarati"),
+    ("gwi", "Gwichin"),
+    ("ht",  "Haitian"),
+    ("ha",  "Hausa"),
+    ("he",  "Hebrew"),
+    ("hz",  "Herero"),
+    ("azz", "Highland_Puebla_Nahuatl"),
+    ("hil", "Hiligaynon"),
+    ("hi",  "Hindi"),
+    ("qhe", "Hindi_English"),
+    ("ho",  "Hiri_Motu"),
+    ("hit", "Hittite"),
+    ("hu",  "Hungarian"),
+    ("is",  "Icelandic"),
+    ("io",  "Ido"),
+    ("ig",  "Igbo"),
+    ("ilo", "Ilocano"),
+    ("arc", "Imperial_Aramaic"),
+    ("id",  "Indonesian"),
+    ("iu",  "Inuktitut"),
+    ("ik",  "Inupiaq"),
+    ("ga",  "Irish"),
+    ("it",  "Italian"),
+    ("ja",  "Japanese"),
+    ("jv",  "Javanese"),
+    ("urb", "Kaapor"),
+    ("kab", "Kabyle"),
+    ("xnr", "Kangri"),
+    ("kn",  "Kannada"),
+    ("kr",  "Kanuri"),
+    ("pam", "Kapampangan"),
+    ("krl", "Karelian"),
+    ("arr", "Karo"),
+    ("ks",  "Kashmiri"),
+    ("kk",  "Kazakh"),
+    ("kfm", "Khunsari"),
+    ("quc", "Kiche"),
+    ("cgg", "Kiga"),
+    ("ki",  "Kikuyu"),
+    ("rw",  "Kinyarwanda"),
+    ("ky",  "Kyrgyz"),
+    ("kv",  "Komi"),
+    ("koi", "Komi_Permyak"),
+    ("kpv", "Komi_Zyrian"),
+    ("kg",  "Kongo"),
+    ("ko",  "Korean"),
+    ("ku",  "Kurdish"),
+    ("kmr", "Kurmanji"),
+    ("kj",  "Kwanyama"),
+    ("lad", "Ladino"),
+    ("lo",  "Lao"),
+    ("ltg", "Latgalian"),
+    ("la",  "Latin"),
+    ("lv",  "Latvian"),
+    ("lij", "Ligurian"),
+    ("li",  "Limburgish"),
+    ("ln",  "Lingala"),
+    ("lt",  "Lithuanian"),
+    ("liv", "Livonian"),
+    ("olo", "Livvi"),
+    ("nds", "Low_Saxon"),
+    ("lu",  "Luba_Katanga"),
+    ("lb",  "Luxembourgish"),
+    ("mk",  "Macedonian"),
+    ("jaa", "Madi"),
+    ("mag", "Magahi"),
+    ("qaf", "Maghrebi_Arabic_French"),
+    ("mai", "Maithili"),
+    ("mpu", "Makurap"),
+    ("mg",  "Malagasy"),
+    ("ms",  "Malay"),
+    ("ml",  "Malayalam"),
+    ("mt",  "Maltese"),
+    ("mjl", "Mandyali"),
+    ("gv",  "Manx"),
+    ("mi",  "Maori"),
+    ("mr",  "Marathi"),
+    ("mh",  "Marshallese"),
+    ("mzn", "Mazandarani"),
+    ("gun", "Mbya_Guarani"),
+    ("enm", "Middle_English"),
+    ("frm", "Middle_French"),
+    ("min", "Minangkabau"),
+    ("xmf", "Mingrelian"),
+    ("mwl", "Mirandese"),
+    ("mdf", "Moksha"),
+    ("mn",  "Mongolian"),
+    ("mos", "Mossi"),
+    ("myu", "Munduruku"),
+    ("my",  "Myanmar"),
+    ("nqo", "N'Ko"),
+    ("nah", "Nahuatl"),
+    ("pcm", "Naija"),
+    ("na",  "Nauru"),
+    ("nv",  "Navajo"),
+    ("nyq", "Nayini"),
+    ("ng",  "Ndonga"),
+    ("nap", "Neapolitan"),
+    ("ne",  "Nepali"),
+    ("new", "Newar"),
+    ("yrl", "Nheengatu"),
+    ("nyn", "Nkore"),
+    ("frr", "North_Frisian"),
+    ("nd",  "North_Ndebele"),
+    ("sme", "North_Sami"),
+    ("nso", "Northern_Sotho"),
+    ("gya", "Northwest_Gbaya"),
+    ("nb",  "Norwegian_Bokmaal"),
+    ("nn",  "Norwegian_Nynorsk"),
+    ("ii",  "Nuosu"),
+    ("oc",  "Occitan"),
+    ("or",  "Odia"),
+    ("oj",  "Ojibwa"),
+    ("cu",  "Old_Church_Slavonic"),
+    ("orv", "Old_East_Slavic"),
+    ("ang", "Old_English"),
+    ("fro", "Old_French"),
+    ("sga", "Old_Irish"),
+    ("ojp", "Old_Japanese"),
+    ("otk", "Old_Turkish"),
+    ("om",  "Oromo"),
+    ("os",  "Ossetian"),
+    ("ota", "Ottoman_Turkish"),
+    ("pi",  "Pali"),
+    ("ps",  "Pashto"),
+    ("pad", "Paumari"),
+    ("fa",  "Persian"),
+    ("pay", "Pesh"),
+    ("xpg", "Phrygian"),
+    ("pbv", "Pnar"),
+    ("pl",  "Polish"),
+    ("qpm", "Pomak"),
+    ("pnt", "Pontic"),
+    ("pt",  "Portuguese"),
+    ("pra", "Prakrit"),
+    ("pa",  "Punjabi"),
+    ("qu",  "Quechua"),
+    ("rhg", "Rohingya"),
+    ("ro",  "Romanian"),
+    ("rm",  "Romansh"),
+    ("rn",  "Rundi"),
+    ("ru",  "Russian"),
+    ("sm",  "Samoan"),
+    ("sg",  "Sango"),
+    ("sa",  "Sanskrit"),
+    ("skr", "Saraiki"),
+    ("sc",  "Sardinian"),
+    ("sco", "Scots"),
+    ("gd",  "Scottish_Gaelic"),
+    ("sr",  "Serbian"),
+    ("sn",  "Shona"),
+    ("zh-hans", "Simplified_Chinese"),
+    ("sd",  "Sindhi"),
+    ("si",  "Sinhala"),
+    ("sms", "Skolt_Sami"),
+    ("sk",  "Slovak"),
+    ("sl",  "Slovenian"),
+    ("soj", "Soi"),
+    ("so",  "Somali"),
+    ("ckb", "Sorani"),
+    ("ajp", "South_Levantine_Arabic"),
+    ("nr",  "South_Ndebele"),
+    ("st",  "Southern_Sotho"),
+    ("es",  "Spanish"),
+    ("ssp", "Spanish_Sign_Language"),
+    ("su",  "Sundanese"),
+    ("sw",  "Swahili"),
+    ("ss",  "Swati"),
+    ("sv",  "Swedish"),
+    ("swl", "Swedish_Sign_Language"),
+    ("gsw", "Swiss_German"),
+    ("syr", "Syriac"),
+    ("tl",  "Tagalog"),
+    ("ty",  "Tahitian"),
+    ("tg",  "Tajik"),
+    ("ta",  "Tamil"),
+    ("tt",  "Tatar"),
+    ("eme", "Teko"),
+    ("te",  "Telugu"),
+    ("qte", "Telugu_English"),
+    ("th",  "Thai"),
+    ("bo",  "Tibetan"),
+    ("ti",  "Tigrinya"),
+    ("to",  "Tonga"),
+    ("zh-hant", "Traditional_Chinese"),
+    ("ts",  "Tsonga"),
+    ("tn",  "Tswana"),
+    ("tpn", "Tupinamba"),
+    ("tr",  "Turkish"),
+    ("qtd", "Turkish_German"),
+    ("tk",  "Turkmen"),
+    ("tw",  "Twi"),
+    ("uk",  "Ukrainian"),
+    ("xum", "Umbrian"),
+    ("hsb", "Upper_Sorbian"),
+    ("ur",  "Urdu"),
+    ("ug",  "Uyghur"),
+    ("uz",  "Uzbek"),
+    ("ve",  "Venda"),
+    ("vep", "Veps"),
+    ("vi",  "Vietnamese"),
+    ("vo",  "Volapük"),
+    ("wa",  "Walloon"),
+    ("war", "Waray"),
+    ("wbp", "Warlpiri"),
+    ("cy",  "Welsh"),
+    ("hyw", "Western_Armenian"),
+    ("fy",  "Western_Frisian"),
+    ("nhi", "Western_Sierra_Puebla_Nahuatl"),
+    ("wo",  "Wolof"),
+    ("xav", "Xavante"),
+    ("xh",  "Xhosa"),
+    ("sjo", "Xibe"),
+    ("sah", "Yakut"),
+    ("yi",  "Yiddish"),
+    ("yo",  "Yoruba"),
+    ("ess", "Yupik"),
+    ("say", "Zaar"),
+    ("zza", "Zazaki"),
+    ("zea", "Zeelandic"),
+    ("za",  "Zhuang"),
+    ("zu",  "Zulu"),
+]
+# build the dictionary, checking for duplicate language codes
+lcode2lang = {}
+for code, language in lcode2lang_raw:
+    assert code not in lcode2lang
+    lcode2lang[code] = language
+# invert the dictionary, checking for possible duplicate language names
+lang2lcode = {}
+for code, language in lcode2lang_raw:
+    assert language not in lang2lcode
+    lang2lcode[language] = code
+# check that nothing got clobbered
+assert len(lcode2lang_raw) == len(lcode2lang)
+assert len(lcode2lang_raw) == len(lang2lcode)
+# some of the two letter langcodes get used elsewhere as three letters
+# for example, Wolof is abbreviated "wo" in UD, but "wol" in Masakhane NER
+two_to_three_letters_raw = (
+    ("bm",  "bam"),
+    ("ee",  "ewe"),
+    ("ha",  "hau"),
+    ("ig",  "ibo"),
+    ("rw",  "kin"),
+    ("lg",  "lug"),
+    ("ny",  "nya"),
+    ("sn",  "sna"),
+    ("sw",  "swa"),
+    ("tn",  "tsn"),
+    ("tw",  "twi"),
+    ("wo",  "wol"),
+    ("xh",  "xho"),
+    ("yo",  "yor"),
+    ("zu",  "zul"),
+    # this is a weird case where a 2 letter code was available,
+    # but UD used the 3 letter code instead
+    ("se",  "sme"),
+)
+for two, three in two_to_three_letters_raw:
+    if two in lcode2lang:
+        assert two in lcode2lang
+        assert three not in lcode2lang
+        assert three not in lang2lcode
+        lang2lcode[three] = two
+        lcode2lang[three] = lcode2lang[two]
+    elif three in lcode2lang:
+        assert three in lcode2lang
+        assert two not in lcode2lang
+        assert two not in lang2lcode
+        lang2lcode[two] = three
+        lcode2lang[two] = lcode2lang[three]
+    else:
+        raise AssertionError("Found a proposed alias %s -> %s when neither code was already known" % (two, three))
+two_to_three_letters = {
+    two: three for two, three in two_to_three_letters_raw
+}
+three_to_two_letters = {
+    three: two for two, three in two_to_three_letters_raw
+}
+assert len(two_to_three_letters) == len(two_to_three_letters_raw)
+assert len(three_to_two_letters) == len(two_to_three_letters_raw)
+# additional useful code to language mapping
+# added after dict invert to avoid conflict
+lcode2lang['nb'] = 'Norwegian' # Norwegian Bokmall mapped to default norwegian
+lcode2lang['no'] = 'Norwegian'
+lcode2lang['zh'] = 'Simplified_Chinese'
+extra_lang_to_lcodes = [
+    ("ab",  "Abkhaz"),
+    ("gsw", "Alemannic"),
+    ("my",  "Burmese"),
+    ("ckb", "Central_Kurdish"),
+    ("ny",  "Chewa"),
+    ("zh",  "Chinese"),
+    ("za",  "Chuang"),
+    ("dv",  "Divehi"),
+    ("eme", "Emerillon"),
+    ("lij", "Genoese"),
+    ("ga",  "Gaelic"),
+    ("ne",  "Gorkhali"),
+    ("ht",  "Haitian_Creole"),
+    ("ilo", "Ilokano"),
+    ("nr",  "isiNdebele"),
+    ("xh",  "isiXhosa"),
+    ("zu",  "isiZulu"),
+    ("jaa", "Jamamadí"),
+    ("kab", "Kabylian"),
+    ("kl",  "Kalaallisut"),
+    ("km",  "Khmer"),
+    ("ky",  "Kirghiz"),
+    ("lb",  "Letzeburgesch"),
+    ("lg",  "Luganda"),
+    ("jaa", "Madí"),
+    ("dv",  "Maldivian"),
+    ("mjl", "Mandeali"),
+    ("skr", "Multani"),
+    ("nb",  "Norwegian"),
+    ("ny",  "Nyanja"),
+    ("sga", "Old_Gaelic"),
+    ("or",  "Oriya"),
+    ("arr", "Ramarama"),
+    ("sah", "Sakha"),
+    ("nso", "Sepedi"),
+    ("tn",  "Setswana"),
+    ("ii",  "Sichuan_Yi"),
+    ("si",  "Sinhalese"),
+    ("ss",  "Siswati"),
+    ("soj", "Sohi"),
+    ("st",  "Sesotho"),
+    ("ve",  "Tshivenda"),
+    ("ts",  "Xitsonga"),
+    ("fy",  "West_Frisian"),
+    ("zza", "Zaza"),
+]
+for code, language in extra_lang_to_lcodes:
+    assert language not in lang2lcode
+    assert code in lcode2lang
+    lang2lcode[language] = code
+# treebank names changed from Old Russian to Old East Slavic in 2.8
+lang2lcode['Old_Russian'] = 'orv'
+# build a lowercase map from language to langcode
+langlower2lcode = {}
+for k in lang2lcode:
+    langlower2lcode[k.lower()] = lang2lcode[k]
+treebank_special_cases = {
+    "UD_Chinese-Beginner": "zh-hans_beginner",
+    "UD_Chinese-GSDSimp": "zh-hans_gsdsimp",
+    "UD_Chinese-GSD": "zh-hant_gsd",
+    "UD_Chinese-HK": "zh-hant_hk",
+    "UD_Chinese-CFL": "zh-hans_cfl",
+    "UD_Chinese-PatentChar": "zh-hans_patentchar",
+    "UD_Chinese-PUD": "zh-hant_pud",
+    "UD_Norwegian-Bokmaal": "nb_bokmaal",
+    "UD_Norwegian-Nynorsk": "nn_nynorsk",
+    "UD_Norwegian-NynorskLIA": "nn_nynorsklia",
+}
+SHORTNAME_RE = re.compile("^[a-z-]+_[a-z0-9-_]+$")
+def langcode_to_lang(lcode):
+    if lcode in lcode2lang:
+        return lcode2lang[lcode]
+    elif lcode.lower() in lcode2lang:
+        return lcode2lang[lcode.lower()]
+    else:
+        return lcode
+def pretty_langcode_to_lang(lcode):
+    lang = langcode_to_lang(lcode)
+    lang = lang.replace("_", " ")
+    if lang == 'Simplified Chinese':
+        lang = 'Chinese (Simplified)'
+    elif lang == 'Traditional Chinese':
+        lang = 'Chinese (Traditional)'
+    return lang
+def lang_to_langcode(lang):
+    if lang in lang2lcode:
+        lcode = lang2lcode[lang]
+    elif lang.lower() in langlower2lcode:
+        lcode = langlower2lcode[lang.lower()]
+    elif lang in lcode2lang:
+        lcode = lang
+    elif lang.lower() in lcode2lang:
+        lcode = lang.lower()
+    else:
+        raise UnknownLanguageError("Unable to find language code for %s" % lang)
+    return lcode
+RIGHT_TO_LEFT = set(["ar", "arc", "az", "ckb", "dv", "ff", "he", "ku", "mzn", "nqo", "ps", "fa", "rhg", "sd", "syr", "ur"])
+def is_right_to_left(lang):
+    """
+    Covers all the RtL languages we support, as well as many we don't.
+    If a language is left out, please let us know!
+    """
+    lcode = lang_to_langcode(lang)
+    return lcode in RIGHT_TO_LEFT
+def treebank_to_short_name(treebank):
+    """ Convert treebank name to short code. """
+    if treebank in treebank_special_cases:
+        return treebank_special_cases.get(treebank)
+    if SHORTNAME_RE.match(treebank):
+        lang, corpus = treebank.split("_", 1)
+        lang = lang_to_langcode(lang)
+        return lang + "_" + corpus
+    if treebank.startswith('UD_'):
+        treebank = treebank[3:]
+    # special case starting with zh in case the input is an already-converted ZH treebank
+    if treebank.startswith("zh-hans") or treebank.startswith("zh-hant"):
+        splits = (treebank[:len("zh-hans")], treebank[len("zh-hans")+1:])
+    else:
+        splits = treebank.split('-')
+        if len(splits) == 1:
+            splits = treebank.split("_", 1)
+    assert len(splits) == 2, "Unable to process %s" % treebank
+    lang, corpus = splits
+    lcode = lang_to_langcode(lang)
+    short = "{}_{}".format(lcode, corpus.lower())
+    return short
+def treebank_to_langid(treebank):
+    """ Convert treebank name to langid """
+    short_name = treebank_to_short_name(treebank)
+    return short_name.split("_")[0]

stanza/stanza/models/common/count_ner_coverage.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from stanza.models.common import pretrain
+import argparse
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('ners', type=str, nargs='*', help='Which treebanks to run on')
+    parser.add_argument('--pretrain', type=str, default="/home/john/stanza_resources/hi/pretrain/hdtb.pt", help='Which pretrain to use')
+    parser.set_defaults(ners=["/home/john/stanza/data/ner/hi_fire2013.train.csv",
+                              "/home/john/stanza/data/ner/hi_fire2013.dev.csv"])
+    args = parser.parse_args()
+    return args
+def read_ner(filename):
+    words = []
+    for line in open(filename).readlines():
+        line = line.strip()
+        if not line:
+            continue
+        if line.split("\t")[1] == 'O':
+            continue
+        words.append(line.split("\t")[0])
+    return words
+def count_coverage(pretrain, words):
+    count = 0
+    for w in words:
+        if w in pretrain.vocab:
+            count = count + 1
+    return count / len(words)
+args = parse_args()
+pt = pretrain.Pretrain(args.pretrain)
+for dataset in args.ners:
+    words = read_ner(dataset)
+    print(dataset)
+    print(count_coverage(pt, words))
+    print()

stanza/stanza/models/common/count_pretrain_coverage.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""A simple script to count the fraction of words in a UD dataset which are in a particular pretrain.
+For example, this script shows that the word2vec Armenian vectors,
+truncated at 250K words, have 75% coverage of the Western Armenian
+dataset, whereas the vectors available here have 88% coverage:
+https://github.com/ispras-texterra/word-embeddings-eval-hy
+"""
+from stanza.models.common import pretrain
+from stanza.utils.conll import CoNLL
+import argparse
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('treebanks', type=str, nargs='*', help='Which treebanks to run on')
+    parser.add_argument('--pretrain', type=str, default="/home/john/extern_data/wordvec/glove/armenian.pt", help='Which pretrain to use')
+    parser.set_defaults(treebanks=["/home/john/extern_data/ud2/ud-treebanks-v2.7/UD_Western_Armenian-ArmTDP/hyw_armtdp-ud-train.conllu",
+                                   "/home/john/extern_data/ud2/ud-treebanks-v2.7/UD_Armenian-ArmTDP/hy_armtdp-ud-train.conllu"])
+    args = parser.parse_args()
+    return args
+args = parse_args()
+pt = pretrain.Pretrain(args.pretrain)
+pt.load()
+print("Pretrain stats: {} vectors, {} dim".format(len(pt.vocab), pt.emb[0].shape[0]))
+for treebank in args.treebanks:
+    print(treebank)
+    found = 0
+    total = 0
+    doc = CoNLL.conll2doc(treebank)
+    for sentence in doc.sentences:
+        for word in sentence.words:
+            total = total + 1
+            if word.text in pt.vocab:
+                found = found + 1
+    print (found / total)

stanza/stanza/models/common/crf.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""
+CRF loss and viterbi decoding.
+"""
+import math
+from numbers import Number
+import numpy as np
+import torch
+from torch import nn
+import torch.nn.init as init
+class CRFLoss(nn.Module):
+    """
+    Calculate log-space crf loss, given unary potentials, a transition matrix
+    and gold tag sequences.
+    """
+    def __init__(self, num_tag, batch_average=True):
+        super().__init__()
+        self._transitions = nn.Parameter(torch.zeros(num_tag, num_tag))
+        self._batch_average = batch_average # if not batch average, average on all tokens
+    def forward(self, inputs, masks, tag_indices):
+        """
+        inputs: batch_size x seq_len x num_tags
+        masks: batch_size x seq_len
+        tag_indices: batch_size x seq_len
+        @return:
+            loss: CRF negative log likelihood on all instances.
+            transitions: the transition matrix
+        """
+        # TODO: handle <start> and <end> tags
+        input_bs, input_sl, input_nc = inputs.size()
+        unary_scores = self.crf_unary_score(inputs, masks, tag_indices, input_bs, input_sl, input_nc)
+        binary_scores = self.crf_binary_score(inputs, masks, tag_indices, input_bs, input_sl, input_nc)
+        log_norm = self.crf_log_norm(inputs, masks, tag_indices)
+        log_likelihood = unary_scores + binary_scores - log_norm # batch_size
+        loss = torch.sum(-log_likelihood)
+        if self._batch_average:
+            loss = loss / input_bs
+        else:
+            total = masks.eq(0).sum()
+            loss = loss / (total + 1e-8)
+        return loss, self._transitions
+    def crf_unary_score(self, inputs, masks, tag_indices, input_bs, input_sl, input_nc):
+        """
+        @return:
+            unary_scores: batch_size
+        """
+        flat_inputs = inputs.view(input_bs, -1)
+        flat_tag_indices = tag_indices + torch.arange(input_sl, device=tag_indices.device).long().unsqueeze(0) * input_nc
+        unary_scores = torch.gather(flat_inputs, 1, flat_tag_indices).view(input_bs, -1)
+        unary_scores.masked_fill_(masks, 0)
+        return unary_scores.sum(dim=1)
+    def crf_binary_score(self, inputs, masks, tag_indices, input_bs, input_sl, input_nc):
+        """
+        @return:
+            binary_scores: batch_size
+        """
+        # get number of transitions
+        nt = tag_indices.size(-1) - 1
+        start_indices = tag_indices[:, :nt]
+        end_indices = tag_indices[:, 1:]
+        # flat matrices
+        flat_transition_indices = start_indices * input_nc + end_indices
+        flat_transition_indices = flat_transition_indices.view(-1)
+        flat_transition_matrix = self._transitions.view(-1)
+        binary_scores = torch.gather(flat_transition_matrix, 0, flat_transition_indices)\
+                .view(input_bs, -1)
+        score_masks = masks[:, 1:]
+        binary_scores.masked_fill_(score_masks, 0)
+        return binary_scores.sum(dim=1)
+    def crf_log_norm(self, inputs, masks, tag_indices):
+        """
+        Calculate the CRF partition in log space for each instance, following:
+            http://www.cs.columbia.edu/~mcollins/fb.pdf
+        @return:
+            log_norm: batch_size
+        """
+        start_inputs = inputs[:,0,:] # bs x nc
+        rest_inputs = inputs[:,1:,:]
+        # TODO: technically we need to pay attention to the initial
+        # value being masked.  Currently we do compensate for the
+        # entire row being masked at the end of the operation
+        rest_masks = masks[:,1:]
+        alphas = start_inputs # bs x nc
+        trans = self._transitions.unsqueeze(0) # 1 x nc x nc
+        # accumulate alphas in log space
+        for i in range(rest_inputs.size(1)):
+            transition_scores = alphas.unsqueeze(2) + trans # bs x nc x nc
+            new_alphas = rest_inputs[:,i,:] + log_sum_exp(transition_scores, dim=1)
+            m = rest_masks[:,i].unsqueeze(1).expand_as(new_alphas) # bs x nc, 1 for padding idx
+            # apply masks
+            new_alphas.masked_scatter_(m, alphas.masked_select(m))
+            alphas = new_alphas
+        log_norm = log_sum_exp(alphas, dim=1)
+        # if any row was entirely masked, we just turn its log denominator to 0
+        # eg, the empty summation for the denominator will be 1, and its log will be 0
+        all_masked = torch.all(masks, dim=1)
+        log_norm = log_norm * torch.logical_not(all_masked)
+        return log_norm
+def viterbi_decode(scores, transition_params):
+    """
+    Decode a tag sequence with viterbi algorithm.
+    scores: seq_len x num_tags (numpy array)
+    transition_params: num_tags x num_tags (numpy array)
+    @return:
+        viterbi: a list of tag ids with highest score
+        viterbi_score: the highest score
+    """
+    trellis = np.zeros_like(scores)
+    backpointers = np.zeros_like(scores, dtype=np.int32)
+    trellis[0] = scores[0]
+    for t in range(1, scores.shape[0]):
+        v = np.expand_dims(trellis[t-1], 1) + transition_params
+        trellis[t] = scores[t] + np.max(v, 0)
+        backpointers[t] = np.argmax(v, 0)
+    viterbi = [np.argmax(trellis[-1])]
+    for bp in reversed(backpointers[1:]):
+        viterbi.append(bp[viterbi[-1]])
+    viterbi.reverse()
+    viterbi_score = np.max(trellis[-1])
+    return viterbi, viterbi_score
+def log_sum_exp(value, dim=None, keepdim=False):
+    """Numerically stable implementation of the operation
+    value.exp().sum(dim, keepdim).log()
+    """
+    if dim is not None:
+        m, _ = torch.max(value, dim=dim, keepdim=True)
+        value0 = value - m
+        if keepdim is False:
+            m = m.squeeze(dim)
+        return m + torch.log(torch.sum(torch.exp(value0),
+                                       dim=dim, keepdim=keepdim))
+    else:
+        m = torch.max(value)
+        sum_exp = torch.sum(torch.exp(value - m))
+        if isinstance(sum_exp, Number):
+            return m + math.log(sum_exp)
+        else:
+            return m + torch.log(sum_exp)

stanza/stanza/models/common/data.py ADDED Viewed

	@@ -0,0 +1,155 @@

+"""
+Utility functions for data transformations.
+"""
+import logging
+import random
+import torch
+import stanza.models.common.seq2seq_constant as constant
+from stanza.models.common.doc import HEAD, ID, UPOS
+logger = logging.getLogger('stanza')
+def map_to_ids(tokens, vocab):
+    ids = [vocab[t] if t in vocab else constant.UNK_ID for t in tokens]
+    return ids
+def get_long_tensor(tokens_list, batch_size, pad_id=constant.PAD_ID):
+    """ Convert (list of )+ tokens to a padded LongTensor. """
+    sizes = []
+    x = tokens_list
+    while isinstance(x[0], list):
+        sizes.append(max(len(y) for y in x))
+        x = [z for y in x for z in y]
+    # TODO: pass in a device parameter and put it directly on the relevant device?
+    # that might be faster than creating it and then moving it
+    tokens = torch.LongTensor(batch_size, *sizes).fill_(pad_id)
+    for i, s in enumerate(tokens_list):
+        tokens[i, :len(s)] = torch.LongTensor(s)
+    return tokens
+def get_float_tensor(features_list, batch_size):
+    if features_list is None or features_list[0] is None:
+        return None
+    seq_len = max(len(x) for x in features_list)
+    feature_len = len(features_list[0][0])
+    features = torch.FloatTensor(batch_size, seq_len, feature_len).zero_()
+    for i,f in enumerate(features_list):
+        features[i,:len(f),:] = torch.FloatTensor(f)
+    return features
+def sort_all(batch, lens):
+    """ Sort all fields by descending order of lens, and return the original indices. """
+    if batch == [[]]:
+        return [[]], []
+    unsorted_all = [lens] + [range(len(lens))] + list(batch)
+    sorted_all = [list(t) for t in zip(*sorted(zip(*unsorted_all), reverse=True))]
+    return sorted_all[2:], sorted_all[1]
+def get_augment_ratio(train_data, should_augment_predicate, can_augment_predicate, desired_ratio=0.1, max_ratio=0.5):
+    """
+    Returns X so that if you randomly select X * N sentences, you get 10%
+    The ratio will be chosen in the assumption that the final dataset
+    is of size N rather than N + X * N.
+    should_augment_predicate: returns True if the sentence has some
+      feature which we may want to change occasionally.  for example,
+      depparse sentences which end in punct
+    can_augment_predicate: in the depparse sentences example, it is
+      technically possible for the punct at the end to be the parent
+      of some other word in the sentence.  in that case, the sentence
+      should not be chosen.  should be at least as restrictive as
+      should_augment_predicate
+    """
+    n_data = len(train_data)
+    n_should_augment = sum(should_augment_predicate(sentence) for sentence in train_data)
+    n_can_augment = sum(can_augment_predicate(sentence) for sentence in train_data)
+    n_error = sum(can_augment_predicate(sentence) and not should_augment_predicate(sentence)
+                  for sentence in train_data)
+    if n_error > 0:
+        raise AssertionError("can_augment_predicate allowed sentences not allowed by should_augment_predicate")
+    if n_can_augment == 0:
+        logger.warning("Found no sentences which matched can_augment_predicate {}".format(can_augment_predicate))
+        return 0.0
+    n_needed = n_data * desired_ratio - (n_data - n_should_augment)
+    # if we want 10%, for example, and more than 10% already matches, we can skip
+    if n_needed < 0:
+        return 0.0
+    ratio = n_needed / n_can_augment
+    if ratio > max_ratio:
+        return max_ratio
+    return ratio
+def should_augment_nopunct_predicate(sentence):
+    last_word = sentence[-1]
+    return last_word.get(UPOS, None) == 'PUNCT'
+def can_augment_nopunct_predicate(sentence):
+    """
+    Check that the sentence ends with PUNCT and also doesn't have any words which depend on the last word
+    """
+    last_word = sentence[-1]
+    if last_word.get(UPOS, None) != 'PUNCT':
+        return False
+    # don't cut off MWT
+    if len(last_word[ID]) > 1:
+        return False
+    if any(len(word[ID]) == 1 and word[HEAD] == last_word[ID][0] for word in sentence):
+        return False
+    return True
+def augment_punct(train_data, augment_ratio,
+                  should_augment_predicate=should_augment_nopunct_predicate,
+                  can_augment_predicate=can_augment_nopunct_predicate,
+                  keep_original_sentences=True):
+    """
+    Adds extra training data to compensate for some models having all sentences end with PUNCT
+    Some of the models (for example, UD_Hebrew-HTB) have the flaw that
+    all of the training sentences end with PUNCT.  The model therefore
+    learns to finish every sentence with punctuation, even if it is
+    given a sentence with non-punct at the end.
+    One simple way to fix this is to train on some fraction of training data with punct.
+    Params:
+    train_data: list of list of dicts, eg a conll doc
+    augment_ratio: the fraction to augment.  if None, a best guess is made to get to 10%
+    should_augment_predicate: a function which returns T/F if a sentence already ends with not PUNCT
+    can_augment_predicate: a function which returns T/F if it makes sense to remove the last PUNCT
+    TODO: do this dynamically, as part of the DataLoader or elsewhere?
+    One complication is the data comes back from the DataLoader as
+    tensors & indices, so it is much more complicated to manipulate
+    """
+    if len(train_data) == 0:
+        return []
+    if augment_ratio is None:
+        augment_ratio = get_augment_ratio(train_data, should_augment_predicate, can_augment_predicate)
+    if augment_ratio <= 0:
+        if keep_original_sentences:
+            return list(train_data)
+        else:
+            return []
+    new_data = []
+    for sentence in train_data:
+        if can_augment_predicate(sentence):
+            if random.random() < augment_ratio and len(sentence) > 1:
+                # todo: could deep copy the words
+                #       or not deep copy any of this
+                new_sentence = list(sentence[:-1])
+                new_data.append(new_sentence)
+            elif keep_original_sentences:
+                new_data.append(new_sentence)
+    return new_data

stanza/stanza/models/common/doc.py ADDED Viewed

	@@ -0,0 +1,1741 @@

+"""
+Basic data structures
+"""
+import io
+from itertools import repeat
+import re
+import json
+import pickle
+import warnings
+from enum import Enum
+import networkx as nx
+from stanza.models.common.stanza_object import StanzaObject
+from stanza.models.common.utils import misc_to_space_after, space_after_to_misc, misc_to_space_before, space_before_to_misc
+from stanza.models.ner.utils import decode_from_bioes
+from stanza.models.constituency import tree_reader
+from stanza.models.coref.coref_chain import CorefMention, CorefChain, CorefAttachment
+class MWTProcessingType(Enum):
+    FLATTEN = 0 # flatten the current token into one ID instead of MWT
+    PROCESS = 1 # process the current token as an MWT and expand it as such
+    SKIP = 2 # do nothing on this token, simply increment IDs
+multi_word_token_id = re.compile(r"([0-9]+)-([0-9]+)")
+multi_word_token_misc = re.compile(r".*MWT=Yes.*")
+MEXP = 'manual_expansion'
+ID = 'id'
+TEXT = 'text'
+LEMMA = 'lemma'
+UPOS = 'upos'
+XPOS = 'xpos'
+FEATS = 'feats'
+HEAD = 'head'
+DEPREL = 'deprel'
+DEPS = 'deps'
+MISC = 'misc'
+NER = 'ner'
+MULTI_NER = 'multi_ner'     # will represent tags from multiple NER models
+START_CHAR = 'start_char'
+END_CHAR = 'end_char'
+TYPE = 'type'
+SENTIMENT = 'sentiment'
+CONSTITUENCY = 'constituency'
+COREF_CHAINS = 'coref_chains'
+# field indices when converting the document to conll
+FIELD_TO_IDX = {ID: 0, TEXT: 1, LEMMA: 2, UPOS: 3, XPOS: 4, FEATS: 5, HEAD: 6, DEPREL: 7, DEPS: 8, MISC: 9}
+FIELD_NUM = len(FIELD_TO_IDX)
+class DocJSONEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, CorefMention):
+            return obj.__dict__
+        if isinstance(obj, CorefAttachment):
+            return obj.to_json()
+        return json.JSONEncoder.default(self, obj)
+class Document(StanzaObject):
+    """ A document class that stores attributes of a document and carries a list of sentences.
+    """
+    def __init__(self, sentences, text=None, comments=None, empty_sentences=None):
+        """ Construct a document given a list of sentences in the form of lists of CoNLL-U dicts.
+        Args:
+            sentences: a list of sentences, which being a list of token entry, in the form of a CoNLL-U dict.
+            text: the raw text of the document.
+            comments: A list of list of strings to use as comments on the sentences, either None or the same length as sentences
+        """
+        self._sentences = []
+        self._lang = None
+        self._text = text
+        self._num_tokens = 0
+        self._num_words = 0
+        self._process_sentences(sentences, comments, empty_sentences)
+        self._ents = []
+        self._coref = []
+        if self._text is not None:
+            self.build_ents()
+            self.mark_whitespace()
+    def mark_whitespace(self):
+        for sentence in self._sentences:
+            # TODO: pairwise, once we move to minimum 3.10
+            for prev_token, next_token in zip(sentence.tokens[:-1], sentence.tokens[1:]):
+                whitespace = self._text[prev_token.end_char:next_token.start_char]
+                prev_token.spaces_after = whitespace
+        for prev_sentence, next_sentence in zip(self._sentences[:-1], self._sentences[1:]):
+            prev_token = prev_sentence.tokens[-1]
+            next_token = next_sentence.tokens[0]
+            whitespace = self._text[prev_token.end_char:next_token.start_char]
+            prev_token.spaces_after = whitespace
+        if len(self._sentences) > 0 and len(self._sentences[-1].tokens) > 0:
+            final_token = self._sentences[-1].tokens[-1]
+            whitespace = self._text[final_token.end_char:]
+            final_token.spaces_after = whitespace
+        if len(self._sentences) > 0 and len(self._sentences[0].tokens) > 0:
+            first_token = self._sentences[0].tokens[0]
+            whitespace = self._text[:first_token.start_char]
+            first_token.spaces_before = whitespace
+    @property
+    def lang(self):
+        """ Access the language of this document """
+        return self._lang
+    @lang.setter
+    def lang(self, value):
+        """ Set the language of this document """
+        self._lang = value
+    @property
+    def text(self):
+        """ Access the raw text for this document. """
+        return self._text
+    @text.setter
+    def text(self, value):
+        """ Set the raw text for this document. """
+        self._text = value
+    @property
+    def sentences(self):
+        """ Access the list of sentences for this document. """
+        return self._sentences
+    @sentences.setter
+    def sentences(self, value):
+        """ Set the list of tokens for this document. """
+        self._sentences = value
+    @property
+    def num_tokens(self):
+        """ Access the number of tokens for this document. """
+        return self._num_tokens
+    @num_tokens.setter
+    def num_tokens(self, value):
+        """ Set the number of tokens for this document. """
+        self._num_tokens = value
+    @property
+    def num_words(self):
+        """ Access the number of words for this document. """
+        return self._num_words
+    @num_words.setter
+    def num_words(self, value):
+        """ Set the number of words for this document. """
+        self._num_words = value
+    @property
+    def ents(self):
+        """ Access the list of entities in this document. """
+        return self._ents
+    @ents.setter
+    def ents(self, value):
+        """ Set the list of entities in this document. """
+        self._ents = value
+    @property
+    def entities(self):
+        """ Access the list of entities. This is just an alias of `ents`. """
+        return self._ents
+    @entities.setter
+    def entities(self, value):
+        """ Set the list of entities in this document. """
+        self._ents = value
+    def _process_sentences(self, sentences, comments=None, empty_sentences=None):
+        self.sentences = []
+        if empty_sentences is None:
+            empty_sentences = repeat([])
+        for sent_idx, (tokens, empty_words) in enumerate(zip(sentences, empty_sentences)):
+            try:
+                sentence = Sentence(tokens, doc=self, empty_words=empty_words)
+            except IndexError as e:
+                raise IndexError("Could not process document at sentence %d" % sent_idx) from e
+            except ValueError as e:
+                tokens = ["|%s|" % t for t in tokens]
+                tokens = ", ".join(tokens)
+                raise ValueError("Could not process document at sentence %d\n  Raw tokens: %s" % (sent_idx, tokens)) from e
+            self.sentences.append(sentence)
+            begin_idx, end_idx = sentence.tokens[0].start_char, sentence.tokens[-1].end_char
+            if all((self.text is not None, begin_idx is not None, end_idx is not None)): sentence.text = self.text[begin_idx: end_idx]
+            sentence.index = sent_idx
+        self._count_words()
+        # Add a #text comment to each sentence in a doc if it doesn't already exist
+        if not comments:
+            comments = [[] for x in self.sentences]
+        else:
+            comments = [list(x) for x in comments]
+        for sentence, sentence_comments in zip(self.sentences, comments):
+            # the space after text can occur in treebanks such as the Naija-NSC treebank,
+            # which extensively uses `# text_en =` and `# text_ortho`
+            if sentence.text and not any(comment.startswith("# text ") or comment.startswith("#text ") or comment.startswith("# text=") or comment.startswith("#text=") for comment in sentence_comments):
+                # split/join to handle weird whitespace, especially newlines
+                sentence_comments.append("# text = " + ' '.join(sentence.text.split()))
+            elif not sentence.text:
+                for comment in sentence_comments:
+                    if comment.startswith("# text ") or comment.startswith("#text ") or comment.startswith("# text=") or comment.startswith("#text="):
+                        sentence.text = comment.split("=", 1)[-1].strip()
+                        break
+            for comment in sentence_comments:
+                sentence.add_comment(comment)
+            # look for sent_id in the comments
+            # if it's there, overwrite the sent_idx id from above
+            for comment in sentence_comments:
+                if comment.startswith("# sent_id"):
+                    sentence.sent_id = comment.split("=", 1)[-1].strip()
+                    break
+            else:
+                # no sent_id found.  add a comment with our enumerated id
+                # setting the sent_id on the sentence will automatically add the comment
+                sentence.sent_id = str(sentence.index)
+    def _count_words(self):
+        """
+        Count the number of tokens and words
+        """
+        self.num_tokens = sum([len(sentence.tokens) for sentence in self.sentences])
+        self.num_words = sum([len(sentence.words) for sentence in self.sentences])
+    def get(self, fields, as_sentences=False, from_token=False):
+        """ Get fields from a list of field names.
+        If only one field name (string or singleton list) is provided,
+        return a list of that field; if more than one, return a list of list.
+        Note that all returned fields are after multi-word expansion.
+        Args:
+            fields: name of the fields as a list or a single string
+            as_sentences: if True, return the fields as a list of sentences; otherwise as a whole list
+            from_token: if True, get the fields from Token; otherwise from Word
+        Returns:
+            All requested fields.
+        """
+        if isinstance(fields, str):
+            fields = [fields]
+        assert isinstance(fields, list), "Must provide field names as a list."
+        assert len(fields) >= 1, "Must have at least one field."
+        results = []
+        for sentence in self.sentences:
+            cursent = []
+            # decide word or token
+            if from_token:
+                units = sentence.tokens
+            else:
+                units = sentence.words
+            for unit in units:
+                if len(fields) == 1:
+                    cursent += [getattr(unit, fields[0])]
+                else:
+                    cursent += [[getattr(unit, field) for field in fields]]
+            # decide whether append the results as a sentence or a whole list
+            if as_sentences:
+                results.append(cursent)
+            else:
+                results += cursent
+        return results
+    def set(self, fields, contents, to_token=False, to_sentence=False):
+        """Set fields based on contents. If only one field (string or
+        singleton list) is provided, then a list of content will be
+        expected; otherwise a list of list of contents will be expected.
+        Args:
+            fields: name of the fields as a list or a single string
+            contents: field values to set; total length should be equal to number of words/tokens
+            to_token: if True, set field values to tokens; otherwise to words
+        """
+        if isinstance(fields, str):
+            fields = [fields]
+        assert isinstance(fields, (tuple, list)), "Must provide field names as a list."
+        assert isinstance(contents, (tuple, list)), "Must provide contents as a list (one item per line)."
+        assert len(fields) >= 1, "Must have at least one field."
+        assert not to_sentence or not to_token, "Both to_token and to_sentence set to True, which is very confusing"
+        if to_sentence:
+            assert len(self.sentences) == len(contents), \
+                "Contents must have the same length as the sentences"
+            for sentence, content in zip(self.sentences, contents):
+                if len(fields) == 1:
+                    setattr(sentence, fields[0], content)
+                else:
+                    for field, piece in zip(fields, content):
+                        setattr(sentence, field, piece)
+        else:
+            assert (to_token and self.num_tokens == len(contents)) or self.num_words == len(contents), \
+                "Contents must have the same length as the original file."
+            cidx = 0
+            for sentence in self.sentences:
+                # decide word or token
+                if to_token:
+                    units = sentence.tokens
+                else:
+                    units = sentence.words
+                for unit in units:
+                    if len(fields) == 1:
+                        setattr(unit, fields[0], contents[cidx])
+                    else:
+                        for field, content in zip(fields, contents[cidx]):
+                            setattr(unit, field, content)
+                    cidx += 1
+    def set_mwt_expansions(self, expansions,
+                           fake_dependencies=False,
+                           process_manual_expanded=None):
+        """ Extend the multi-word tokens annotated by tokenizer. A list of list of expansions
+        will be expected for each multi-word token. Use `process_manual_expanded` to limit
+        processing for tokens marked manually expanded:
+        There are two types of MWT expansions: those with `misc`: `MWT=True`, and those with
+        `manual_expansion`: True. The latter of which means that it is an expansion which the
+        user manually specified through a postprocessor; the former means that it is a MWT
+        which the detector picked out, but needs to be automatically expanded.
+        process_manual_expanded = None - default; doesn't process manually expanded tokens
+                                = True - process only manually expanded tokens (with `manual_expansion`: True)
+                                = False - process only tokens explicitly tagged as MWT (`misc`: `MWT=True`)
+        """
+        idx_e = 0
+        for sentence in self.sentences:
+            idx_w = 0
+            for token in sentence.tokens:
+                idx_w += 1
+                is_multi = (len(token.id) > 1)
+                is_mwt = (multi_word_token_misc.match(token.misc) if token.misc is not None else None)
+                is_manual_expansion = token.manual_expansion
+                perform_mwt_processing = MWTProcessingType.FLATTEN
+                if (process_manual_expanded and is_manual_expansion):
+                    perform_mwt_processing = MWTProcessingType.PROCESS
+                elif (process_manual_expanded==False and is_mwt):
+                    perform_mwt_processing = MWTProcessingType.PROCESS
+                elif (process_manual_expanded==False and is_manual_expansion):
+                    perform_mwt_processing = MWTProcessingType.SKIP
+                elif (process_manual_expanded==None and (is_mwt or is_multi)):
+                    perform_mwt_processing = MWTProcessingType.PROCESS
+                if perform_mwt_processing == MWTProcessingType.FLATTEN:
+                    for word in token.words:
+                        token.id = (idx_w, )
+                        # delete dependency information
+                        word.deps = None
+                        word.head, word.deprel = None, None
+                        word.id = idx_w
+                elif perform_mwt_processing == MWTProcessingType.PROCESS:
+                    expanded = [x for x in expansions[idx_e].split(' ') if len(x) > 0]
+                    # in the event the MWT annotator only split the
+                    # Token into a single Word, we preserve its text
+                    # otherwise the Token's text is different from its
+                    # only Word's text
+                    if len(expanded) == 1:
+                        expanded = [token.text]
+                    idx_e += 1
+                    idx_w_end = idx_w + len(expanded) - 1
+                    if token.misc:  # None can happen when using a prebuilt doc
+                        token.misc = None if token.misc == 'MWT=Yes' else '|'.join([x for x in token.misc.split('|') if x != 'MWT=Yes'])
+                    token.id = (idx_w, idx_w_end) if len(expanded) > 1 else (idx_w,)
+                    token.words = []
+                    for i, e_word in enumerate(expanded):
+                        token.words.append(Word(sentence, {ID: idx_w + i, TEXT: e_word}))
+                    idx_w = idx_w_end
+                elif perform_mwt_processing == MWTProcessingType.SKIP:
+                    token.id = tuple(orig_id + idx_e for orig_id in token.id)
+                    for i in token.words:
+                        i.id += idx_e
+                    idx_w = token.id[-1]
+                    token.manual_expansion = None
+            # reprocess the words using the new tokens
+            sentence.words = []
+            for token in sentence.tokens:
+                token.sent = sentence
+                for word in token.words:
+                    word.sent = sentence
+                    word.parent = token
+                    sentence.words.append(word)
+                if token.start_char is not None and token.end_char is not None and "".join(word.text for word in token.words) == token.text:
+                    start_char = token.start_char
+                    for word in token.words:
+                        end_char = start_char + len(word.text)
+                        word.start_char = start_char
+                        word.end_char = end_char
+                        start_char = end_char
+            if fake_dependencies:
+                sentence.build_fake_dependencies()
+            else:
+                sentence.rebuild_dependencies()
+        self._count_words() # update number of words & tokens
+        assert idx_e == len(expansions), "{} {}".format(idx_e, len(expansions))
+        return
+    def get_mwt_expansions(self, evaluation=False):
+        """ Get the multi-word tokens. For training, return a list of
+        (multi-word token, extended multi-word token); otherwise, return a list of
+        multi-word token only. By default doesn't skip already expanded tokens, but
+        `skip_already_expanded` will return only tokens marked as MWT.
+        """
+        expansions = []
+        for sentence in self.sentences:
+            for token in sentence.tokens:
+                is_multi = (len(token.id) > 1)
+                is_mwt = multi_word_token_misc.match(token.misc) if token.misc is not None else None
+                is_manual_expansion = token.manual_expansion
+                if (is_multi and not is_manual_expansion) or is_mwt:
+                    src = token.text
+                    dst = ' '.join([word.text for word in token.words])
+                    expansions.append([src, dst])
+        if evaluation: expansions = [e[0] for e in expansions]
+        return expansions
+    def build_ents(self):
+        """ Build the list of entities by iterating over all words. Return all entities as a list. """
+        self.ents = []
+        for s in self.sentences:
+            s_ents = s.build_ents()
+            self.ents += s_ents
+        return self.ents
+    def sort_features(self):
+        """ Sort the features on all the words... useful for prototype treebanks, for example """
+        for sentence in self.sentences:
+            for word in sentence.words:
+                if not word.feats:
+                    continue
+                pieces = word.feats.split("|")
+                pieces = sorted(pieces)
+                word.feats = "|".join(pieces)
+    def iter_words(self):
+        """ An iterator that returns all of the words in this Document. """
+        for s in self.sentences:
+            yield from s.words
+    def iter_tokens(self):
+        """ An iterator that returns all of the tokens in this Document. """
+        for s in self.sentences:
+            yield from s.tokens
+    def sentence_comments(self):
+        """ Returns a list of list of comments for the sentences """
+        return [[comment for comment in sentence.comments] for sentence in self.sentences]
+    @property
+    def coref(self):
+        """
+        Access the coref lists of the document
+        """
+        return self._coref
+    @coref.setter
+    def coref(self, chains):
+        """ Set the document's coref lists """
+        self._coref = chains
+        self._attach_coref_mentions(chains)
+    def _attach_coref_mentions(self, chains):
+        for sentence in self.sentences:
+            for word in sentence.words:
+                word.coref_chains = []
+        for chain in chains:
+            for mention_idx, mention in enumerate(chain.mentions):
+                sentence = self.sentences[mention.sentence]
+                for word_idx in range(mention.start_word, mention.end_word):
+                    is_start = word_idx == mention.start_word
+                    is_end = word_idx == mention.end_word - 1
+                    is_representative = mention_idx == chain.representative_index
+                    attachment = CorefAttachment(chain, is_start, is_end, is_representative)
+                    sentence.words[word_idx].coref_chains.append(attachment)
+    def reindex_sentences(self, start_index):
+        for sent_id, sentence in zip(range(start_index, start_index + len(self.sentences)), self.sentences):
+            sentence.sent_id = str(sent_id)
+    def to_dict(self):
+        """ Dumps the whole document into a list of list of dictionary for each token in each sentence in the doc.
+        """
+        return [sentence.to_dict() for sentence in self.sentences]
+    def __repr__(self):
+        return json.dumps(self.to_dict(), indent=2, ensure_ascii=False, cls=DocJSONEncoder)
+    def __format__(self, spec):
+        if spec == 'c':
+            return "\n\n".join("{:c}".format(s) for s in self.sentences)
+        elif spec == 'C':
+            return "\n\n".join("{:C}".format(s) for s in self.sentences)
+        else:
+            return str(self)
+    def to_serialized(self):
+        """ Dumps the whole document including text to a byte array containing a list of list of dictionaries for each token in each sentence in the doc.
+        """
+        return pickle.dumps((self.text, self.to_dict(), self.sentence_comments()))
+    @classmethod
+    def from_serialized(cls, serialized_string):
+        """ Create and initialize a new document from a serialized string generated by Document.to_serialized_string():
+        """
+        stuff = pickle.loads(serialized_string)
+        if not isinstance(stuff, tuple):
+            raise TypeError("Serialized data was not a tuple when building a Document")
+        if len(stuff) == 2:
+            text, sentences = pickle.loads(serialized_string)
+            doc = cls(sentences, text)
+        else:
+            text, sentences, comments = pickle.loads(serialized_string)
+            doc = cls(sentences, text, comments)
+        return doc
+class Sentence(StanzaObject):
+    """ A sentence class that stores attributes of a sentence and carries a list of tokens.
+    """
+    def __init__(self, tokens, doc=None, empty_words=None):
+        """ Construct a sentence given a list of tokens in the form of CoNLL-U dicts.
+        """
+        self._tokens = []
+        self._words = []
+        self._dependencies = []
+        self._text = None
+        self._ents = []
+        self._doc = doc
+        self._constituency = None
+        self._sentiment = None
+        # comments are a list of comment lines occurring before the
+        # sentence in a CoNLL-U file.  Can be empty
+        self._comments = []
+        self._doc_id = None
+        # enhanced_dependencies represents the DEPS column
+        # this is a networkx MultiDiGraph
+        # with edges from the parent to the dependent
+        # however, we set it to None until needed, as it is somewhat slow
+        self._enhanced_dependencies = None
+        self._process_tokens(tokens)
+        if empty_words is not None:
+            self._empty_words = [Word(self, entry) for entry in empty_words]
+        else:
+            self._empty_words = []
+    def _process_tokens(self, tokens):
+        st, en = -1, -1
+        self.tokens, self.words = [], []
+        for i, entry in enumerate(tokens):
+            if ID not in entry: # manually set a 1-based id for word if not exist
+                entry[ID] = (i+1, )
+            if isinstance(entry[ID], int):
+                entry[ID] = (entry[ID], )
+            if len(entry.get(ID)) > 1: # if this token is a multi-word token
+                st, en = entry[ID]
+                self.tokens.append(Token(self, entry))
+            else: # else this token is a word
+                new_word = Word(self, entry)
+                if len(self.words) > 0 and self.words[-1].id == new_word.id:
+                    # this can happen in the following context:
+                    # a document was created with MWT=Yes to mark that a token should be split
+                    # and then there was an MWT "expansion" with a single word after that token
+                    # we replace the Word in the Token assuming that the expansion token might
+                    # have more information than the Token dict did
+                    # note that a single word MWT like that can be detected with something like
+                    #   multi_word_token_misc.match(entry.get(MISC)) if entry.get(MISC, None)
+                    self.words[-1] = new_word
+                    self.tokens[-1].words[-1] = new_word
+                    continue
+                self.words.append(new_word)
+                idx = entry.get(ID)[0]
+                if idx <= en:
+                    self.tokens[-1].words.append(new_word)
+                else:
+                    self.tokens.append(Token(self, entry, words=[new_word]))
+                new_word.parent = self.tokens[-1]
+        # put all of the whitespace annotations (if any) on the Tokens instead of the Words
+        for token in self.tokens:
+            token.consolidate_whitespace()
+        self.rebuild_dependencies()
+    def has_enhanced_dependencies(self):
+        """
+        Whether or not the enhanced dependencies are part of this sentence
+        """
+        return self._enhanced_dependencies is not None and len(self._enhanced_dependencies) > 0
+    @property
+    def index(self):
+        """
+        Access the index of this sentence within the doc.
+        If multiple docs were processed together,
+        the sentence index will continue counting across docs.
+        """
+        return self._index
+    @index.setter
+    def index(self, value):
+        """ Set the sentence's index value. """
+        self._index = value
+    @property
+    def id(self):
+        """
+        Access the index of this sentence within the doc.
+        If multiple docs were processed together,
+        the sentence index will continue counting across docs.
+        """
+        warnings.warn("Use of sentence.id is deprecated.  Please use sentence.index instead", stacklevel=2)
+        return self._index
+    @id.setter
+    def id(self, value):
+        """ Set the sentence's index value. """
+        warnings.warn("Use of sentence.id is deprecated.  Please use sentence.index instead", stacklevel=2)
+        self._index = value
+    @property
+    def sent_id(self):
+        """ conll-style sent_id  Will be set from index if unknown """
+        return self._sent_id
+    @sent_id.setter
+    def sent_id(self, value):
+        """ Set the sentence's sent_id value. """
+        self._sent_id = value
+        sent_id_comment = "# sent_id = " + str(value)
+        for comment_idx, comment in enumerate(self._comments):
+            if comment.startswith("# sent_id = "):
+                self._comments[comment_idx] = sent_id_comment
+                break
+        else: # this is intended to be a for/else loop
+            self._comments.append(sent_id_comment)
+    @property
+    def doc_id(self):
+        """ conll-style doc_id  Can be left blank if unknown """
+        return self._doc_id
+    @doc_id.setter
+    def doc_id(self, value):
+        """ Set the sentence's doc_id value. """
+        self._doc_id = value
+        doc_id_comment = "# doc_id = " + str(value)
+        for comment_idx, comment in enumerate(self._comments):
+            if comment.startswith("# doc_id = "):
+                self._comments[comment_idx] = doc_id_comment
+                break
+        else: # this is intended to be a for/else loop
+            self._comments.append(doc_id_comment)
+    @property
+    def doc(self):
+        """ Access the parent doc of this span. """
+        return self._doc
+    @doc.setter
+    def doc(self, value):
+        """ Set the parent doc of this span. """
+        self._doc = value
+    @property
+    def text(self):
+        """ Access the raw text for this sentence. """
+        return self._text
+    @text.setter
+    def text(self, value):
+        """ Set the raw text for this sentence. """
+        self._text = value
+    @property
+    def dependencies(self):
+        """ Access list of dependencies for this sentence. """
+        return self._dependencies
+    @dependencies.setter
+    def dependencies(self, value):
+        """ Set the list of dependencies for this sentence. """
+        self._dependencies = value
+    @property
+    def tokens(self):
+        """ Access the list of tokens for this sentence. """
+        return self._tokens
+    @tokens.setter
+    def tokens(self, value):
+        """ Set the list of tokens for this sentence. """
+        self._tokens = value
+    @property
+    def words(self):
+        """ Access the list of words for this sentence. """
+        return self._words
+    @words.setter
+    def words(self, value):
+        """ Set the list of words for this sentence. """
+        self._words = value
+    @property
+    def empty_words(self):
+        """ Access the list of words for this sentence. """
+        return self._empty_words
+    @empty_words.setter
+    def empty_words(self, value):
+        """ Set the list of words for this sentence. """
+        self._empty_words = value
+    @property
+    def ents(self):
+        """ Access the list of entities in this sentence. """
+        return self._ents
+    @ents.setter
+    def ents(self, value):
+        """ Set the list of entities in this sentence. """
+        self._ents = value
+    @property
+    def entities(self):
+        """ Access the list of entities. This is just an alias of `ents`. """
+        return self._ents
+    @entities.setter
+    def entities(self, value):
+        """ Set the list of entities in this sentence. """
+        self._ents = value
+    def build_ents(self):
+        """ Build the list of entities by iterating over all tokens. Return all entities as a list.
+        Note that unlike other attributes, since NER requires raw text, the actual tagging are always
+        performed at and attached to the `Token`s, instead of `Word`s.
+        """
+        self.ents = []
+        tags = [w.ner for w in self.tokens]
+        decoded = decode_from_bioes(tags)
+        for e in decoded:
+            ent_tokens = self.tokens[e['start']:e['end']+1]
+            self.ents.append(Span(tokens=ent_tokens, type=e['type'], doc=self.doc, sent=self))
+        return self.ents
+    @property
+    def sentiment(self):
+        """ Returns the sentiment value for this sentence """
+        return self._sentiment
+    @sentiment.setter
+    def sentiment(self, value):
+        """ Set the sentiment value """
+        self._sentiment = value
+        sentiment_comment = "# sentiment = " + str(value)
+        for comment_idx, comment in enumerate(self._comments):
+            if comment.startswith("# sentiment = "):
+                self._comments[comment_idx] = sentiment_comment
+                break
+        else: # this is intended to be a for/else loop
+            self._comments.append(sentiment_comment)
+    @property
+    def constituency(self):
+        """ Returns the constituency tree for this sentence """
+        return self._constituency
+    @constituency.setter
+    def constituency(self, value):
+        """
+        Set the constituency tree
+        This incidentally updates the #constituency comment if it already exists,
+        or otherwise creates a new comment # constituency = ...
+        """
+        self._constituency = value
+        constituency_comment = "# constituency = " + str(value)
+        constituency_comment = constituency_comment.replace("\n", "*NL*").replace("\r", "")
+        for comment_idx, comment in enumerate(self._comments):
+            if comment.startswith("# constituency = "):
+                self._comments[comment_idx] = constituency_comment
+                break
+        else: # this is intended to be a for/else loop
+            self._comments.append(constituency_comment)
+    @property
+    def comments(self):
+        """ Returns CoNLL-style comments for this sentence """
+        return self._comments
+    def add_comment(self, comment):
+        """ Adds a single comment to this sentence.
+        If the comment does not already have # at the start, it will be added.
+        """
+        if not comment.startswith("#"):
+            comment = "# " + comment
+        if comment.startswith("# constituency ="):
+            _, tree_text = comment.split("=", 1)
+            tree = tree_reader.read_trees(tree_text)
+            if len(tree) > 1:
+                raise ValueError("Multiple constituency trees for one sentence: %s" % tree_text)
+            self._constituency = tree[0]
+            self._comments = [x for x in self._comments if not x.startswith("# constituency =")]
+        elif comment.startswith("# sentiment ="):
+            _, sentiment = comment.split("=", 1)
+            sentiment = int(sentiment.strip())
+            self._sentiment = sentiment
+            self._comments = [x for x in self._comments if not x.startswith("# sentiment =")]
+        elif comment.startswith("# sent_id ="):
+            _, sent_id = comment.split("=", 1)
+            sent_id = sent_id.strip()
+            self._sent_id = sent_id
+            self._comments = [x for x in self._comments if not x.startswith("# sent_id =")]
+        elif comment.startswith("# doc_id ="):
+            _, doc_id = comment.split("=", 1)
+            doc_id = doc_id.strip()
+            self._doc_id = doc_id
+            self._comments = [x for x in self._comments if not x.startswith("# doc_id =")]
+        self._comments.append(comment)
+    def rebuild_dependencies(self):
+        # rebuild dependencies if there is dependency info
+        is_complete_dependencies = all(word.head is not None and word.deprel is not None for word in self.words)
+        is_complete_words = (len(self.words) >= len(self.tokens)) and (len(self.words) == self.words[-1].id)
+        if is_complete_dependencies and is_complete_words: self.build_dependencies()
+    def build_dependencies(self):
+        """ Build the dependency graph for this sentence. Each dependency graph entry is
+        a list of (head, deprel, word).
+        """
+        self.dependencies = []
+        for word in self.words:
+            if word.head == 0:
+                # make a word for the ROOT
+                word_entry = {ID: 0, TEXT: "ROOT"}
+                head = Word(self, word_entry)
+            else:
+                # id is index in words list + 1
+                try:
+                    head = self.words[word.head - 1]
+                except IndexError as e:
+                    raise IndexError("Word head {} is not a valid word index for word {}".format(word.head, word.id)) from e
+                if word.head != head.id:
+                    raise ValueError("Dependency tree is incorrectly constructed")
+            self.dependencies.append((head, word.deprel, word))
+    def build_fake_dependencies(self):
+        self.dependencies = []
+        for word_idx, word in enumerate(self.words):
+            word.head = word_idx   # note that this goes one previous to the index
+            word.deprel = "root" if word_idx == 0 else "dep"
+            word.deps = "%d:%s" % (word.head, word.deprel)
+            self.dependencies.append((word_idx, word.deprel, word))
+    def print_dependencies(self, file=None):
+        """ Print the dependencies for this sentence. """
+        for dep_edge in self.dependencies:
+            print((dep_edge[2].text, dep_edge[0].id, dep_edge[1]), file=file)
+    def dependencies_string(self):
+        """ Dump the dependencies for this sentence into string. """
+        dep_string = io.StringIO()
+        self.print_dependencies(file=dep_string)
+        return dep_string.getvalue().strip()
+    def print_tokens(self, file=None):
+        """ Print the tokens for this sentence. """
+        for tok in self.tokens:
+            print(tok.pretty_print(), file=file)
+    def tokens_string(self):
+        """ Dump the tokens for this sentence into string. """
+        toks_string = io.StringIO()
+        self.print_tokens(file=toks_string)
+        return toks_string.getvalue().strip()
+    def print_words(self, file=None):
+        """ Print the words for this sentence. """
+        for word in self.words:
+            print(word.pretty_print(), file=file)
+    def words_string(self):
+        """ Dump the words for this sentence into string. """
+        wrds_string = io.StringIO()
+        self.print_words(file=wrds_string)
+        return wrds_string.getvalue().strip()
+    def to_dict(self):
+        """ Dumps the sentence into a list of dictionary for each token in the sentence.
+        """
+        ret = []
+        empty_idx = 0
+        for token_idx, token in enumerate(self.tokens):
+            while empty_idx < len(self._empty_words) and self._empty_words[empty_idx].id[0] < token.id[0]:
+                ret.append(self._empty_words[empty_idx].to_dict())
+                empty_idx += 1
+            ret += token.to_dict()
+        for empty_word in self._empty_words[empty_idx:]:
+            ret.append(empty_word.to_dict())
+        return ret
+    def __repr__(self):
+        return json.dumps(self.to_dict(), indent=2, ensure_ascii=False, cls=DocJSONEncoder)
+    def __format__(self, spec):
+        if spec != 'c' and spec != 'C':
+            return str(self)
+        pieces = []
+        empty_idx = 0
+        for token_idx, token in enumerate(self.tokens):
+            while empty_idx < len(self._empty_words) and self._empty_words[empty_idx].id[0] < token.id[0]:
+                pieces.append(self._empty_words[empty_idx].to_conll_text())
+                empty_idx += 1
+            pieces.append(token.to_conll_text())
+        for empty_word in self._empty_words[empty_idx:]:
+            pieces.append(empty_word.to_conll_text())
+        if spec == 'c':
+            return "\n".join(pieces)
+        elif spec == 'C':
+            tokens = "\n".join(pieces)
+            if len(self.comments) > 0:
+                text = "\n".join(self.comments)
+                return text + "\n" + tokens
+            return tokens
+def init_from_misc(unit):
+    """Create attributes by parsing from the `misc` field.
+    Also, remove start_char, end_char, and any other values we can set
+    from the misc field if applicable, so that we don't repeat ourselves
+    """
+    remaining_values = []
+    for item in unit._misc.split('|'):
+        key_value = item.split('=', 1)
+        if len(key_value) == 2:
+            # some key_value can not be split
+            key, value = key_value
+            # start & end char are kept as ints
+            if key in (START_CHAR, END_CHAR):
+                value = int(value)
+            # set attribute
+            attr = f'_{key}'
+            if hasattr(unit, attr):
+                setattr(unit, attr, value)
+                continue
+            elif key == NER:
+                # special case skipping NER for Words, since there is no Word NER field
+                continue
+        remaining_values.append(item)
+    unit._misc = "|".join(remaining_values)
+def dict_to_conll_text(token_dict, id_connector="-"):
+    token_conll = ['_' for i in range(FIELD_NUM)]
+    misc = []
+    for key in token_dict:
+        if key == START_CHAR or key == END_CHAR:
+            misc.append("{}={}".format(key, token_dict[key]))
+        elif key == NER:
+            # TODO: potentially need to escape =|\ in the NER
+            misc.append("{}={}".format(key, token_dict[key]))
+        elif key == COREF_CHAINS:
+            chains = token_dict[key]
+            if len(chains) > 0:
+                misc_chains = []
+                for chain in chains:
+                    if chain.is_start and chain.is_end:
+                        coref_position = "unit-"
+                    elif chain.is_start:
+                        coref_position = "start-"
+                    elif chain.is_end:
+                        coref_position = "end-"
+                    else:
+                        coref_position = "middle-"
+                    is_representative = "repr-" if chain.is_representative else ""
+                    misc_chains.append("%s%sid%d" % (coref_position, is_representative, chain.chain.index))
+                misc.append("{}={}".format(key, ",".join(misc_chains)))
+        elif key == MISC:
+            # avoid appending a blank misc entry.
+            # otherwise the resulting misc field in the conll doc will wind up being blank text
+            # TODO: potentially need to escape =|\ in the MISC as well
+            if token_dict[key]:
+                misc.append(token_dict[key])
+        elif key == ID:
+            token_conll[FIELD_TO_IDX[key]] = id_connector.join([str(x) for x in token_dict[key]]) if isinstance(token_dict[key], tuple) else str(token_dict[key])
+        elif key in FIELD_TO_IDX:
+            token_conll[FIELD_TO_IDX[key]] = str(token_dict[key])
+    if misc:
+        token_conll[FIELD_TO_IDX[MISC]] = "|".join(misc)
+    else:
+        token_conll[FIELD_TO_IDX[MISC]] = '_'
+    # when a word (not mwt token) without head is found, we insert dummy head as required by the UD eval script
+    if '-' not in token_conll[FIELD_TO_IDX[ID]] and '.' not in token_conll[FIELD_TO_IDX[ID]] and HEAD not in token_dict:
+        token_conll[FIELD_TO_IDX[HEAD]] = str(int(token_dict[ID] if isinstance(token_dict[ID], int) else token_dict[ID][0]) - 1) # evaluation script requires head: int
+    return "\t".join(token_conll)
+class Token(StanzaObject):
+    """ A token class that stores attributes of a token and carries a list of words. A token corresponds to a unit in the raw
+    text. In some languages such as English, a token has a one-to-one mapping to a word, while in other languages such as French,
+    a (multi-word) token might be expanded into multiple words that carry syntactic annotations.
+    """
+    def __init__(self, sentence, token_entry, words=None):
+        """
+        Construct a token given a dictionary format token entry. Optionally link itself to the corresponding words.
+        The owning sentence must be passed in.
+        """
+        self._id = token_entry.get(ID)
+        self._text = token_entry.get(TEXT)
+        if not self._id:
+            raise ValueError('id not included for the token')
+        if not self._text:
+            raise ValueError('text not included for the token')
+        self._misc = token_entry.get(MISC, None)
+        self._ner = token_entry.get(NER, None)
+        self._multi_ner = token_entry.get(MULTI_NER, None)
+        self._words = words if words is not None else []
+        self._start_char = token_entry.get(START_CHAR, None)
+        self._end_char = token_entry.get(END_CHAR, None)
+        self._sent = sentence
+        self._mexp = token_entry.get(MEXP, None)
+        self._spaces_before = ""
+        self._spaces_after = " "
+        if self._misc is not None:
+            init_from_misc(self)
+    @property
+    def id(self):
+        """ Access the index of this token. """
+        return self._id
+    @id.setter
+    def id(self, value):
+        """ Set the token's id value. """
+        self._id = value
+    @property
+    def manual_expansion(self):
+        """ Access the whether this token was manually expanded. """
+        return self._mexp
+    @manual_expansion.setter
+    def manual_expansion(self, value):
+        """ Set the whether this token was manually expanded. """
+        self._mexp = value
+    @property
+    def text(self):
+        """ Access the text of this token. Example: 'The' """
+        return self._text
+    @text.setter
+    def text(self, value):
+        """ Set the token's text value. Example: 'The' """
+        self._text = value
+    @property
+    def misc(self):
+        """ Access the miscellaneousness of this token. """
+        return self._misc
+    @misc.setter
+    def misc(self, value):
+        """ Set the token's miscellaneousness value. """
+        self._misc = value if self._is_null(value) == False else None
+    def consolidate_whitespace(self):
+        """
+        Remove whitespace misc annotations from the Words and mark the whitespace on the Tokens
+        """
+        found_after = False
+        found_before = False
+        num_words = len(self.words)
+        for word_idx, word in enumerate(self.words):
+            misc = word.misc
+            if not misc:
+                continue
+            pieces = misc.split("|")
+            if word_idx == 0:
+                if any(piece.startswith("SpacesBefore=") for piece in pieces):
+                    self.spaces_before = misc_to_space_before(misc)
+                    found_before = True
+            else:
+                if any(piece.startswith("SpacesBefore=") for piece in pieces):
+                    warnings.warn("Found a SpacesBefore MISC annotation on a Word that was not the first Word in a Token")
+            if word_idx == num_words - 1:
+                if any(piece.startswith("SpaceAfter=") or piece.startswith("SpacesAfter=") for piece in pieces):
+                    self.spaces_after = misc_to_space_after(misc)
+                    found_after = True
+            else:
+                if any(piece.startswith("SpaceAfter=") or piece.startswith("SpacesAfter=") for piece in pieces):
+                    unexpected_space_after = misc_to_space_after(misc)
+                    if unexpected_space_after == "":
+                        warnings.warn("Unexpected SpaceAfter=No annotation on a word in the middle of an MWT")
+                    else:
+                        warnings.warn("Unexpected SpacesAfter on a word in the middle on an MWT")
+            pieces = [x for x in pieces if not x.startswith("SpacesAfter=") and not x.startswith("SpaceAfter=") and not x.startswith("SpacesBefore=")]
+            word.misc = "|".join(pieces)
+        misc = self.misc
+        if misc:
+            pieces = misc.split("|")
+            if any(piece.startswith("SpacesBefore=") for piece in pieces):
+                spaces_before = misc_to_space_before(misc)
+                if found_before:
+                    if spaces_before != self.spaces_before:
+                        warnings.warn("Found conflicting SpacesBefore on a token and its word!")
+                else:
+                    self.spaces_before = spaces_before
+            if any(piece.startswith("SpaceAfter=") or piece.startswith("SpacesAfter=") for piece in pieces):
+                spaces_after = misc_to_space_after(misc)
+                if found_after:
+                    if spaces_after != self.spaces_after:
+                        warnings.warn("Found conflicting SpaceAfter / SpacesAfter on a token and its word!")
+                else:
+                    self.spaces_after = spaces_after
+            pieces = [x for x in pieces if not x.startswith("SpacesAfter=") and not x.startswith("SpaceAfter=") and not x.startswith("SpacesBefore=")]
+            self.misc = "|".join(pieces)
+    @property
+    def spaces_before(self):
+        """ SpacesBefore for the token. Translated from the MISC fields """
+        return self._spaces_before
+    @spaces_before.setter
+    def spaces_before(self, value):
+        self._spaces_before = value
+    @property
+    def spaces_after(self):
+        """ SpaceAfter or SpacesAfter for the token.  Translated from the MISC field """
+        return self._spaces_after
+    @spaces_after.setter
+    def spaces_after(self, value):
+        self._spaces_after = value
+    @property
+    def words(self):
+        """ Access the list of syntactic words underlying this token. """
+        return self._words
+    @words.setter
+    def words(self, value):
+        """ Set this token's list of underlying syntactic words. """
+        self._words = value
+        for w in self._words:
+            w.parent = self
+    @property
+    def start_char(self):
+        """ Access the start character index for this token in the raw text. """
+        return self._start_char
+    @property
+    def end_char(self):
+        """ Access the end character index for this token in the raw text. """
+        return self._end_char
+    @property
+    def ner(self):
+        """ Access the NER tag of this token. Example: 'B-ORG'"""
+        return self._ner
+    @ner.setter
+    def ner(self, value):
+        """ Set the token's NER tag. Example: 'B-ORG'"""
+        self._ner = value if self._is_null(value) == False else None
+    @property
+    def multi_ner(self):
+        """ Access the MULTI_NER tag of this token. Example: '(B-ORG, B-DISEASE)'"""
+        return self._multi_ner
+    @multi_ner.setter
+    def multi_ner(self, value):
+        """ Set the token's MULTI_NER tag. Example: '(B-ORG, B-DISEASE)'"""
+        self._multi_ner = value if self._is_null(value) == False else None
+    @property
+    def sent(self):
+        """ Access the pointer to the sentence that this token belongs to. """
+        return self._sent
+    @sent.setter
+    def sent(self, value):
+        """ Set the pointer to the sentence that this token belongs to. """
+        self._sent = value
+    def __repr__(self):
+        return json.dumps(self.to_dict(), indent=2, ensure_ascii=False, cls=DocJSONEncoder)
+    def __format__(self, spec):
+        if spec == 'C':
+            return "\n".join(self.to_conll_text())
+        elif spec == 'P':
+            return self.pretty_print()
+        else:
+            return str(self)
+    def to_conll_text(self):
+        return "\n".join(dict_to_conll_text(x) for x in self.to_dict())
+    def to_dict(self, fields=[ID, TEXT, MISC, START_CHAR, END_CHAR, NER, MULTI_NER, MEXP]):
+        """ Dumps the token into a list of dictionary for this token with its extended words
+        if the token is a multi-word token.
+        """
+        ret = []
+        if len(self.id) > 1:
+            token_dict = {}
+            for field in fields:
+                if getattr(self, field) is not None:
+                    token_dict[field] = getattr(self, field)
+            if MISC in fields:
+                spaces_after = self.spaces_after
+                if spaces_after is not None and spaces_after != ' ':
+                    space_misc = space_after_to_misc(spaces_after)
+                    if token_dict.get(MISC):
+                        token_dict[MISC] = token_dict[MISC] + "|" + space_misc
+                    else:
+                        token_dict[MISC] = space_misc
+                spaces_before = self.spaces_before
+                if spaces_before is not None and spaces_before != '':
+                    space_misc = space_before_to_misc(spaces_before)
+                    if token_dict.get(MISC):
+                        token_dict[MISC] = token_dict[MISC] + "|" + space_misc
+                    else:
+                        token_dict[MISC] = space_misc
+            ret.append(token_dict)
+        for word in self.words:
+            word_dict = word.to_dict()
+            if len(self.id) == 1 and NER in fields and getattr(self, NER) is not None: # propagate NER label to Word if it is a single-word token
+                word_dict[NER] = getattr(self, NER)
+            if len(self.id) == 1 and MULTI_NER in fields and getattr(self, MULTI_NER) is not None: # propagate MULTI_NER label to Word if it is a single-word token
+                word_dict[MULTI_NER] = getattr(self, MULTI_NER)
+            if len(self.id) == 1 and MISC in fields:
+                spaces_after = self.spaces_after
+                if spaces_after is not None and spaces_after != ' ':
+                    space_misc = space_after_to_misc(spaces_after)
+                    if word_dict.get(MISC):
+                        word_dict[MISC] = word_dict[MISC] + "|" + space_misc
+                    else:
+                        word_dict[MISC] = space_misc
+                spaces_before = self.spaces_before
+                if spaces_before is not None and spaces_before != '':
+                    space_misc = space_before_to_misc(spaces_before)
+                    if word_dict.get(MISC):
+                        word_dict[MISC] = word_dict[MISC] + "|" + space_misc
+                    else:
+                        word_dict[MISC] = space_misc
+            ret.append(word_dict)
+        return ret
+    def pretty_print(self):
+        """ Print this token with its extended words in one line. """
+        return f"<{self.__class__.__name__} id={'-'.join([str(x) for x in self.id])};words=[{', '.join([word.pretty_print() for word in self.words])}]>"
+    def _is_null(self, value):
+        return (value is None) or (value == '_')
+    def is_mwt(self):
+        return len(self.words) > 1
+class Word(StanzaObject):
+    """ A word class that stores attributes of a word.
+    """
+    def __init__(self, sentence, word_entry):
+        """ Construct a word given a dictionary format word entry.
+        """
+        self._id = word_entry.get(ID, None)
+        if isinstance(self._id, tuple):
+            if len(self._id) == 1:
+                self._id = self._id[0]
+        self._text = word_entry.get(TEXT, None)
+        assert self._id is not None and self._text is not None, 'id and text should be included for the word. {}'.format(word_entry)
+        self._lemma = word_entry.get(LEMMA, None)
+        self._upos = word_entry.get(UPOS, None)
+        self._xpos = word_entry.get(XPOS, None)
+        self._feats = word_entry.get(FEATS, None)
+        self._head = word_entry.get(HEAD, None)
+        self._deprel = word_entry.get(DEPREL, None)
+        self._misc = word_entry.get(MISC, None)
+        self._start_char = word_entry.get(START_CHAR, None)
+        self._end_char = word_entry.get(END_CHAR, None)
+        self._parent = None
+        self._sent = sentence
+        self._mexp = word_entry.get(MEXP, None)
+        self._coref_chains = None
+        if self._misc is not None:
+            init_from_misc(self)
+        # use the setter, which will go up to the sentence and set the
+        # dependencies on that graph
+        self.deps = word_entry.get(DEPS, None)
+    @property
+    def manual_expansion(self):
+        """ Access the whether this token was manually expanded. """
+        return self._mexp
+    @manual_expansion.setter
+    def manual_expansion(self, value):
+        """ Set the whether this token was manually expanded. """
+        self._mexp = value
+    @property
+    def id(self):
+        """ Access the index of this word. """
+        return self._id
+    @id.setter
+    def id(self, value):
+        """ Set the word's index value. """
+        self._id = value
+    @property
+    def text(self):
+        """ Access the text of this word. Example: 'The'"""
+        return self._text
+    @text.setter
+    def text(self, value):
+        """ Set the word's text value. Example: 'The'"""
+        self._text = value
+    @property
+    def lemma(self):
+        """ Access the lemma of this word. """
+        return self._lemma
+    @lemma.setter
+    def lemma(self, value):
+        """ Set the word's lemma value. """
+        self._lemma = value if self._is_null(value) == False or self._text == '_' else None
+    @property
+    def upos(self):
+        """ Access the universal part-of-speech of this word. Example: 'NOUN'"""
+        return self._upos
+    @upos.setter
+    def upos(self, value):
+        """ Set the word's universal part-of-speech value. Example: 'NOUN'"""
+        self._upos = value if self._is_null(value) == False else None
+    @property
+    def xpos(self):
+        """ Access the treebank-specific part-of-speech of this word. Example: 'NNP'"""
+        return self._xpos
+    @xpos.setter
+    def xpos(self, value):
+        """ Set the word's treebank-specific part-of-speech value. Example: 'NNP'"""
+        self._xpos = value if self._is_null(value) == False else None
+    @property
+    def feats(self):
+        """ Access the morphological features of this word. Example: 'Gender=Fem'"""
+        return self._feats
+    @feats.setter
+    def feats(self, value):
+        """ Set this word's morphological features. Example: 'Gender=Fem'"""
+        self._feats = value if self._is_null(value) == False else None
+    @property
+    def head(self):
+        """ Access the id of the governor of this word. """
+        return self._head
+    @head.setter
+    def head(self, value):
+        """ Set the word's governor id value. """
+        self._head = int(value) if self._is_null(value) == False else None
+    @property
+    def deprel(self):
+        """ Access the dependency relation of this word. Example: 'nmod'"""
+        return self._deprel
+    @deprel.setter
+    def deprel(self, value):
+        """ Set the word's dependency relation value. Example: 'nmod'"""
+        self._deprel = value if self._is_null(value) == False else None
+    @property
+    def deps(self):
+        """ Access the dependencies of this word. """
+        graph = self._sent._enhanced_dependencies
+        if graph is None or not graph.has_node(self.id):
+            return None
+        data = []
+        predecessors = sorted(list(graph.predecessors(self.id)), key=lambda x: x if isinstance(x, tuple) else (x,))
+        for parent in predecessors:
+            deps = sorted(list(graph.get_edge_data(parent, self.id)))
+            for dep in deps:
+                if isinstance(parent, int):
+                    data.append("%d:%s" % (parent, dep))
+                else:
+                    data.append("%d.%d:%s" % (parent[0], parent[1], dep))
+        if not data:
+            return None
+        return "|".join(data)
+    @deps.setter
+    def deps(self, value):
+        """ Set the word's dependencies value. """
+        graph = self._sent._enhanced_dependencies
+        # if we don't have a graph, and we aren't trying to set any actual
+        # dependencies, we can save the time of doing anything else
+        if graph is None and value is None:
+            return
+        if graph is None:
+            graph = nx.MultiDiGraph()
+            self._sent._enhanced_dependencies = graph
+        # need to make a new list: cannot iterate and delete at the same time
+        if graph.has_node(self.id):
+            in_edges = list(graph.in_edges(self.id))
+            graph.remove_edges_from(in_edges)
+        if value is None:
+            return
+        if isinstance(value, str):
+            value = value.split("|")
+        if all(isinstance(x, str) for x in value):
+            value = [x.split(":", maxsplit=1) for x in value]
+        for parent, dep in value:
+            # we have to match the format of the IDs.  since the IDs
+            # of the words are int if they aren't empty words, we need
+            # to convert single int IDs into int instead of tuple
+            parent = tuple(map(int, parent.split(".", maxsplit=1)))
+            if len(parent) == 1:
+                parent = parent[0]
+            graph.add_edge(parent, self.id, dep)
+    @property
+    def misc(self):
+        """ Access the miscellaneousness of this word. """
+        return self._misc
+    @misc.setter
+    def misc(self, value):
+        """ Set the word's miscellaneousness value. """
+        self._misc = value if self._is_null(value) == False else None
+    @property
+    def start_char(self):
+        """ Access the start character index for this token in the raw text. """
+        return self._start_char
+    @start_char.setter
+    def start_char(self, value):
+        self._start_char = value
+    @property
+    def end_char(self):
+        """ Access the end character index for this token in the raw text. """
+        return self._end_char
+    @end_char.setter
+    def end_char(self, value):
+        self._end_char = value
+    @property
+    def parent(self):
+        """ Access the parent token of this word. In the case of a multi-word token, a token can be the parent of
+        multiple words. Note that this should return a reference to the parent token object.
+        """
+        return self._parent
+    @parent.setter
+    def parent(self, value):
+        """ Set this word's parent token. In the case of a multi-word token, a token can be the parent of
+        multiple words. Note that value here should be a reference to the parent token object.
+        """
+        self._parent = value
+    @property
+    def pos(self):
+        """ Access the universal part-of-speech of this word. Example: 'NOUN'"""
+        return self._upos
+    @pos.setter
+    def pos(self, value):
+        """ Set the word's universal part-of-speech value. Example: 'NOUN'"""
+        self._upos = value if self._is_null(value) == False else None
+    @property
+    def coref_chains(self):
+        """
+        coref_chains points to a list of CorefChain namedtuple, which has a list of mentions and a representative mention.
+        Useful for disambiguating words such as "him" (in languages where coref is available)
+        Theoretically it is possible for multiple corefs to occur at the same word.  For example,
+          "Chris Manning's NLP Group"
+        could have "Chris Manning" and "Chris Manning's NLP Group" as overlapping entities
+        """
+        return self._coref_chains
+    @coref_chains.setter
+    def coref_chains(self, chain):
+        """ Set the backref for the coref chains """
+        self._coref_chains = chain
+    @property
+    def sent(self):
+        """ Access the pointer to the sentence that this word belongs to. """
+        return self._sent
+    @sent.setter
+    def sent(self, value):
+        """ Set the pointer to the sentence that this word belongs to. """
+        self._sent = value
+    def __repr__(self):
+        return json.dumps(self.to_dict(), indent=2, ensure_ascii=False, cls=DocJSONEncoder)
+    def __format__(self, spec):
+        if spec == 'C':
+            return self.to_conll_text()
+        elif spec == 'P':
+            return self.pretty_print()
+        else:
+            return str(self)
+    def to_conll_text(self):
+        """
+        Turn a word into a conll representation (10 column tab separated)
+        """
+        token_dict = self.to_dict()
+        return dict_to_conll_text(token_dict, '.')
+    def to_dict(self, fields=[ID, TEXT, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC, START_CHAR, END_CHAR, MEXP, COREF_CHAINS]):
+        """ Dumps the word into a dictionary.
+        """
+        word_dict = {}
+        for field in fields:
+            if getattr(self, field) is not None:
+                word_dict[field] = getattr(self, field)
+        return word_dict
+    def pretty_print(self):
+        """ Print the word in one line. """
+        features = [ID, TEXT, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL]
+        feature_str = ";".join(["{}={}".format(k, getattr(self, k)) for k in features if getattr(self, k) is not None])
+        return f"<{self.__class__.__name__} {feature_str}>"
+    def _is_null(self, value):
+        return (value is None) or (value == '_')
+class Span(StanzaObject):
+    """ A span class that stores attributes of a textual span. A span can be typed.
+    A range of objects (e.g., entity mentions) can be represented as spans.
+    """
+    def __init__(self, span_entry=None, tokens=None, type=None, doc=None, sent=None):
+        """ Construct a span given a span entry or a list of tokens. A valid reference to a doc
+        must be provided to construct a span (otherwise the text of the span cannot be initialized).
+        """
+        assert span_entry is not None or (tokens is not None and type is not None), \
+                'Either a span_entry or a token list needs to be provided to construct a span.'
+        assert doc is not None, 'A parent doc must be provided to construct a span.'
+        self._text, self._type, self._start_char, self._end_char = [None] * 4
+        self._tokens = []
+        self._words = []
+        self._doc = doc
+        self._sent = sent
+        if span_entry is not None:
+            self.init_from_entry(span_entry)
+        if tokens is not None:
+            self.init_from_tokens(tokens, type)
+    def init_from_entry(self, span_entry):
+        self.text = span_entry.get(TEXT, None)
+        self.type = span_entry.get(TYPE, None)
+        self.start_char = span_entry.get(START_CHAR, None)
+        self.end_char = span_entry.get(END_CHAR, None)
+    def init_from_tokens(self, tokens, type):
+        assert isinstance(tokens, list), 'Tokens must be provided as a list to construct a span.'
+        assert len(tokens) > 0, "Tokens of a span cannot be an empty list."
+        self.tokens = tokens
+        self.type = type
+        # load start and end char offsets from tokens
+        self.start_char = self.tokens[0].start_char
+        self.end_char = self.tokens[-1].end_char
+        if self.doc is not None and self.doc.text is not None:
+            self.text = self.doc.text[self.start_char:self.end_char]
+        elif tokens[0].sent is tokens[-1].sent:
+            sentence = tokens[0].sent
+            text_start = tokens[0].start_char - sentence.tokens[0].start_char
+            text_end = tokens[-1].end_char - sentence.tokens[0].start_char
+            self.text = sentence.text[text_start:text_end]
+        else:
+            # TODO: do any spans ever cross sentences?
+            raise RuntimeError("Document text does not exist, and the span tested crosses two sentences, so it is impossible to extract the entity text!")
+        # collect the words of the span following tokens
+        self.words = [w for t in tokens for w in t.words]
+        # set the sentence back-pointer to point to the sentence of the first token
+        self.sent = tokens[0].sent
+    @property
+    def doc(self):
+        """ Access the parent doc of this span. """
+        return self._doc
+    @doc.setter
+    def doc(self, value):
+        """ Set the parent doc of this span. """
+        self._doc = value
+    @property
+    def text(self):
+        """ Access the text of this span. Example: 'Stanford University'"""
+        return self._text
+    @text.setter
+    def text(self, value):
+        """ Set the span's text value. Example: 'Stanford University'"""
+        self._text = value
+    @property
+    def tokens(self):
+        """ Access reference to a list of tokens that correspond to this span. """
+        return self._tokens
+    @tokens.setter
+    def tokens(self, value):
+        """ Set the span's list of tokens. """
+        self._tokens = value
+    @property
+    def words(self):
+        """ Access reference to a list of words that correspond to this span. """
+        return self._words
+    @words.setter
+    def words(self, value):
+        """ Set the span's list of words. """
+        self._words = value
+    @property
+    def type(self):
+        """ Access the type of this span. Example: 'PERSON'"""
+        return self._type
+    @type.setter
+    def type(self, value):
+        """ Set the type of this span. """
+        self._type = value
+    @property
+    def start_char(self):
+        """ Access the start character offset of this span. """
+        return self._start_char
+    @start_char.setter
+    def start_char(self, value):
+        """ Set the start character offset of this span. """
+        self._start_char = value
+    @property
+    def end_char(self):
+        """ Access the end character offset of this span. """
+        return self._end_char
+    @end_char.setter
+    def end_char(self, value):
+        """ Set the end character offset of this span. """
+        self._end_char = value
+    @property
+    def sent(self):
+        """ Access the pointer to the sentence that this span belongs to. """
+        return self._sent
+    @sent.setter
+    def sent(self, value):
+        """ Set the pointer to the sentence that this span belongs to. """
+        self._sent = value
+    def to_dict(self):
+        """ Dumps the span into a dictionary. """
+        attrs = ['text', 'type', 'start_char', 'end_char']
+        span_dict = dict([(attr_name, getattr(self, attr_name)) for attr_name in attrs])
+        return span_dict
+    def __repr__(self):
+        return json.dumps(self.to_dict(), indent=2, ensure_ascii=False, cls=DocJSONEncoder)
+    def pretty_print(self):
+        """ Print the span in one line. """
+        span_dict = self.to_dict()
+        feature_str = ";".join(["{}={}".format(k,v) for k,v in span_dict.items()])
+        return f"<{self.__class__.__name__} {feature_str}>"

stanza/stanza/models/common/dropout.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import torch
+import torch.nn as nn
+class WordDropout(nn.Module):
+    """ A word dropout layer that's designed for embedded inputs (e.g., any inputs to an LSTM layer).
+    Given a batch of embedded inputs, this layer randomly set some of them to be a replacement state.
+    Note that this layer assumes the last dimension of the input to be the hidden dimension of a unit.
+    """
+    def __init__(self, dropprob):
+        super().__init__()
+        self.dropprob = dropprob
+    def forward(self, x, replacement=None):
+        if not self.training or self.dropprob == 0:
+            return x
+        masksize = [y for y in x.size()]
+        masksize[-1] = 1
+        dropmask = torch.rand(*masksize, device=x.device) < self.dropprob
+        res = x.masked_fill(dropmask, 0)
+        if replacement is not None:
+            res = res + dropmask.float() * replacement
+        return res
+    def extra_repr(self):
+        return 'p={}'.format(self.dropprob)
+class LockedDropout(nn.Module):
+    """
+    A variant of dropout layer that consistently drops out the same parameters over time. Also known as the variational dropout.
+    This implementation was modified from the LockedDropout implementation in the flair library (https://github.com/zalandoresearch/flair).
+    """
+    def __init__(self, dropprob, batch_first=True):
+        super().__init__()
+        self.dropprob = dropprob
+        self.batch_first = batch_first
+    def forward(self, x):
+        if not self.training or self.dropprob == 0:
+            return x
+        if not self.batch_first:
+            m = x.new_empty(1, x.size(1), x.size(2), requires_grad=False).bernoulli_(1 - self.dropprob)
+        else:
+            m = x.new_empty(x.size(0), 1, x.size(2), requires_grad=False).bernoulli_(1 - self.dropprob)
+        mask = m.div(1 - self.dropprob).expand_as(x)
+        return mask * x
+    def extra_repr(self):
+        return 'p={}'.format(self.dropprob)
+class SequenceUnitDropout(nn.Module):
+    """ A unit dropout layer that's designed for input of sequence units (e.g., word sequence, char sequence, etc.).
+    Given a sequence of unit indices, this layer randomly set some of them to be a replacement id (usually set to be <UNK>).
+    """
+    def __init__(self, dropprob, replacement_id):
+        super().__init__()
+        self.dropprob = dropprob
+        self.replacement_id = replacement_id
+    def forward(self, x):
+        """ :param: x must be a LongTensor of unit indices. """
+        if not self.training or self.dropprob == 0:
+            return x
+        masksize = [y for y in x.size()]
+        dropmask = torch.rand(*masksize, device=x.device) < self.dropprob
+        res = x.masked_fill(dropmask, self.replacement_id)
+        return res
+    def extra_repr(self):
+        return 'p={}, replacement_id={}'.format(self.dropprob, self.replacement_id)

stanza/stanza/models/common/exceptions.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""
+A couple more specific FileNotFoundError exceptions
+The idea being, the caller can catch it and report a more useful error resolution
+"""
+import errno
+class ForwardCharlmNotFoundError(FileNotFoundError):
+    def __init__(self, msg, filename):
+        super().__init__(errno.ENOENT, msg, filename)
+class BackwardCharlmNotFoundError(FileNotFoundError):
+    def __init__(self, msg, filename):
+        super().__init__(errno.ENOENT, msg, filename)

stanza/stanza/models/common/foundation_cache.py ADDED Viewed

	@@ -0,0 +1,148 @@

+"""
+Keeps BERT, charlm, word embedings in a cache to save memory
+"""
+from collections import namedtuple
+from copy import deepcopy
+import logging
+import threading
+from stanza.models.common import bert_embedding
+from stanza.models.common.char_model import CharacterLanguageModel
+from stanza.models.common.pretrain import Pretrain
+logger = logging.getLogger('stanza')
+BertRecord = namedtuple('BertRecord', ['model', 'tokenizer', 'peft_ids'])
+class FoundationCache:
+    def __init__(self, other=None, local_files_only=False):
+        if other is None:
+            self.bert = {}
+            self.charlms = {}
+            self.pretrains = {}
+            # future proof the module by using a lock for the glorious day
+            # when the GIL is finally gone
+            self.lock = threading.Lock()
+        else:
+            self.bert = other.bert
+            self.charlms = other.charlms
+            self.pretrains = other.pretrains
+            self.lock = other.lock
+        self.local_files_only=local_files_only
+    def load_bert(self, transformer_name, local_files_only=None):
+        m, t, _ = self.load_bert_with_peft(transformer_name, None, local_files_only=local_files_only)
+        return m, t
+    def load_bert_with_peft(self, transformer_name, peft_name, local_files_only=None):
+        """
+        Load a transformer only once
+        Uses a lock for thread safety
+        """
+        if transformer_name is None:
+            return None, None, None
+        with self.lock:
+            if transformer_name not in self.bert:
+                if local_files_only is None:
+                    local_files_only = self.local_files_only
+                model, tokenizer = bert_embedding.load_bert(transformer_name, local_files_only=local_files_only)
+                self.bert[transformer_name] = BertRecord(model, tokenizer, {})
+            else:
+                logger.debug("Reusing bert %s", transformer_name)
+            bert_record = self.bert[transformer_name]
+            if not peft_name:
+                return bert_record.model, bert_record.tokenizer, None
+            if peft_name not in bert_record.peft_ids:
+                bert_record.peft_ids[peft_name] = 0
+            else:
+                bert_record.peft_ids[peft_name] = bert_record.peft_ids[peft_name] + 1
+            peft_name = "%s_%d" % (peft_name, bert_record.peft_ids[peft_name])
+            return bert_record.model, bert_record.tokenizer, peft_name
+    def load_charlm(self, filename):
+        if not filename:
+            return None
+        with self.lock:
+            if filename not in self.charlms:
+                logger.debug("Loading charlm from %s", filename)
+                self.charlms[filename] = CharacterLanguageModel.load(filename, finetune=False)
+            else:
+                logger.debug("Reusing charlm from %s", filename)
+            return self.charlms[filename]
+    def load_pretrain(self, filename):
+        """
+        Load a pretrained word embedding only once
+        Uses a lock for thread safety
+        """
+        if filename is None:
+            return None
+        with self.lock:
+            if filename not in self.pretrains:
+                logger.debug("Loading pretrain %s", filename)
+                self.pretrains[filename] = Pretrain(filename)
+            else:
+                logger.debug("Reusing pretrain %s", filename)
+            return self.pretrains[filename]
+class NoTransformerFoundationCache(FoundationCache):
+    """
+    Uses the underlying FoundationCache, but hiding the transformer.
+    Useful for when loading a downstream model such as POS which has a
+    finetuned transformer, and we don't want the transformer reused
+    since it will then have the finetuned weights for other models
+    which don't want them
+    """
+    def load_bert(self, transformer_name, local_files_only=None):
+        return load_bert(transformer_name, local_files_only=self.local_files_only if local_files_only is None else local_files_only)
+    def load_bert_with_peft(self, transformer_name, peft_name, local_files_only=None):
+        return load_bert_with_peft(transformer_name, peft_name, local_files_only=self.local_files_only if local_files_only is None else local_files_only)
+def load_bert(model_name, foundation_cache=None, local_files_only=None):
+    """
+    Load a bert, possibly using a foundation cache, ignoring the cache if None
+    """
+    if foundation_cache is None:
+        return bert_embedding.load_bert(model_name, local_files_only=local_files_only)
+    else:
+        return foundation_cache.load_bert(model_name, local_files_only=local_files_only)
+def load_bert_with_peft(model_name, peft_name, foundation_cache=None, local_files_only=None):
+    if foundation_cache is None:
+        m, t = bert_embedding.load_bert(model_name, local_files_only=local_files_only)
+        return m, t, peft_name
+    return foundation_cache.load_bert_with_peft(model_name, peft_name, local_files_only=local_files_only)
+def load_charlm(charlm_file, foundation_cache=None, finetune=False):
+    if not charlm_file:
+        return None
+    if finetune:
+        # can't use the cache in the case of a model which will be finetuned
+        # and the numbers will be different for other users of the model
+        return CharacterLanguageModel.load(charlm_file, finetune=True)
+    if foundation_cache is not None:
+        return foundation_cache.load_charlm(charlm_file)
+    logger.debug("Loading charlm from %s", charlm_file)
+    return CharacterLanguageModel.load(charlm_file, finetune=False)
+def load_pretrain(filename, foundation_cache=None):
+    if not filename:
+        return None
+    if foundation_cache is not None:
+        return foundation_cache.load_pretrain(filename)
+    logger.debug("Loading pretrain from %s", filename)
+    return Pretrain(filename)

stanza/stanza/models/common/hlstm.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence, pack_sequence, PackedSequence
+from stanza.models.common.packed_lstm import PackedLSTM
+class HLSTMCell(nn.modules.rnn.RNNCellBase):
+    """
+    A Highway LSTM Cell as proposed in Zhang et al. (2018) Highway Long Short-Term Memory RNNs for
+    Distant Speech Recognition.
+    """
+    def __init__(self, input_size, hidden_size, bias=True):
+        super(HLSTMCell, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        # LSTM parameters
+        self.Wi = nn.Linear(input_size + hidden_size, hidden_size, bias=bias)
+        self.Wf = nn.Linear(input_size + hidden_size, hidden_size, bias=bias)
+        self.Wo = nn.Linear(input_size + hidden_size, hidden_size, bias=bias)
+        self.Wg = nn.Linear(input_size + hidden_size, hidden_size, bias=bias)
+        # highway gate parameters
+        self.gate = nn.Linear(input_size + 2 * hidden_size, hidden_size, bias=bias)
+    def forward(self, input, c_l_minus_one=None, hx=None):
+        self.check_forward_input(input)
+        if hx is None:
+            hx = input.new_zeros(input.size(0), self.hidden_size, requires_grad=False)
+            hx = (hx, hx)
+        if c_l_minus_one is None:
+            c_l_minus_one = input.new_zeros(input.size(0), self.hidden_size, requires_grad=False)
+        self.check_forward_hidden(input, hx[0], '[0]')
+        self.check_forward_hidden(input, hx[1], '[1]')
+        self.check_forward_hidden(input, c_l_minus_one, 'c_l_minus_one')
+        # vanilla LSTM computation
+        rec_input = torch.cat([input, hx[0]], 1)
+        i = F.sigmoid(self.Wi(rec_input))
+        f = F.sigmoid(self.Wf(rec_input))
+        o = F.sigmoid(self.Wo(rec_input))
+        g = F.tanh(self.Wg(rec_input))
+        # highway gates
+        gate = F.sigmoid(self.gate(torch.cat([c_l_minus_one, hx[1], input], 1)))
+        c = gate * c_l_minus_one + f * hx[1] + i * g
+        h = o * F.tanh(c)
+        return h, c
+# Highway LSTM network, does NOT use the HLSTMCell above
+class HighwayLSTM(nn.Module):
+    """
+    A Highway LSTM network, as used in the original Tensorflow version of the Dozat parser. Note that this
+    is independent from the HLSTMCell above.
+    """
+    def __init__(self, input_size, hidden_size,
+                 num_layers=1, bias=True, batch_first=False,
+                 dropout=0, bidirectional=False, rec_dropout=0, highway_func=None, pad=False):
+        super(HighwayLSTM, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.bias = bias
+        self.batch_first = batch_first
+        self.dropout = dropout
+        self.dropout_state = {}
+        self.bidirectional = bidirectional
+        self.num_directions = 2 if bidirectional else 1
+        self.highway_func = highway_func
+        self.pad = pad
+        self.lstm = nn.ModuleList()
+        self.highway = nn.ModuleList()
+        self.gate = nn.ModuleList()
+        self.drop = nn.Dropout(dropout, inplace=True)
+        in_size = input_size
+        for l in range(num_layers):
+            self.lstm.append(PackedLSTM(in_size, hidden_size, num_layers=1, bias=bias,
+                batch_first=batch_first, dropout=0, bidirectional=bidirectional, rec_dropout=rec_dropout))
+            self.highway.append(nn.Linear(in_size, hidden_size * self.num_directions))
+            self.gate.append(nn.Linear(in_size, hidden_size * self.num_directions))
+            self.highway[-1].bias.data.zero_()
+            self.gate[-1].bias.data.zero_()
+            in_size = hidden_size * self.num_directions
+    def forward(self, input, seqlens, hx=None):
+        highway_func = (lambda x: x) if self.highway_func is None else self.highway_func
+        hs = []
+        cs = []
+        if not isinstance(input, PackedSequence):
+            input = pack_padded_sequence(input, seqlens, batch_first=self.batch_first)
+        for l in range(self.num_layers):
+            if l > 0:
+                input = PackedSequence(self.drop(input.data), input.batch_sizes, input.sorted_indices, input.unsorted_indices)
+            layer_hx = (hx[0][l * self.num_directions:(l+1)*self.num_directions], hx[1][l * self.num_directions:(l+1)*self.num_directions]) if hx is not None else None
+            h, (ht, ct) = self.lstm[l](input, seqlens, layer_hx)
+            hs.append(ht)
+            cs.append(ct)
+            input = PackedSequence(h.data + torch.sigmoid(self.gate[l](input.data)) * highway_func(self.highway[l](input.data)), input.batch_sizes, input.sorted_indices, input.unsorted_indices)
+        if self.pad:
+            input = pad_packed_sequence(input, batch_first=self.batch_first)[0]
+        return input, (torch.cat(hs, 0), torch.cat(cs, 0))
+if __name__ == "__main__":
+    T = 10
+    bidir = True
+    num_dir = 2 if bidir else 1
+    rnn = HighwayLSTM(10, 20, num_layers=2, bidirectional=True)
+    input = torch.randn(T, 3, 10)
+    hx = torch.randn(2 * num_dir, 3, 20)
+    cx = torch.randn(2 * num_dir, 3, 20)
+    output = rnn(input, (hx, cx))
+    print(output)

stanza/stanza/models/common/large_margin_loss.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""
+LargeMarginInSoftmax, from the article
+@inproceedings{kobayashi2019bmvc,
+  title={Large Margin In Softmax Cross-Entropy Loss},
+  author={Takumi Kobayashi},
+  booktitle={Proceedings of the British Machine Vision Conference (BMVC)},
+  year={2019}
+}
+implementation from
+https://github.com/tk1980/LargeMarginInSoftmax
+There is no license specifically chosen; they just ask people to cite the paper if the work is useful.
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+import torch.nn.functional as F
+class LargeMarginInSoftmaxLoss(nn.CrossEntropyLoss):
+    r"""
+    This combines the Softmax Cross-Entropy Loss (nn.CrossEntropyLoss) and the large-margin inducing
+    regularization proposed in
+       T. Kobayashi, "Large-Margin In Softmax Cross-Entropy Loss." In BMVC2019.
+    This loss function inherits the parameters from nn.CrossEntropyLoss except for `reg_lambda` and `deg_logit`.
+    Args:
+         reg_lambda (float, optional): a regularization parameter. (default: 0.3)
+         deg_logit (bool, optional): underestimate (degrade) the target logit by -1 or not. (default: False)
+                                     If True, it realizes the method that incorporates the modified loss into ours
+                                     as described in the above paper (Table 4).
+    """
+    def __init__(self, reg_lambda=0.3, deg_logit=None,
+                weight=None, size_average=None, ignore_index=-100, reduce=None, reduction='mean'):
+        super(LargeMarginInSoftmaxLoss, self).__init__(weight=weight, size_average=size_average,
+                                ignore_index=ignore_index, reduce=reduce, reduction=reduction)
+        self.reg_lambda = reg_lambda
+        self.deg_logit = deg_logit
+    def forward(self, input, target):
+        N = input.size(0) # number of samples
+        C = input.size(1) # number of classes
+        Mask = torch.zeros_like(input, requires_grad=False)
+        Mask[range(N),target] = 1
+        if self.deg_logit is not None:
+            input = input - self.deg_logit * Mask
+        loss = F.cross_entropy(input, target, weight=self.weight,
+                               ignore_index=self.ignore_index, reduction=self.reduction)
+        X = input - 1.e6 * Mask # [N x C], excluding the target class
+        reg = 0.5 * ((F.softmax(X, dim=1) - 1.0/(C-1)) * F.log_softmax(X, dim=1) * (1.0-Mask)).sum(dim=1)
+        if self.reduction == 'sum':
+            reg = reg.sum()
+        elif self.reduction == 'mean':
+            reg = reg.mean()
+        elif self.reduction == 'none':
+            reg = reg
+        return loss + self.reg_lambda * reg

stanza/stanza/models/common/loss.py ADDED Viewed

	@@ -0,0 +1,134 @@

+"""
+Different loss functions.
+"""
+import logging
+import numpy as np
+import torch
+import torch.nn as nn
+import stanza.models.common.seq2seq_constant as constant
+logger = logging.getLogger('stanza')
+def SequenceLoss(vocab_size):
+    weight = torch.ones(vocab_size)
+    weight[constant.PAD_ID] = 0
+    crit = nn.NLLLoss(weight)
+    return crit
+def weighted_cross_entropy_loss(labels, log_dampened=False):
+    """
+    Either return a loss function which reweights all examples so the
+    classes have the same effective weight, or dampened reweighting
+    using log() so that the biggest class has some priority
+    """
+    if isinstance(labels, list):
+        all_labels = np.array(labels)
+    _, weights = np.unique(labels, return_counts=True)
+    weights = weights / float(np.sum(weights))
+    weights = np.sum(weights) / weights
+    if log_dampened:
+        weights = 1 + np.log(weights)
+    logger.debug("Reweighting cross entropy by {}".format(weights))
+    loss = nn.CrossEntropyLoss(
+        weight=torch.from_numpy(weights).type('torch.FloatTensor')
+    )
+    return loss
+class FocalLoss(nn.Module):
+    """
+    Uses the model's assessment of how likely the correct answer is
+    to weight the loss for a each error
+    multi-category focal loss, in other words
+    from "Focal Loss for Dense Object Detection"
+    https://arxiv.org/abs/1708.02002
+    """
+    def __init__(self, reduction='mean', gamma=2.0):
+        super().__init__()
+        if reduction not in ('sum', 'none', 'mean'):
+            raise ValueError("Unknown reduction: %s" % reduction)
+        self.reduction = reduction
+        self.ce_loss = nn.CrossEntropyLoss(reduction='none')
+        self.gamma = gamma
+    def forward(self, inputs, targets):
+        """
+        Weight the loss using the models assessment of the correct answer
+        inputs: [N, C]
+        targets: [N]
+        """
+        if len(inputs.shape) == 2 and len(targets.shape) == 1:
+            if inputs.shape[0] != targets.shape[0]:
+                raise ValueError("Expected inputs N,C and targets N, but got {} and {}".format(inputs.shape, targets.shape))
+        elif len(inputs.shape) == 1 and len(targets.shape) == 0:
+            raise NotImplementedError("This would be a reasonable thing to implement, but we haven't done it yet")
+        else:
+            raise ValueError("Expected inputs N,C and targets N, but got {} and {}".format(inputs.shape, targets.shape))
+        raw_loss = self.ce_loss(inputs, targets)
+        assert len(raw_loss.shape) == 1 and raw_loss.shape[0] == inputs.shape[0]
+        # https://www.tutorialexample.com/implement-focal-loss-for-multi-label-classification-in-pytorch-pytorch-tutorial/
+        final_loss = raw_loss * ((1 - torch.exp(-raw_loss)) ** self.gamma)
+        assert len(final_loss.shape) == 1 and final_loss.shape[0] == inputs.shape[0]
+        if self.reduction == 'sum':
+            return final_loss.sum()
+        elif self.reduction == 'mean':
+            return final_loss.mean()
+        elif self.reduction == 'none':
+            return final_loss
+        raise AssertionError("unknown reduction!  how did this happen??")
+class MixLoss(nn.Module):
+    """
+    A mixture of SequenceLoss and CrossEntropyLoss.
+    Loss = SequenceLoss + alpha * CELoss
+    """
+    def __init__(self, vocab_size, alpha):
+        super().__init__()
+        self.seq_loss = SequenceLoss(vocab_size)
+        self.ce_loss = nn.CrossEntropyLoss()
+        assert alpha >= 0
+        self.alpha = alpha
+    def forward(self, seq_inputs, seq_targets, class_inputs, class_targets):
+        sl = self.seq_loss(seq_inputs, seq_targets)
+        cel = self.ce_loss(class_inputs, class_targets)
+        loss = sl + self.alpha * cel
+        return loss
+class MaxEntropySequenceLoss(nn.Module):
+    """
+    A max entropy loss that encourage the model to have large entropy,
+    therefore giving more diverse outputs.
+    Loss = NLLLoss + alpha * EntropyLoss
+    """
+    def __init__(self, vocab_size, alpha):
+        super().__init__()
+        weight = torch.ones(vocab_size)
+        weight[constant.PAD_ID] = 0
+        self.nll = nn.NLLLoss(weight)
+        self.alpha = alpha
+    def forward(self, inputs, targets):
+        """
+        inputs: [N, C]
+        targets: [N]
+        """
+        assert inputs.size(0) == targets.size(0)
+        nll_loss = self.nll(inputs, targets)
+        # entropy loss
+        mask = targets.eq(constant.PAD_ID).unsqueeze(1).expand_as(inputs)
+        masked_inputs = inputs.clone().masked_fill_(mask, 0.0)
+        p = torch.exp(masked_inputs)
+        ent_loss = p.mul(masked_inputs).sum() / inputs.size(0) # average over minibatch
+        loss = nll_loss + self.alpha * ent_loss
+        return loss

stanza/stanza/models/common/maxout_linear.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""
+A layer which implements maxout from the "Maxout Networks" paper
+https://arxiv.org/pdf/1302.4389v4.pdf
+Goodfellow, Warde-Farley, Mirza, Courville, Bengio
+or a simpler explanation here:
+https://stats.stackexchange.com/questions/129698/what-is-maxout-in-neural-network/298705#298705
+The implementation here:
+for k layers of maxout, in -> out channels, we make a single linear
+  map of size in -> out*k
+then we reshape the end to be (..., k, out)
+and return the max over the k layers
+"""
+import torch
+import torch.nn as nn
+class MaxoutLinear(nn.Module):
+    def __init__(self, in_channels, out_channels, maxout_k):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.maxout_k = maxout_k
+        self.linear = nn.Linear(in_channels, out_channels * maxout_k)
+    def forward(self, inputs):
+        """
+        Use the oversized linear as the repeated linear, then take the max
+        One large linear map makes the implementation simpler and easier for pytorch to make parallel
+        """
+        outputs = self.linear(inputs)
+        outputs = outputs.view(*outputs.shape[:-1], self.maxout_k, self.out_channels)
+        outputs = torch.max(outputs, dim=-2)[0]
+        return outputs

stanza/stanza/models/common/packed_lstm.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence, pack_sequence, PackedSequence
+class PackedLSTM(nn.Module):
+    def __init__(self, input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, pad=False, rec_dropout=0):
+        super().__init__()
+        self.batch_first = batch_first
+        self.pad = pad
+        if rec_dropout == 0:
+            # use the fast, native LSTM implementation
+            self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=bias, batch_first=batch_first, dropout=dropout, bidirectional=bidirectional)
+        else:
+            self.lstm = LSTMwRecDropout(input_size, hidden_size, num_layers, bias=bias, batch_first=batch_first, dropout=dropout, bidirectional=bidirectional, rec_dropout=rec_dropout)
+    def forward(self, input, lengths, hx=None):
+        if not isinstance(input, PackedSequence):
+            input = pack_padded_sequence(input, lengths, batch_first=self.batch_first)
+        res = self.lstm(input, hx)
+        if self.pad:
+            res = (pad_packed_sequence(res[0], batch_first=self.batch_first)[0], res[1])
+        return res
+class LSTMwRecDropout(nn.Module):
+    """ An LSTM implementation that supports recurrent dropout """
+    def __init__(self, input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, pad=False, rec_dropout=0):
+        super().__init__()
+        self.batch_first = batch_first
+        self.pad = pad
+        self.num_layers = num_layers
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.drop = nn.Dropout(dropout, inplace=True)
+        self.rec_drop = nn.Dropout(rec_dropout, inplace=True)
+        self.num_directions = 2 if bidirectional else 1
+        self.cells = nn.ModuleList()
+        for l in range(num_layers):
+            in_size = input_size if l == 0 else self.num_directions * hidden_size
+            for d in range(self.num_directions):
+                self.cells.append(nn.LSTMCell(in_size, hidden_size, bias=bias))
+    def forward(self, input, hx=None):
+        def rnn_loop(x, batch_sizes, cell, inits, reverse=False):
+            # RNN loop for one layer in one direction with recurrent dropout
+            # Assumes input is PackedSequence, returns PackedSequence as well
+            batch_size = batch_sizes[0].item()
+            states = [list(init.split([1] * batch_size)) for init in inits]
+            h_drop_mask = x.new_ones(batch_size, self.hidden_size)
+            h_drop_mask = self.rec_drop(h_drop_mask)
+            resh = []
+            if not reverse:
+                st = 0
+                for bs in batch_sizes:
+                    s1 = cell(x[st:st+bs], (torch.cat(states[0][:bs], 0) * h_drop_mask[:bs], torch.cat(states[1][:bs], 0)))
+                    resh.append(s1[0])
+                    for j in range(bs):
+                        states[0][j] = s1[0][j].unsqueeze(0)
+                        states[1][j] = s1[1][j].unsqueeze(0)
+                    st += bs
+            else:
+                en = x.size(0)
+                for i in range(batch_sizes.size(0)-1, -1, -1):
+                    bs = batch_sizes[i]
+                    s1 = cell(x[en-bs:en], (torch.cat(states[0][:bs], 0) * h_drop_mask[:bs], torch.cat(states[1][:bs], 0)))
+                    resh.append(s1[0])
+                    for j in range(bs):
+                        states[0][j] = s1[0][j].unsqueeze(0)
+                        states[1][j] = s1[1][j].unsqueeze(0)
+                    en -= bs
+                resh = list(reversed(resh))
+            return torch.cat(resh, 0), tuple(torch.cat(s, 0) for s in states)
+        all_states = [[], []]
+        inputdata, batch_sizes = input.data, input.batch_sizes
+        for l in range(self.num_layers):
+            new_input = []
+            if self.dropout > 0 and l > 0:
+                inputdata = self.drop(inputdata)
+            for d in range(self.num_directions):
+                idx = l * self.num_directions + d
+                cell = self.cells[idx]
+                out, states = rnn_loop(inputdata, batch_sizes, cell, (hx[i][idx] for i in range(2)) if hx is not None else (input.data.new_zeros(input.batch_sizes[0].item(), self.hidden_size, requires_grad=False) for _ in range(2)), reverse=(d == 1))
+                new_input.append(out)
+                all_states[0].append(states[0].unsqueeze(0))
+                all_states[1].append(states[1].unsqueeze(0))
+            if self.num_directions > 1:
+                # concatenate both directions
+                inputdata = torch.cat(new_input, 1)
+            else:
+                inputdata = new_input[0]
+        input = PackedSequence(inputdata, batch_sizes)
+        return input, tuple(torch.cat(x, 0) for x in all_states)

stanza/stanza/models/common/peft_config.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""
+Set a few common flags for peft uage
+"""
+TRANSFORMER_LORA_RANK = {}
+DEFAULT_LORA_RANK = 64
+TRANSFORMER_LORA_ALPHA = {}
+DEFAULT_LORA_ALPHA = 128
+TRANSFORMER_LORA_DROPOUT = {}
+DEFAULT_LORA_DROPOUT = 0.1
+TRANSFORMER_LORA_TARGETS = {}
+DEFAULT_LORA_TARGETS = "query,value,output.dense,intermediate.dense"
+TRANSFORMER_LORA_SAVE = {}
+DEFAULT_LORA_SAVE = ""
+def add_peft_args(parser):
+    """
+    Add common default flags to an argparse
+    """
+    parser.add_argument('--lora_rank', type=int, default=None, help="Rank of a LoRA approximation.  Default will be %d or a model-specific parameter" % DEFAULT_LORA_RANK)
+    parser.add_argument('--lora_alpha', type=int, default=None, help="Alpha of a LoRA approximation.  Default will be %d or a model-specific parameter" % DEFAULT_LORA_ALPHA)
+    parser.add_argument('--lora_dropout', type=float, default=None, help="Dropout for the LoRA approximation.  Default will be %s or a model-specific parameter" % DEFAULT_LORA_DROPOUT)
+    parser.add_argument('--lora_target_modules', type=str, default=None, help="Comma separated list of LoRA targets.  Default will be '%s' or a model-specific parameter" % DEFAULT_LORA_TARGETS)
+    parser.add_argument('--lora_modules_to_save', type=str, default=None, help="Comma separated list of modules to save (eg, fully tune) when using LoRA.  Default will be '%s' or a model-specific parameter" % DEFAULT_LORA_SAVE)
+    parser.add_argument('--use_peft', default=False, action='store_true', help="Finetune Bert using peft")
+def pop_peft_args(args):
+    """
+    Pop all of the peft-related arguments from a given dict
+    Useful for making sure a model loaded from disk is recreated with
+    the right shapes, for example
+    """
+    args.pop("lora_rank", None)
+    args.pop("lora_alpha", None)
+    args.pop("lora_dropout", None)
+    args.pop("lora_target_modules", None)
+    args.pop("lora_modules_to_save", None)
+    args.pop("use_peft", None)
+def resolve_peft_args(args, logger, check_bert_finetune=True):
+    if not hasattr(args, 'bert_model'):
+        return
+    if args.lora_rank is None:
+        args.lora_rank = TRANSFORMER_LORA_RANK.get(args.bert_model, DEFAULT_LORA_RANK)
+    if args.lora_alpha is None:
+        args.lora_alpha = TRANSFORMER_LORA_ALPHA.get(args.bert_model, DEFAULT_LORA_ALPHA)
+    if args.lora_dropout is None:
+        args.lora_dropout = TRANSFORMER_LORA_DROPOUT.get(args.bert_model, DEFAULT_LORA_DROPOUT)
+    if args.lora_target_modules is None:
+        args.lora_target_modules = TRANSFORMER_LORA_TARGETS.get(args.bert_model, DEFAULT_LORA_TARGETS)
+    if not args.lora_target_modules.strip():
+        args.lora_target_modules = []
+    else:
+        args.lora_target_modules = args.lora_target_modules.split(",")
+    if args.lora_modules_to_save is None:
+        args.lora_modules_to_save = TRANSFORMER_LORA_SAVE.get(args.bert_model, DEFAULT_LORA_SAVE)
+    if not args.lora_modules_to_save.strip():
+        args.lora_modules_to_save = []
+    else:
+        args.lora_modules_to_save = args.lora_modules_to_save.split(",")
+    if check_bert_finetune and hasattr(args, 'bert_finetune'):
+        if args.use_peft and not args.bert_finetune:
+            logger.info("--use_peft set.  setting --bert_finetune as well")
+            args.bert_finetune = True
+def build_peft_config(args, logger):
+    # Hide import so that the peft dependency is optional
+    from peft import LoraConfig
+    logger.debug("Creating lora adapter with rank %d and alpha %d", args['lora_rank'], args['lora_alpha'])
+    peft_config = LoraConfig(inference_mode=False,
+                             r=args['lora_rank'],
+                             target_modules=args['lora_target_modules'],
+                             lora_alpha=args['lora_alpha'],
+                             lora_dropout=args['lora_dropout'],
+                             modules_to_save=args['lora_modules_to_save'],
+                             bias="none")
+    return peft_config
+def build_peft_wrapper(bert_model, args, logger, adapter_name="default"):
+    # Hide import so that the peft dependency is optional
+    from peft import get_peft_model
+    peft_config = build_peft_config(args, logger)
+    pefted = get_peft_model(bert_model, peft_config, adapter_name=adapter_name)
+    # apparently get_peft_model doesn't actually mark that
+    # peft configs are loaded, making it impossible to turn off (or on)
+    # the peft adapter later
+    bert_model._hf_peft_config_loaded = True
+    pefted._hf_peft_config_loaded = True
+    pefted.set_adapter(adapter_name)
+    return pefted
+def load_peft_wrapper(bert_model, lora_params, args, logger, adapter_name):
+    peft_config = build_peft_config(args, logger)
+    try:
+        bert_model.load_adapter(adapter_name=adapter_name, peft_config=peft_config, adapter_state_dict=lora_params)
+    except (ValueError, TypeError) as _:
+        from peft import set_peft_model_state_dict
+        # this can happen if the adapter already exists...
+        # in that case, try setting the adapter weights?
+        set_peft_model_state_dict(bert_model, lora_params, adapter_name=adapter_name)
+    bert_model.set_adapter(adapter_name)
+    return bert_model

stanza/stanza/models/common/seq2seq_constant.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""
+Constants for seq2seq models.
+"""
+PAD = '<PAD>'
+PAD_ID = 0
+UNK = '<UNK>'
+UNK_ID = 1
+SOS = '<SOS>'
+SOS_ID = 2
+EOS = '<EOS>'
+EOS_ID = 3
+VOCAB_PREFIX = [PAD, UNK, SOS, EOS]
+EMB_INIT_RANGE = 1.0
+INFINITY_NUMBER = 1e12

stanza/stanza/models/common/seq2seq_model.py ADDED Viewed

	@@ -0,0 +1,364 @@

+"""
+The full encoder-decoder model, built on top of the base seq2seq modules.
+"""
+import logging
+import torch
+from torch import nn
+import torch.nn.functional as F
+import numpy as np
+import stanza.models.common.seq2seq_constant as constant
+from stanza.models.common import utils
+from stanza.models.common.seq2seq_modules import LSTMAttention
+from stanza.models.common.beam import Beam
+from stanza.models.common.seq2seq_constant import UNK_ID
+logger = logging.getLogger('stanza')
+class Seq2SeqModel(nn.Module):
+    """
+    A complete encoder-decoder model, with optional attention.
+    A parent class which makes use of the contextual_embedding (such as a charlm)
+    can make use of unsaved_modules when saving.
+    """
+    def __init__(self, args, emb_matrix=None, contextual_embedding=None):
+        super().__init__()
+        self.unsaved_modules = []
+        self.vocab_size = args['vocab_size']
+        self.emb_dim = args['emb_dim']
+        self.hidden_dim = args['hidden_dim']
+        self.nlayers = args['num_layers'] # encoder layers, decoder layers = 1
+        self.emb_dropout = args.get('emb_dropout', 0.0)
+        self.dropout = args['dropout']
+        self.pad_token = constant.PAD_ID
+        self.max_dec_len = args['max_dec_len']
+        self.top = args.get('top', 1e10)
+        self.args = args
+        self.emb_matrix = emb_matrix
+        self.add_unsaved_module("contextual_embedding", contextual_embedding)
+        logger.debug("Building an attentional Seq2Seq model...")
+        logger.debug("Using a Bi-LSTM encoder")
+        self.num_directions = 2
+        self.enc_hidden_dim = self.hidden_dim // 2
+        self.dec_hidden_dim = self.hidden_dim
+        self.use_pos = args.get('pos', False)
+        self.pos_dim = args.get('pos_dim', 0)
+        self.pos_vocab_size = args.get('pos_vocab_size', 0)
+        self.pos_dropout = args.get('pos_dropout', 0)
+        self.edit = args.get('edit', False)
+        self.num_edit = args.get('num_edit', 0)
+        self.copy = args.get('copy', False)
+        self.emb_drop = nn.Dropout(self.emb_dropout)
+        self.drop = nn.Dropout(self.dropout)
+        self.embedding = nn.Embedding(self.vocab_size, self.emb_dim, self.pad_token)
+        self.input_dim = self.emb_dim
+        if self.contextual_embedding is not None:
+            self.input_dim += self.contextual_embedding.hidden_dim()
+        self.encoder = nn.LSTM(self.input_dim, self.enc_hidden_dim, self.nlayers, \
+                bidirectional=True, batch_first=True, dropout=self.dropout if self.nlayers > 1 else 0)
+        self.decoder = LSTMAttention(self.emb_dim, self.dec_hidden_dim, \
+                batch_first=True, attn_type=self.args['attn_type'])
+        self.dec2vocab = nn.Linear(self.dec_hidden_dim, self.vocab_size)
+        if self.use_pos and self.pos_dim > 0:
+            logger.debug("Using POS in encoder")
+            self.pos_embedding = nn.Embedding(self.pos_vocab_size, self.pos_dim, self.pad_token)
+            self.pos_drop = nn.Dropout(self.pos_dropout)
+        if self.edit:
+            edit_hidden = self.hidden_dim//2
+            self.edit_clf = nn.Sequential(
+                    nn.Linear(self.hidden_dim, edit_hidden),
+                    nn.ReLU(),
+                    nn.Linear(edit_hidden, self.num_edit))
+        if self.copy:
+            self.copy_gate = nn.Linear(self.dec_hidden_dim, 1)
+        SOS_tensor = torch.LongTensor([constant.SOS_ID])
+        self.register_buffer('SOS_tensor', SOS_tensor)
+        self.init_weights()
+    def add_unsaved_module(self, name, module):
+        self.unsaved_modules += [name]
+        setattr(self, name, module)
+    def init_weights(self):
+        # initialize embeddings
+        init_range = constant.EMB_INIT_RANGE
+        if self.emb_matrix is not None:
+            if isinstance(self.emb_matrix, np.ndarray):
+                self.emb_matrix = torch.from_numpy(self.emb_matrix)
+            assert self.emb_matrix.size() == (self.vocab_size, self.emb_dim), \
+                    "Input embedding matrix must match size: {} x {}".format(self.vocab_size, self.emb_dim)
+            self.embedding.weight.data.copy_(self.emb_matrix)
+        else:
+            self.embedding.weight.data.uniform_(-init_range, init_range)
+        # decide finetuning
+        if self.top <= 0:
+            logger.debug("Do not finetune embedding layer.")
+            self.embedding.weight.requires_grad = False
+        elif self.top < self.vocab_size:
+            logger.debug("Finetune top {} embeddings.".format(self.top))
+            self.embedding.weight.register_hook(lambda x: utils.keep_partial_grad(x, self.top))
+        else:
+            logger.debug("Finetune all embeddings.")
+        # initialize pos embeddings
+        if self.use_pos:
+            self.pos_embedding.weight.data.uniform_(-init_range, init_range)
+    def zero_state(self, inputs):
+        batch_size = inputs.size(0)
+        device = self.SOS_tensor.device
+        h0 = torch.zeros(self.encoder.num_layers*2, batch_size, self.enc_hidden_dim, requires_grad=False, device=device)
+        c0 = torch.zeros(self.encoder.num_layers*2, batch_size, self.enc_hidden_dim, requires_grad=False, device=device)
+        return h0, c0
+    def encode(self, enc_inputs, lens):
+        """ Encode source sequence. """
+        h0, c0 = self.zero_state(enc_inputs)
+        packed_inputs = nn.utils.rnn.pack_padded_sequence(enc_inputs, lens, batch_first=True)
+        packed_h_in, (hn, cn) = self.encoder(packed_inputs, (h0, c0))
+        h_in, _ = nn.utils.rnn.pad_packed_sequence(packed_h_in, batch_first=True)
+        hn = torch.cat((hn[-1], hn[-2]), 1)
+        cn = torch.cat((cn[-1], cn[-2]), 1)
+        return h_in, (hn, cn)
+    def decode(self, dec_inputs, hn, cn, ctx, ctx_mask=None, src=None, never_decode_unk=False):
+        """ Decode a step, based on context encoding and source context states."""
+        dec_hidden = (hn, cn)
+        decoder_output = self.decoder(dec_inputs, dec_hidden, ctx, ctx_mask, return_logattn=self.copy)
+        if self.copy:
+            h_out, dec_hidden, log_attn = decoder_output
+        else:
+            h_out, dec_hidden = decoder_output
+        h_out_reshape = h_out.contiguous().view(h_out.size(0) * h_out.size(1), -1)
+        decoder_logits = self.dec2vocab(h_out_reshape)
+        decoder_logits = decoder_logits.view(h_out.size(0), h_out.size(1), -1)
+        log_probs = self.get_log_prob(decoder_logits)
+        if self.copy:
+            copy_logit = self.copy_gate(h_out)
+            if self.use_pos:
+                # can't copy the UPOS
+                log_attn = log_attn[:, :, 1:]
+            # renormalize
+            log_attn = torch.log_softmax(log_attn, -1)
+            # calculate copy probability for each word in the vocab
+            log_copy_prob = torch.nn.functional.logsigmoid(copy_logit) + log_attn
+            # scatter logsumexp
+            mx = log_copy_prob.max(-1, keepdim=True)[0]
+            log_copy_prob = log_copy_prob - mx
+            # here we make space in the log probs for vocab items
+            # which might be copied from the encoder side, but which
+            # were not known at training time
+            # note that such an item cannot possibly be predicted by
+            # the model as a raw output token
+            # however, the copy gate might score high on copying a
+            # previously unknown vocab item
+            copy_prob = torch.exp(log_copy_prob)
+            copied_vocab_shape = list(log_probs.size())
+            if torch.max(src) >= copied_vocab_shape[-1]:
+                copied_vocab_shape[-1] = torch.max(src) + 1
+            copied_vocab_prob = log_probs.new_zeros(copied_vocab_shape)
+            scattered_copy = src.unsqueeze(1).expand(src.size(0), copy_prob.size(1), src.size(1))
+            # fill in the copy tensor with the copy probs of each character
+            # the rest of the copy tensor will be filled with -largenumber
+            copied_vocab_prob = copied_vocab_prob.scatter_add(-1, scattered_copy, copy_prob)
+            zero_mask = (copied_vocab_prob == 0)
+            log_copied_vocab_prob = torch.log(copied_vocab_prob.masked_fill(zero_mask, 1e-12)) + mx
+            log_copied_vocab_prob = log_copied_vocab_prob.masked_fill(zero_mask, -1e12)
+            # combine with normal vocab probability
+            log_nocopy_prob = -torch.log(1 + torch.exp(copy_logit))
+            if log_probs.shape[-1] < copied_vocab_shape[-1]:
+                # for previously unknown vocab items which are in the encoder,
+                # we reuse the UNK_ID prediction
+                # this gives a baseline number which we can combine with
+                # the copy gate prediction
+                # technically this makes log_probs no longer represent
+                # a probability distribution when looking at unknown vocab
+                # this is probably not a serious problem
+                # an example of this usage is in the Lemmatizer, such as a
+                # plural word in English with the character "ã" in it instead of "a"
+                # if "ã" is not known in the training data, the lemmatizer would
+                # ordinarily be unable to output it, and thus the seq2seq model
+                # would have no chance to depluralize "ãntennae" -> "ãntenna"
+                # however, if we temporarily add "ã" to the encoder vocab,
+                # then let the copy gate accept that letter, we find the Lemmatizer
+                # seq2seq model will want to copy that particular vocab item
+                # this allows the Lemmatizer to produce "ã" instead of requiring
+                # that it produces UNK, then going back to the input text to
+                # figure out which UNK it intended to produce
+                new_log_probs = log_probs.new_zeros(copied_vocab_shape)
+                new_log_probs[:, :, :log_probs.shape[-1]] = log_probs
+                new_log_probs[:, :, log_probs.shape[-1]:] = new_log_probs[:, :, UNK_ID].unsqueeze(2)
+                log_probs = new_log_probs
+            log_probs = log_probs + log_nocopy_prob
+            log_probs = torch.logsumexp(torch.stack([log_copied_vocab_prob, log_probs]), 0)
+        if never_decode_unk:
+            log_probs[:, :, UNK_ID] = float("-inf")
+        return log_probs, dec_hidden
+    def embed(self, src, src_mask, pos, raw):
+        embed_src = src.clone()
+        embed_src[embed_src >= self.vocab_size] = UNK_ID
+        enc_inputs = self.emb_drop(self.embedding(embed_src))
+        batch_size = enc_inputs.size(0)
+        if self.use_pos:
+            assert pos is not None, "Missing POS input for seq2seq lemmatizer."
+            pos_inputs = self.pos_drop(self.pos_embedding(pos))
+            enc_inputs = torch.cat([pos_inputs.unsqueeze(1), enc_inputs], dim=1)
+            pos_src_mask = src_mask.new_zeros([batch_size, 1])
+            src_mask = torch.cat([pos_src_mask, src_mask], dim=1)
+        if raw is not None and self.contextual_embedding is not None:
+            raw_inputs = self.contextual_embedding(raw)
+            if self.use_pos:
+                raw_zeros = raw_inputs.new_zeros((raw_inputs.shape[0], 1, raw_inputs.shape[2]))
+                raw_inputs = torch.cat([raw_inputs, raw_zeros], dim=1)
+            enc_inputs = torch.cat([enc_inputs, raw_inputs], dim=2)
+        src_lens = list(src_mask.data.eq(constant.PAD_ID).long().sum(1))
+        return enc_inputs, batch_size, src_lens, src_mask
+    def forward(self, src, src_mask, tgt_in, pos=None, raw=None):
+        # prepare for encoder/decoder
+        enc_inputs, batch_size, src_lens, src_mask = self.embed(src, src_mask, pos, raw)
+        # encode source
+        h_in, (hn, cn) = self.encode(enc_inputs, src_lens)
+        if self.edit:
+            edit_logits = self.edit_clf(hn)
+        else:
+            edit_logits = None
+        dec_inputs = self.emb_drop(self.embedding(tgt_in))
+        log_probs, _ = self.decode(dec_inputs, hn, cn, h_in, src_mask, src=src)
+        return log_probs, edit_logits
+    def get_log_prob(self, logits):
+        logits_reshape = logits.view(-1, self.vocab_size)
+        log_probs = F.log_softmax(logits_reshape, dim=1)
+        if logits.dim() == 2:
+            return log_probs
+        return log_probs.view(logits.size(0), logits.size(1), logits.size(2))
+    def predict_greedy(self, src, src_mask, pos=None, raw=None, never_decode_unk=False):
+        """ Predict with greedy decoding. """
+        enc_inputs, batch_size, src_lens, src_mask = self.embed(src, src_mask, pos, raw)
+        # encode source
+        h_in, (hn, cn) = self.encode(enc_inputs, src_lens)
+        if self.edit:
+            edit_logits = self.edit_clf(hn)
+        else:
+            edit_logits = None
+        # greedy decode by step
+        dec_inputs = self.embedding(self.SOS_tensor)
+        dec_inputs = dec_inputs.expand(batch_size, dec_inputs.size(0), dec_inputs.size(1))
+        done = [False for _ in range(batch_size)]
+        total_done = 0
+        max_len = 0
+        output_seqs = [[] for _ in range(batch_size)]
+        while total_done < batch_size and max_len < self.max_dec_len:
+            log_probs, (hn, cn) = self.decode(dec_inputs, hn, cn, h_in, src_mask, src=src, never_decode_unk=never_decode_unk)
+            assert log_probs.size(1) == 1, "Output must have 1-step of output."
+            _, preds = log_probs.squeeze(1).max(1, keepdim=True)
+            # if a unlearned character is predicted via the copy mechanism,
+            # use the UNK embedding for it
+            dec_inputs = preds.clone()
+            dec_inputs[dec_inputs >= self.vocab_size] = UNK_ID
+            dec_inputs = self.embedding(dec_inputs) # update decoder inputs
+            max_len += 1
+            for i in range(batch_size):
+                if not done[i]:
+                    token = preds.data[i][0].item()
+                    if token == constant.EOS_ID:
+                        done[i] = True
+                        total_done += 1
+                    else:
+                        output_seqs[i].append(token)
+        return output_seqs, edit_logits
+    def predict(self, src, src_mask, pos=None, beam_size=5, raw=None, never_decode_unk=False):
+        """ Predict with beam search. """
+        if beam_size == 1:
+            return self.predict_greedy(src, src_mask, pos, raw, never_decode_unk=never_decode_unk)
+        enc_inputs, batch_size, src_lens, src_mask = self.embed(src, src_mask, pos, raw)
+        # (1) encode source
+        h_in, (hn, cn) = self.encode(enc_inputs, src_lens)
+        if self.edit:
+            edit_logits = self.edit_clf(hn)
+        else:
+            edit_logits = None
+        # (2) set up beam
+        with torch.no_grad():
+            h_in = h_in.data.repeat(beam_size, 1, 1) # repeat data for beam search
+            src_mask = src_mask.repeat(beam_size, 1)
+            # repeat decoder hidden states
+            hn = hn.data.repeat(beam_size, 1)
+            cn = cn.data.repeat(beam_size, 1)
+        device = self.SOS_tensor.device
+        beam = [Beam(beam_size, device) for _ in range(batch_size)]
+        def update_state(states, idx, positions, beam_size):
+            """ Select the states according to back pointers. """
+            for e in states:
+                br, d = e.size()
+                s = e.contiguous().view(beam_size, br // beam_size, d)[:,idx]
+                s.data.copy_(s.data.index_select(0, positions))
+        # (3) main loop
+        for i in range(self.max_dec_len):
+            dec_inputs = torch.stack([b.get_current_state() for b in beam]).t().contiguous().view(-1, 1)
+            # if a unlearned character is predicted via the copy mechanism,
+            # use the UNK embedding for it
+            dec_inputs[dec_inputs >= self.vocab_size] = UNK_ID
+            dec_inputs = self.embedding(dec_inputs)
+            log_probs, (hn, cn) = self.decode(dec_inputs, hn, cn, h_in, src_mask, src=src, never_decode_unk=never_decode_unk)
+            log_probs = log_probs.view(beam_size, batch_size, -1).transpose(0,1).contiguous() # [batch, beam, V]
+            # advance each beam
+            done = []
+            for b in range(batch_size):
+                is_done = beam[b].advance(log_probs.data[b])
+                if is_done:
+                    done += [b]
+                # update beam state
+                update_state((hn, cn), b, beam[b].get_current_origin(), beam_size)
+            if len(done) == batch_size:
+                break
+        # back trace and find hypothesis
+        all_hyp, all_scores = [], []
+        for b in range(batch_size):
+            scores, ks = beam[b].sort_best()
+            all_scores += [scores[0]]
+            k = ks[0]
+            hyp = beam[b].get_hyp(k)
+            hyp = utils.prune_hyp(hyp)
+            hyp = [i.item() for i in hyp]
+            all_hyp += [hyp]
+        return all_hyp, edit_logits

stanza/stanza/models/common/seq2seq_utils.py ADDED Viewed

	@@ -0,0 +1,121 @@

+"""
+Utils for seq2seq models.
+"""
+from collections import Counter
+import random
+import json
+import torch
+import stanza.models.common.seq2seq_constant as constant
+# torch utils
+def get_optimizer(name, parameters, lr):
+    if name == 'sgd':
+        return torch.optim.SGD(parameters, lr=lr)
+    elif name == 'adagrad':
+        return torch.optim.Adagrad(parameters, lr=lr)
+    elif name == 'adam':
+        return torch.optim.Adam(parameters) # use default lr
+    elif name == 'adamax':
+        return torch.optim.Adamax(parameters) # use default lr
+    else:
+        raise Exception("Unsupported optimizer: {}".format(name))
+def change_lr(optimizer, new_lr):
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = new_lr
+def flatten_indices(seq_lens, width):
+    flat = []
+    for i, l in enumerate(seq_lens):
+        for j in range(l):
+            flat.append(i * width + j)
+    return flat
+def keep_partial_grad(grad, topk):
+    """
+    Keep only the topk rows of grads.
+    """
+    assert topk < grad.size(0)
+    grad.data[topk:].zero_()
+    return grad
+# other utils
+def save_config(config, path, verbose=True):
+    with open(path, 'w') as outfile:
+        json.dump(config, outfile, indent=2)
+    if verbose:
+        print("Config saved to file {}".format(path))
+    return config
+def load_config(path, verbose=True):
+    with open(path) as f:
+        config = json.load(f)
+    if verbose:
+        print("Config loaded from file {}".format(path))
+    return config
+def unmap_with_copy(indices, src_tokens, vocab):
+    """
+    Unmap a list of list of indices, by optionally copying from src_tokens.
+    """
+    result = []
+    for ind, tokens in zip(indices, src_tokens):
+        words = []
+        for idx in ind:
+            if idx >= 0:
+                words.append(vocab.id2word[idx])
+            else:
+                idx = -idx - 1 # flip and minus 1
+                words.append(tokens[idx])
+        result += [words]
+    return result
+def prune_decoded_seqs(seqs):
+    """
+    Prune decoded sequences after EOS token.
+    """
+    out = []
+    for s in seqs:
+        if constant.EOS in s:
+            idx = s.index(constant.EOS_TOKEN)
+            out += [s[:idx]]
+        else:
+            out += [s]
+    return out
+def prune_hyp(hyp):
+    """
+    Prune a decoded hypothesis
+    """
+    if constant.EOS_ID in hyp:
+        idx = hyp.index(constant.EOS_ID)
+        return hyp[:idx]
+    else:
+        return hyp
+def prune(data_list, lens):
+    assert len(data_list) == len(lens)
+    nl = []
+    for d, l in zip(data_list, lens):
+        nl.append(d[:l])
+    return nl
+def sort(packed, ref, reverse=True):
+    """
+    Sort a series of packed list, according to a ref list.
+    Also return the original index before the sort.
+    """
+    assert (isinstance(packed, tuple) or isinstance(packed, list)) and isinstance(ref, list)
+    packed = [ref] + [range(len(ref))] + list(packed)
+    sorted_packed = [list(t) for t in zip(*sorted(zip(*packed), reverse=reverse))]
+    return tuple(sorted_packed[1:])
+def unsort(sorted_list, oidx):
+    """
+    Unsort a sorted list, based on the original idx.
+    """
+    assert len(sorted_list) == len(oidx), "Number of list elements must match with original indices."
+    _, unsorted = [list(t) for t in zip(*sorted(zip(oidx, sorted_list)))]
+    return unsorted

stanza/stanza/models/common/short_name_to_treebank.py ADDED Viewed

	@@ -0,0 +1,619 @@

+# This module is autogenerated by build_short_name_to_treebank.py
+# Please do not edit
+SHORT_NAMES = {
+    'abq_atb':                   'UD_Abaza-ATB',
+    'ab_abnc':                   'UD_Abkhaz-AbNC',
+    'af_afribooms':              'UD_Afrikaans-AfriBooms',
+    'akk_pisandub':              'UD_Akkadian-PISANDUB',
+    'akk_riao':                  'UD_Akkadian-RIAO',
+    'aqz_tudet':                 'UD_Akuntsu-TuDeT',
+    'sq_staf':                   'UD_Albanian-STAF',
+    'sq_tsa':                    'UD_Albanian-TSA',
+    'am_att':                    'UD_Amharic-ATT',
+    'grc_proiel':                'UD_Ancient_Greek-PROIEL',
+    'grc_ptnk':                  'UD_Ancient_Greek-PTNK',
+    'grc_perseus':               'UD_Ancient_Greek-Perseus',
+    'hbo_ptnk':                  'UD_Ancient_Hebrew-PTNK',
+    'apu_ufpa':                  'UD_Apurina-UFPA',
+    'ar_nyuad':                  'UD_Arabic-NYUAD',
+    'ar_padt':                   'UD_Arabic-PADT',
+    'ar_pud':                    'UD_Arabic-PUD',
+    'hy_armtdp':                 'UD_Armenian-ArmTDP',
+    'hy_bsut':                   'UD_Armenian-BSUT',
+    'aii_as':                    'UD_Assyrian-AS',
+    'az_tuecl':                  'UD_Azerbaijani-TueCL',
+    'bm_crb':                    'UD_Bambara-CRB',
+    'eu_bdt':                    'UD_Basque-BDT',
+    'bar_maibaam':               'UD_Bavarian-MaiBaam',
+    'bej_autogramm':             'UD_Beja-Autogramm',
+    'be_hse':                    'UD_Belarusian-HSE',
+    'bn_bru':                    'UD_Bengali-BRU',
+    'bho_bhtb':                  'UD_Bhojpuri-BHTB',
+    'bor_bdt':                   'UD_Bororo-BDT',
+    'br_keb':                    'UD_Breton-KEB',
+    'bg_btb':                    'UD_Bulgarian-BTB',
+    'bxr_bdt':                   'UD_Buryat-BDT',
+    'yue_hk':                    'UD_Cantonese-HK',
+    'cpg_amgic':                 'UD_Cappadocian-AMGiC',
+    'cpg_tuecl':                 'UD_Cappadocian-TueCL',
+    'ca_ancora':                 'UD_Catalan-AnCora',
+    'ceb_gja':                   'UD_Cebuano-GJA',
+    'zh-hans_beginner':          'UD_Chinese-Beginner',
+    'zh_beginner':               'UD_Chinese-Beginner',
+    'zh-hans_cfl':               'UD_Chinese-CFL',
+    'zh_cfl':                    'UD_Chinese-CFL',
+    'zh-hant_gsd':               'UD_Chinese-GSD',
+    'zh_gsd':                    'UD_Chinese-GSD',
+    'zh-hans_gsdsimp':           'UD_Chinese-GSDSimp',
+    'zh_gsdsimp':                'UD_Chinese-GSDSimp',
+    'zh-hant_hk':                'UD_Chinese-HK',
+    'zh_hk':                     'UD_Chinese-HK',
+    'zh-hant_pud':               'UD_Chinese-PUD',
+    'zh_pud':                    'UD_Chinese-PUD',
+    'zh-hans_patentchar':        'UD_Chinese-PatentChar',
+    'zh_patentchar':             'UD_Chinese-PatentChar',
+    'ckt_hse':                   'UD_Chukchi-HSE',
+    'xcl_caval':                 'UD_Classical_Armenian-CAVaL',
+    'lzh_kyoto':                 'UD_Classical_Chinese-Kyoto',
+    'lzh_tuecl':                 'UD_Classical_Chinese-TueCL',
+    'cop_scriptorium':           'UD_Coptic-Scriptorium',
+    'hr_set':                    'UD_Croatian-SET',
+    'cs_cac':                    'UD_Czech-CAC',
+    'cs_cltt':                   'UD_Czech-CLTT',
+    'cs_fictree':                'UD_Czech-FicTree',
+    'cs_pdt':                    'UD_Czech-PDT',
+    'cs_pud':                    'UD_Czech-PUD',
+    'cs_poetry':                 'UD_Czech-Poetry',
+    'da_ddt':                    'UD_Danish-DDT',
+    'nl_alpino':                 'UD_Dutch-Alpino',
+    'nl_lassysmall':             'UD_Dutch-LassySmall',
+    'egy_ujaen':                 'UD_Egyptian-UJaen',
+    'en_atis':                   'UD_English-Atis',
+    'en_ctetex':                 'UD_English-CTeTex',
+    'en_eslspok':                'UD_English-ESLSpok',
+    'en_ewt':                    'UD_English-EWT',
+    'en_gentle':                 'UD_English-GENTLE',
+    'en_gum':                    'UD_English-GUM',
+    'en_gumreddit':              'UD_English-GUMReddit',
+    'en_lines':                  'UD_English-LinES',
+    'en_pud':                    'UD_English-PUD',
+    'en_partut':                 'UD_English-ParTUT',
+    'en_pronouns':               'UD_English-Pronouns',
+    'myv_jr':                    'UD_Erzya-JR',
+    'et_edt':                    'UD_Estonian-EDT',
+    'et_ewt':                    'UD_Estonian-EWT',
+    'fo_farpahc':                'UD_Faroese-FarPaHC',
+    'fo_oft':                    'UD_Faroese-OFT',
+    'fi_ftb':                    'UD_Finnish-FTB',
+    'fi_ood':                    'UD_Finnish-OOD',
+    'fi_pud':                    'UD_Finnish-PUD',
+    'fi_tdt':                    'UD_Finnish-TDT',
+    'fr_fqb':                    'UD_French-FQB',
+    'fr_gsd':                    'UD_French-GSD',
+    'fr_pud':                    'UD_French-PUD',
+    'fr_partut':                 'UD_French-ParTUT',
+    'fr_parisstories':           'UD_French-ParisStories',
+    'fr_rhapsodie':              'UD_French-Rhapsodie',
+    'fr_sequoia':                'UD_French-Sequoia',
+    'qfn_fame':                  'UD_Frisian_Dutch-Fame',
+    'gl_ctg':                    'UD_Galician-CTG',
+    'gl_pud':                    'UD_Galician-PUD',
+    'gl_treegal':                'UD_Galician-TreeGal',
+    'ka_glc':                    'UD_Georgian-GLC',
+    'de_gsd':                    'UD_German-GSD',
+    'de_hdt':                    'UD_German-HDT',
+    'de_lit':                    'UD_German-LIT',
+    'de_pud':                    'UD_German-PUD',
+    'aln_gps':                   'UD_Gheg-GPS',
+    'got_proiel':                'UD_Gothic-PROIEL',
+    'el_gdt':                    'UD_Greek-GDT',
+    'el_gud':                    'UD_Greek-GUD',
+    'gub_tudet':                 'UD_Guajajara-TuDeT',
+    'gn_oldtudet':               'UD_Guarani-OldTuDeT',
+    'gu_gujtb':                  'UD_Gujarati-GujTB',
+    'gwi_tuecl':                 'UD_Gwichin-TueCL',
+    'ht_autogramm':              'UD_Haitian_Creole-Autogramm',
+    'ha_northernautogramm':      'UD_Hausa-NorthernAutogramm',
+    'ha_southernautogramm':      'UD_Hausa-SouthernAutogramm',
+    'he_htb':                    'UD_Hebrew-HTB',
+    'he_iahltknesset':           'UD_Hebrew-IAHLTknesset',
+    'he_iahltwiki':              'UD_Hebrew-IAHLTwiki',
+    'azz_itml':                  'UD_Highland_Puebla_Nahuatl-ITML',
+    'hi_hdtb':                   'UD_Hindi-HDTB',
+    'hi_pud':                    'UD_Hindi-PUD',
+    'hit_hittb':                 'UD_Hittite-HitTB',
+    'hu_szeged':                 'UD_Hungarian-Szeged',
+    'is_gc':                     'UD_Icelandic-GC',
+    'is_icepahc':                'UD_Icelandic-IcePaHC',
+    'is_modern':                 'UD_Icelandic-Modern',
+    'is_pud':                    'UD_Icelandic-PUD',
+    'id_csui':                   'UD_Indonesian-CSUI',
+    'id_gsd':                    'UD_Indonesian-GSD',
+    'id_pud':                    'UD_Indonesian-PUD',
+    'ga_cadhan':                 'UD_Irish-Cadhan',
+    'ga_idt':                    'UD_Irish-IDT',
+    'ga_twittirish':             'UD_Irish-TwittIrish',
+    'it_isdt':                   'UD_Italian-ISDT',
+    'it_markit':                 'UD_Italian-MarkIT',
+    'it_old':                    'UD_Italian-Old',
+    'it_pud':                    'UD_Italian-PUD',
+    'it_partut':                 'UD_Italian-ParTUT',
+    'it_parlamint':              'UD_Italian-ParlaMint',
+    'it_postwita':               'UD_Italian-PoSTWITA',
+    'it_twittiro':               'UD_Italian-TWITTIRO',
+    'it_vit':                    'UD_Italian-VIT',
+    'it_valico':                 'UD_Italian-Valico',
+    'ja_bccwj':                  'UD_Japanese-BCCWJ',
+    'ja_bccwjluw':               'UD_Japanese-BCCWJLUW',
+    'ja_gsd':                    'UD_Japanese-GSD',
+    'ja_gsdluw':                 'UD_Japanese-GSDLUW',
+    'ja_pud':                    'UD_Japanese-PUD',
+    'ja_pudluw':                 'UD_Japanese-PUDLUW',
+    'jv_csui':                   'UD_Javanese-CSUI',
+    'urb_tudet':                 'UD_Kaapor-TuDeT',
+    'xnr_kdtb':                  'UD_Kangri-KDTB',
+    'krl_kkpp':                  'UD_Karelian-KKPP',
+    'arr_tudet':                 'UD_Karo-TuDeT',
+    'kk_ktb':                    'UD_Kazakh-KTB',
+    'kfm_aha':                   'UD_Khunsari-AHA',
+    'quc_iu':                    'UD_Kiche-IU',
+    'koi_uh':                    'UD_Komi_Permyak-UH',
+    'kpv_ikdp':                  'UD_Komi_Zyrian-IKDP',
+    'kpv_lattice':               'UD_Komi_Zyrian-Lattice',
+    'ko_gsd':                    'UD_Korean-GSD',
+    'ko_ksl':                    'UD_Korean-KSL',
+    'ko_kaist':                  'UD_Korean-Kaist',
+    'ko_pud':                    'UD_Korean-PUD',
+    'kmr_mg':                    'UD_Kurmanji-MG',
+    'ky_ktmu':                   'UD_Kyrgyz-KTMU',
+    'ky_tuecl':                  'UD_Kyrgyz-TueCL',
+    'ltg_cairo':                 'UD_Latgalian-Cairo',
+    'la_circse':                 'UD_Latin-CIRCSE',
+    'la_ittb':                   'UD_Latin-ITTB',
+    'la_llct':                   'UD_Latin-LLCT',
+    'la_proiel':                 'UD_Latin-PROIEL',
+    'la_perseus':                'UD_Latin-Perseus',
+    'la_udante':                 'UD_Latin-UDante',
+    'lv_cairo':                  'UD_Latvian-Cairo',
+    'lv_lvtb':                   'UD_Latvian-LVTB',
+    'lij_glt':                   'UD_Ligurian-GLT',
+    'lt_alksnis':                'UD_Lithuanian-ALKSNIS',
+    'lt_hse':                    'UD_Lithuanian-HSE',
+    'olo_kkpp':                  'UD_Livvi-KKPP',
+    'nds_lsdc':                  'UD_Low_Saxon-LSDC',
+    'lb_luxbank':                'UD_Luxembourgish-LuxBank',
+    'mk_mtb':                    'UD_Macedonian-MTB',
+    'jaa_jarawara':              'UD_Madi-Jarawara',
+    'qaf_arabizi':               'UD_Maghrebi_Arabic_French-Arabizi',
+    'mpu_tudet':                 'UD_Makurap-TuDeT',
+    'ml_ufal':                   'UD_Malayalam-UFAL',
+    'mt_mudt':                   'UD_Maltese-MUDT',
+    'gv_cadhan':                 'UD_Manx-Cadhan',
+    'mr_ufal':                   'UD_Marathi-UFAL',
+    'gun_dooley':                'UD_Mbya_Guarani-Dooley',
+    'gun_thomas':                'UD_Mbya_Guarani-Thomas',
+    'frm_profiterole':           'UD_Middle_French-PROFITEROLE',
+    'mdf_jr':                    'UD_Moksha-JR',
+    'myu_tudet':                 'UD_Munduruku-TuDeT',
+    'pcm_nsc':                   'UD_Naija-NSC',
+    'nyq_aha':                   'UD_Nayini-AHA',
+    'nap_rb':                    'UD_Neapolitan-RB',
+    'yrl_complin':               'UD_Nheengatu-CompLin',
+    'sme_giella':                'UD_North_Sami-Giella',
+    'gya_autogramm':             'UD_Northwest_Gbaya-Autogramm',
+    'nb_bokmaal':                'UD_Norwegian-Bokmaal',
+    'no_bokmaal':                'UD_Norwegian-Bokmaal',
+    'nn_nynorsk':                'UD_Norwegian-Nynorsk',
+    'cu_proiel':                 'UD_Old_Church_Slavonic-PROIEL',
+    'orv_birchbark':             'UD_Old_East_Slavic-Birchbark',
+    'orv_rnc':                   'UD_Old_East_Slavic-RNC',
+    'orv_ruthenian':             'UD_Old_East_Slavic-Ruthenian',
+    'orv_torot':                 'UD_Old_East_Slavic-TOROT',
+    'fro_profiterole':           'UD_Old_French-PROFITEROLE',
+    'sga_dipsgg':                'UD_Old_Irish-DipSGG',
+    'sga_dipwbg':                'UD_Old_Irish-DipWBG',
+    'otk_clausal':               'UD_Old_Turkish-Clausal',
+    'ota_boun':                  'UD_Ottoman_Turkish-BOUN',
+    'ota_dudu':                  'UD_Ottoman_Turkish-DUDU',
+    'ps_sikaram':                'UD_Pashto-Sikaram',
+    'pad_tuecl':                 'UD_Paumari-TueCL',
+    'fa_perdt':                  'UD_Persian-PerDT',
+    'fa_seraji':                 'UD_Persian-Seraji',
+    'pay_chibergis':             'UD_Pesh-ChibErgIS',
+    'xpg_kul':                   'UD_Phrygian-KUL',
+    'pl_lfg':                    'UD_Polish-LFG',
+    'pl_pdb':                    'UD_Polish-PDB',
+    'pl_pud':                    'UD_Polish-PUD',
+    'qpm_philotis':              'UD_Pomak-Philotis',
+    'pt_bosque':                 'UD_Portuguese-Bosque',
+    'pt_cintil':                 'UD_Portuguese-CINTIL',
+    'pt_dantestocks':            'UD_Portuguese-DANTEStocks',
+    'pt_gsd':                    'UD_Portuguese-GSD',
+    'pt_pud':                    'UD_Portuguese-PUD',
+    'pt_petrogold':              'UD_Portuguese-PetroGold',
+    'pt_porttinari':             'UD_Portuguese-Porttinari',
+    'ro_art':                    'UD_Romanian-ArT',
+    'ro_nonstandard':            'UD_Romanian-Nonstandard',
+    'ro_rrt':                    'UD_Romanian-RRT',
+    'ro_simonero':               'UD_Romanian-SiMoNERo',
+    'ro_tuecl':                  'UD_Romanian-TueCL',
+    'ru_gsd':                    'UD_Russian-GSD',
+    'ru_pud':                    'UD_Russian-PUD',
+    'ru_poetry':                 'UD_Russian-Poetry',
+    'ru_syntagrus':              'UD_Russian-SynTagRus',
+    'ru_taiga':                  'UD_Russian-Taiga',
+    'sa_ufal':                   'UD_Sanskrit-UFAL',
+    'sa_vedic':                  'UD_Sanskrit-Vedic',
+    'gd_arcosg':                 'UD_Scottish_Gaelic-ARCOSG',
+    'sr_set':                    'UD_Serbian-SET',
+    'si_stb':                    'UD_Sinhala-STB',
+    'sms_giellagas':             'UD_Skolt_Sami-Giellagas',
+    'sk_snk':                    'UD_Slovak-SNK',
+    'sl_ssj':                    'UD_Slovenian-SSJ',
+    'sl_sst':                    'UD_Slovenian-SST',
+    'soj_aha':                   'UD_Soi-AHA',
+    'ajp_madar':                 'UD_South_Levantine_Arabic-MADAR',
+    'es_ancora':                 'UD_Spanish-AnCora',
+    'es_coser':                  'UD_Spanish-COSER',
+    'es_gsd':                    'UD_Spanish-GSD',
+    'es_pud':                    'UD_Spanish-PUD',
+    'ssp_lse':                   'UD_Spanish_Sign_Language-LSE',
+    'sv_lines':                  'UD_Swedish-LinES',
+    'sv_pud':                    'UD_Swedish-PUD',
+    'sv_talbanken':              'UD_Swedish-Talbanken',
+    'swl_sslc':                  'UD_Swedish_Sign_Language-SSLC',
+    'gsw_uzh':                   'UD_Swiss_German-UZH',
+    'tl_trg':                    'UD_Tagalog-TRG',
+    'tl_ugnayan':                'UD_Tagalog-Ugnayan',
+    'ta_mwtt':                   'UD_Tamil-MWTT',
+    'ta_ttb':                    'UD_Tamil-TTB',
+    'tt_nmctt':                  'UD_Tatar-NMCTT',
+    'eme_tudet':                 'UD_Teko-TuDeT',
+    'te_mtg':                    'UD_Telugu-MTG',
+    'qte_tect':                  'UD_Telugu_English-TECT',
+    'th_pud':                    'UD_Thai-PUD',
+    'tn_popapolelo':             'UD_Tswana-Popapolelo',
+    'tpn_tudet':                 'UD_Tupinamba-TuDeT',
+    'tr_atis':                   'UD_Turkish-Atis',
+    'tr_boun':                   'UD_Turkish-BOUN',
+    'tr_framenet':               'UD_Turkish-FrameNet',
+    'tr_gb':                     'UD_Turkish-GB',
+    'tr_imst':                   'UD_Turkish-IMST',
+    'tr_kenet':                  'UD_Turkish-Kenet',
+    'tr_pud':                    'UD_Turkish-PUD',
+    'tr_penn':                   'UD_Turkish-Penn',
+    'tr_tourism':                'UD_Turkish-Tourism',
+    'qtd_sagt':                  'UD_Turkish_German-SAGT',
+    'uk_iu':                     'UD_Ukrainian-IU',
+    'uk_parlamint':              'UD_Ukrainian-ParlaMint',
+    'xum_ikuvina':               'UD_Umbrian-IKUVINA',
+    'hsb_ufal':                  'UD_Upper_Sorbian-UFAL',
+    'ur_udtb':                   'UD_Urdu-UDTB',
+    'ug_udt':                    'UD_Uyghur-UDT',
+    'uz_ut':                     'UD_Uzbek-UT',
+    'vep_vwt':                   'UD_Veps-VWT',
+    'vi_tuecl':                  'UD_Vietnamese-TueCL',
+    'vi_vtb':                    'UD_Vietnamese-VTB',
+    'wbp_ufal':                  'UD_Warlpiri-UFAL',
+    'cy_ccg':                    'UD_Welsh-CCG',
+    'hyw_armtdp':                'UD_Western_Armenian-ArmTDP',
+    'nhi_itml':                  'UD_Western_Sierra_Puebla_Nahuatl-ITML',
+    'wo_wtb':                    'UD_Wolof-WTB',
+    'xav_xdt':                   'UD_Xavante-XDT',
+    'sjo_xdt':                   'UD_Xibe-XDT',
+    'sah_yktdt':                 'UD_Yakut-YKTDT',
+    'yo_ytb':                    'UD_Yoruba-YTB',
+    'ess_sli':                   'UD_Yupik-SLI',
+    'say_autogramm':             'UD_Zaar-Autogramm',
+}
+def short_name_to_treebank(short_name):
+    return SHORT_NAMES[short_name]
+CANONICAL_NAMES = {
+    'ud_abaza-atb':                            'UD_Abaza-ATB',
+    'ud_abkhaz-abnc':                          'UD_Abkhaz-AbNC',
+    'ud_afrikaans-afribooms':                  'UD_Afrikaans-AfriBooms',
+    'ud_akkadian-pisandub':                    'UD_Akkadian-PISANDUB',
+    'ud_akkadian-riao':                        'UD_Akkadian-RIAO',
+    'ud_akuntsu-tudet':                        'UD_Akuntsu-TuDeT',
+    'ud_albanian-staf':                        'UD_Albanian-STAF',
+    'ud_albanian-tsa':                         'UD_Albanian-TSA',
+    'ud_amharic-att':                          'UD_Amharic-ATT',
+    'ud_ancient_greek-proiel':                 'UD_Ancient_Greek-PROIEL',
+    'ud_ancient_greek-ptnk':                   'UD_Ancient_Greek-PTNK',
+    'ud_ancient_greek-perseus':                'UD_Ancient_Greek-Perseus',
+    'ud_ancient_hebrew-ptnk':                  'UD_Ancient_Hebrew-PTNK',
+    'ud_apurina-ufpa':                         'UD_Apurina-UFPA',
+    'ud_arabic-nyuad':                         'UD_Arabic-NYUAD',
+    'ud_arabic-padt':                          'UD_Arabic-PADT',
+    'ud_arabic-pud':                           'UD_Arabic-PUD',
+    'ud_armenian-armtdp':                      'UD_Armenian-ArmTDP',
+    'ud_armenian-bsut':                        'UD_Armenian-BSUT',
+    'ud_assyrian-as':                          'UD_Assyrian-AS',
+    'ud_azerbaijani-tuecl':                    'UD_Azerbaijani-TueCL',
+    'ud_bambara-crb':                          'UD_Bambara-CRB',
+    'ud_basque-bdt':                           'UD_Basque-BDT',
+    'ud_bavarian-maibaam':                     'UD_Bavarian-MaiBaam',
+    'ud_beja-autogramm':                       'UD_Beja-Autogramm',
+    'ud_belarusian-hse':                       'UD_Belarusian-HSE',
+    'ud_bengali-bru':                          'UD_Bengali-BRU',
+    'ud_bhojpuri-bhtb':                        'UD_Bhojpuri-BHTB',
+    'ud_bororo-bdt':                           'UD_Bororo-BDT',
+    'ud_breton-keb':                           'UD_Breton-KEB',
+    'ud_bulgarian-btb':                        'UD_Bulgarian-BTB',
+    'ud_buryat-bdt':                           'UD_Buryat-BDT',
+    'ud_cantonese-hk':                         'UD_Cantonese-HK',
+    'ud_cappadocian-amgic':                    'UD_Cappadocian-AMGiC',
+    'ud_cappadocian-tuecl':                    'UD_Cappadocian-TueCL',
+    'ud_catalan-ancora':                       'UD_Catalan-AnCora',
+    'ud_cebuano-gja':                          'UD_Cebuano-GJA',
+    'ud_chinese-beginner':                     'UD_Chinese-Beginner',
+    'ud_chinese-cfl':                          'UD_Chinese-CFL',
+    'ud_chinese-gsd':                          'UD_Chinese-GSD',
+    'ud_chinese-gsdsimp':                      'UD_Chinese-GSDSimp',
+    'ud_chinese-hk':                           'UD_Chinese-HK',
+    'ud_chinese-pud':                          'UD_Chinese-PUD',
+    'ud_chinese-patentchar':                   'UD_Chinese-PatentChar',
+    'ud_chukchi-hse':                          'UD_Chukchi-HSE',
+    'ud_classical_armenian-caval':             'UD_Classical_Armenian-CAVaL',
+    'ud_classical_chinese-kyoto':              'UD_Classical_Chinese-Kyoto',
+    'ud_classical_chinese-tuecl':              'UD_Classical_Chinese-TueCL',
+    'ud_coptic-scriptorium':                   'UD_Coptic-Scriptorium',
+    'ud_croatian-set':                         'UD_Croatian-SET',
+    'ud_czech-cac':                            'UD_Czech-CAC',
+    'ud_czech-cltt':                           'UD_Czech-CLTT',
+    'ud_czech-fictree':                        'UD_Czech-FicTree',
+    'ud_czech-pdt':                            'UD_Czech-PDT',
+    'ud_czech-pud':                            'UD_Czech-PUD',
+    'ud_czech-poetry':                         'UD_Czech-Poetry',
+    'ud_danish-ddt':                           'UD_Danish-DDT',
+    'ud_dutch-alpino':                         'UD_Dutch-Alpino',
+    'ud_dutch-lassysmall':                     'UD_Dutch-LassySmall',
+    'ud_egyptian-ujaen':                       'UD_Egyptian-UJaen',
+    'ud_english-atis':                         'UD_English-Atis',
+    'ud_english-ctetex':                       'UD_English-CTeTex',
+    'ud_english-eslspok':                      'UD_English-ESLSpok',
+    'ud_english-ewt':                          'UD_English-EWT',
+    'ud_english-gentle':                       'UD_English-GENTLE',
+    'ud_english-gum':                          'UD_English-GUM',
+    'ud_english-gumreddit':                    'UD_English-GUMReddit',
+    'ud_english-lines':                        'UD_English-LinES',
+    'ud_english-pud':                          'UD_English-PUD',
+    'ud_english-partut':                       'UD_English-ParTUT',
+    'ud_english-pronouns':                     'UD_English-Pronouns',
+    'ud_erzya-jr':                             'UD_Erzya-JR',
+    'ud_estonian-edt':                         'UD_Estonian-EDT',
+    'ud_estonian-ewt':                         'UD_Estonian-EWT',
+    'ud_faroese-farpahc':                      'UD_Faroese-FarPaHC',
+    'ud_faroese-oft':                          'UD_Faroese-OFT',
+    'ud_finnish-ftb':                          'UD_Finnish-FTB',
+    'ud_finnish-ood':                          'UD_Finnish-OOD',
+    'ud_finnish-pud':                          'UD_Finnish-PUD',
+    'ud_finnish-tdt':                          'UD_Finnish-TDT',
+    'ud_french-fqb':                           'UD_French-FQB',
+    'ud_french-gsd':                           'UD_French-GSD',
+    'ud_french-pud':                           'UD_French-PUD',
+    'ud_french-partut':                        'UD_French-ParTUT',
+    'ud_french-parisstories':                  'UD_French-ParisStories',
+    'ud_french-rhapsodie':                     'UD_French-Rhapsodie',
+    'ud_french-sequoia':                       'UD_French-Sequoia',
+    'ud_frisian_dutch-fame':                   'UD_Frisian_Dutch-Fame',
+    'ud_galician-ctg':                         'UD_Galician-CTG',
+    'ud_galician-pud':                         'UD_Galician-PUD',
+    'ud_galician-treegal':                     'UD_Galician-TreeGal',
+    'ud_georgian-glc':                         'UD_Georgian-GLC',
+    'ud_german-gsd':                           'UD_German-GSD',
+    'ud_german-hdt':                           'UD_German-HDT',
+    'ud_german-lit':                           'UD_German-LIT',
+    'ud_german-pud':                           'UD_German-PUD',
+    'ud_gheg-gps':                             'UD_Gheg-GPS',
+    'ud_gothic-proiel':                        'UD_Gothic-PROIEL',
+    'ud_greek-gdt':                            'UD_Greek-GDT',
+    'ud_greek-gud':                            'UD_Greek-GUD',
+    'ud_guajajara-tudet':                      'UD_Guajajara-TuDeT',
+    'ud_guarani-oldtudet':                     'UD_Guarani-OldTuDeT',
+    'ud_gujarati-gujtb':                       'UD_Gujarati-GujTB',
+    'ud_gwichin-tuecl':                        'UD_Gwichin-TueCL',
+    'ud_haitian_creole-autogramm':             'UD_Haitian_Creole-Autogramm',
+    'ud_hausa-northernautogramm':              'UD_Hausa-NorthernAutogramm',
+    'ud_hausa-southernautogramm':              'UD_Hausa-SouthernAutogramm',
+    'ud_hebrew-htb':                           'UD_Hebrew-HTB',
+    'ud_hebrew-iahltknesset':                  'UD_Hebrew-IAHLTknesset',
+    'ud_hebrew-iahltwiki':                     'UD_Hebrew-IAHLTwiki',
+    'ud_highland_puebla_nahuatl-itml':         'UD_Highland_Puebla_Nahuatl-ITML',
+    'ud_hindi-hdtb':                           'UD_Hindi-HDTB',
+    'ud_hindi-pud':                            'UD_Hindi-PUD',
+    'ud_hittite-hittb':                        'UD_Hittite-HitTB',
+    'ud_hungarian-szeged':                     'UD_Hungarian-Szeged',
+    'ud_icelandic-gc':                         'UD_Icelandic-GC',
+    'ud_icelandic-icepahc':                    'UD_Icelandic-IcePaHC',
+    'ud_icelandic-modern':                     'UD_Icelandic-Modern',
+    'ud_icelandic-pud':                        'UD_Icelandic-PUD',
+    'ud_indonesian-csui':                      'UD_Indonesian-CSUI',
+    'ud_indonesian-gsd':                       'UD_Indonesian-GSD',
+    'ud_indonesian-pud':                       'UD_Indonesian-PUD',
+    'ud_irish-cadhan':                         'UD_Irish-Cadhan',
+    'ud_irish-idt':                            'UD_Irish-IDT',
+    'ud_irish-twittirish':                     'UD_Irish-TwittIrish',
+    'ud_italian-isdt':                         'UD_Italian-ISDT',
+    'ud_italian-markit':                       'UD_Italian-MarkIT',
+    'ud_italian-old':                          'UD_Italian-Old',
+    'ud_italian-pud':                          'UD_Italian-PUD',
+    'ud_italian-partut':                       'UD_Italian-ParTUT',
+    'ud_italian-parlamint':                    'UD_Italian-ParlaMint',
+    'ud_italian-postwita':                     'UD_Italian-PoSTWITA',
+    'ud_italian-twittiro':                     'UD_Italian-TWITTIRO',
+    'ud_italian-vit':                          'UD_Italian-VIT',
+    'ud_italian-valico':                       'UD_Italian-Valico',
+    'ud_japanese-bccwj':                       'UD_Japanese-BCCWJ',
+    'ud_japanese-bccwjluw':                    'UD_Japanese-BCCWJLUW',
+    'ud_japanese-gsd':                         'UD_Japanese-GSD',
+    'ud_japanese-gsdluw':                      'UD_Japanese-GSDLUW',
+    'ud_japanese-pud':                         'UD_Japanese-PUD',
+    'ud_japanese-pudluw':                      'UD_Japanese-PUDLUW',
+    'ud_javanese-csui':                        'UD_Javanese-CSUI',
+    'ud_kaapor-tudet':                         'UD_Kaapor-TuDeT',
+    'ud_kangri-kdtb':                          'UD_Kangri-KDTB',
+    'ud_karelian-kkpp':                        'UD_Karelian-KKPP',
+    'ud_karo-tudet':                           'UD_Karo-TuDeT',
+    'ud_kazakh-ktb':                           'UD_Kazakh-KTB',
+    'ud_khunsari-aha':                         'UD_Khunsari-AHA',
+    'ud_kiche-iu':                             'UD_Kiche-IU',
+    'ud_komi_permyak-uh':                      'UD_Komi_Permyak-UH',
+    'ud_komi_zyrian-ikdp':                     'UD_Komi_Zyrian-IKDP',
+    'ud_komi_zyrian-lattice':                  'UD_Komi_Zyrian-Lattice',
+    'ud_korean-gsd':                           'UD_Korean-GSD',
+    'ud_korean-ksl':                           'UD_Korean-KSL',
+    'ud_korean-kaist':                         'UD_Korean-Kaist',
+    'ud_korean-pud':                           'UD_Korean-PUD',
+    'ud_kurmanji-mg':                          'UD_Kurmanji-MG',
+    'ud_kyrgyz-ktmu':                          'UD_Kyrgyz-KTMU',
+    'ud_kyrgyz-tuecl':                         'UD_Kyrgyz-TueCL',
+    'ud_latgalian-cairo':                      'UD_Latgalian-Cairo',
+    'ud_latin-circse':                         'UD_Latin-CIRCSE',
+    'ud_latin-ittb':                           'UD_Latin-ITTB',
+    'ud_latin-llct':                           'UD_Latin-LLCT',
+    'ud_latin-proiel':                         'UD_Latin-PROIEL',
+    'ud_latin-perseus':                        'UD_Latin-Perseus',
+    'ud_latin-udante':                         'UD_Latin-UDante',
+    'ud_latvian-cairo':                        'UD_Latvian-Cairo',
+    'ud_latvian-lvtb':                         'UD_Latvian-LVTB',
+    'ud_ligurian-glt':                         'UD_Ligurian-GLT',
+    'ud_lithuanian-alksnis':                   'UD_Lithuanian-ALKSNIS',
+    'ud_lithuanian-hse':                       'UD_Lithuanian-HSE',
+    'ud_livvi-kkpp':                           'UD_Livvi-KKPP',
+    'ud_low_saxon-lsdc':                       'UD_Low_Saxon-LSDC',
+    'ud_luxembourgish-luxbank':                'UD_Luxembourgish-LuxBank',
+    'ud_macedonian-mtb':                       'UD_Macedonian-MTB',
+    'ud_madi-jarawara':                        'UD_Madi-Jarawara',
+    'ud_maghrebi_arabic_french-arabizi':       'UD_Maghrebi_Arabic_French-Arabizi',
+    'ud_makurap-tudet':                        'UD_Makurap-TuDeT',
+    'ud_malayalam-ufal':                       'UD_Malayalam-UFAL',
+    'ud_maltese-mudt':                         'UD_Maltese-MUDT',
+    'ud_manx-cadhan':                          'UD_Manx-Cadhan',
+    'ud_marathi-ufal':                         'UD_Marathi-UFAL',
+    'ud_mbya_guarani-dooley':                  'UD_Mbya_Guarani-Dooley',
+    'ud_mbya_guarani-thomas':                  'UD_Mbya_Guarani-Thomas',
+    'ud_middle_french-profiterole':            'UD_Middle_French-PROFITEROLE',
+    'ud_moksha-jr':                            'UD_Moksha-JR',
+    'ud_munduruku-tudet':                      'UD_Munduruku-TuDeT',
+    'ud_naija-nsc':                            'UD_Naija-NSC',
+    'ud_nayini-aha':                           'UD_Nayini-AHA',
+    'ud_neapolitan-rb':                        'UD_Neapolitan-RB',
+    'ud_nheengatu-complin':                    'UD_Nheengatu-CompLin',
+    'ud_north_sami-giella':                    'UD_North_Sami-Giella',
+    'ud_northwest_gbaya-autogramm':            'UD_Northwest_Gbaya-Autogramm',
+    'ud_norwegian-bokmaal':                    'UD_Norwegian-Bokmaal',
+    'ud_norwegian-nynorsk':                    'UD_Norwegian-Nynorsk',
+    'ud_old_church_slavonic-proiel':           'UD_Old_Church_Slavonic-PROIEL',
+    'ud_old_east_slavic-birchbark':            'UD_Old_East_Slavic-Birchbark',
+    'ud_old_east_slavic-rnc':                  'UD_Old_East_Slavic-RNC',
+    'ud_old_east_slavic-ruthenian':            'UD_Old_East_Slavic-Ruthenian',
+    'ud_old_east_slavic-torot':                'UD_Old_East_Slavic-TOROT',
+    'ud_old_french-profiterole':               'UD_Old_French-PROFITEROLE',
+    'ud_old_irish-dipsgg':                     'UD_Old_Irish-DipSGG',
+    'ud_old_irish-dipwbg':                     'UD_Old_Irish-DipWBG',
+    'ud_old_turkish-clausal':                  'UD_Old_Turkish-Clausal',
+    'ud_ottoman_turkish-boun':                 'UD_Ottoman_Turkish-BOUN',
+    'ud_ottoman_turkish-dudu':                 'UD_Ottoman_Turkish-DUDU',
+    'ud_pashto-sikaram':                       'UD_Pashto-Sikaram',
+    'ud_paumari-tuecl':                        'UD_Paumari-TueCL',
+    'ud_persian-perdt':                        'UD_Persian-PerDT',
+    'ud_persian-seraji':                       'UD_Persian-Seraji',
+    'ud_pesh-chibergis':                       'UD_Pesh-ChibErgIS',
+    'ud_phrygian-kul':                         'UD_Phrygian-KUL',
+    'ud_polish-lfg':                           'UD_Polish-LFG',
+    'ud_polish-pdb':                           'UD_Polish-PDB',
+    'ud_polish-pud':                           'UD_Polish-PUD',
+    'ud_pomak-philotis':                       'UD_Pomak-Philotis',
+    'ud_portuguese-bosque':                    'UD_Portuguese-Bosque',
+    'ud_portuguese-cintil':                    'UD_Portuguese-CINTIL',
+    'ud_portuguese-dantestocks':               'UD_Portuguese-DANTEStocks',
+    'ud_portuguese-gsd':                       'UD_Portuguese-GSD',
+    'ud_portuguese-pud':                       'UD_Portuguese-PUD',
+    'ud_portuguese-petrogold':                 'UD_Portuguese-PetroGold',
+    'ud_portuguese-porttinari':                'UD_Portuguese-Porttinari',
+    'ud_romanian-art':                         'UD_Romanian-ArT',
+    'ud_romanian-nonstandard':                 'UD_Romanian-Nonstandard',
+    'ud_romanian-rrt':                         'UD_Romanian-RRT',
+    'ud_romanian-simonero':                    'UD_Romanian-SiMoNERo',
+    'ud_romanian-tuecl':                       'UD_Romanian-TueCL',
+    'ud_russian-gsd':                          'UD_Russian-GSD',
+    'ud_russian-pud':                          'UD_Russian-PUD',
+    'ud_russian-poetry':                       'UD_Russian-Poetry',
+    'ud_russian-syntagrus':                    'UD_Russian-SynTagRus',
+    'ud_russian-taiga':                        'UD_Russian-Taiga',
+    'ud_sanskrit-ufal':                        'UD_Sanskrit-UFAL',
+    'ud_sanskrit-vedic':                       'UD_Sanskrit-Vedic',
+    'ud_scottish_gaelic-arcosg':               'UD_Scottish_Gaelic-ARCOSG',
+    'ud_serbian-set':                          'UD_Serbian-SET',
+    'ud_sinhala-stb':                          'UD_Sinhala-STB',
+    'ud_skolt_sami-giellagas':                 'UD_Skolt_Sami-Giellagas',
+    'ud_slovak-snk':                           'UD_Slovak-SNK',
+    'ud_slovenian-ssj':                        'UD_Slovenian-SSJ',
+    'ud_slovenian-sst':                        'UD_Slovenian-SST',
+    'ud_soi-aha':                              'UD_Soi-AHA',
+    'ud_south_levantine_arabic-madar':         'UD_South_Levantine_Arabic-MADAR',
+    'ud_spanish-ancora':                       'UD_Spanish-AnCora',
+    'ud_spanish-coser':                        'UD_Spanish-COSER',
+    'ud_spanish-gsd':                          'UD_Spanish-GSD',
+    'ud_spanish-pud':                          'UD_Spanish-PUD',
+    'ud_spanish_sign_language-lse':            'UD_Spanish_Sign_Language-LSE',
+    'ud_swedish-lines':                        'UD_Swedish-LinES',
+    'ud_swedish-pud':                          'UD_Swedish-PUD',
+    'ud_swedish-talbanken':                    'UD_Swedish-Talbanken',
+    'ud_swedish_sign_language-sslc':           'UD_Swedish_Sign_Language-SSLC',
+    'ud_swiss_german-uzh':                     'UD_Swiss_German-UZH',
+    'ud_tagalog-trg':                          'UD_Tagalog-TRG',
+    'ud_tagalog-ugnayan':                      'UD_Tagalog-Ugnayan',
+    'ud_tamil-mwtt':                           'UD_Tamil-MWTT',
+    'ud_tamil-ttb':                            'UD_Tamil-TTB',
+    'ud_tatar-nmctt':                          'UD_Tatar-NMCTT',
+    'ud_teko-tudet':                           'UD_Teko-TuDeT',
+    'ud_telugu-mtg':                           'UD_Telugu-MTG',
+    'ud_telugu_english-tect':                  'UD_Telugu_English-TECT',
+    'ud_thai-pud':                             'UD_Thai-PUD',
+    'ud_tswana-popapolelo':                    'UD_Tswana-Popapolelo',
+    'ud_tupinamba-tudet':                      'UD_Tupinamba-TuDeT',
+    'ud_turkish-atis':                         'UD_Turkish-Atis',
+    'ud_turkish-boun':                         'UD_Turkish-BOUN',
+    'ud_turkish-framenet':                     'UD_Turkish-FrameNet',
+    'ud_turkish-gb':                           'UD_Turkish-GB',
+    'ud_turkish-imst':                         'UD_Turkish-IMST',
+    'ud_turkish-kenet':                        'UD_Turkish-Kenet',
+    'ud_turkish-pud':                          'UD_Turkish-PUD',
+    'ud_turkish-penn':                         'UD_Turkish-Penn',
+    'ud_turkish-tourism':                      'UD_Turkish-Tourism',
+    'ud_turkish_german-sagt':                  'UD_Turkish_German-SAGT',
+    'ud_ukrainian-iu':                         'UD_Ukrainian-IU',
+    'ud_ukrainian-parlamint':                  'UD_Ukrainian-ParlaMint',
+    'ud_umbrian-ikuvina':                      'UD_Umbrian-IKUVINA',
+    'ud_upper_sorbian-ufal':                   'UD_Upper_Sorbian-UFAL',
+    'ud_urdu-udtb':                            'UD_Urdu-UDTB',
+    'ud_uyghur-udt':                           'UD_Uyghur-UDT',
+    'ud_uzbek-ut':                             'UD_Uzbek-UT',
+    'ud_veps-vwt':                             'UD_Veps-VWT',
+    'ud_vietnamese-tuecl':                     'UD_Vietnamese-TueCL',
+    'ud_vietnamese-vtb':                       'UD_Vietnamese-VTB',
+    'ud_warlpiri-ufal':                        'UD_Warlpiri-UFAL',
+    'ud_welsh-ccg':                            'UD_Welsh-CCG',
+    'ud_western_armenian-armtdp':              'UD_Western_Armenian-ArmTDP',
+    'ud_western_sierra_puebla_nahuatl-itml':   'UD_Western_Sierra_Puebla_Nahuatl-ITML',
+    'ud_wolof-wtb':                            'UD_Wolof-WTB',
+    'ud_xavante-xdt':                          'UD_Xavante-XDT',
+    'ud_xibe-xdt':                             'UD_Xibe-XDT',
+    'ud_yakut-yktdt':                          'UD_Yakut-YKTDT',
+    'ud_yoruba-ytb':                           'UD_Yoruba-YTB',
+    'ud_yupik-sli':                            'UD_Yupik-SLI',
+    'ud_zaar-autogramm':                       'UD_Zaar-Autogramm',
+}
+def canonical_treebank_name(ud_name):
+    if ud_name in SHORT_NAMES:
+        return SHORT_NAMES[ud_name]
+    return CANONICAL_NAMES.get(ud_name.lower(), ud_name)

stanza/stanza/models/common/trainer.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import torch
+class Trainer:
+    def change_lr(self, new_lr):
+        for param_group in self.optimizer.param_groups:
+            param_group['lr'] = new_lr
+    def save(self, filename):
+        savedict = {
+                   'model': self.model.state_dict(),
+                   'optimizer': self.optimizer.state_dict()
+                   }
+        torch.save(savedict, filename)
+    def load(self, filename):
+        savedict = torch.load(filename, lambda storage, loc: storage, weights_only=True)
+        self.model.load_state_dict(savedict['model'])
+        if self.args['mode'] == 'train':
+            self.optimizer.load_state_dict(savedict['optimizer'])

stanza/stanza/models/common/utils.py ADDED Viewed

	@@ -0,0 +1,816 @@

+"""
+Utility functions.
+"""
+import argparse
+from collections import Counter
+from contextlib import contextmanager
+import gzip
+import json
+import logging
+import lzma
+import os
+import random
+import re
+import sys
+import unicodedata
+import zipfile
+import torch
+import numpy as np
+from stanza.models.common.constant import lcode2lang
+import stanza.models.common.seq2seq_constant as constant
+from stanza.resources.default_packages import TRANSFORMER_NICKNAMES
+import stanza.utils.conll18_ud_eval as ud_eval
+from stanza.utils.conll18_ud_eval import UDError
+logger = logging.getLogger('stanza')
+# filenames
+def get_wordvec_file(wordvec_dir, shorthand, wordvec_type=None):
+    """ Lookup the name of the word vectors file, given a directory and the language shorthand.
+    """
+    lcode, tcode = shorthand.split('_', 1)
+    lang = lcode2lang[lcode]
+    # locate language folder
+    word2vec_dir = os.path.join(wordvec_dir, 'word2vec', lang)
+    fasttext_dir = os.path.join(wordvec_dir, 'fasttext', lang)
+    lang_dir = None
+    if wordvec_type is not None:
+        lang_dir = os.path.join(wordvec_dir, wordvec_type, lang)
+        if not os.path.exists(lang_dir):
+            raise FileNotFoundError("Word vector type {} was specified, but directory {} does not exist".format(wordvec_type, lang_dir))
+    elif os.path.exists(word2vec_dir): # first try word2vec
+        lang_dir = word2vec_dir
+    elif os.path.exists(fasttext_dir): # otherwise try fasttext
+        lang_dir = fasttext_dir
+    else:
+        raise FileNotFoundError("Cannot locate word vector directory for language: {}  Looked in {} and {}".format(lang, word2vec_dir, fasttext_dir))
+    # look for wordvec filename in {lang_dir}
+    filename = os.path.join(lang_dir, '{}.vectors'.format(lcode))
+    if os.path.exists(filename + ".xz"):
+        filename = filename + ".xz"
+    elif os.path.exists(filename + ".txt"):
+        filename = filename + ".txt"
+    return filename
+@contextmanager
+def output_stream(filename=None):
+    """
+    Yields the given file if a file is given, or returns sys.stdout if filename is None
+    Opens the file in a context manager so it closes nicely
+    """
+    if filename is None:
+        yield sys.stdout
+    else:
+        with open(filename, "w", encoding="utf-8") as fout:
+            yield fout
+@contextmanager
+def open_read_text(filename, encoding="utf-8"):
+    """
+    Opens a file as an .xz file or .gz if it ends with .xz or .gz, or regular text otherwise.
+    Use as a context
+    eg:
+    with open_read_text(filename) as fin:
+        do stuff
+    File will be closed once the context exits
+    """
+    if filename.endswith(".xz"):
+        with lzma.open(filename, mode='rt', encoding=encoding) as fin:
+            yield fin
+    elif filename.endswith(".gz"):
+        with gzip.open(filename, mode='rt', encoding=encoding) as fin:
+            yield fin
+    else:
+        with open(filename, encoding=encoding) as fin:
+            yield fin
+@contextmanager
+def open_read_binary(filename):
+    """
+    Opens a file as an .xz file or .gz if it ends with .xz or .gz, or regular binary file otherwise.
+    If a .zip file is given, it can be read if there is a single file in there
+    Use as a context
+    eg:
+    with open_read_binary(filename) as fin:
+        do stuff
+    File will be closed once the context exits
+    """
+    if filename.endswith(".xz"):
+        with lzma.open(filename, mode='rb') as fin:
+            yield fin
+    elif filename.endswith(".gz"):
+        with gzip.open(filename, mode='rb') as fin:
+            yield fin
+    elif filename.endswith(".zip"):
+        with zipfile.ZipFile(filename) as zin:
+            input_names = zin.namelist()
+            if len(input_names) == 0:
+                raise ValueError("Empty zip archive")
+            if len(input_names) > 1:
+                raise ValueError("zip file %s has more than one file in it")
+            with zin.open(input_names[0]) as fin:
+                yield fin
+    else:
+        with open(filename, mode='rb') as fin:
+            yield fin
+# training schedule
+def get_adaptive_eval_interval(cur_dev_size, thres_dev_size, base_interval):
+    """ Adjust the evaluation interval adaptively.
+    If cur_dev_size <= thres_dev_size, return base_interval;
+    else, linearly increase the interval (round to integer times of base interval).
+    """
+    if cur_dev_size <= thres_dev_size:
+        return base_interval
+    else:
+        alpha = round(cur_dev_size / thres_dev_size)
+        return base_interval * alpha
+# ud utils
+def ud_scores(gold_conllu_file, system_conllu_file):
+    try:
+        gold_ud = ud_eval.load_conllu_file(gold_conllu_file)
+    except UDError as e:
+        raise UDError("Could not read %s" % gold_conllu_file) from e
+    try:
+        system_ud = ud_eval.load_conllu_file(system_conllu_file)
+    except UDError as e:
+        raise UDError("Could not read %s" % system_conllu_file) from e
+    evaluation = ud_eval.evaluate(gold_ud, system_ud)
+    return evaluation
+def harmonic_mean(a, weights=None):
+    if any([x == 0 for x in a]):
+        return 0
+    else:
+        assert weights is None or len(weights) == len(a), 'Weights has length {} which is different from that of the array ({}).'.format(len(weights), len(a))
+        if weights is None:
+            return len(a) / sum([1/x for x in a])
+        else:
+            return sum(weights) / sum(w/x for x, w in zip(a, weights))
+# torch utils
+def dispatch_optimizer(name, parameters, opt_logger, lr=None, betas=None, eps=None, momentum=None, **extra_args):
+    extra_logging = ""
+    if len(extra_args) > 0:
+        extra_logging = ", " + ", ".join("%s=%s" % (x, y) for x, y in extra_args.items())
+    if name == 'amsgrad':
+        opt_logger.debug("Building Adam w/ amsgrad with lr=%f, betas=%s, eps=%f%s", lr, betas, eps, extra_logging)
+        return torch.optim.Adam(parameters, amsgrad=True, lr=lr, betas=betas, eps=eps, **extra_args)
+    elif name == 'amsgradw':
+        opt_logger.debug("Building AdamW w/ amsgrad with lr=%f, betas=%s, eps=%f%s", lr, betas, eps, extra_logging)
+        return torch.optim.AdamW(parameters, amsgrad=True, lr=lr, betas=betas, eps=eps, **extra_args)
+    elif name == 'sgd':
+        opt_logger.debug("Building SGD with lr=%f, momentum=%f%s", lr, momentum, extra_logging)
+        return torch.optim.SGD(parameters, lr=lr, momentum=momentum, **extra_args)
+    elif name == 'adagrad':
+        opt_logger.debug("Building Adagrad with lr=%f%s", lr, extra_logging)
+        return torch.optim.Adagrad(parameters, lr=lr, **extra_args)
+    elif name == 'adam':
+        opt_logger.debug("Building Adam with lr=%f, betas=%s, eps=%f%s", lr, betas, eps, extra_logging)
+        return torch.optim.Adam(parameters, lr=lr, betas=betas, eps=eps, **extra_args)
+    elif name == 'adamw':
+        opt_logger.debug("Building AdamW with lr=%f, betas=%s, eps=%f%s", lr, betas, eps, extra_logging)
+        return torch.optim.AdamW(parameters, lr=lr, betas=betas, eps=eps, **extra_args)
+    elif name == 'adamax':
+        opt_logger.debug("Building Adamax%s", extra_logging)
+        return torch.optim.Adamax(parameters, **extra_args) # use default lr
+    elif name == 'adadelta':
+        opt_logger.debug("Building Adadelta with lr=%f%s", lr, extra_logging)
+        return torch.optim.Adadelta(parameters, lr=lr, **extra_args)
+    elif name == 'adabelief':
+        try:
+            from adabelief_pytorch import AdaBelief
+        except ModuleNotFoundError as e:
+            raise ModuleNotFoundError("Could not create adabelief optimizer.  Perhaps the adabelief-pytorch package is not installed") from e
+        opt_logger.debug("Building AdaBelief with lr=%f, eps=%f%s", lr, eps, extra_logging)
+        # TODO: add weight_decouple and rectify as extra args?
+        return AdaBelief(parameters, lr=lr, eps=eps, weight_decouple=True, rectify=True, **extra_args)
+    elif name == 'madgrad':
+        try:
+            import madgrad
+        except ModuleNotFoundError as e:
+            raise ModuleNotFoundError("Could not create madgrad optimizer.  Perhaps the madgrad package is not installed") from e
+        opt_logger.debug("Building MADGRAD with lr=%f, momentum=%f%s", lr, momentum, extra_logging)
+        return madgrad.MADGRAD(parameters, lr=lr, momentum=momentum, **extra_args)
+    elif name == 'mirror_madgrad':
+        try:
+            import madgrad
+        except ModuleNotFoundError as e:
+            raise ModuleNotFoundError("Could not create mirror_madgrad optimizer.  Perhaps the madgrad package is not installed") from e
+        opt_logger.debug("Building MirrorMADGRAD with lr=%f, momentum=%f%s", lr, momentum, extra_logging)
+        return madgrad.MirrorMADGRAD(parameters, lr=lr, momentum=momentum, **extra_args)
+    else:
+        raise ValueError("Unsupported optimizer: {}".format(name))
+def get_optimizer(name, model, lr, betas=(0.9, 0.999), eps=1e-8, momentum=0, weight_decay=None, bert_learning_rate=0.0, bert_weight_decay=None, charlm_learning_rate=0.0, is_peft=False, bert_finetune_layers=None, opt_logger=None):
+    opt_logger = opt_logger if opt_logger is not None else logger
+    base_parameters = [p for n, p in model.named_parameters()
+                       if p.requires_grad and not n.startswith("bert_model.")
+                       and not n.startswith("charmodel_forward.") and not n.startswith("charmodel_backward.")]
+    parameters = [{'param_group_name': 'base', 'params': base_parameters}]
+    charlm_parameters = [p for n, p in model.named_parameters()
+                         if p.requires_grad and (n.startswith("charmodel_forward.") or n.startswith("charmodel_backward."))]
+    if len(charlm_parameters) > 0 and charlm_learning_rate > 0:
+        parameters.append({'param_group_name': 'charlm', 'params': charlm_parameters, 'lr': lr * charlm_learning_rate})
+    if not is_peft:
+        bert_parameters = [p for n, p in model.named_parameters() if p.requires_grad and n.startswith("bert_model.")]
+        # bert_finetune_layers limits the bert finetuning to the *last* N layers of the model
+        if len(bert_parameters) > 0 and bert_finetune_layers is not None:
+            num_layers = model.bert_model.config.num_hidden_layers
+            start_layer = num_layers - bert_finetune_layers
+            bert_parameters = []
+            for layer_num in range(start_layer, num_layers):
+                bert_parameters.extend([param for name, param in model.named_parameters()
+                                        if param.requires_grad and name.startswith("bert_model.") and "layer.%d." % layer_num in name])
+        if len(bert_parameters) > 0 and bert_learning_rate > 0:
+            opt_logger.debug("Finetuning %d bert parameters with LR %s and WD %s", len(bert_parameters), lr * bert_learning_rate, bert_weight_decay)
+            parameters.append({'param_group_name': 'bert', 'params': bert_parameters, 'lr': lr * bert_learning_rate})
+            if bert_weight_decay is not None:
+                parameters[-1]['weight_decay'] = bert_weight_decay
+    else:
+        # some optimizers seem to train some even with a learning rate of 0...
+        if bert_learning_rate > 0:
+            # because PEFT handles what to hand to an optimizer, we don't want to touch that
+            parameters.append({'param_group_name': 'bert', 'params': model.bert_model.parameters(), 'lr': lr * bert_learning_rate})
+            if bert_weight_decay is not None:
+                parameters[-1]['weight_decay'] = bert_weight_decay
+    extra_args = {}
+    if weight_decay is not None:
+        extra_args["weight_decay"] = weight_decay
+    return dispatch_optimizer(name, parameters, opt_logger=opt_logger, lr=lr, betas=betas, eps=eps, momentum=momentum, **extra_args)
+def get_split_optimizer(name, model, lr, betas=(0.9, 0.999), eps=1e-8, momentum=0, weight_decay=None, bert_learning_rate=0.0, bert_weight_decay=None, charlm_learning_rate=0.0, is_peft=False, bert_finetune_layers=None):
+    """Same as `get_optimizer`, but splits the optimizer for Bert into a seperate optimizer"""
+    base_parameters = [p for n, p in model.named_parameters()
+                       if p.requires_grad and not n.startswith("bert_model.")
+                       and not n.startswith("charmodel_forward.") and not n.startswith("charmodel_backward.")]
+    parameters = [{'param_group_name': 'base', 'params': base_parameters}]
+    charlm_parameters = [p for n, p in model.named_parameters()
+                         if p.requires_grad and (n.startswith("charmodel_forward.") or n.startswith("charmodel_backward."))]
+    if len(charlm_parameters) > 0 and charlm_learning_rate > 0:
+        parameters.append({'param_group_name': 'charlm', 'params': charlm_parameters, 'lr': lr * charlm_learning_rate})
+    bert_parameters = None
+    if not is_peft:
+        trainable_parameters = [p for n, p in model.named_parameters() if p.requires_grad and n.startswith("bert_model.")]
+        # bert_finetune_layers limits the bert finetuning to the *last* N layers of the model
+        if len(trainable_parameters) > 0 and bert_finetune_layers is not None:
+            num_layers = model.bert_model.config.num_hidden_layers
+            start_layer = num_layers - bert_finetune_layers
+            trainable_parameters = []
+            for layer_num in range(start_layer, num_layers):
+                trainable_parameters.extend([param for name, param in model.named_parameters()
+                                             if param.requires_grad and name.startswith("bert_model.") and "layer.%d." % layer_num in name])
+        if len(trainable_parameters) > 0:
+            bert_parameters = [{'param_group_name': 'bert', 'params': trainable_parameters, 'lr': lr * bert_learning_rate}]
+    else:
+        # because PEFT handles what to hand to an optimizer, we don't want to touch that
+        bert_parameters = [{'param_group_name': 'bert', 'params': model.bert_model.parameters(), 'lr': lr * bert_learning_rate}]
+    extra_args = {}
+    if weight_decay is not None:
+        extra_args["weight_decay"] = weight_decay
+    optimizers = {
+        "general_optimizer": dispatch_optimizer(name, parameters, opt_logger=logger, lr=lr, betas=betas, eps=eps, momentum=momentum, **extra_args)
+    }
+    if bert_parameters is not None and bert_learning_rate > 0.0:
+        if bert_weight_decay is not None:
+            extra_args['weight_decay'] = bert_weight_decay
+        optimizers["bert_optimizer"] = dispatch_optimizer(name, bert_parameters, opt_logger=logger, lr=lr, betas=betas, eps=eps, momentum=momentum, **extra_args)
+    return optimizers
+def change_lr(optimizer, new_lr):
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = new_lr
+def flatten_indices(seq_lens, width):
+    flat = []
+    for i, l in enumerate(seq_lens):
+        for j in range(l):
+            flat.append(i * width + j)
+    return flat
+def keep_partial_grad(grad, topk):
+    """
+    Keep only the topk rows of grads.
+    """
+    assert topk < grad.size(0)
+    grad.data[topk:].zero_()
+    return grad
+# other utils
+def ensure_dir(d, verbose=True):
+    if not os.path.exists(d):
+        if verbose:
+            logger.info("Directory {} does not exist; creating...".format(d))
+        # exist_ok: guard against race conditions
+        os.makedirs(d, exist_ok=True)
+def save_config(config, path, verbose=True):
+    with open(path, 'w') as outfile:
+        json.dump(config, outfile, indent=2)
+    if verbose:
+        print("Config saved to file {}".format(path))
+    return config
+def load_config(path, verbose=True):
+    with open(path) as f:
+        config = json.load(f)
+    if verbose:
+        print("Config loaded from file {}".format(path))
+    return config
+def print_config(config):
+    info = "Running with the following configs:\n"
+    for k,v in config.items():
+        info += "\t{} : {}\n".format(k, str(v))
+    logger.info("\n" + info + "\n")
+def normalize_text(text):
+    return unicodedata.normalize('NFD', text)
+def unmap_with_copy(indices, src_tokens, vocab):
+    """
+    Unmap a list of list of indices, by optionally copying from src_tokens.
+    """
+    result = []
+    for ind, tokens in zip(indices, src_tokens):
+        words = []
+        for idx in ind:
+            if idx >= 0:
+                words.append(vocab.id2word[idx])
+            else:
+                idx = -idx - 1 # flip and minus 1
+                words.append(tokens[idx])
+        result += [words]
+    return result
+def prune_decoded_seqs(seqs):
+    """
+    Prune decoded sequences after EOS token.
+    """
+    out = []
+    for s in seqs:
+        if constant.EOS in s:
+            idx = s.index(constant.EOS_TOKEN)
+            out += [s[:idx]]
+        else:
+            out += [s]
+    return out
+def prune_hyp(hyp):
+    """
+    Prune a decoded hypothesis
+    """
+    if constant.EOS_ID in hyp:
+        idx = hyp.index(constant.EOS_ID)
+        return hyp[:idx]
+    else:
+        return hyp
+def prune(data_list, lens):
+    assert len(data_list) == len(lens)
+    nl = []
+    for d, l in zip(data_list, lens):
+        nl.append(d[:l])
+    return nl
+def sort(packed, ref, reverse=True):
+    """
+    Sort a series of packed list, according to a ref list.
+    Also return the original index before the sort.
+    """
+    assert (isinstance(packed, tuple) or isinstance(packed, list)) and isinstance(ref, list)
+    packed = [ref] + [range(len(ref))] + list(packed)
+    sorted_packed = [list(t) for t in zip(*sorted(zip(*packed), reverse=reverse))]
+    return tuple(sorted_packed[1:])
+def unsort(sorted_list, oidx):
+    """
+    Unsort a sorted list, based on the original idx.
+    """
+    assert len(sorted_list) == len(oidx), "Number of list elements must match with original indices."
+    if len(sorted_list) == 0:
+        return []
+    _, unsorted = [list(t) for t in zip(*sorted(zip(oidx, sorted_list)))]
+    return unsorted
+def sort_with_indices(data, key=None, reverse=False):
+    """
+    Sort data and return both the data and the original indices.
+    One useful application is to sort by length, which can be done with key=len
+    Returns the data as a sorted list, then the indices of the original list.
+    """
+    if not data:
+        return [], []
+    if key:
+        ordered = sorted(enumerate(data), key=lambda x: key(x[1]), reverse=reverse)
+    else:
+        ordered = sorted(enumerate(data), key=lambda x: x[1], reverse=reverse)
+    result = tuple(zip(*ordered))
+    return result[1], result[0]
+def split_into_batches(data, batch_size):
+    """
+    Returns a list of intervals so that each interval is either <= batch_size or one element long.
+    Long elements are not dropped from the intervals.
+    data is a list of lists
+    batch_size is how long to make each batch
+    return value is a list of pairs, start_idx end_idx
+    """
+    intervals = []
+    interval_start = 0
+    interval_size = 0
+    for idx, line in enumerate(data):
+        if len(line) > batch_size:
+            # guess we'll just hope the model can handle a batch of this size after all
+            if interval_size > 0:
+                intervals.append((interval_start, idx))
+            intervals.append((idx, idx+1))
+            interval_start = idx+1
+            interval_size = 0
+        elif len(line) + interval_size > batch_size:
+            # this line puts us over batch_size
+            intervals.append((interval_start, idx))
+            interval_start = idx
+            interval_size = len(line)
+        else:
+            interval_size = interval_size + len(line)
+    if interval_size > 0:
+        # there's some leftover
+        intervals.append((interval_start, len(data)))
+    return intervals
+def tensor_unsort(sorted_tensor, oidx):
+    """
+    Unsort a sorted tensor on its 0-th dimension, based on the original idx.
+    """
+    assert sorted_tensor.size(0) == len(oidx), "Number of list elements must match with original indices."
+    backidx = [x[0] for x in sorted(enumerate(oidx), key=lambda x: x[1])]
+    return sorted_tensor[backidx]
+def set_random_seed(seed):
+    """
+    Set a random seed on all of the things which might need it.
+    torch, np, python random, and torch.cuda
+    """
+    if seed is None:
+        seed = random.randint(0, 1000000000)
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    # some of these calls are probably redundant
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+    return seed
+def find_missing_tags(known_tags, test_tags):
+    if isinstance(known_tags, list) and isinstance(known_tags[0], list):
+        known_tags = set(x for y in known_tags for x in y)
+    if isinstance(test_tags, list) and isinstance(test_tags[0], list):
+        test_tags = sorted(set(x for y in test_tags for x in y))
+    missing_tags = sorted(x for x in test_tags if x not in known_tags)
+    return missing_tags
+def warn_missing_tags(known_tags, test_tags, test_set_name):
+    """
+    Print a warning if any tags present in the second list are not in the first list.
+    Can also handle a list of lists.
+    """
+    missing_tags = find_missing_tags(known_tags, test_tags)
+    if len(missing_tags) > 0:
+        logger.warning("Found tags in {} missing from the expected tag set: {}".format(test_set_name, missing_tags))
+        return True
+    return False
+def checkpoint_name(save_dir, save_name, checkpoint_name):
+    """
+    Will return a recommended checkpoint name for the given dir, save_name, optional checkpoint_name
+    For example, can pass in args['save_dir'], args['save_name'], args['checkpoint_save_name']
+    """
+    if checkpoint_name:
+        model_dir = os.path.split(checkpoint_name)[0]
+        if model_dir == save_dir:
+            return checkpoint_name
+        return os.path.join(save_dir, checkpoint_name)
+    model_dir = os.path.split(save_name)[0]
+    if model_dir != save_dir:
+        save_name = os.path.join(save_dir, save_name)
+    if save_name.endswith(".pt"):
+        return save_name[:-3] + "_checkpoint.pt"
+    return save_name + "_checkpoint"
+def default_device():
+    """
+    Pick a default device based on what's available on this system
+    """
+    if torch.cuda.is_available():
+        return 'cuda'
+    return 'cpu'
+def add_device_args(parser):
+    """
+    Add args which specify cpu, cuda, or arbitrary device
+    """
+    parser.add_argument('--device', type=str, default=default_device(), help='Which device to run on - use a torch device string name')
+    parser.add_argument('--cuda', dest='device', action='store_const', const='cuda', help='Run on CUDA')
+    parser.add_argument('--cpu', dest='device', action='store_const', const='cpu', help='Ignore CUDA and run on CPU')
+def load_elmo(elmo_model):
+    # This import is here so that Elmo integration can be treated
+    # as an optional feature
+    import elmoformanylangs
+    logger.info("Loading elmo: %s" % elmo_model)
+    elmo_model = elmoformanylangs.Embedder(elmo_model)
+    return elmo_model
+def log_training_args(args, args_logger, name="training"):
+    """
+    For record keeping purposes, log the arguments when training
+    """
+    if isinstance(args, argparse.Namespace):
+        args = vars(args)
+    keys = sorted(args.keys())
+    log_lines = ['%s: %s' % (k, args[k]) for k in keys]
+    args_logger.info('ARGS USED AT %s TIME:\n%s\n', name.upper(), '\n'.join(log_lines))
+def embedding_name(args):
+    """
+    Return the generic name of the biggest embedding used by a model.
+    Used by POS and depparse, for example.
+    TODO: Probably will make the transformer names a bit more informative,
+    such as electra, roberta, etc.  Maybe even phobert for VI, for example
+    """
+    embedding = "nocharlm"
+    if args['wordvec_pretrain_file'] is None and args['wordvec_file'] is None:
+        embedding = "nopretrain"
+    if args.get('charlm', True) and (args['charlm_forward_file'] or args['charlm_backward_file']):
+        embedding = "charlm"
+    if args['bert_model']:
+        if args['bert_model'] in TRANSFORMER_NICKNAMES:
+            embedding = TRANSFORMER_NICKNAMES[args['bert_model']]
+        else:
+            embedding = "transformer"
+    return embedding
+def standard_model_file_name(args, model_type, **kwargs):
+    """
+    Returns a model file name based on some common args found in the various models.
+    The expectation is that the args will have something like
+      parser.add_argument('--save_name', type=str, default="{shorthand}_{embedding}_parser.pt", help="File name to save the model")
+    Then the model shorthand, embedding type, and other args will be
+    turned into arguments in a format string
+    """
+    embedding = embedding_name(args)
+    finetune = ""
+    transformer_lr = ""
+    if args.get("bert_finetune", False):
+        finetune = "finetuned"
+        if "bert_learning_rate" in args:
+            transformer_lr = "{}".format(args["bert_learning_rate"])
+    use_peft = "nopeft"
+    if args.get("bert_finetune", False) and args.get("use_peft", False):
+        use_peft = "peft"
+    bert_finetuning = ""
+    if args.get("bert_finetune", False):
+        if args.get("use_peft", False):
+            bert_finetuning = "peft"
+        else:
+            bert_finetuning = "ft"
+    seed = args.get('seed', None)
+    if seed is None:
+        seed = ""
+    else:
+        seed = str(seed)
+    format_args = {
+        "batch_size":      args['batch_size'],
+        "bert_finetuning": bert_finetuning,
+        "embedding":       embedding,
+        "finetune":        finetune,
+        "peft":            use_peft,
+        "seed":            seed,
+        "shorthand":       args['shorthand'],
+        "transformer_lr":  transformer_lr,
+    }
+    format_args.update(**kwargs)
+    model_file = args['save_name'].format(**format_args)
+    model_file = re.sub("_+", "_", model_file)
+    model_dir = os.path.split(model_file)[0]
+    if not os.path.exists(os.path.join(args['save_dir'], model_file)) and os.path.exists(model_file):
+        return model_file
+    return os.path.join(args['save_dir'], model_file)
+def escape_misc_space(space):
+    spaces = []
+    for char in space:
+        if char == ' ':
+            spaces.append('\\s')
+        elif char == '\t':
+            spaces.append('\\t')
+        elif char == '\r':
+            spaces.append('\\r')
+        elif char == '\n':
+            spaces.append('\\n')
+        elif char == '|':
+            spaces.append('\\p')
+        elif char == '\\':
+            spaces.append('\\\\')
+        elif char == ' ':
+            spaces.append('\\u00A0')
+        else:
+            spaces.append(char)
+    escaped_space = "".join(spaces)
+    return escaped_space
+def unescape_misc_space(misc_space):
+    spaces = []
+    pos = 0
+    while pos < len(misc_space):
+        if misc_space[pos:pos+2] == '\\s':
+            spaces.append(' ')
+            pos += 2
+        elif misc_space[pos:pos+2] == '\\t':
+            spaces.append('\t')
+            pos += 2
+        elif misc_space[pos:pos+2] == '\\r':
+            spaces.append('\r')
+            pos += 2
+        elif misc_space[pos:pos+2] == '\\n':
+            spaces.append('\n')
+            pos += 2
+        elif misc_space[pos:pos+2] == '\\p':
+            spaces.append('|')
+            pos += 2
+        elif misc_space[pos:pos+2] == '\\\\':
+            spaces.append('\\')
+            pos += 2
+        elif misc_space[pos:pos+6] == '\\u00A0':
+            spaces.append(' ')
+            pos += 6
+        else:
+            spaces.append(misc_space[pos])
+            pos += 1
+    unescaped_space = "".join(spaces)
+    return unescaped_space
+def space_before_to_misc(space):
+    """
+    Convert whitespace to SpacesBefore specifically for the start of a document.
+    In general, UD datasets do not have both SpacesAfter on a token and SpacesBefore on the next token.
+    The space(s) are only marked on one of the tokens.
+    Only at the very beginning of a document is it necessary to mark what spaces occurred before the actual text,
+    and the default assumption is that there is no space if there is no SpacesBefore annotation.
+    """
+    if not space:
+        return ""
+    escaped_space = escape_misc_space(space)
+    return "SpacesBefore=%s" % escaped_space
+def space_after_to_misc(space):
+    """
+    Convert whitespace back to the escaped format - either SpaceAfter=No or SpacesAfter=...
+    """
+    if not space:
+        return "SpaceAfter=No"
+    if space == " ":
+        return ""
+    escaped_space = escape_misc_space(space)
+    return "SpacesAfter=%s" % escaped_space
+def misc_to_space_before(misc):
+    """
+    Find any SpacesBefore annotation in the MISC column and turn it into a space value
+    """
+    if not misc:
+        return ""
+    pieces = misc.split("|")
+    for piece in pieces:
+        if not piece.lower().startswith("spacesbefore="):
+            continue
+        misc_space = piece.split("=", maxsplit=1)[1]
+        return unescape_misc_space(misc_space)
+    return ""
+def misc_to_space_after(misc):
+    """
+    Convert either SpaceAfter=No or the SpacesAfter annotation
+    see https://universaldependencies.org/misc.html#spacesafter
+    We compensate for some treebanks using SpaceAfter=\n instead of SpacesAfter=\n
+    On the way back, though, those annotations will be turned into SpacesAfter
+    """
+    if not misc:
+        return " "
+    pieces = misc.split("|")
+    if any(piece.lower() == "spaceafter=no" for piece in pieces):
+        return ""
+    if "SpaceAfter=Yes" in pieces:
+        # as of UD 2.11, the Cantonese treebank had this as a misc feature
+        return " "
+    if "SpaceAfter=No~" in pieces:
+        # as of UD 2.11, a weird typo in the Russian Taiga dataset
+        return ""
+    for piece in pieces:
+        if piece.startswith("SpaceAfter=") or piece.startswith("SpacesAfter="):
+            misc_space = piece.split("=", maxsplit=1)[1]
+            return unescape_misc_space(misc_space)
+    return " "
+def log_norms(model):
+    lines = ["NORMS FOR MODEL PARAMTERS"]
+    pieces = []
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            pieces.append((name, "%.6g" % torch.norm(param).item(), "%d" % param.numel()))
+    name_len = max(len(x[0]) for x in pieces)
+    norm_len = max(len(x[1]) for x in pieces)
+    line_format = "  %-" + str(name_len) + "s   %" + str(norm_len) + "s     %s"
+    for line in pieces:
+        lines.append(line_format % line)
+    logger.info("\n".join(lines))
+def attach_bert_model(model, bert_model, bert_tokenizer, use_peft, force_bert_saved):
+    if use_peft:
+        # we use a peft-specific pathway for saving peft weights
+        model.add_unsaved_module('bert_model', bert_model)
+        model.bert_model.train()
+    elif force_bert_saved:
+        model.bert_model = bert_model
+    elif bert_model is not None:
+        model.add_unsaved_module('bert_model', bert_model)
+        for _, parameter in bert_model.named_parameters():
+            parameter.requires_grad = False
+    else:
+        model.bert_model = None
+    model.add_unsaved_module('bert_tokenizer', bert_tokenizer)
+def build_save_each_filename(base_filename):
+    """
+    If the given name doesn't have %d in it, add %4d at the end of the filename
+    This way, there's something to count how many models have been saved
+    """
+    try:
+        base_filename % 1
+    except TypeError:
+        # so models.pt -> models_0001.pt, etc
+        pieces = os.path.splitext(model_save_each_file)
+        base_filename = pieces[0] + "_%04d" + pieces[1]
+    return base_filename

stanza/stanza/models/common/vocab.py ADDED Viewed

	@@ -0,0 +1,298 @@

+from copy import copy
+from collections import Counter, OrderedDict
+from collections.abc import Iterable
+import os
+import pickle
+PAD = '<PAD>'
+PAD_ID = 0
+UNK = '<UNK>'
+UNK_ID = 1
+EMPTY = '<EMPTY>'
+EMPTY_ID = 2
+ROOT = '<ROOT>'
+ROOT_ID = 3
+VOCAB_PREFIX = [PAD, UNK, EMPTY, ROOT]
+VOCAB_PREFIX_SIZE = len(VOCAB_PREFIX)
+class BaseVocab:
+    """ A base class for common vocabulary operations. Each subclass should at least
+    implement its own build_vocab() function."""
+    def __init__(self, data=None, lang="", idx=0, cutoff=0, lower=False):
+        self.data = data
+        self.lang = lang
+        self.idx = idx
+        self.cutoff = cutoff
+        self.lower = lower
+        if data is not None:
+            self.build_vocab()
+        self.state_attrs = ['lang', 'idx', 'cutoff', 'lower', '_unit2id', '_id2unit']
+    def build_vocab(self):
+        raise NotImplementedError("This BaseVocab does not have build_vocab implemented.  This method should create _id2unit and _unit2id")
+    def state_dict(self):
+        """ Returns a dictionary containing all states that are necessary to recover
+        this vocab. Useful for serialization."""
+        state = OrderedDict()
+        for attr in self.state_attrs:
+            if hasattr(self, attr):
+                state[attr] = getattr(self, attr)
+        return state
+    @classmethod
+    def load_state_dict(cls, state_dict):
+        """ Returns a new Vocab instance constructed from a state dict. """
+        new = cls()
+        for attr, value in state_dict.items():
+            setattr(new, attr, value)
+        return new
+    def normalize_unit(self, unit):
+        # be sure to look in subclasses for other normalization being done
+        # especially PretrainWordVocab
+        if unit is None:
+            return unit
+        if self.lower:
+            return unit.lower()
+        return unit
+    def unit2id(self, unit):
+        unit = self.normalize_unit(unit)
+        if unit in self._unit2id:
+            return self._unit2id[unit]
+        else:
+            return self._unit2id[UNK]
+    def id2unit(self, id):
+        return self._id2unit[id]
+    def map(self, units):
+        return [self.unit2id(x) for x in units]
+    def unmap(self, ids):
+        return [self.id2unit(x) for x in ids]
+    def __str__(self):
+        lang_str = "(%s)" % self.lang if self.lang else ""
+        name = str(type(self)) + lang_str
+        return "<%s: %s>" % (name, self._id2unit)
+    def __len__(self):
+        return len(self._id2unit)
+    def __getitem__(self, key):
+        if isinstance(key, str):
+            return self.unit2id(key)
+        elif isinstance(key, int) or isinstance(key, list):
+            return self.id2unit(key)
+        else:
+            raise TypeError("Vocab key must be one of str, list, or int")
+    def __contains__(self, key):
+        return self.normalize_unit(key) in self._unit2id
+    @property
+    def size(self):
+        return len(self)
+class DeltaVocab(BaseVocab):
+    """
+    A vocab that starts off with a BaseVocab, then possibly adds more tokens based on the text in the given data
+    Currently meant only for characters, such as built by MWT or Lemma
+    Expected data format is either a list of strings, or a list of list of strings
+    """
+    def __init__(self, data, orig_vocab):
+        self.orig_vocab = orig_vocab
+        super().__init__(data=data, lang=orig_vocab.lang, idx=orig_vocab.idx, cutoff=orig_vocab.cutoff, lower=orig_vocab.lower)
+    def build_vocab(self):
+        if all(isinstance(word, str) for word in self.data):
+            allchars = "".join(self.data)
+        else:
+            allchars = "".join([word for sentence in self.data for word in sentence])
+        unk = [c for c in allchars if c not in self.orig_vocab._unit2id]
+        if len(unk) > 0:
+            unk = sorted(set(unk))
+            self._id2unit = self.orig_vocab._id2unit + unk
+            self._unit2id = dict(self.orig_vocab._unit2id)
+            for c in unk:
+                self._unit2id[c] = len(self._unit2id)
+        else:
+            self._id2unit = self.orig_vocab._id2unit
+            self._unit2id = self.orig_vocab._unit2id
+class CompositeVocab(BaseVocab):
+    ''' Vocabulary class that handles parsing and printing composite values such as
+    compositional XPOS and universal morphological features (UFeats).
+    Two key options are `keyed` and `sep`. `sep` specifies the separator used between
+    different parts of the composite values, which is `|` for UFeats, for example.
+    If `keyed` is `True`, then the incoming value is treated similarly to UFeats, where
+    each part is a key/value pair separated by an equal sign (`=`). There are no inherit
+    order to the keys, and we sort them alphabetically for serialization and deserialization.
+    Whenever a part is absent, its internal value is a special `<EMPTY>` symbol that will
+    be treated accordingly when generating the output. If `keyed` is `False`, then the parts
+    are treated as positioned values, and `<EMPTY>` is used to pad parts at the end when the
+    incoming value is not long enough.'''
+    def __init__(self, data=None, lang="", idx=0, sep="", keyed=False):
+        self.sep = sep
+        self.keyed = keyed
+        super().__init__(data, lang, idx=idx)
+        self.state_attrs += ['sep', 'keyed']
+    def unit2parts(self, unit):
+        # unpack parts of a unit
+        if not self.sep:
+            parts = [x for x in unit]
+        else:
+            parts = unit.split(self.sep)
+        if self.keyed:
+            if len(parts) == 1 and parts[0] == '_':
+                return dict()
+            parts = [x.split('=') for x in parts]
+            if any(len(x) != 2 for x in parts):
+                raise ValueError('Received "%s" for a dictionary which is supposed to be keyed, eg the entries should all be of the form key=value and separated by %s' % (unit, self.sep))
+            # Just treat multi-valued properties values as one possible value
+            parts = dict(parts)
+        elif unit == '_':
+            parts = []
+        return parts
+    def unit2id(self, unit):
+        parts = self.unit2parts(unit)
+        if self.keyed:
+            # treat multi-valued properties as singletons
+            return [self._unit2id[k].get(parts[k], UNK_ID) if k in parts else EMPTY_ID for k in self._unit2id]
+        else:
+            return [self._unit2id[i].get(parts[i], UNK_ID) if i < len(parts) else EMPTY_ID for i in range(len(self._unit2id))]
+    def id2unit(self, id):
+        # special case: allow single ids for vocabs with length 1
+        if len(self._id2unit) == 1 and not isinstance(id, Iterable):
+            id = (id,)
+        items = []
+        for v, k in zip(id, self._id2unit.keys()):
+            if v == EMPTY_ID: continue
+            if self.keyed:
+                items.append("{}={}".format(k, self._id2unit[k][v]))
+            else:
+                items.append(self._id2unit[k][v])
+        if self.sep is not None:
+            res = self.sep.join(items)
+            if res == "":
+                res = "_"
+            return res
+        else:
+            return items
+    def build_vocab(self):
+        allunits = [w[self.idx] for sent in self.data for w in sent]
+        if self.keyed:
+            self._id2unit = dict()
+            for u in allunits:
+                parts = self.unit2parts(u)
+                for key in parts:
+                    if key not in self._id2unit:
+                        self._id2unit[key] = copy(VOCAB_PREFIX)
+                    # treat multi-valued properties as singletons
+                    if parts[key] not in self._id2unit[key]:
+                        self._id2unit[key].append(parts[key])
+            # special handle for the case where upos/xpos/ufeats are always empty
+            if len(self._id2unit) == 0:
+                self._id2unit['_'] = copy(VOCAB_PREFIX) # use an arbitrary key
+        else:
+            self._id2unit = dict()
+            allparts = [self.unit2parts(u) for u in allunits]
+            maxlen = max([len(p) for p in allparts])
+            for parts in allparts:
+                for i, p in enumerate(parts):
+                    if i not in self._id2unit:
+                        self._id2unit[i] = copy(VOCAB_PREFIX)
+                    if i < len(parts) and p not in self._id2unit[i]:
+                        self._id2unit[i].append(p)
+            # special handle for the case where upos/xpos/ufeats are always empty
+            if len(self._id2unit) == 0:
+                self._id2unit[0] = copy(VOCAB_PREFIX) # use an arbitrary key
+        self._id2unit = OrderedDict([(k, self._id2unit[k]) for k in sorted(self._id2unit.keys())])
+        self._unit2id = {k: {w:i for i, w in enumerate(self._id2unit[k])} for k in self._id2unit}
+    def lens(self):
+        return [len(self._unit2id[k]) for k in self._unit2id]
+    def items(self, idx):
+        return self._id2unit[idx]
+    def __str__(self):
+        pieces = ["[" + ",".join(x) + "]" for _, x in self._id2unit.items()]
+        rep = "<{}:\n {}>".format(type(self), "\n ".join(pieces))
+        return rep
+class BaseMultiVocab:
+    """ A convenient vocab container that can store multiple BaseVocab instances, and support
+    safe serialization of all instances via state dicts. Each subclass of this base class
+    should implement the load_state_dict() function to specify how a saved state dict
+    should be loaded back."""
+    def __init__(self, vocab_dict=None):
+        self._vocabs = OrderedDict()
+        if vocab_dict is None:
+            return
+        # check all values provided must be a subclass of the Vocab base class
+        assert all([isinstance(v, BaseVocab) for v in vocab_dict.values()])
+        for k, v in vocab_dict.items():
+            self._vocabs[k] = v
+    def __setitem__(self, key, item):
+        self._vocabs[key] = item
+    def __getitem__(self, key):
+        return self._vocabs[key]
+    def __str__(self):
+        return "<{}: [{}]>".format(type(self), ", ".join(self._vocabs.keys()))
+    def __contains__(self, key):
+        return key in self._vocabs
+    def keys(self):
+        return self._vocabs.keys()
+    def state_dict(self):
+        """ Build a state dict by iteratively calling state_dict() of all vocabs. """
+        state = OrderedDict()
+        for k, v in self._vocabs.items():
+            state[k] = v.state_dict()
+        return state
+    @classmethod
+    def load_state_dict(cls, state_dict):
+        """ Construct a MultiVocab by reading from a state dict."""
+        raise NotImplementedError
+class CharVocab(BaseVocab):
+    def build_vocab(self):
+        if isinstance(self.data[0][0], (list, tuple)): # general data from DataLoader
+            counter = Counter([c for sent in self.data for w in sent for c in w[self.idx]])
+            for k in list(counter.keys()):
+                if counter[k] < self.cutoff:
+                    del counter[k]
+        else: # special data from Char LM
+            counter = Counter([c for sent in self.data for c in sent])
+        self._id2unit = VOCAB_PREFIX + list(sorted(list(counter.keys()), key=lambda k: (counter[k], k), reverse=True))
+        self._unit2id = {w:i for i, w in enumerate(self._id2unit)}

stanza/stanza/models/constituency/base_model.py ADDED Viewed

	@@ -0,0 +1,532 @@

+"""
+The BaseModel is passed to the transitions so that the transitions
+can operate on a parsing state without knowing the exact
+representation used in the model.
+For example, a SimpleModel simply looks at the top of the various stacks in the state.
+A model with LSTM representations for the different transitions may
+attach the hidden and output states of the LSTM to the word /
+constituent / transition stacks.
+Reminder: the parsing state is a list of words to parse, the
+transitions used to build a (possibly incomplete) parse, and the
+constituent(s) built so far by those transitions.  Each of these
+components are represented using stacks to improve the efficiency
+of operations such as "combine the most recent 4 constituents"
+or "turn the next input word into a constituent"
+"""
+from abc import ABC, abstractmethod
+from collections import defaultdict
+import logging
+import torch
+from stanza.models.common import utils
+from stanza.models.constituency import transition_sequence
+from stanza.models.constituency.parse_transitions import TransitionScheme, CloseConstituent
+from stanza.models.constituency.parse_tree import Tree
+from stanza.models.constituency.state import State
+from stanza.models.constituency.tree_stack import TreeStack
+from stanza.server.parser_eval import ParseResult, ScoredTree
+# default unary limit.  some treebanks may have longer chains (CTB, for example)
+UNARY_LIMIT = 4
+logger = logging.getLogger('stanza.constituency.trainer')
+class BaseModel(ABC):
+    """
+    This base class defines abstract methods for manipulating a State.
+    Applying transitions may change important metadata about a State
+    such as the vectors associated with LSTM hidden states, for example.
+    The constructor forwards all unused arguments to other classes in the
+    constructor sequence, so put this before other classes such as nn.Module
+    """
+    def __init__(self, transition_scheme, unary_limit, reverse_sentence, root_labels, *args, **kwargs):
+        super().__init__(*args, **kwargs)  # forwards all unused arguments
+        self._transition_scheme = transition_scheme
+        self._unary_limit = unary_limit
+        self._reverse_sentence = reverse_sentence
+        self._root_labels = sorted(list(root_labels))
+        self._is_top_down = (self._transition_scheme is TransitionScheme.TOP_DOWN or
+                             self._transition_scheme is TransitionScheme.TOP_DOWN_UNARY or
+                             self._transition_scheme is TransitionScheme.TOP_DOWN_COMPOUND)
+    @abstractmethod
+    def initial_word_queues(self, tagged_word_lists):
+        """
+        For each list of tagged words, builds a TreeStack of word nodes
+        The word lists should be backwards so that the first word is the last word put on the stack (LIFO)
+        """
+    @abstractmethod
+    def initial_transitions(self):
+        """
+        Builds an initial transition stack with whatever values need to go into first position
+        """
+    @abstractmethod
+    def initial_constituents(self):
+        """
+        Builds an initial constituent stack with whatever values need to go into first position
+        """
+    @abstractmethod
+    def get_word(self, word_node):
+        """
+        Get the word corresponding to this position in the word queue
+        """
+    @abstractmethod
+    def transform_word_to_constituent(self, state):
+        """
+        Transform the top node of word_queue to something that can push on the constituent stack
+        """
+    @abstractmethod
+    def dummy_constituent(self, dummy):
+        """
+        When using a dummy node as a sentinel, transform it to something usable by this model
+        """
+    @abstractmethod
+    def build_constituents(self, labels, children_lists):
+        """
+        Build multiple constituents at once.  This gives the opportunity for batching operations
+        """
+    @abstractmethod
+    def push_constituents(self, constituent_stacks, constituents):
+        """
+        Add a multiple constituents to multiple constituent_stacks
+        Useful to factor this out in case batching will help
+        """
+    @abstractmethod
+    def get_top_constituent(self, constituents):
+        """
+        Get the first constituent from the constituent stack
+        For example, a model might want to remove embeddings and LSTM state vectors
+        """
+    @abstractmethod
+    def push_transitions(self, transition_stacks, transitions):
+        """
+        Add a multiple transitions to multiple transition_stacks
+        Useful to factor this out in case batching will help
+        """
+    @abstractmethod
+    def get_top_transition(self, transitions):
+        """
+        Get the first transition from the transition stack
+        For example, a model might want to remove transition embeddings before returning the transition
+        """
+    @property
+    def root_labels(self):
+        """
+        Return ROOT labels for this model.  Probably ROOT, TOP, or both
+        (Danish uses 's', though)
+        """
+        return self._root_labels
+    def unary_limit(self):
+        """
+        Limit on the number of consecutive unary transitions
+        """
+        return self._unary_limit
+    def transition_scheme(self):
+        """
+        Transition scheme used - see parse_transitions
+        """
+        return self._transition_scheme
+    def has_unary_transitions(self):
+        """
+        Whether or not this model uses unary transitions, based on transition_scheme
+        """
+        return self._transition_scheme is TransitionScheme.TOP_DOWN_UNARY
+    @property
+    def is_top_down(self):
+        """
+        Whether or not this model is TOP_DOWN
+        """
+        return self._is_top_down
+    @property
+    def reverse_sentence(self):
+        """
+        Whether or not this model is built to parse backwards
+        """
+        return self._reverse_sentence
+    def predict(self, states, is_legal=True):
+        raise NotImplementedError("LSTMModel can predict, but SimpleModel cannot")
+    def weighted_choice(self, states):
+        raise NotImplementedError("LSTMModel can weighted_choice, but SimpleModel cannot")
+    def predict_gold(self, states, is_legal=True):
+        """
+        For each State, return the next item in the gold_sequence
+        """
+        transitions = [y.gold_sequence[y.num_transitions] for y in states]
+        if is_legal:
+            for trans, state in zip(transitions, states):
+                if not trans.is_legal(state, self):
+                    raise RuntimeError("Transition {}:{} was not legal in a transition sequence:\nOriginal tree: {}\nTransitions: {}".format(state.num_transitions, trans, state.gold_tree, state.gold_sequence))
+        return None, transitions, None
+    def initial_state_from_preterminals(self, preterminal_lists, gold_trees, gold_sequences):
+        """
+        what is passed in should be a list of list of preterminals
+        """
+        word_queues = self.initial_word_queues(preterminal_lists)
+        # this is the bottom of the TreeStack and will be the same for each State
+        transitions = self.initial_transitions()
+        constituents = self.initial_constituents()
+        states = [State(sentence_length=len(wq)-2,   # -2 because it starts and ends with a sentinel
+                        num_opens=0,
+                        word_queue=wq,
+                        gold_tree=None,
+                        gold_sequence=None,
+                        transitions=transitions,
+                        constituents=constituents,
+                        word_position=0,
+                        score=0.0)
+                  for idx, wq in enumerate(word_queues)]
+        if gold_trees:
+            states = [state._replace(gold_tree=gold_tree) for gold_tree, state in zip(gold_trees, states)]
+        if gold_sequences:
+            states = [state._replace(gold_sequence=gold_sequence) for gold_sequence, state in zip(gold_sequences, states)]
+        return states
+    def initial_state_from_words(self, word_lists):
+        preterminal_lists = [[Tree(tag, Tree(word)) for word, tag in words]
+                             for words in word_lists]
+        return self.initial_state_from_preterminals(preterminal_lists, gold_trees=None, gold_sequences=None)
+    def initial_state_from_gold_trees(self, trees, gold_sequences=None):
+        preterminal_lists = [[Tree(pt.label, Tree(pt.children[0].label))
+                              for pt in tree.yield_preterminals()]
+                             for tree in trees]
+        return self.initial_state_from_preterminals(preterminal_lists, gold_trees=trees, gold_sequences=gold_sequences)
+    def build_batch_from_trees(self, batch_size, data_iterator):
+        """
+        Read from the data_iterator batch_size trees and turn them into new parsing states
+        """
+        state_batch = []
+        for _ in range(batch_size):
+            gold_tree = next(data_iterator, None)
+            if gold_tree is None:
+                break
+            state_batch.append(gold_tree)
+        if len(state_batch) > 0:
+            state_batch = self.initial_state_from_gold_trees(state_batch)
+        return state_batch
+    def build_batch_from_trees_with_gold_sequence(self, batch_size, data_iterator):
+        """
+        Same as build_batch_from_trees, but use the model parameters to turn the trees into gold sequences and include the sequence
+        """
+        state_batch = self.build_batch_from_trees(batch_size, data_iterator)
+        if len(state_batch) == 0:
+            return state_batch
+        gold_sequences = transition_sequence.build_treebank([state.gold_tree for state in state_batch], self.transition_scheme(), self.reverse_sentence)
+        state_batch = [state._replace(gold_sequence=sequence) for state, sequence in zip(state_batch, gold_sequences)]
+        return state_batch
+    def build_batch_from_tagged_words(self, batch_size, data_iterator):
+        """
+        Read from the data_iterator batch_size tagged sentences and turn them into new parsing states
+        Expects a list of list of (word, tag)
+        """
+        state_batch = []
+        for _ in range(batch_size):
+            sentence = next(data_iterator, None)
+            if sentence is None:
+                break
+            state_batch.append(sentence)
+        if len(state_batch) > 0:
+            state_batch = self.initial_state_from_words(state_batch)
+        return state_batch
+    def parse_sentences(self, data_iterator, build_batch_fn, batch_size, transition_choice, keep_state=False, keep_constituents=False, keep_scores=False):
+        """
+        Repeat transitions to build a list of trees from the input batches.
+        The data_iterator should be anything which returns the data for a parse task via next()
+        build_batch_fn is a function that turns that data into State objects
+        This will be called to generate batches of size batch_size until the data is exhausted
+        The return is a list of tuples: (gold_tree, [(predicted, score) ...])
+        gold_tree will be left blank if the data did not include gold trees
+        if keep_scores is true, the score will be the sum of the values
+          returned by the model for each transition
+        transition_choice: which method of the model to use for choosing the next transition
+          predict for predicting the transition based on the model
+          predict_gold to just extract the gold transition from the sequence
+        """
+        treebank = []
+        treebank_indices = []
+        state_batch = build_batch_fn(batch_size, data_iterator)
+        # used to track which indices we are currently parsing
+        # since the parses get finished at different times, this will let us unsort after
+        batch_indices = list(range(len(state_batch)))
+        horizon_iterator = iter([])
+        if keep_constituents:
+            constituents = defaultdict(list)
+        while len(state_batch) > 0:
+            pred_scores, transitions, scores = transition_choice(state_batch)
+            if keep_scores and scores is not None:
+                state_batch = [state._replace(score=state.score + score) for state, score in zip(state_batch, scores)]
+            state_batch = self.bulk_apply(state_batch, transitions)
+            if keep_constituents:
+                for t_idx, transition in enumerate(transitions):
+                    if isinstance(transition, CloseConstituent):
+                        # constituents is a TreeStack with information on how to build the next state of the LSTM or attn
+                        # constituents.value is the TreeStack node
+                        # constituents.value.value is the Constituent itself (with the tree and the embedding)
+                        constituents[batch_indices[t_idx]].append(state_batch[t_idx].constituents.value.value)
+            remove = set()
+            for idx, state in enumerate(state_batch):
+                if state.finished(self):
+                    predicted_tree = state.get_tree(self)
+                    if self.reverse_sentence:
+                        predicted_tree = predicted_tree.reverse()
+                    gold_tree = state.gold_tree
+                    treebank.append(ParseResult(gold_tree, [ScoredTree(predicted_tree, state.score)], state if keep_state else None, constituents[batch_indices[idx]] if keep_constituents else None))
+                    treebank_indices.append(batch_indices[idx])
+                    remove.add(idx)
+            if len(remove) > 0:
+                state_batch = [state for idx, state in enumerate(state_batch) if idx not in remove]
+                batch_indices = [batch_idx for idx, batch_idx in enumerate(batch_indices) if idx not in remove]
+            for _ in range(batch_size - len(state_batch)):
+                horizon_state = next(horizon_iterator, None)
+                if not horizon_state:
+                    horizon_batch = build_batch_fn(batch_size, data_iterator)
+                    if len(horizon_batch) == 0:
+                        break
+                    horizon_iterator = iter(horizon_batch)
+                    horizon_state = next(horizon_iterator, None)
+                state_batch.append(horizon_state)
+                batch_indices.append(len(treebank) + len(state_batch))
+        treebank = utils.unsort(treebank, treebank_indices)
+        return treebank
+    def parse_sentences_no_grad(self, data_iterator, build_batch_fn, batch_size, transition_choice, keep_state=False, keep_constituents=False, keep_scores=False):
+        """
+        Given an iterator over the data and a method for building batches, returns a list of parse trees.
+        no_grad() is so that gradients aren't kept, which makes the model
+        run faster and use less memory at inference time
+        """
+        with torch.no_grad():
+            return self.parse_sentences(data_iterator, build_batch_fn, batch_size, transition_choice, keep_state, keep_constituents, keep_scores)
+    def analyze_trees(self, trees, batch_size=None, keep_state=True, keep_constituents=True, keep_scores=True):
+        """
+        Return a ParseResult for each tree in the trees list
+        The transitions run will be the transitions represented by the tree
+        The output layers will be available in result.state for each result
+        keep_state=True as a default here as a method which keeps the grad
+        is likely to want to keep the resulting state as well
+        """
+        if batch_size is None:
+            # TODO: refactor?
+            batch_size = self.args['eval_batch_size']
+        tree_iterator = iter(trees)
+        treebank = self.parse_sentences(tree_iterator, self.build_batch_from_trees_with_gold_sequence, batch_size, self.predict_gold, keep_state, keep_constituents, keep_scores=keep_scores)
+        return treebank
+    def parse_tagged_words(self, words, batch_size):
+        """
+        This parses tagged words and returns a list of trees.
+        `parse_tagged_words` is useful at Pipeline time -
+          it takes words & tags and processes that into trees.
+        The tagged words should be represented:
+          one list per sentence
+            each sentence is a list of (word, tag)
+        The return value is a list of ParseTree objects
+        """
+        logger.debug("Processing %d sentences", len(words))
+        self.eval()
+        sentence_iterator = iter(words)
+        treebank = self.parse_sentences_no_grad(sentence_iterator, self.build_batch_from_tagged_words, batch_size, self.predict, keep_state=False, keep_constituents=False)
+        results = [t.predictions[0].tree for t in treebank]
+        return results
+    def bulk_apply(self, state_batch, transitions, fail=False):
+        """
+        Apply the given list of Transitions to the given list of States, using the model as a reference
+        model: SimpleModel, LSTMModel, or any other form of model
+        state_batch: list of States
+        transitions: list of transitions, one per state
+        fail: throw an exception on a failed transition, as opposed to skipping the tree
+        """
+        remove = set()
+        word_positions = []
+        constituents = []
+        new_constituents = []
+        callbacks = defaultdict(list)
+        for idx, (tree, transition) in enumerate(zip(state_batch, transitions)):
+            if not transition:
+                error = "Got stuck and couldn't find a legal transition on the following gold tree:\n{}\n\nFinal state:\n{}".format(tree.gold_tree, tree.to_string(self))
+                if fail:
+                    raise ValueError(error)
+                else:
+                    logger.error(error)
+                    remove.add(idx)
+                    continue
+            if tree.num_transitions >= len(tree.word_queue) * 20:
+                # too many transitions
+                # x20 is somewhat empirically chosen based on certain
+                # treebanks having deep unary structures, especially early
+                # on when the model is fumbling around
+                if tree.gold_tree:
+                    error = "Went infinite on the following gold tree:\n{}\n\nFinal state:\n{}".format(tree.gold_tree, tree.to_string(self))
+                else:
+                    error = "Went infinite!:\nFinal state:\n{}".format(tree.to_string(self))
+                if fail:
+                    raise ValueError(error)
+                else:
+                    logger.error(error)
+                    remove.add(idx)
+                    continue
+            wq, c, nc, callback = transition.update_state(tree, self)
+            word_positions.append(wq)
+            constituents.append(c)
+            new_constituents.append(nc)
+            if callback:
+                # not `idx` in case something was removed
+                callbacks[callback].append(len(new_constituents)-1)
+        for key, idxs in callbacks.items():
+            data = [new_constituents[x] for x in idxs]
+            callback_constituents = key.build_constituents(self, data)
+            for idx, constituent in zip(idxs, callback_constituents):
+                new_constituents[idx] = constituent
+        if len(remove) > 0:
+            state_batch = [tree for idx, tree in enumerate(state_batch) if idx not in remove]
+            transitions = [trans for idx, trans in enumerate(transitions) if idx not in remove]
+        if len(state_batch) == 0:
+            return state_batch
+        new_transitions = self.push_transitions([tree.transitions for tree in state_batch], transitions)
+        new_constituents = self.push_constituents(constituents, new_constituents)
+        state_batch = [state._replace(num_opens=state.num_opens + transition.delta_opens(),
+                                      word_position=word_position,
+                                      transitions=transition_stack,
+                                      constituents=constituents)
+                      for (state, transition, word_position, transition_stack, constituents)
+                      in zip(state_batch, transitions, word_positions, new_transitions, new_constituents)]
+        return state_batch
+class SimpleModel(BaseModel):
+    """
+    This model allows pushing and popping with no extra data
+    This class is primarily used for testing various operations which
+    don't need the NN's weights
+    Also, for rebuilding trees from transitions when verifying the
+    transitions in situations where the NN state is not relevant,
+    as this class will be faster than using the NN
+    """
+    def __init__(self, transition_scheme=TransitionScheme.TOP_DOWN_UNARY, unary_limit=UNARY_LIMIT, reverse_sentence=False, root_labels=("ROOT",)):
+        super().__init__(transition_scheme=transition_scheme, unary_limit=unary_limit, reverse_sentence=reverse_sentence, root_labels=root_labels)
+    def initial_word_queues(self, tagged_word_lists):
+        word_queues = []
+        for tagged_words in tagged_word_lists:
+            word_queue =  [None]
+            word_queue += [tag_node for tag_node in tagged_words]
+            word_queue.append(None)
+            if self.reverse_sentence:
+                word_queue.reverse()
+            word_queues.append(word_queue)
+        return word_queues
+    def initial_transitions(self):
+        return TreeStack(value=None, parent=None, length=1)
+    def initial_constituents(self):
+        return TreeStack(value=None, parent=None, length=1)
+    def get_word(self, word_node):
+        return word_node
+    def transform_word_to_constituent(self, state):
+        return state.get_word(state.word_position)
+    def dummy_constituent(self, dummy):
+        return dummy
+    def build_constituents(self, labels, children_lists):
+        constituents = []
+        for label, children in zip(labels, children_lists):
+            if isinstance(label, str):
+                label = (label,)
+            for value in reversed(label):
+                children = Tree(label=value, children=children)
+            constituents.append(children)
+        return constituents
+    def push_constituents(self, constituent_stacks, constituents):
+        return [stack.push(constituent) for stack, constituent in zip(constituent_stacks, constituents)]
+    def get_top_constituent(self, constituents):
+        return constituents.value
+    def push_transitions(self, transition_stacks, transitions):
+        return [stack.push(transition) for stack, transition in zip(transition_stacks, transitions)]
+    def get_top_transition(self, transitions):
+        return transitions.value

stanza/stanza/models/constituency/base_trainer.py ADDED Viewed

	@@ -0,0 +1,153 @@

+from enum import Enum
+import logging
+import os
+import torch
+from pickle import UnpicklingError
+import warnings
+logger = logging.getLogger('stanza')
+class ModelType(Enum):
+    LSTM               = 1
+    ENSEMBLE           = 2
+class BaseTrainer:
+    def __init__(self, model, optimizer=None, scheduler=None, epochs_trained=0, batches_trained=0, best_f1=0.0, best_epoch=0, first_optimizer=False):
+        self.model = model
+        self.optimizer = optimizer
+        self.scheduler = scheduler
+        # keeping track of the epochs trained will be useful
+        # for adjusting the learning scheme
+        self.epochs_trained = epochs_trained
+        self.batches_trained = batches_trained
+        self.best_f1 = best_f1
+        self.best_epoch = best_epoch
+        self.first_optimizer = first_optimizer
+    def save(self, filename, save_optimizer=True):
+        params = self.model.get_params()
+        checkpoint = {
+            'params': params,
+            'epochs_trained': self.epochs_trained,
+            'batches_trained': self.batches_trained,
+            'best_f1': self.best_f1,
+            'best_epoch': self.best_epoch,
+            'model_type': self.model_type.name,
+            'first_optimizer': self.first_optimizer,
+        }
+        checkpoint["bert_lora"] = self.get_peft_params()
+        if save_optimizer and self.optimizer is not None:
+            checkpoint['optimizer_state_dict'] = self.optimizer.state_dict()
+            checkpoint['scheduler_state_dict'] = self.scheduler.state_dict()
+        torch.save(checkpoint, filename, _use_new_zipfile_serialization=False)
+        logger.info("Model saved to %s", filename)
+    def log_norms(self):
+        self.model.log_norms()
+    def log_shapes(self):
+        self.model.log_shapes()
+    @property
+    def transitions(self):
+        return self.model.transitions
+    @property
+    def root_labels(self):
+        return self.model.root_labels
+    @property
+    def device(self):
+        return next(self.model.parameters()).device
+    def train(self):
+        return self.model.train()
+    def eval(self):
+        return self.model.eval()
+    # TODO: make ABC with methods such as model_from_params?
+    # TODO: if we save the type in the checkpoint, use that here to figure out which to load
+    @staticmethod
+    def load(filename, args=None, load_optimizer=False, foundation_cache=None, peft_name=None):
+        """
+        Load back a model and possibly its optimizer.
+        """
+        # hide the import here to avoid circular imports
+        from stanza.models.constituency.ensemble import EnsembleTrainer
+        from stanza.models.constituency.trainer import Trainer
+        if not os.path.exists(filename):
+            if args.get('save_dir', None) is None:
+                raise FileNotFoundError("Cannot find model in {} and args['save_dir'] is None".format(filename))
+            elif os.path.exists(os.path.join(args['save_dir'], filename)):
+                filename = os.path.join(args['save_dir'], filename)
+            else:
+                raise FileNotFoundError("Cannot find model in {} or in {}".format(filename, os.path.join(args['save_dir'], filename)))
+        try:
+            # TODO: currently cannot switch this to weights_only=True
+            # without in some way changing the model to save enums in
+            # a safe manner, probably by converting to int
+            try:
+                checkpoint = torch.load(filename, lambda storage, loc: storage, weights_only=True)
+            except UnpicklingError as e:
+                checkpoint = torch.load(filename, lambda storage, loc: storage, weights_only=False)
+                warnings.warn("The saved constituency parser has an old format using Enum, set, unsanitized Transitions, etc.  This version of Stanza can support reading both the new and the old formats.  Future versions will only allow loading with weights_only=True.  Please resave the constituency parser using this version ASAP.")
+        except BaseException:
+            logger.exception("Cannot load model from %s", filename)
+            raise
+        logger.debug("Loaded model from %s", filename)
+        params = checkpoint['params']
+        if 'model_type' not in checkpoint:
+            # old models will have this trait
+            # TODO: can remove this after 1.10
+            checkpoint['model_type'] = ModelType.LSTM
+        if isinstance(checkpoint['model_type'], str):
+            checkpoint['model_type'] = ModelType[checkpoint['model_type']]
+        if checkpoint['model_type'] == ModelType.LSTM:
+            clazz = Trainer
+        elif checkpoint['model_type'] == ModelType.ENSEMBLE:
+            clazz = EnsembleTrainer
+        else:
+            raise ValueError("Unexpected model type: %s" % checkpoint['model_type'])
+        model = clazz.model_from_params(params, checkpoint.get('bert_lora', None), args, foundation_cache, peft_name)
+        epochs_trained = checkpoint['epochs_trained']
+        batches_trained = checkpoint.get('batches_trained', 0)
+        best_f1 = checkpoint['best_f1']
+        best_epoch = checkpoint['best_epoch']
+        if 'first_optimizer' not in checkpoint:
+            # this will only apply to old (LSTM) Trainers
+            # EnsembleTrainers will always have this value saved
+            # so here we can compensate by looking at the old training statistics...
+            # we use params['config'] here instead of model.args
+            # because the args might have a different training
+            # mechanism, but in order to reload the optimizer, we need
+            # to match the optimizer we build with the one that was
+            # used at training time
+            build_simple_adadelta = params['config']['multistage'] and epochs_trained < params['config']['epochs'] // 2
+            checkpoint['first_optimizer'] = build_simple_adadelta
+        first_optimizer = checkpoint['first_optimizer']
+        if load_optimizer:
+            optimizer = clazz.load_optimizer(model, checkpoint, first_optimizer, filename)
+            scheduler = clazz.load_scheduler(model, optimizer, checkpoint, first_optimizer)
+        else:
+            optimizer = None
+            scheduler = None
+        if checkpoint['model_type'] == ModelType.LSTM:
+            logger.debug("-- MODEL CONFIG --")
+            for k in model.args.keys():
+                logger.debug("  --%s: %s", k, model.args[k])
+            return Trainer(model=model, optimizer=optimizer, scheduler=scheduler, epochs_trained=epochs_trained, batches_trained=batches_trained, best_f1=best_f1, best_epoch=best_epoch, first_optimizer=first_optimizer)
+        elif checkpoint['model_type'] == ModelType.ENSEMBLE:
+            return EnsembleTrainer(ensemble=model, optimizer=optimizer, scheduler=scheduler, epochs_trained=epochs_trained, batches_trained=batches_trained, best_f1=best_f1, best_epoch=best_epoch, first_optimizer=first_optimizer)
+        else:
+            raise ValueError("Unexpected model type: %s" % checkpoint['model_type'])

stanza/stanza/models/constituency/ensemble.py ADDED Viewed

	@@ -0,0 +1,486 @@

+"""
+Prototype of ensembling N models together on the same dataset
+The main inference method is to run the normal transition sequence,
+but sum the scores for the N models and use that to choose the highest
+scoring transition
+Example of how to run it to build a silver dataset
+(or just parse a text file in general):
+# first, use this tool to build a saved ensemble
+python3 stanza/models/constituency/ensemble.py
+   saved_models/constituency/wsj_inorder_?.pt
+   --save_name saved_models/constituency/en_ensemble.pt
+# then use the ensemble directly as a model in constituency_parser.py
+python3 stanza/models/constituency_parser.py
+   --save_name saved_models/constituency/en_ensemble.pt
+   --mode parse_text
+   --tokenized_file /nlp/scr/horatio/en_silver/en_split_100
+   --predict_file /nlp/scr/horatio/en_silver/en_split_100.inorder.mrg
+   --retag_package en_combined_bert
+   --lang en
+then, ideally, run a second time with a set of topdown models,
+then take the trees which match from the files
+"""
+import argparse
+import copy
+import logging
+import os
+import torch
+import torch.nn as nn
+from stanza.models.common import utils
+from stanza.models.common.foundation_cache import FoundationCache
+from stanza.models.constituency.base_trainer import BaseTrainer, ModelType
+from stanza.models.constituency.state import MultiState
+from stanza.models.constituency.trainer import Trainer
+from stanza.models.constituency.utils import build_optimizer, build_scheduler
+from stanza.server.parser_eval import ParseResult, ScoredTree
+logger = logging.getLogger('stanza.constituency.trainer')
+class Ensemble(nn.Module):
+    def __init__(self, args, filenames=None, models=None, foundation_cache=None):
+        """
+        Loads each model in filenames
+        If foundation_cache is None, we build one on our own,
+        as the expectation is the models will reuse modules
+        such as pretrain, charlm, bert
+        """
+        super().__init__()
+        self.args = args
+        if filenames:
+            if models:
+                raise ValueError("both filenames and models set when making the Ensemble")
+            if foundation_cache is None:
+                foundation_cache = FoundationCache()
+            if isinstance(filenames, str):
+                filenames = [filenames]
+            logger.info("Models used for ensemble:\n  %s", "\n  ".join(filenames))
+            models = [Trainer.load(filename, args, load_optimizer=False, foundation_cache=foundation_cache).model for filename in filenames]
+        elif not models:
+            raise ValueError("filenames and models both not set!")
+        self.models = nn.ModuleList(models)
+        for model_idx, model in enumerate(self.models):
+            if self.models[0].transition_scheme() != model.transition_scheme():
+                raise ValueError("Models {} and {} are incompatible.  {} vs {}".format(filenames[0], filenames[model_idx], self.models[0].transition_scheme(), model.transition_scheme()))
+            if self.models[0].transitions != model.transitions:
+                raise ValueError(f"Models {filenames[0]} and {filenames[model_idx]} are incompatible: different transitions\n{filenames[0]}:\n{self.models[0].transitions}\n{filenames[model_idx]}:\n{model.transitions}")
+            if self.models[0].constituents != model.constituents:
+                raise ValueError("Models %s and %s are incompatible: different constituents" % (filenames[0], filenames[model_idx]))
+            if self.models[0].root_labels != model.root_labels:
+                raise ValueError("Models %s and %s are incompatible: different root_labels" % (filenames[0], filenames[model_idx]))
+            if self.models[0].uses_xpos() != model.uses_xpos():
+                raise ValueError("Models %s and %s are incompatible: different uses_xpos" % (filenames[0], filenames[model_idx]))
+            if self.models[0].reverse_sentence != model.reverse_sentence:
+                raise ValueError("Models %s and %s are incompatible: different reverse_sentence" % (filenames[0], filenames[model_idx]))
+        self._reverse_sentence = self.models[0].reverse_sentence
+        # submodels are not trained (so far)
+        self.detach_submodels()
+        logger.debug("Number of models in the Ensemble: %d", len(self.models))
+        self.register_parameter('weighted_sum', torch.nn.Parameter(torch.zeros(len(self.models), len(self.transitions), requires_grad=True)))
+    def detach_submodels(self):
+        # submodels are not trained (so far)
+        for model in self.models:
+            for _, parameter in model.named_parameters():
+                parameter.requires_grad = False
+    def train(self, mode=True):
+        super().train(mode)
+        if mode:
+            # peft has a weird interaction where it turns requires_grad back on
+            # even if it was previously off
+            self.detach_submodels()
+    @property
+    def transitions(self):
+        return self.models[0].transitions
+    @property
+    def root_labels(self):
+        return self.models[0].root_labels
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    def unary_limit(self):
+        """
+        Limit on the number of consecutive unary transitions
+        """
+        return min(m.unary_limit() for m in self.models)
+    def transition_scheme(self):
+        return self.models[0].transition_scheme()
+    def has_unary_transitions(self):
+        return self.models[0].has_unary_transitions()
+    @property
+    def is_top_down(self):
+        return self.models[0].is_top_down
+    @property
+    def reverse_sentence(self):
+        return self._reverse_sentence
+    @property
+    def retag_method(self):
+        # TODO: make the method an enum
+        return self.models[0].args['retag_method']
+    def uses_xpos(self):
+        return self.models[0].uses_xpos()
+    def get_top_constituent(self, constituents):
+        return self.models[0].get_top_constituent(constituents)
+    def get_top_transition(self, transitions):
+        return self.models[0].get_top_transition(transitions)
+    def log_norms(self):
+        lines = ["NORMS FOR MODEL PARAMETERS"]
+        for name, param in self.named_parameters():
+            if param.requires_grad and not name.startswith("models."):
+                zeros = torch.sum(param.abs() < 0.000001).item()
+                norm = "%.6g" % torch.norm(param).item()
+                lines.append("%s %s %d %d" % (name, norm, zeros, param.nelement()))
+        for model_idx, model in enumerate(self.models):
+            sublines = model.get_norms()
+            if len(sublines) > 0:
+                lines.append("  ---- MODEL %d ----" % model_idx)
+                lines.extend(sublines)
+        logger.info("\n".join(lines))
+    def log_shapes(self):
+        lines = ["NORMS FOR MODEL PARAMETERS"]
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                lines.append("{} {}".format(name, param.shape))
+        logger.info("\n".join(lines))
+    def get_params(self):
+        model_state = self.state_dict()
+        # don't save the children in the base params
+        model_state = {k: v for k, v in model_state.items() if not k.startswith("models.")}
+        return {
+            "base_params": model_state,
+            "children_params": [x.get_params() for x in self.models]
+        }
+    def initial_state_from_preterminals(self, preterminal_lists, gold_trees, gold_sequences):
+        state_batch = [model.initial_state_from_preterminals(preterminal_lists, gold_trees, gold_sequences) for model in self.models]
+        state_batch = list(zip(*state_batch))
+        state_batch = [MultiState(states, gold_tree, gold_sequence, 0.0)
+                       for states, gold_tree, gold_sequence in zip(state_batch, gold_trees, gold_sequences)]
+        return state_batch
+    def build_batch_from_tagged_words(self, batch_size, data_iterator):
+        """
+        Read from the data_iterator batch_size tagged sentences and turn them into new parsing states
+        Expects a list of list of (word, tag)
+        """
+        state_batch = []
+        for _ in range(batch_size):
+            sentence = next(data_iterator, None)
+            if sentence is None:
+                break
+            state_batch.append(sentence)
+        if len(state_batch) > 0:
+            state_batch = [model.initial_state_from_words(state_batch) for model in self.models]
+            state_batch = list(zip(*state_batch))
+            state_batch = [MultiState(states, None, None, 0.0) for states in state_batch]
+        return state_batch
+    def build_batch_from_trees(self, batch_size, data_iterator):
+        """
+        Read from the data_iterator batch_size trees and turn them into N lists of parsing states
+        """
+        state_batch = []
+        for _ in range(batch_size):
+            gold_tree = next(data_iterator, None)
+            if gold_tree is None:
+                break
+            state_batch.append(gold_tree)
+        if len(state_batch) > 0:
+            state_batch = [model.initial_state_from_gold_trees(state_batch) for model in self.models]
+            state_batch = list(zip(*state_batch))
+            state_batch = [MultiState(states, None, None, 0.0) for states in state_batch]
+        return state_batch
+    def predict(self, states, is_legal=True):
+        states = list(zip(*[x.states for x in states]))
+        predictions = [model.forward(state_batch) for model, state_batch in zip(self.models, states)]
+        # batch X num transitions X num models
+        predictions = torch.stack(predictions, dim=2)
+        flat_predictions = torch.einsum("BTM,MT->BT", predictions, self.weighted_sum)
+        predictions = torch.sum(predictions, dim=2) + flat_predictions
+        model = self.models[0]
+        # TODO: possibly refactor with lstm_model.predict
+        pred_max = torch.argmax(predictions, dim=1)
+        scores = torch.take_along_dim(predictions, pred_max.unsqueeze(1), dim=1)
+        pred_max = pred_max.detach().cpu()
+        pred_trans = [model.transitions[pred_max[idx]] for idx in range(len(states[0]))]
+        if is_legal:
+            for idx, (state, trans) in enumerate(zip(states[0], pred_trans)):
+                if not trans.is_legal(state, model):
+                    _, indices = predictions[idx, :].sort(descending=True)
+                    for index in indices:
+                        if model.transitions[index].is_legal(state, model):
+                            pred_trans[idx] = model.transitions[index]
+                            scores[idx] = predictions[idx, index]
+                            break
+                    else: # yeah, else on a for loop, deal with it
+                        pred_trans[idx] = None
+                        scores[idx] = None
+        return predictions, pred_trans, scores.squeeze(1)
+    def bulk_apply(self, state_batch, transitions, fail=False):
+        new_states = []
+        states = list(zip(*[x.states for x in state_batch]))
+        states = [x.bulk_apply(y, transitions, fail=fail) for x, y in zip(self.models, states)]
+        states = list(zip(*states))
+        state_batch = [x._replace(states=y) for x, y in zip(state_batch, states)]
+        return state_batch
+    def parse_tagged_words(self, words, batch_size):
+        """
+        This parses tagged words and returns a list of trees.
+        `parse_tagged_words` is useful at Pipeline time -
+          it takes words & tags and processes that into trees.
+        The tagged words should be represented:
+          one list per sentence
+            each sentence is a list of (word, tag)
+        The return value is a list of ParseTree objects
+        TODO: this really ought to be refactored with base_model
+        """
+        logger.debug("Processing %d sentences", len(words))
+        self.eval()
+        sentence_iterator = iter(words)
+        treebank = self.parse_sentences_no_grad(sentence_iterator, self.build_batch_from_tagged_words, batch_size, self.predict, keep_state=False, keep_constituents=False)
+        results = [t.predictions[0].tree for t in treebank]
+        return results
+    def parse_sentences(self, data_iterator, build_batch_fn, batch_size, transition_choice, keep_state=False, keep_constituents=False, keep_scores=False):
+        """
+        Repeat transitions to build a list of trees from the input batches.
+        The data_iterator should be anything which returns the data for a parse task via next()
+        build_batch_fn is a function that turns that data into State objects
+        This will be called to generate batches of size batch_size until the data is exhausted
+        The return is a list of tuples: (gold_tree, [(predicted, score) ...])
+        gold_tree will be left blank if the data did not include gold trees
+        currently score is always 1.0, but the interface may be expanded
+        to get a score from the result of the parsing
+        transition_choice: which method of the model to use for
+        choosing the next transition
+        TODO: refactor with base_model
+        """
+        treebank = []
+        treebank_indices = []
+        # this will produce tuples of states
+        # batch size lists of num models tuples
+        state_batch = build_batch_fn(batch_size, data_iterator)
+        batch_indices = list(range(len(state_batch)))
+        horizon_iterator = iter([])
+        if keep_constituents:
+            constituents = defaultdict(list)
+        while len(state_batch) > 0:
+            pred_scores, transitions, scores = transition_choice(state_batch)
+            # num models lists of batch size states
+            state_batch = self.bulk_apply(state_batch, transitions)
+            remove = set()
+            for idx, states in enumerate(state_batch):
+                if states.finished(self):
+                    predicted_tree = states.get_tree(self)
+                    if self.reverse_sentence:
+                        predicted_tree = predicted_tree.reverse()
+                    gold_tree = states.gold_tree
+                    # TODO: could easily store the score here
+                    # not sure what it means to store the state,
+                    # since each model is tracking its own state
+                    treebank.append(ParseResult(gold_tree, [ScoredTree(predicted_tree, None)], None, None))
+                    treebank_indices.append(batch_indices[idx])
+                    remove.add(idx)
+            if len(remove) > 0:
+                state_batch = [state for idx, state in enumerate(state_batch) if idx not in remove]
+                batch_indices = [batch_idx for idx, batch_idx in enumerate(batch_indices) if idx not in remove]
+            for _ in range(batch_size - len(state_batch)):
+                horizon_state = next(horizon_iterator, None)
+                if not horizon_state:
+                    horizon_batch = build_batch_fn(batch_size, data_iterator)
+                    if len(horizon_batch) == 0:
+                        break
+                    horizon_iterator = iter(horizon_batch)
+                    horizon_state = next(horizon_iterator, None)
+                state_batch.append(horizon_state)
+                batch_indices.append(len(treebank) + len(state_batch))
+        treebank = utils.unsort(treebank, treebank_indices)
+        return treebank
+    def parse_sentences_no_grad(self, data_iterator, build_batch_fn, batch_size, transition_choice, keep_state=False, keep_constituents=False, keep_scores=False):
+        with torch.no_grad():
+            return self.parse_sentences(data_iterator, build_batch_fn, batch_size, transition_choice, keep_state, keep_constituents, keep_scores)
+class EnsembleTrainer(BaseTrainer):
+    """
+    Stores a list of constituency models, useful for combining their results into one stronger model
+    """
+    def __init__(self, ensemble, optimizer=None, scheduler=None, epochs_trained=0, batches_trained=0, best_f1=0.0, best_epoch=0, first_optimizer=False):
+        super().__init__(ensemble, optimizer, scheduler, epochs_trained, batches_trained, best_f1, best_epoch, first_optimizer)
+    @staticmethod
+    def from_files(args, filenames, foundation_cache=None):
+        ensemble = Ensemble(args, filenames, foundation_cache=foundation_cache)
+        ensemble = ensemble.to(args.get('device', None))
+        return EnsembleTrainer(ensemble)
+    def get_peft_params(self):
+        params = []
+        for model in self.model.models:
+            if model.args.get('use_peft', False):
+                from peft import get_peft_model_state_dict
+                params.append(get_peft_model_state_dict(model.bert_model, adapter_name=model.peft_name))
+            else:
+                params.append(None)
+        return params
+    @property
+    def model_type(self):
+        return ModelType.ENSEMBLE
+    def log_num_words_known(self, words):
+        nwk = [m.num_words_known(words) for m in self.model.models]
+        if all(x == nwk[0] for x in nwk):
+            logger.info("Number of words in the training set known to each sub-model: %d out of %d", nwk[0], len(words))
+        else:
+            logger.info("Number of words in the training set known to the sub-models:\n  %s" % "\n  ".join(["%d/%d" % (x, len(words)) for x in nwk]))
+    @staticmethod
+    def build_optimizer(args, model, first_optimizer):
+        def fake_named_parameters():
+            for n, p in model.named_parameters():
+                if not n.startswith("models."):
+                    yield n, p
+        # TODO: there has to be a cleaner way to do this, like maybe a "keep" callback
+        # TODO: if we finetune the underlying models, we will want a series of optimizers
+        # so that they can have a different learning rate from the ensemble's fields
+        fake_model = copy.copy(model)
+        fake_model.named_parameters = fake_named_parameters
+        optimizer = build_optimizer(args, fake_model, first_optimizer)
+        return optimizer
+    @staticmethod
+    def load_optimizer(model, checkpoint, first_optimizer, filename):
+        optimizer = EnsembleTrainer.build_optimizer(model.models[0].args, model, first_optimizer)
+        if checkpoint.get('optimizer_state_dict', None) is not None:
+            try:
+                optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+            except ValueError as e:
+                raise ValueError("Failed to load optimizer from %s" % filename) from e
+        else:
+            logger.info("Attempted to load optimizer to resume training, but optimizer not saved.  Creating new optimizer")
+        return optimizer
+    @staticmethod
+    def load_scheduler(model, optimizer, checkpoint, first_optimizer):
+        scheduler = build_scheduler(model.models[0].args, optimizer, first_optimizer=first_optimizer)
+        if 'scheduler_state_dict' in checkpoint:
+            scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
+        return scheduler
+    @staticmethod
+    def model_from_params(params, peft_params, args, foundation_cache=None, peft_name=None):
+        # TODO: no need for the if/else once the models are rebuilt
+        children_params = params["children_params"] if isinstance(params, dict) else params
+        base_params = params["base_params"] if isinstance(params, dict) else {}
+        # TODO: fill in peft_name
+        if peft_params is None:
+            peft_params = [None] * len(children_params)
+        if peft_name is None:
+            peft_name = [None] * len(children_params)
+        if len(children_params) != len(peft_params):
+            raise ValueError("Model file had params length %d and peft params length %d" % (len(params), len(peft_params)))
+        if len(children_params) != len(peft_name):
+            raise ValueError("Model file had params length %d and peft name length %d" % (len(params), len(peft_name)))
+        models = [Trainer.model_from_params(model_param, peft_param, args, foundation_cache, peft_name=pname)
+                  for model_param, peft_param, pname in zip(children_params, peft_params, peft_name)]
+        ensemble = Ensemble(args, models=models)
+        ensemble.load_state_dict(base_params, strict=False)
+        ensemble = ensemble.to(args.get('device', None))
+        return ensemble
+def parse_args(args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--charlm_forward_file', type=str, default=None, help="Exact path to use for forward charlm")
+    parser.add_argument('--charlm_backward_file', type=str, default=None, help="Exact path to use for backward charlm")
+    parser.add_argument('--wordvec_pretrain_file', type=str, default=None, help='Exact name of the pretrain file to read')
+    utils.add_device_args(parser)
+    parser.add_argument('--lang', default='en', help='Language to use')
+    parser.add_argument('models', type=str, nargs='+', default=None, help="Which model(s) to load")
+    parser.add_argument('--save_name', type=str, default=None, required=True, help='Where to save the combined ensemble')
+    args = vars(parser.parse_args())
+    return args
+def main(args=None):
+    args = parse_args(args)
+    foundation_cache = FoundationCache()
+    ensemble = EnsembleTrainer.from_files(args, args['models'], foundation_cache)
+    ensemble.save(args['save_name'], save_optimizer=False)
+if __name__ == "__main__":
+    main()

stanza/stanza/models/constituency/in_order_compound_oracle.py ADDED Viewed

	@@ -0,0 +1,327 @@

+from enum import Enum
+from stanza.models.constituency.dynamic_oracle import advance_past_constituents, find_in_order_constituent_end, find_previous_open, DynamicOracle
+from stanza.models.constituency.parse_transitions import Shift, OpenConstituent, CloseConstituent, CompoundUnary, Finalize
+def fix_missing_unary_error(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    A CompoundUnary transition was missed after a Shift, but the sequence was continued correctly otherwise
+    """
+    if not isinstance(gold_transition, CompoundUnary):
+        return None
+    if pred_transition != gold_sequence[gold_index + 1]:
+        return None
+    if isinstance(pred_transition, Finalize):
+        # this can happen if the entire tree is a single word
+        # but it can't be fixed if it means the parser missed the ROOT transition
+        return None
+    return gold_sequence[:gold_index] + gold_sequence[gold_index+1:]
+def fix_wrong_unary_error(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    if not isinstance(gold_transition, CompoundUnary):
+        return None
+    if not isinstance(pred_transition, CompoundUnary):
+        return None
+    assert gold_transition != pred_transition
+    return gold_sequence[:gold_index] + [pred_transition] + gold_sequence[gold_index+1:]
+def fix_spurious_unary_error(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    if isinstance(gold_transition, CompoundUnary):
+        return None
+    if not isinstance(pred_transition, CompoundUnary):
+        return None
+    return gold_sequence[:gold_index] + [pred_transition] + gold_sequence[gold_index:]
+def fix_open_shift_error(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    Fix a missed Open constituent where we predicted a Shift and the next transition was a Shift
+    In fact, the subsequent transition MUST be a Shift with this transition scheme
+    """
+    if not isinstance(gold_transition, OpenConstituent):
+        return None
+    if not isinstance(pred_transition, Shift):
+        return None
+    #if not isinstance(gold_sequence[gold_index+1], Shift):
+    #    return None
+    assert isinstance(gold_sequence[gold_index+1], Shift)
+    # close_index represents the Close for the missing Open
+    close_index = advance_past_constituents(gold_sequence, gold_index+1)
+    assert close_index is not None
+    return gold_sequence[:gold_index] + gold_sequence[gold_index+1:close_index] + gold_sequence[close_index+1:]
+def fix_open_open_two_subtrees_error(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    if gold_transition == pred_transition:
+        return None
+    if not isinstance(gold_transition, OpenConstituent):
+        return None
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    block_end = find_in_order_constituent_end(gold_sequence, gold_index+1)
+    if isinstance(gold_sequence[block_end], Shift):
+        # this is a multiple subtrees version of this error
+        # we are only skipping the two subtrees errors for now
+        return None
+    # no fix is possible, so we just return here
+    return RepairType.OPEN_OPEN_TWO_SUBTREES_ERROR, None
+def fix_open_open_error(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, exactly_three):
+    if gold_transition == pred_transition:
+        return None
+    if not isinstance(gold_transition, OpenConstituent):
+        return None
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    block_end = find_in_order_constituent_end(gold_sequence, gold_index+1)
+    if not isinstance(gold_sequence[block_end], Shift):
+        # this is a multiple subtrees version of this error
+        # we are only skipping the two subtrees errors for now
+        return None
+    next_block_end = find_in_order_constituent_end(gold_sequence, block_end+1)
+    if exactly_three and isinstance(gold_sequence[next_block_end], Shift):
+        # for exactly three subtrees,
+        # we can put back the missing open transition
+        # and now we have no recall error, only precision error
+        # for more than three, we separate that out as an ambiguous choice
+        return None
+    elif not exactly_three and isinstance(gold_sequence[next_block_end], CloseConstituent):
+        # this is ambiguous, but we can still try this fix
+        return None
+    # at this point, we build a new sequence with the origin constituent inserted
+    return gold_sequence[:gold_index] + [pred_transition] + gold_sequence[gold_index+1:block_end] + [CloseConstituent(), gold_transition] + gold_sequence[block_end:]
+def fix_open_open_three_subtrees_error(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    return fix_open_open_error(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, exactly_three=True)
+def fix_open_open_many_subtrees_error(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    return fix_open_open_error(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, exactly_three=False)
+def fix_open_close_error(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    Find the closed bracket, reopen it
+    The Open we just missed must be forgotten - it cannot be reopened
+    """
+    if not isinstance(gold_transition, OpenConstituent):
+        return None
+    if not isinstance(pred_transition, CloseConstituent):
+        return None
+    # find the appropriate Open so we can reopen it
+    open_idx = find_previous_open(gold_sequence, gold_index)
+    # actually, if the Close is legal, this can't happen
+    # but it might happen in a unit test which doesn't check legality
+    if open_idx is None:
+        return None
+    # also, since we are punting on the missed Open, we need to skip
+    # the Close which would have closed it
+    close_idx = advance_past_constituents(gold_sequence, gold_index+1)
+    return gold_sequence[:gold_index] + [pred_transition, gold_sequence[open_idx]] + gold_sequence[gold_index+1:close_idx] + gold_sequence[close_idx+1:]
+def fix_shift_close_error(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    Find the closed bracket, reopen it
+    """
+    if not isinstance(gold_transition, Shift):
+        return None
+    if not isinstance(pred_transition, CloseConstituent):
+        return None
+    # don't do this at the start or immediately after opening
+    if gold_index == 0 or isinstance(gold_sequence[gold_index - 1], OpenConstituent):
+        return None
+    open_idx = find_previous_open(gold_sequence, gold_index)
+    assert open_idx is not None
+    return gold_sequence[:gold_index] + [pred_transition, gold_sequence[open_idx]] + gold_sequence[gold_index:]
+def fix_shift_open_unambiguous_error(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    if not isinstance(gold_transition, Shift):
+        return None
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    bracket_end = find_in_order_constituent_end(gold_sequence, gold_index)
+    assert bracket_end is not None
+    if isinstance(gold_sequence[bracket_end], Shift):
+        # this is an ambiguous error
+        # multiple possible places to end the wrong constituent
+        return None
+    assert isinstance(gold_sequence[bracket_end], CloseConstituent)
+    return gold_sequence[:gold_index] + [pred_transition] + gold_sequence[gold_index:bracket_end] + [CloseConstituent()] + gold_sequence[bracket_end:]
+def fix_close_shift_unambiguous_error(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    if not isinstance(gold_transition, CloseConstituent):
+        return None
+    if not isinstance(pred_transition, Shift):
+        return None
+    if not isinstance(gold_sequence[gold_index+1], Shift):
+        return None
+    bracket_end = find_in_order_constituent_end(gold_sequence, gold_index+1)
+    assert bracket_end is not None
+    if isinstance(gold_sequence[bracket_end], Shift):
+        # this is an ambiguous error
+        # multiple possible places to end the wrong constituent
+        return None
+    assert isinstance(gold_sequence[bracket_end], CloseConstituent)
+    return gold_sequence[:gold_index] + gold_sequence[gold_index+1:bracket_end] + [CloseConstituent()] + gold_sequence[bracket_end:]
+class RepairType(Enum):
+    """
+    Keep track of which repair is used, if any, on an incorrect transition
+    Effects of different repair types:
+      no oracle:                0.9251  0.9226
+     +missing_unary:            0.9246  0.9214
+     +wrong_unary:              0.9236  0.9213
+     +spurious_unary:           0.9247  0.9229
+     +open_shift_error:         0.9258  0.9226
+     +open_open_two_subtrees:   0.9256  0.9215    # nothing changes with this one...
+     +open_open_three_subtrees: 0.9256  0.9226
+     +open_open_many_subtrees:  0.9257  0.9234
+     +shift_close:              0.9267  0.9250
+     +shift_open:               0.9273  0.9247
+     +close_shift:              0.9266  0.9229
+     +open_close:               0.9267  0.9256
+    """
+    def __new__(cls, fn, correct=False, debug=False):
+        """
+        Enumerate values as normal, but also keep a pointer to a function which repairs that kind of error
+        """
+        value = len(cls.__members__)
+        obj = object.__new__(cls)
+        obj._value_ = value + 1
+        obj.fn = fn
+        obj.correct = correct
+        obj.debug = debug
+        return obj
+    @property
+    def is_correct(self):
+        return self.correct
+    # The correct sequence went Shift - Unary - Stuff
+    # but the CompoundUnary was missed and Stuff predicted
+    # so now we just proceed as if nothing happened
+    # note that CompoundUnary happens immediately after a Shift
+    # complicated nodes are created with single Open transitions
+    MISSING_UNARY_ERROR                    = (fix_missing_unary_error,)
+    # Predicted a wrong CompoundUnary.  No way to fix this, so just keep going
+    WRONG_UNARY_ERROR                      = (fix_wrong_unary_error,)
+    # The correct sequence went Shift - Stuff
+    # but instead we predicted a CompoundUnary
+    # again, we just keep going
+    SPURIOUS_UNARY_ERROR                   = (fix_spurious_unary_error,)
+    # Were supposed to open a new constituent,
+    # but instead shifted an item onto the stack
+    #
+    # The missed Open cannot be recovered
+    #
+    # One could ask, is it possible to open a bigger constituent later,
+    # but if the constituent patterns go
+    #   X (good open) Y (missed open) Z
+    # when we eventually close Y and Z, because of the missed Open,
+    # it is guaranteed to capture X as well
+    # since it will grab constituents until one left of the previous Open before Y
+    #
+    # Therefore, in this case, we must simply forget about this Open (recall error)
+    OPEN_SHIFT_ERROR                       = (fix_open_shift_error,)
+    # With this transition scheme, it is not possible to fix the following pattern:
+    #   T1 O_x T2 C -> T1 O_y T2 C
+    # seeing as how there are no unary transitions
+    # so whatever precision & recall errors are caused by substituting O_x -> O_y
+    # (which could include multiple transitions)
+    # those errors are unfixable in any way
+    OPEN_OPEN_TWO_SUBTREES_ERROR           = (fix_open_open_two_subtrees_error,)
+    # With this transition scheme, a three subtree branch with a wrong Open
+    # has a non-ambiguous fix
+    #   T1 O_x T2 T3 C -> T1 O_y T2 T3 C
+    # this can become
+    #   T1 O_y T2 C O_x T3 C
+    # now there are precision errors from the incorrectly added transition(s),
+    # but the correctly replaced transitions are unambiguous
+    OPEN_OPEN_THREE_SUBTREES_ERROR         = (fix_open_open_three_subtrees_error,)
+    # We were supposed to shift a new item onto the stack,
+    # but instead we closed the previous constituent
+    # This causes a precision error, but we can avoid the recall error
+    # by immediately reopening the closed constituent.
+    SHIFT_CLOSE_ERROR                      = (fix_shift_close_error,)
+    # We opened a new constituent instead of shifting
+    # In the event that the next constituent ends with a close,
+    # rather than building another new constituent,
+    # then there is no ambiguity
+    SHIFT_OPEN_UNAMBIGUOUS_ERROR           = (fix_shift_open_unambiguous_error,)
+    # Suppose we were supposed to Close, then Shift
+    # but instead we just did a Shift
+    # Similar to shift_open_unambiguous, we now have an opened
+    # constituent which shouldn't be there
+    # We can scroll past the next constituent created to see
+    # if the outer constituents close at that point
+    # If so, we can close this constituent as well in an unambiguous manner
+    # TODO: analyze the case where we were supposed to Close, Open
+    # but instead did a Shift
+    CLOSE_SHIFT_UNAMBIGUOUS_ERROR          = (fix_close_shift_unambiguous_error,)
+    # Supposed to open a new constituent,
+    # instead closed an existing constituent
+    #
+    #  X (good open) Y (open -> close) Z
+    #
+    # the constituent that should contain Y, Z is unfortunately lost
+    # since now the stack has
+    #
+    #  XY ...
+    #
+    # furthermore, there is now a precision error for the extra XY
+    # constituent that should not exist
+    # however, what we can do to minimize further errors is
+    # to at least reopen the label between X and Y
+    OPEN_CLOSE_ERROR                       = (fix_open_close_error,)
+    # this is ambiguous, but we can still try the same fix as three_subtrees (see above)
+    OPEN_OPEN_MANY_SUBTREES_ERROR          = (fix_open_open_many_subtrees_error,)
+    CORRECT                                = (None, True)
+    UNKNOWN                                = None
+class InOrderCompoundOracle(DynamicOracle):
+    def __init__(self, root_labels, oracle_level, additional_oracle_levels, deactivated_oracle_levels):
+        super().__init__(root_labels, oracle_level, RepairType, additional_oracle_levels, deactivated_oracle_levels)

stanza/stanza/models/constituency/in_order_oracle.py ADDED Viewed

	@@ -0,0 +1,1029 @@

+from enum import Enum
+from stanza.models.constituency.dynamic_oracle import advance_past_constituents, find_in_order_constituent_end, find_previous_open, score_candidates, DynamicOracle, RepairEnum
+from stanza.models.constituency.parse_transitions import Shift, OpenConstituent, CloseConstituent
+def fix_wrong_open_root_error(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    If there is an open/open error specifically at the ROOT, close the wrong open and try again
+    """
+    if gold_transition == pred_transition:
+        return None
+    if isinstance(gold_transition, OpenConstituent) and isinstance(pred_transition, OpenConstituent) and gold_transition.top_label in root_labels:
+        return gold_sequence[:gold_index] + [pred_transition, CloseConstituent()] + gold_sequence[gold_index:]
+    return None
+def fix_wrong_open_unary_chain(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    Fix a wrong open/open in a unary chain by removing the skipped unary transitions
+    Only applies is the wrong pred transition is a transition found higher up in the unary chain
+    """
+    # useful to have this check here in case the call is made independently in a unit test
+    if gold_transition == pred_transition:
+        return None
+    if isinstance(gold_transition, OpenConstituent) and isinstance(pred_transition, OpenConstituent):
+        cur_index = gold_index + 1  # This is now a Close if we are in this particular context
+        while cur_index + 1 < len(gold_sequence) and isinstance(gold_sequence[cur_index], CloseConstituent) and isinstance(gold_sequence[cur_index+1], OpenConstituent):
+            cur_index = cur_index + 1  # advance to the next Open
+            if gold_sequence[cur_index] == pred_transition:
+                return gold_sequence[:gold_index] + gold_sequence[cur_index:]
+            cur_index = cur_index + 1  # advance to the next Close
+    return None
+def fix_wrong_open_subtrees(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, more_than_two):
+    if gold_transition == pred_transition:
+        return None
+    if not isinstance(gold_transition, OpenConstituent):
+        return None
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    if isinstance(gold_sequence[gold_index+1], CloseConstituent):
+        # if Close, the gold was a unary
+        return None
+    assert not isinstance(gold_sequence[gold_index+1], OpenConstituent)
+    assert isinstance(gold_sequence[gold_index+1], Shift)
+    block_end = find_in_order_constituent_end(gold_sequence, gold_index+1)
+    assert block_end is not None
+    if more_than_two and isinstance(gold_sequence[block_end], CloseConstituent):
+        return None
+    if not more_than_two and isinstance(gold_sequence[block_end], Shift):
+        return None
+    return gold_sequence[:gold_index] + [pred_transition] + gold_sequence[gold_index+1:block_end] + [CloseConstituent(), gold_transition] + gold_sequence[block_end:]
+def fix_wrong_open_two_subtrees(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    return fix_wrong_open_subtrees(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, more_than_two=False)
+def fix_wrong_open_multiple_subtrees(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    return fix_wrong_open_subtrees(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, more_than_two=True)
+def advance_past_unaries(gold_sequence, cur_index):
+    while cur_index + 2 < len(gold_sequence) and isinstance(gold_sequence[cur_index], OpenConstituent) and isinstance(gold_sequence[cur_index+1], CloseConstituent):
+        cur_index += 2
+    return cur_index
+def fix_wrong_open_stuff_unary(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    Fix a wrong open/open when there is an intervening constituent and then the guessed NT
+    This happens when the correct pattern is
+      stuff_1 NT_X stuff_2 close NT_Y ...
+    and instead of guessing the gold transition NT_X,
+    the prediction was NT_Y
+    """
+    if gold_transition == pred_transition:
+        return None
+    if not isinstance(gold_transition, OpenConstituent):
+        return None
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    # TODO: Here we could advance past unary transitions while
+    # watching for hitting pred_transition.  However, that is an open
+    # question... is it better to try to keep such an Open as part of
+    # the sequence, or is it better to skip them and attach the inner
+    # nodes to the upper level
+    stuff_start = gold_index + 1
+    if not isinstance(gold_sequence[stuff_start], Shift):
+        return None
+    stuff_end = advance_past_constituents(gold_sequence, stuff_start)
+    if stuff_end is None:
+        return None
+    # at this point, stuff_end points to the Close which occurred after stuff_2
+    # also, stuff_start points to the first transition which makes stuff_2, the Shift
+    cur_index = stuff_end + 1
+    while isinstance(gold_sequence[cur_index], OpenConstituent):
+        if gold_sequence[cur_index] == pred_transition:
+            return gold_sequence[:gold_index] + [pred_transition] + gold_sequence[stuff_start:stuff_end] + gold_sequence[cur_index+1:]
+        # this was an OpenConstituent, but not the OpenConstituent we guessed
+        # maybe there's a unary transition which lets us try again
+        if cur_index + 2 < len(gold_sequence) and isinstance(gold_sequence[cur_index + 1], CloseConstituent):
+            cur_index = cur_index + 2
+        else:
+            break
+    # oh well, none of this worked
+    return None
+def fix_wrong_open_general(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    Fix a general wrong open/open transition by accepting the open and continuing
+    A couple other open/open patterns have already been carved out
+    TODO: negative checks for the previous patterns, in case we turn those off
+    """
+    if gold_transition == pred_transition:
+        return None
+    if not isinstance(gold_transition, OpenConstituent):
+        return None
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    # If the top is a ROOT, then replacing it with a non-ROOT creates an illegal
+    # transition sequence.  The ROOT case was already handled elsewhere anyway
+    if gold_transition.top_label in root_labels:
+        return None
+    return gold_sequence[:gold_index] + [pred_transition] + gold_sequence[gold_index+1:]
+def fix_missed_unary(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    Fix a missed unary which is followed by an otherwise correct transition
+    (also handles multiple missed unary transitions)
+    """
+    if gold_transition == pred_transition:
+        return None
+    cur_index = gold_index
+    cur_index = advance_past_unaries(gold_sequence, cur_index)
+    if gold_sequence[cur_index] == pred_transition:
+        return gold_sequence[:gold_index] + gold_sequence[cur_index:]
+    return None
+def fix_open_shift(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    Fix an Open replaced with a Shift
+    Suppose we were supposed to guess NT_X and instead did S
+    We derive the repair as follows.
+    For simplicity, assume the open is not a unary for now
+    Since we know an Open was legal, there must be stuff
+      stuff NT_X
+    Shift is also legal, so there must be other stuff and a previous Open
+      stuff_1 NT_Y stuff_2 NT_X
+    After the NT_X which we missed, there was a bunch of stuff and a close for NT_X
+      stuff_1 NT_Y stuff_2 NT_X stuff_3 C
+    There could be more stuff here which can be saved...
+      stuff_1 NT_Y stuff_2 NT_X stuff_3 C stuff_4 C
+      stuff_1 NT_Y stuff_2 NT_X stuff_3 C C
+    """
+    if not isinstance(gold_transition, OpenConstituent):
+        return None
+    if not isinstance(pred_transition, Shift):
+        return None
+    cur_index = gold_index
+    cur_index = advance_past_unaries(gold_sequence, cur_index)
+    if not isinstance(gold_sequence[cur_index], OpenConstituent):
+        return None
+    if gold_sequence[cur_index].top_label in root_labels:
+        return None
+    # cur_index now points to the NT_X we missed (not counting unaries)
+    stuff_start = cur_index + 1
+    # can't be a Close, since we just went past an Open and checked for unaries
+    # can't be an Open, since two Open in a row is illegal
+    assert isinstance(gold_sequence[stuff_start], Shift)
+    stuff_end = advance_past_constituents(gold_sequence, stuff_start)
+    # stuff_end is now the Close which ends NT_X
+    cur_index = stuff_end + 1
+    if cur_index >= len(gold_sequence):
+        return None
+    if isinstance(gold_sequence[cur_index], OpenConstituent):
+        cur_index = advance_past_unaries(gold_sequence, cur_index)
+        if cur_index >= len(gold_sequence):
+            return None
+    if isinstance(gold_sequence[cur_index], OpenConstituent):
+        # an Open here signifies that there was a bracket containing X underneath Y
+        # TODO: perhaps try to salvage something out of that situation?
+        return None
+    # the repair starts with the sequence up through the error,
+    # then stuff_3, which includes the error
+    # skip the Close for the missed NT_X
+    # then finish the sequence with any potential stuff_4, the next Close, and everything else
+    repair = gold_sequence[:gold_index] + gold_sequence[stuff_start:stuff_end] + gold_sequence[cur_index:]
+    return repair
+def fix_open_close(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    Fix an Open replaced with a Close
+    Call the Open NT_X
+    Open legal, so there must be stuff:
+      stuff NT_X
+    Close legal, so there must be something to close:
+      stuff_1 NT_Y stuff_2 NT_X
+    The incorrect close makes the following brackets:
+      (Y stuff_1 stuff_2)
+    We were supposed to build
+      (Y stuff_1 (X stuff_2 ...) (possibly more stuff))
+    The simplest fix here is to reopen Y at this point.
+    One issue might be if there is another bracket which encloses X underneath Y
+    So, for example, the tree was supposed to be
+      (Y stuff_1 (Z (X stuff_2 stuff_3) stuff_4))
+    The pattern for this case is
+      stuff_1 NT_Y stuff_2 NY_X stuff_3 close NT_Z stuff_4 close close
+    """
+    if not isinstance(gold_transition, OpenConstituent):
+        return None
+    if not isinstance(pred_transition, CloseConstituent):
+        return None
+    cur_index = advance_past_unaries(gold_sequence, gold_index)
+    if cur_index >= len(gold_sequence):
+        return None
+    if not isinstance(gold_sequence[cur_index], OpenConstituent):
+        return None
+    if gold_sequence[cur_index].top_label in root_labels:
+        return None
+    prev_open_index = find_previous_open(gold_sequence, gold_index)
+    if prev_open_index is None:
+        return None
+    prev_open = gold_sequence[prev_open_index]
+    # prev_open is now NT_Y from above
+    stuff_start = cur_index + 1
+    assert isinstance(gold_sequence[stuff_start], Shift)
+    stuff_end = advance_past_constituents(gold_sequence, stuff_start)
+    # stuff_end is now the Close which ends NT_X
+    # stuff_start:stuff_end is the stuff_3 block above
+    cur_index = stuff_end + 1
+    if cur_index >= len(gold_sequence):
+        return None
+    # if there are unary transitions here, we want to skip those.
+    # those are unary transitions on X and cannot be recovered, since X is gone
+    cur_index = advance_past_unaries(gold_sequence, cur_index)
+    # now there is a certain failure case which has to be accounted for.
+    # specifically, if there is a new non-terminal which opens
+    # immediately after X closes, it is encompassing X in a way that
+    # cannot be recovered now that part of X is stuck under Y.
+    # The two choices at this point would be to eliminate the new
+    # transition or just reject the tree from the repair
+    # For now, we reject the tree
+    if isinstance(gold_sequence[cur_index], OpenConstituent):
+        return None
+    repair = gold_sequence[:gold_index] + [pred_transition, prev_open] + gold_sequence[stuff_start:stuff_end] + gold_sequence[cur_index:]
+    return repair
+def fix_shift_close(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    This fixes Shift replaced with a Close transition.
+    This error occurs in the following pattern:
+      stuff_1 NT_X stuff... shift
+    Instead of shift, you close the NT_X
+    The easiest fix here is to just restore the NT_X.
+    """
+    if not isinstance(pred_transition, CloseConstituent):
+        return None
+    # this fix can also be applied if there were unaries on the
+    # previous constituent.  we just skip those until the Shift
+    cur_index = gold_index
+    if isinstance(gold_transition, OpenConstituent):
+        cur_index = advance_past_unaries(gold_sequence, cur_index)
+    if not isinstance(gold_sequence[cur_index], Shift):
+        return None
+    prev_open_index = find_previous_open(gold_sequence, gold_index)
+    if prev_open_index is None:
+        return None
+    prev_open = gold_sequence[prev_open_index]
+    # prev_open is now NT_X from above
+    return gold_sequence[:gold_index] + [pred_transition, prev_open] + gold_sequence[cur_index:]
+def fix_close_shift_open_bracket(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, ambiguous, late):
+    if not isinstance(gold_transition, CloseConstituent):
+        return None
+    if not isinstance(pred_transition, Shift):
+        return None
+    if len(gold_sequence) < gold_index + 3:
+        return None
+    if not isinstance(gold_sequence[gold_index+1], OpenConstituent):
+        return None
+    open_index = advance_past_unaries(gold_sequence, gold_index+1)
+    if not isinstance(gold_sequence[open_index], OpenConstituent):
+        return None
+    if not isinstance(gold_sequence[open_index+1], Shift):
+        return None
+    # check that the next operation was to open a *different* constituent
+    # from the one we just closed
+    prev_open_index = find_previous_open(gold_sequence, gold_index)
+    if prev_open_index is None:
+        return None
+    prev_open = gold_sequence[prev_open_index]
+    if gold_sequence[open_index] == prev_open:
+        return None
+    # check that the following stuff is a single bracket, not multiple brackets
+    end_index = find_in_order_constituent_end(gold_sequence, open_index+1)
+    if ambiguous and isinstance(gold_sequence[end_index], CloseConstituent):
+        return None
+    elif not ambiguous and isinstance(gold_sequence[end_index], Shift):
+        return None
+    # if closing at the end of the next blocks,
+    # instead of closing after the first block ends,
+    # we go to the end of the last block
+    if late:
+        end_index = advance_past_constituents(gold_sequence, open_index+1)
+    return gold_sequence[:gold_index] + gold_sequence[open_index+1:end_index] + gold_sequence[gold_index:open_index+1] + gold_sequence[end_index:]
+def fix_close_open_shift_unambiguous_bracket(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    return fix_close_shift_open_bracket(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, ambiguous=False, late=False)
+def fix_close_open_shift_ambiguous_bracket_early(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    return fix_close_shift_open_bracket(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, ambiguous=True, late=False)
+def fix_close_open_shift_ambiguous_bracket_late(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    return fix_close_shift_open_bracket(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, ambiguous=True, late=True)
+def fix_close_open_shift_ambiguous_predicted(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    if not isinstance(gold_transition, CloseConstituent):
+        return None
+    if not isinstance(pred_transition, Shift):
+        return None
+    if len(gold_sequence) < gold_index + 3:
+        return None
+    if not isinstance(gold_sequence[gold_index+1], OpenConstituent):
+        return None
+    open_index = advance_past_unaries(gold_sequence, gold_index+1)
+    if not isinstance(gold_sequence[open_index], OpenConstituent):
+        return None
+    if not isinstance(gold_sequence[open_index+1], Shift):
+        return None
+    # check that the next operation was to open a *different* constituent
+    # from the one we just closed
+    prev_open_index = find_previous_open(gold_sequence, gold_index)
+    if prev_open_index is None:
+        return None
+    prev_open = gold_sequence[prev_open_index]
+    if gold_sequence[open_index] == prev_open:
+        return None
+    # alright, at long last we have:
+    #   a close that was missed
+    #   a non-nested open that was missed
+    end_index = find_in_order_constituent_end(gold_sequence, open_index+1)
+    candidates = []
+    candidates.append((gold_sequence[:gold_index], gold_sequence[open_index+1:end_index], gold_sequence[gold_index:open_index+1], gold_sequence[end_index:]))
+    while isinstance(gold_sequence[end_index], Shift):
+        end_index = find_in_order_constituent_end(gold_sequence, end_index+1)
+        candidates.append((gold_sequence[:gold_index], gold_sequence[open_index+1:end_index], gold_sequence[gold_index:open_index+1], gold_sequence[end_index:]))
+    scores, best_idx, best_candidate = score_candidates(model, state, candidates, candidate_idx=2)
+    if len(candidates) == 1:
+        return RepairType.CLOSE_OPEN_SHIFT_UNAMBIGUOUS_BRACKET, best_candidate
+    if best_idx == len(candidates) - 1:
+        best_idx = -1
+    repair_type = RepairEnum(name=RepairType.CLOSE_OPEN_SHIFT_AMBIGUOUS_PREDICTED.name,
+                             value="%d.%d" % (RepairType.CLOSE_OPEN_SHIFT_AMBIGUOUS_PREDICTED.value, best_idx),
+                             is_correct=False)
+    return repair_type, best_candidate
+def fix_close_open_shift_nested(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    Fix a Close X..Open X..Shift pattern where both the Close and Open were skipped.
+    Here the pattern we are trying to fix is
+      stuff_A open_X stuff_B *close* open_X shift...
+    replaced with
+      stuff_A open_X stuff_B shift...
+    the missed close & open means a missed recall error for (X A B)
+    whereas the previous open_X can still get the outer bracket
+    """
+    if not isinstance(gold_transition, CloseConstituent):
+        return None
+    if not isinstance(pred_transition, Shift):
+        return None
+    if len(gold_sequence) < gold_index + 3:
+        return None
+    if not isinstance(gold_sequence[gold_index+1], OpenConstituent):
+        return None
+    # handle the sequence:
+    #   stuff_A open_X stuff_B close open_Y close open_X shift
+    open_index = advance_past_unaries(gold_sequence, gold_index+1)
+    if not isinstance(gold_sequence[open_index], OpenConstituent):
+        return None
+    if not isinstance(gold_sequence[open_index+1], Shift):
+        return None
+    # check that the next operation was to open the same constituent
+    # we just closed
+    prev_open_index = find_previous_open(gold_sequence, gold_index)
+    if prev_open_index is None:
+        return None
+    prev_open = gold_sequence[prev_open_index]
+    if gold_sequence[open_index] != prev_open:
+        return None
+    return gold_sequence[:gold_index] + gold_sequence[open_index+1:]
+def fix_close_shift_shift(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, ambiguous, late):
+    """
+    Repair Close/Shift -> Shift by moving the Close to after the next block is created
+    """
+    if not isinstance(gold_transition, CloseConstituent):
+        return None
+    if not isinstance(pred_transition, Shift):
+        return None
+    if len(gold_sequence) < gold_index + 2:
+        return None
+    start_index = gold_index + 1
+    start_index = advance_past_unaries(gold_sequence, start_index)
+    if len(gold_sequence) < start_index + 2:
+        return None
+    if not isinstance(gold_sequence[start_index], Shift):
+        return None
+    end_index = find_in_order_constituent_end(gold_sequence, start_index)
+    if end_index is None:
+        return None
+    # if this *isn't* a close, we don't allow it in the unambiguous case
+    # that case seems to be ambiguous...
+    #   stuff_1 close stuff_2 stuff_3
+    # if you would normally start building stuff_3,
+    # it is not clear if you want to close at the end of
+    # stuff_2 or build stuff_3 instead.
+    if ambiguous and isinstance(gold_sequence[end_index], CloseConstituent):
+        return None
+    elif not ambiguous and isinstance(gold_sequence[end_index], Shift):
+        return None
+    # close at the end of the brackets, rather than once the first bracket is finished
+    if late:
+        end_index = advance_past_constituents(gold_sequence, start_index)
+    return gold_sequence[:gold_index] + gold_sequence[start_index:end_index] + [CloseConstituent()] + gold_sequence[end_index:]
+def fix_close_shift_shift_unambiguous(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    return fix_close_shift_shift(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, ambiguous=False, late=False)
+def fix_close_shift_shift_ambiguous_early(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    return fix_close_shift_shift(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, ambiguous=True, late=False)
+def fix_close_shift_shift_ambiguous_late(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    return fix_close_shift_shift(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, ambiguous=True, late=True)
+def fix_close_shift_shift_ambiguous_predicted(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    if not isinstance(gold_transition, CloseConstituent):
+        return None
+    if not isinstance(pred_transition, Shift):
+        return None
+    if len(gold_sequence) < gold_index + 2:
+        return None
+    start_index = gold_index + 1
+    start_index = advance_past_unaries(gold_sequence, start_index)
+    if len(gold_sequence) < start_index + 2:
+        return None
+    if not isinstance(gold_sequence[start_index], Shift):
+        return None
+    # now we know that the gold pattern was
+    #   Close (unaries) Shift
+    # and instead the model predicted Shift
+    candidates = []
+    current_index = start_index
+    while isinstance(gold_sequence[current_index], Shift):
+        current_index = find_in_order_constituent_end(gold_sequence, current_index)
+        assert current_index is not None
+        candidates.append((gold_sequence[:gold_index], gold_sequence[start_index:current_index], [CloseConstituent()], gold_sequence[current_index:]))
+    scores, best_idx, best_candidate = score_candidates(model, state, candidates, candidate_idx=2)
+    if len(candidates) == 1:
+        return RepairType.CLOSE_SHIFT_SHIFT, best_candidate
+    if best_idx == len(candidates) - 1:
+        best_idx = -1
+    repair_type = RepairEnum(name=RepairType.CLOSE_SHIFT_SHIFT_AMBIGUOUS_PREDICTED.name,
+                             value="%d.%d" % (RepairType.CLOSE_SHIFT_SHIFT_AMBIGUOUS_PREDICTED.value, best_idx),
+                             is_correct=False)
+    #print(best_idx, len(candidates), repair_type)
+    return repair_type, best_candidate
+def ambiguous_shift_open_unary_close(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    if not isinstance(gold_transition, Shift):
+        return None
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    return gold_sequence[:gold_index] + [pred_transition, CloseConstituent()] + gold_sequence[gold_index:]
+def ambiguous_shift_open_early_close(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    if not isinstance(gold_transition, Shift):
+        return None
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    # Find when the current block ends,
+    # either via a Shift or a Close
+    end_index = find_in_order_constituent_end(gold_sequence, gold_index)
+    return gold_sequence[:gold_index] + [pred_transition] + gold_sequence[gold_index:end_index] + [CloseConstituent()] + gold_sequence[end_index:]
+def ambiguous_shift_open_late_close(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    if not isinstance(gold_transition, Shift):
+        return None
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    end_index = advance_past_constituents(gold_sequence, gold_index)
+    return gold_sequence[:gold_index] + [pred_transition] + gold_sequence[gold_index:end_index] + [CloseConstituent()] + gold_sequence[end_index:]
+def ambiguous_shift_open_predicted_close(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    if not isinstance(gold_transition, Shift):
+        return None
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    unary_candidate = (gold_sequence[:gold_index], [pred_transition], [CloseConstituent()], gold_sequence[gold_index:])
+    early_index = find_in_order_constituent_end(gold_sequence, gold_index)
+    early_candidate = (gold_sequence[:gold_index], [pred_transition] + gold_sequence[gold_index:early_index], [CloseConstituent()], gold_sequence[early_index:])
+    late_index = advance_past_constituents(gold_sequence, gold_index)
+    if early_index == late_index:
+        candidates = [unary_candidate, early_candidate]
+        scores, best_idx, best_candidate = score_candidates(model, state, candidates, candidate_idx=2)
+        if best_idx == 0:
+            return_label = "U"
+        else:
+            return_label = "S"
+    else:
+        late_candidate = (gold_sequence[:gold_index], [pred_transition] + gold_sequence[gold_index:late_index], [CloseConstituent()], gold_sequence[late_index:])
+        candidates = [unary_candidate, early_candidate, late_candidate]
+        scores, best_idx, best_candidate = score_candidates(model, state, candidates, candidate_idx=2)
+        if best_idx == 0:
+            return_label = "U"
+        elif best_idx == 1:
+            return_label = "E"
+        else:
+            return_label = "L"
+    repair_type = RepairEnum(name=RepairType.SHIFT_OPEN_PREDICTED_CLOSE.name,
+                             value="%d.%s" % (RepairType.SHIFT_OPEN_PREDICTED_CLOSE.value, return_label),
+                             is_correct=False)
+    return repair_type, best_candidate
+def report_close_shift(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    if not isinstance(gold_transition, CloseConstituent):
+        return None
+    if not isinstance(pred_transition, Shift):
+        return None
+    return RepairType.OTHER_CLOSE_SHIFT, None
+def report_close_open(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    if not isinstance(gold_transition, CloseConstituent):
+        return None
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    return RepairType.OTHER_CLOSE_OPEN, None
+def report_open_open(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    if not isinstance(gold_transition, OpenConstituent):
+        return None
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    return RepairType.OTHER_OPEN_OPEN, None
+def report_open_shift(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    if not isinstance(gold_transition, OpenConstituent):
+        return None
+    if not isinstance(pred_transition, Shift):
+        return None
+    return RepairType.OTHER_OPEN_SHIFT, None
+def report_open_close(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    if not isinstance(gold_transition, OpenConstituent):
+        return None
+    if not isinstance(pred_transition, CloseConstituent):
+        return None
+    return RepairType.OTHER_OPEN_CLOSE, None
+def report_shift_open(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    if not isinstance(gold_transition, Shift):
+        return None
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    return RepairType.OTHER_SHIFT_OPEN, None
+class RepairType(Enum):
+    """
+    Keep track of which repair is used, if any, on an incorrect transition
+    Statistics on English w/ no charlm, no transformer,
+      eg word vectors only, best model as of January 2024
+    unambiguous transitions only:
+        oracle scheme          dev      test
+         no oracle            0.9245   0.9226
+          +wrong_open_root    0.9244   0.9224
+          +wrong_unary_chain  0.9243   0.9237
+          +wrong_open_unary   0.9249   0.9223
+          +wrong_open_general 0.9251   0.9215
+          +missed_unary       0.9248   0.9215
+          +open_shift         0.9243   0.9216
+          +open_close         0.9254   0.9217
+          +shift_close        0.9261   0.9238
+          +close_shift_nested 0.9253   0.9250
+    Redoing the wrong_open_general, which seemed to hurt test scores:
+          wrong_open_two_subtrees - L4             0.9244   0.9220
+          every else w/o ambiguous open/open fix   0.9259   0.9241
+          everything w/ open_two_subtrees          0.9261   0.9246
+          w/ ambiguous open_three_subtrees         0.9264   0.9243
+    Testing three different possible repairs for shift-open:
+          w/ ambiguous open_three_subtrees 0.9264   0.9243
+          immediate close (unary)          0.9267   0.9246
+          close after first bracket        0.9265   0.9256
+          close after last bracket         0.9264   0.9240
+    Testing three possible repairs for close-open-shift/shift
+          w/ ambiguous open_three_subtrees   0.9264   0.9243
+          unambiguous c-o-s/shift            0.9265   0.9246
+          ambiguous c-o-s/shift closed early 0.9262   0.9246
+          ambiguous c-o-s/shift closed late  0.9259   0.9245
+    Testing three possible repairs for close-shift/shift
+          w/ ambiguous open_three_subtrees   0.9264   0.9243
+          unambiguous c-s/shift              0.9253   0.9239
+          ambiguous c-s/shift closed early   0.9259   0.9235
+          ambiguous c-s/shift closed late    0.9252   0.9241
+          ambiguous c-s/shift predicted      0.9264   0.9243
+    --------------------------------------------------------
+    Running ID experiments to verify some of the above findings
+    no charlm or bert, only 200 epochs
+    Comparing wrong_open fixes
+          w/ ambiguous open_two_subtrees     0.8448   0.8335
+          w/ ambiguous open_three_subtrees   0.8424   0.8336
+    Testing three possible repairs for close-shift/shift
+          unambiguous c-s/shift              0.8448   0.8360
+          ambiguous c-s/shift closed early   0.8425   0.8352
+          ambiguous c-s/shift closed late    0.8452   0.8334
+    --------------------------------------------------------
+    Running ID experiments to verify some of the above findings
+    bert + peft, only 200 epochs
+    Comparing wrong_open fixes
+          w/o ambiguous open/open fix        0.8923   0.8834
+          w/ ambiguous open_two_subtrees     0.8908   0.8828
+          w/ ambiguous open_three_subtrees   0.8901   0.8801
+    Testing three possible repairs for close-shift/shift
+          unambiguous c-s/shift              0.8921   0.8825
+          ambiguous c-s/shift closed early   0.8924   0.8841
+          ambiguous c-s/shift closed late    0.8921   0.8806
+          ambiguous c-s/shift predicted      0.8923   0.8835
+    --------------------------------------------------------
+    Running DE experiments to verify some of the above findings
+    bert + peft, only 200 epochs
+    Comparing wrong_open fixes
+          w/o ambiguous open/open fix        0.9576   0.9402
+          w/ ambiguous open_two_subtrees     0.9570   0.9410
+          w/ ambiguous open_three_subtrees   0.9569   0.9412
+    Testing three possible repairs for close-shift/shift
+          unambiguous c-s/shift              0.9566   0.9408
+          ambiguous c-s/shift closed early   0.9564   0.9394
+          ambiguous c-s/shift closed late    0.9572   0.9408
+          ambiguous c-s/shift predicted      0.9571   0.9404
+    --------------------------------------------------------
+    Running IT experiments to verify some of the above findings
+    bert + peft, only 200 epochs
+    Comparing wrong_open fixes
+          w/o ambiguous open/open fix        0.8380   0.8361
+          w/ ambiguous open_two_subtrees     0.8377   0.8351
+          w/ ambiguous open_three_subtrees   0.8381   0.8368
+    Testing three possible repairs for close-shift/shift
+          unambiguous c-s/shift              0.8376   0.8392
+          ambiguous c-s/shift closed early   0.8363   0.8359
+          ambiguous c-s/shift closed late    0.8365   0.8383
+          ambiguous c-s/shift predicted      0.8379   0.8371
+    --------------------------------------------------------
+    Running ZH experiments to verify some of the above findings
+    bert + peft, only 200 epochs
+    Comparing wrong_open fixes
+          w/o ambiguous open/open fix        0.9160   0.9143
+          w/ ambiguous open_two_subtrees     0.9145   0.9144
+          w/ ambiguous open_three_subtrees   0.9146   0.9142
+    Testing three possible repairs for close-shift/shift
+          unambiguous c-s/shift              0.9155   0.9146
+          ambiguous c-s/shift closed early   0.9145   0.9153
+          ambiguous c-s/shift closed late    0.9138   0.9140
+          ambiguous c-s/shift predicted      0.9154   0.9144
+    --------------------------------------------------------
+    Running VI experiments to verify some of the above findings
+    bert + peft, only 200 epochs
+    Comparing wrong_open fixes
+          w/o ambiguous open/open fix        0.8282   0.7668
+          w/ ambiguous open_two_subtrees     0.8272   0.7670
+          w/ ambiguous open_three_subtrees   0.8282   0.7668
+    Testing three possible repairs for close-shift/shift
+          unambiguous c-s/shift              0.8285   0.7683
+          ambiguous c-s/shift closed early   0.8276   0.7678
+          ambiguous c-s/shift closed late    0.8278   0.7668
+          ambiguous c-s/shift predicted      0.8270   0.7668
+    --------------------------------------------------------
+    Testing a combination of ambiguous vs predicted transitions
+      ambiguous
+    EN: (no CSS_U)                           0.9258   0.9252
+    ZH: (no CSS_U)                           0.9153   0.9145
+      predicted
+    EN: (no CSS_U)                           0.9264   0.9241
+    ZH: (no CSS_U)                           0.9145   0.9141
+    """
+    def __new__(cls, fn, correct=False, debug=False):
+        """
+        Enumerate values as normal, but also keep a pointer to a function which repairs that kind of error
+        correct: this represents a correct transition
+        debug: always run this, as it just counts statistics
+        """
+        value = len(cls.__members__)
+        obj = object.__new__(cls)
+        obj._value_ = value + 1
+        obj.fn = fn
+        obj.correct = correct
+        obj.debug = debug
+        return obj
+    @property
+    def is_correct(self):
+        return self.correct
+    # The first section is a sequence of repairs when the parser
+    # should have chosen NTx but instead chose NTy
+    # Blocks of transitions which can be abstracted away to be
+    # anything will be represented as S1, S2, etc... S for stuff
+    # We carve out an exception for a wrong open at the root
+    # The only possble transtions at this point are to close
+    # the error and try again with the root
+    WRONG_OPEN_ROOT_ERROR  = (fix_wrong_open_root_error,)
+    # The simplest form of such an error is when there is a sequence
+    # of unary transitions and the parser chose a wrong parent.
+    # Remember that a unary transition is represented by a pair
+    # of transitions, NTx, Close.
+    # In this case, the correct sequence was
+    #   S1 NTx Close NTy Close NTz ...
+    # but the parser chose NTy, NTz, etc
+    # The repair in this case is to simply discard the unchosen
+    # unary transitions and continue
+    WRONG_OPEN_UNARY_CHAIN = (fix_wrong_open_unary_chain,)
+    # Similar to the UNARY_CHAIN error, but in this case there is a
+    # bunch of stuff (one or more constituents built) between the
+    # missed open transition and the close transition
+    WRONG_OPEN_STUFF_UNARY = (fix_wrong_open_stuff_unary,)
+    # If the correct sequence is
+    #   T1 O_x T2 C
+    # and instead we predicted
+    #   T1 O_y ...
+    # this can be fixed with a unary transition after
+    #   T1 O_y T2 C O_x C
+    # note that this is technically ambiguous
+    # could have done
+    #   T1 O_x C O_y T2 C
+    # but doing this should be easier for the parser to detect (untested)
+    # also this way the same code paths can be used for two subtrees
+    # and for multiple subtrees
+    WRONG_OPEN_TWO_SUBTREES = (fix_wrong_open_two_subtrees,)
+    # If the gold transition is an Open because it is part of
+    # a unary transition, and the following transition is a
+    # correct Shift or Close, we can just skip past the unary.
+    MISSED_UNARY           = (fix_missed_unary,)
+    # Open -> Shift errors which don't just represent a unary
+    # generally represent a missing bracket which cannot be
+    # recovered using the in-order mechanism.  Dropping the
+    # missing transition is generally the only fix.
+    # (This means removing the corresponding Close)
+    # One could theoretically create a new transition which
+    # grabs two constituents, though
+    OPEN_SHIFT             = (fix_open_shift,)
+    # Open -> Close is a rather drastic break in the
+    # potential structure of the tree.  We can no longer
+    # recover the missed Open, and we might not be able
+    # to recover other following missed Opens as well.
+    # In most cases, the only thing to do is reopen the
+    # incorrectly closed outer bracket and keep going.
+    OPEN_CLOSE             = (fix_open_close,)
+    # Similar to the Open -> Close error, but at least
+    # in this case we are just introducing one wrong bracket
+    # rather than also breaking some existing brackets.
+    # The fix here is to reopen the closed bracket.
+    SHIFT_CLOSE            = (fix_shift_close,)
+    # Specifically fixes an error where bracket X is
+    # closed and then immediately opened to build a
+    # new X bracket.  In this case, the simplest fix
+    # will be to skip both the close and the new open
+    # and continue from there.
+    CLOSE_OPEN_SHIFT_NESTED = (fix_close_open_shift_nested,)
+    # Fix an error where the correct sequence was to Close X, Open Y,
+    # then continue building,
+    # but instead the model did a Shift in place of C_X O_Y
+    # The damage here is a recall error for the missed X and
+    # a precision error for the incorrectly opened X
+    # However, the Y can actually be recovered - whenever we finally
+    # close X, we can then open Y
+    # One form of that is unambiguous, that of
+    #   T_A O_X T_B C O_Y T_C C
+    # with only one subtree after the O_Y
+    # In that case, the Close that would have closed Y
+    # is the only place for the missing close of X
+    # So we can produce the following:
+    #   T_A O_X T_B T_C C O_Y C
+    CLOSE_OPEN_SHIFT_UNAMBIGUOUS_BRACKET = (fix_close_open_shift_unambiguous_bracket,)
+    # Similarly to WRONG_OPEN_TWO_SUBTREES, if the correct sequence is
+    #   T1 O_x T2 T3 C
+    # and instead we predicted
+    #   T1 O_y ...
+    # this can be fixed by closing O_y in any number of places
+    #   T1 O_y T2 C O_x T3 C
+    #   T1 O_y T2 C T3 O_x C
+    # Either solution is a single precision error,
+    # but keeps the O_x subtree correct
+    # This is an ambiguous transition - we can experiment with different fixes
+    WRONG_OPEN_MULTIPLE_SUBTREES = (fix_wrong_open_multiple_subtrees,)
+    CORRECT                = (None, True)
+    UNKNOWN                = None
+    # If the model is supposed to build a block after a Close
+    # operation, attach that block to the piece to the left
+    # a couple different variations on this were tried
+    # we tried attaching all constituents to the
+    #   bracket which should have been closed
+    # we tried attaching exactly one constituent
+    # and we tried attaching only if there was
+    #   exactly one following constituent
+    # none of these improved f1.  for example, on the VI dataset, we
+    # lost 0.15 F1 with the exactly one following constituent version
+    # it might be worthwhile double checking some of the other
+    # versions to make sure those also fail, though
+    CLOSE_SHIFT_SHIFT                   = (fix_close_shift_shift_unambiguous,)
+    # In the ambiguous close-shift/shift case, this closes the surrounding bracket
+    # (which should have already been closed)
+    # as soon as the next constituent is built
+    # this turns
+    #   (A (B s1 s2) s3 s4)
+    # into
+    #   (A (B s1 s2 s3) s4)
+    CLOSE_SHIFT_SHIFT_AMBIGUOUS_EARLY   = (fix_close_shift_shift_ambiguous_early,)
+    # In the ambiguous close-shift/shift case, this closes the surrounding bracket
+    # (which should have already been closed)
+    # when the rest of the constituents in this bracket are built
+    # this turns
+    #   (A (B s1 s2) s3 s4)
+    # into
+    #   (A (B s1 s2 s3 s4))
+    CLOSE_SHIFT_SHIFT_AMBIGUOUS_LATE    = (fix_close_shift_shift_ambiguous_late,)
+    # For the close-shift/shift errors which are ambiguous,
+    # this uses the model's predictions to guess which block
+    # to put the close after
+    CLOSE_SHIFT_SHIFT_AMBIGUOUS_PREDICTED = (fix_close_shift_shift_ambiguous_predicted,)
+    # If a sequence should have gone Close - Open - Shift,
+    # and instead we went Shift,
+    # we need to close the previous bracket
+    # If it is ambiguous
+    # such as Close - Open - Shift - Shift
+    # close the bracket ASAP
+    # eg, Shift - Close - Open - Shift
+    CLOSE_OPEN_SHIFT_AMBIGUOUS_BRACKET_EARLY = (fix_close_open_shift_ambiguous_bracket_early,)
+    # for Close - Open - Shift - Shift
+    # close the bracket as late as possible
+    # eg, Shift - Shift - Close - Open
+    CLOSE_OPEN_SHIFT_AMBIGUOUS_BRACKET_LATE = (fix_close_open_shift_ambiguous_bracket_late,)
+    # If the sequence should have gone
+    #   Close - Open - Shift
+    # and instead we predicted a Shift
+    # in a context where closing the bracket would be ambiguous
+    # we use the model to predict where the close should actually happen
+    CLOSE_OPEN_SHIFT_AMBIGUOUS_PREDICTED = (fix_close_open_shift_ambiguous_predicted,)
+    # This particular repair effectively turns the shift -> ambiguous open
+    # into a unary transition
+    SHIFT_OPEN_UNARY_CLOSE       = (ambiguous_shift_open_unary_close,)
+    # Fix the shift -> ambiguous open by closing after the first constituent
+    # This is an ambiguous solution because it could also be closed either
+    # as a unary transition or with a close at the end of the outer bracket
+    SHIFT_OPEN_EARLY_CLOSE       = (ambiguous_shift_open_early_close,)
+    # Fix the shift -> ambiguous open by closing after all constituents
+    # This is an ambiguous solution because it could also be closed either
+    # as a unary transition or with a close at the end of the first constituent
+    SHIFT_OPEN_LATE_CLOSE        = (ambiguous_shift_open_late_close,)
+    # Use the model to predict when to close!
+    # The different options for where to put the Close are put into the model,
+    # and the highest scoring close is used
+    SHIFT_OPEN_PREDICTED_CLOSE   = (ambiguous_shift_open_predicted_close,)
+    OTHER_CLOSE_SHIFT            = (report_close_shift, False, True)
+    OTHER_CLOSE_OPEN             = (report_close_open, False, True)
+    OTHER_OPEN_OPEN              = (report_open_open, False, True)
+    OTHER_OPEN_CLOSE             = (report_open_close, False, True)
+    OTHER_OPEN_SHIFT             = (report_open_shift, False, True)
+    OTHER_SHIFT_OPEN             = (report_shift_open, False, True)
+    # any other open transition we get wrong, which hasn't already
+    # been carved out as an exception above, we just accept the
+    # incorrect Open and keep going
+    #
+    # TODO: check if there is a way to improve this
+    # it appears to hurt scores simply by existing
+    # explanation: this is wrong logic
+    # Suppose the correct sequence had been
+    #   T1 open(NP) T2 T3 close
+    # Instead we had done
+    #   T1 open(VP) T2 T3 close
+    # We can recover the missing NP!
+    #   T1 open(VP) T2 close open(NP) T3 close
+    # Can also recover it as
+    #   T1 open(VP) T2 T3 close open(NP) close
+    # So this is actually an ambiguous transition
+    # except in the case of
+    #   T1 open(...) close
+    # In this case, a unary transition can fix make it so we only have
+    # a precision error, not also a recall error
+    # Currently, the approach is to put this after the default fixes
+    # and use the two & more-than-two versions of the fix above
+    WRONG_OPEN_GENERAL     = (fix_wrong_open_general,)
+class InOrderOracle(DynamicOracle):
+    def __init__(self, root_labels, oracle_level, additional_oracle_levels, deactivated_oracle_levels):
+        super().__init__(root_labels, oracle_level, RepairType, additional_oracle_levels, deactivated_oracle_levels)

stanza/stanza/models/constituency/lstm_model.py ADDED Viewed

	@@ -0,0 +1,1178 @@

+"""
+A version of the BaseModel which uses LSTMs to predict the correct next transition
+based on the current known state.
+The primary purpose of this class is to implement the prediction of the next
+transition, which is done by concatenating the output of an LSTM operated over
+previous transitions, the words, and the partially built constituents.
+A complete processing of a sentence is as follows:
+  1) Run the input words through an encoder.
+     The encoder includes some or all of the following:
+       pretrained word embedding
+       finetuned word embedding for training set words - "delta_embedding"
+       POS tag embedding
+       pretrained charlm representation
+       BERT or similar large language model representation
+       attention transformer over the previous inputs
+       labeled attention transformer over the first attention layer
+     The encoded input is then put through a bi-lstm, giving a word representation
+  2) Transitions are put in an embedding, and transitions already used are tracked
+     in an LSTM
+  3) Constituents already built are also processed in an LSTM
+  4) Every transition is chosen by taking the output of the current word position,
+     the transition LSTM, and the constituent LSTM, and classifying the next
+     transition
+  5) Transitions are repeated (with constraints) until the sentence is completed
+"""
+from collections import namedtuple
+import copy
+from enum import Enum
+import logging
+import math
+import random
+import torch
+import torch.nn as nn
+from torch.nn.utils.rnn import pack_padded_sequence
+from stanza.models.common.bert_embedding import extract_bert_embeddings
+from stanza.models.common.maxout_linear import MaxoutLinear
+from stanza.models.common.utils import attach_bert_model, unsort
+from stanza.models.common.vocab import PAD_ID, UNK_ID
+from stanza.models.constituency.base_model import BaseModel
+from stanza.models.constituency.label_attention import LabelAttentionModule
+from stanza.models.constituency.lstm_tree_stack import LSTMTreeStack
+from stanza.models.constituency.parse_transitions import TransitionScheme
+from stanza.models.constituency.parse_tree import Tree
+from stanza.models.constituency.partitioned_transformer import PartitionedTransformerModule
+from stanza.models.constituency.positional_encoding import ConcatSinusoidalEncoding
+from stanza.models.constituency.transformer_tree_stack import TransformerTreeStack
+from stanza.models.constituency.tree_stack import TreeStack
+from stanza.models.constituency.utils import build_nonlinearity, initialize_linear
+logger = logging.getLogger('stanza')
+tlogger = logging.getLogger('stanza.constituency.trainer')
+WordNode = namedtuple("WordNode", ['value', 'hx'])
+# lstm_hx & lstm_cx are the hidden & cell states of the LSTM going across constituents
+# tree_hx and tree_cx are the states of the lstm going up the constituents in the case of the tree_lstm combination method
+Constituent = namedtuple("Constituent", ['value', 'tree_hx', 'tree_cx'])
+# The sentence boundary vectors are marginally useful at best.
+# However, they make it much easier to use non-bert layers as input to
+# attention layers, as the attention layers work better when they have
+# an index 0 to attend to.
+class SentenceBoundary(Enum):
+    NONE               = 1
+    WORDS              = 2
+    EVERYTHING         = 3
+class StackHistory(Enum):
+    LSTM               = 1
+    ATTN               = 2
+# How to compose constituent children into new constituents
+# MAX is simply take the max value of the children
+# this is surprisingly effective
+# for example, a Turkish dataset went from 81-81.5 dev, 75->75.5 test
+# BILSTM is the method described in the papers of making an lstm
+# out of the constituents
+# BILSTM_MAX is the same as BILSTM, but instead of using a Linear
+# to reduce the outputs of the lstm, we first take the max
+# and then use a linear to reduce the max
+# BIGRAM combines pairs of children and then takes the max over those
+# ATTN means to put an attention layer over the children nodes
+# we then take the max of the children with their attention
+#
+# Experiments show that MAX is noticeably better than the other options
+# On ja_alt, here are a few results after 200 iterations,
+# averaged over 5 iterations:
+#   MAX:         0.8985
+#   BILSTM:      0.8964
+#   BILSTM_MAX:  0.8973
+#   BIGRAM:      0.8982
+#
+# The MAX method has a linear transform after the max.
+#   Removing that transform makes the score go down to 0.8982
+#
+# We tried a few varieties of BILSTM_MAX
+# In particular:
+# max over LSTM, combining forward & backward using the max: 0.8970
+# max over forward & backward separately, then reduce:       0.8970
+# max over forward & backward only over 1:-1
+#   (eg, leave out the node embedding):                      0.8969
+# same as previous, but split the reduce into 2 pieces:      0.8973
+# max over forward & backward separately, then reduce as
+#   1/2(F + B) + W(F,B)
+#   the idea being that this way F and B are guaranteed
+#   to be represented:                                       0.8971
+#
+# BIGRAM is an attempt to mix information from nodes
+#   when building constituents, but it didn't help
+#   The first example, just taking pairs and learning
+#   a transform, went to NaN.  Likely the transform
+#   expanded the embedding too much.  Switching it to
+#   scale the matrix by 0.5 didn't go to Nan, but only
+#   resulted in 0.8982
+#
+# A couple varieties of ATTN:
+# first an input linear, then attn, then an output linear
+#   the upside of this would be making the dimension of the attn
+#   independent from the rest of the model
+#   however, this caused an expansion in the magnitude of the vectors,
+#   resulting in NaN for deep enough trees
+# adding layernorm or tanh to balance this out resulted in
+#   disappointing performance
+#   tanh: 0.8972
+# another alternative not tested yet: lower initialization weights
+#   and enforce that the norms of the matrices are low enough that
+#   exponential explosion up the layers of the tree doesn't happen
+# just an attention layer means hidden_size % reduce_heads == 0
+#   that is simple enough to enforce by slightly changing hidden_size
+#   if needed
+# appending the embedding for the open state to the start of the
+#   sequence of children and taking only the content nodes
+#   was very disappointing: 0.8967
+# taking the entire sequence of children including the open state
+#   embedding resulted in 0.8973
+# long story short, this looks like an idea that should work, but it
+#   doesn't help.  suggestions welcome for improving these results
+#
+# The current TREE_LSTM_CX mechanism uses a word's embedding
+#   as the hx and a trained embedding over tags as the cx    0.8996
+# This worked slightly better than 0s for cx (TREE_LSTM)     0.8992
+# A variant of TREE_LSTM which didn't work out:
+#   nodes are combined with an LSTM
+#   hx & cx are embeddings of the node type (eg S, NP, etc)
+#   input is the max over children:                          0.8977
+# Another variant which didn't work: use the word embedding
+#   as input to the same LSTM to get hx & cx                 0.8985
+# Note that although the scores for TREE_LSTM_CX are slightly higher
+# than MAX for the JA dataset, the benefit was not as clear for EN,
+# so we left the default at MAX.
+# For example, on English WSJ, before switching to Bert POS and
+# a learned Bert mixing layer, a comparison of 5x models trained
+# for 400 iterations got dev scores of:
+#   TREE_LSTM_CX        0.9589
+#   MAX                 0.9593
+#
+# UNTIED_MAX has a different reduce_linear for each type of
+#   constituent in the model.  Similar to the different linear
+#   maps used in the CVG paper from Socher, Bauer, Manning, Ng
+# This is implemented as a large CxHxH parameter,
+#   with num_constituent layers of hidden-hidden transform,
+#   along with a CxH bias parameter.
+#   Essentially C Linears stacked on top of each other,
+#   but in a parameter so that indexing can be done quickly.
+# Unfortunately this does not beat out MAX with one combined linear.
+#   On an experiment on WSJ with all the best settings as of early
+#   October 2022, such as a Bert model POS tagger:
+#   MAX                 0.9597
+#   UNTIED_MAX          0.9592
+# Furthermore, starting from a finished MAX model and restarting
+#   by splitting the MAX layer into multiple pieces did not improve.
+#
+# KEY has a single Key which is used for a facsimile of ATTN
+#   each incoming subtree has its values weighted by a Query
+#   then the Key is used to calculate a softmax
+#   finally, a Value is used to scale the subtrees
+#   reduce_heads is used to determine the number of heads
+# There is an option to use or not use position information
+#   using a sinusoidal position embedding
+# UNTIED_KEY is the same, but has a different key
+#   for each possible constituent
+# On a VI dataset:
+#   MAX                    0.82064
+#   KEY (pos, 8)           0.81739
+#   UNTIED_KEY (pos, 8)    0.82046
+#   UNTIED_KEY (pos, 4)    0.81742
+# Attempted to add a linear to mix the attn heads together,
+#   but that was awful:    0.81567
+# Adding two position vectors, one in each direction, did not help:
+#   UNTIED_KEY (2x pos, 8) 0.8188
+# To redo that experiment, double the width of reduce_query and
+#   reduce_value, then call reduce_position on nhx, flip it,
+#   and call reduce_position again
+# Evidently the experiments to try should be:
+#   no pos at all
+#   more heads
+class ConstituencyComposition(Enum):
+    BILSTM                = 1
+    MAX                   = 2
+    TREE_LSTM             = 3
+    BILSTM_MAX            = 4
+    BIGRAM                = 5
+    ATTN                  = 6
+    TREE_LSTM_CX          = 7
+    UNTIED_MAX            = 8
+    KEY                   = 9
+    UNTIED_KEY            = 10
+class LSTMModel(BaseModel, nn.Module):
+    def __init__(self, pretrain, forward_charlm, backward_charlm, bert_model, bert_tokenizer, force_bert_saved, peft_name, transitions, constituents, tags, words, rare_words, root_labels, constituent_opens, unary_limit, args):
+        """
+        pretrain: a Pretrain object
+        transitions: a list of all possible transitions which will be
+          used to build trees
+        constituents: a list of all possible constituents in the treebank
+        tags: a list of all possible tags in the treebank
+        words: a list of all known words, used for a delta word embedding.
+          note that there will be an attempt made to learn UNK words as well,
+          and tags by themselves may help UNK words
+        rare_words: a list of rare words, used to occasionally replace with UNK
+        root_labels: probably ROOT, although apparently some treebanks like TOP or even s
+        constituent_opens: a list of all possible open nodes which will go on the stack
+          - this might be different from constituents if there are nodes
+            which represent multiple constituents at once
+        args: hidden_size, transition_hidden_size, etc as gotten from
+          constituency_parser.py
+        Note that it might look like a hassle to pass all of this in
+        when it can be collected directly from the trees themselves.
+        However, that would only work at train time.  At eval or
+        pipeline time we will load the lists from the saved model.
+        """
+        super().__init__(transition_scheme=args['transition_scheme'], unary_limit=unary_limit, reverse_sentence=args.get('reversed', False), root_labels=root_labels)
+        self.args = args
+        self.unsaved_modules = []
+        emb_matrix = pretrain.emb
+        self.add_unsaved_module('embedding', nn.Embedding.from_pretrained(emb_matrix, freeze=True))
+        # replacing NBSP picks up a whole bunch of words for VI
+        self.vocab_map = { word.replace('\xa0', ' '): i for i, word in enumerate(pretrain.vocab) }
+        # precompute tensors for the word indices
+        # the tensors should be put on the GPU if needed by calling to(device)
+        self.register_buffer('vocab_tensors', torch.tensor(range(len(pretrain.vocab)), requires_grad=False))
+        self.vocab_size = emb_matrix.shape[0]
+        self.embedding_dim = emb_matrix.shape[1]
+        self.constituents = sorted(list(constituents))
+        self.hidden_size = self.args['hidden_size']
+        self.constituency_composition = self.args.get("constituency_composition", ConstituencyComposition.BILSTM)
+        if self.constituency_composition in (ConstituencyComposition.ATTN, ConstituencyComposition.KEY, ConstituencyComposition.UNTIED_KEY):
+            self.reduce_heads = self.args['reduce_heads']
+            if self.hidden_size % self.reduce_heads != 0:
+                self.hidden_size = self.hidden_size + self.reduce_heads - (self.hidden_size % self.reduce_heads)
+        if args['constituent_stack'] == StackHistory.ATTN:
+            self.reduce_heads = self.args['reduce_heads']
+            if self.hidden_size % args['constituent_heads'] != 0:
+                # TODO: technically we should either use the LCM of this and reduce_heads, or just have two separate fields
+                self.hidden_size = self.hidden_size + args['constituent_heads'] - (hidden_size % args['constituent_heads'])
+                if self.constituency_composition == ConstituencyComposition.ATTN and self.hidden_size % self.reduce_heads != 0:
+                    raise ValueError("--reduce_heads and --constituent_heads not compatible!")
+        self.transition_hidden_size = self.args['transition_hidden_size']
+        if args['transition_stack'] == StackHistory.ATTN:
+            if self.transition_hidden_size % args['transition_heads'] > 0:
+                logger.warning("transition_hidden_size %d %% transition_heads %d != 0.  reconfiguring", transition_hidden_size, args['transition_heads'])
+                self.transition_hidden_size = self.transition_hidden_size + args['transition_heads'] - (self.transition_hidden_size % args['transition_heads'])
+        self.tag_embedding_dim = self.args['tag_embedding_dim']
+        self.transition_embedding_dim = self.args['transition_embedding_dim']
+        self.delta_embedding_dim = self.args['delta_embedding_dim']
+        self.word_input_size = self.embedding_dim + self.tag_embedding_dim + self.delta_embedding_dim
+        if forward_charlm is not None:
+            self.add_unsaved_module('forward_charlm', forward_charlm)
+            self.word_input_size += self.forward_charlm.hidden_dim()
+            if not forward_charlm.is_forward_lm:
+                raise ValueError("Got a backward charlm as a forward charlm!")
+        else:
+            self.forward_charlm = None
+        if backward_charlm is not None:
+            self.add_unsaved_module('backward_charlm', backward_charlm)
+            self.word_input_size += self.backward_charlm.hidden_dim()
+            if backward_charlm.is_forward_lm:
+                raise ValueError("Got a forward charlm as a backward charlm!")
+        else:
+            self.backward_charlm = None
+        self.delta_words = sorted(set(words))
+        self.delta_word_map = { word: i+2 for i, word in enumerate(self.delta_words) }
+        assert PAD_ID == 0
+        assert UNK_ID == 1
+        # initialization is chosen based on the observed values of the norms
+        # after several long training cycles
+        # (this is true for other embeddings and embedding-like vectors as well)
+        # the experiments show this slightly helps were done with
+        # Adadelta and the correct initialization may be slightly
+        # different for a different optimizer.
+        # in fact, it is likely a scheme other than normal_ would
+        # be better - the optimizer tends to learn the weights
+        # rather close to 0 before learning in the direction it
+        # actually wants to go
+        self.delta_embedding = nn.Embedding(num_embeddings = len(self.delta_words)+2,
+                                            embedding_dim = self.delta_embedding_dim,
+                                            padding_idx = 0)
+        nn.init.normal_(self.delta_embedding.weight, std=0.05)
+        self.register_buffer('delta_tensors', torch.tensor(range(len(self.delta_words) + 2), requires_grad=False))
+        self.rare_words = set(rare_words)
+        self.tags = sorted(list(tags))
+        if self.tag_embedding_dim > 0:
+            self.tag_map = { t: i+2 for i, t in enumerate(self.tags) }
+            self.tag_embedding = nn.Embedding(num_embeddings = len(tags)+2,
+                                              embedding_dim = self.tag_embedding_dim,
+                                              padding_idx = 0)
+            nn.init.normal_(self.tag_embedding.weight, std=0.25)
+            self.register_buffer('tag_tensors', torch.tensor(range(len(self.tags) + 2), requires_grad=False))
+        self.num_lstm_layers = self.args['num_lstm_layers']
+        self.num_tree_lstm_layers = self.args['num_tree_lstm_layers']
+        self.lstm_layer_dropout = self.args['lstm_layer_dropout']
+        self.word_dropout = nn.Dropout(self.args['word_dropout'])
+        self.predict_dropout = nn.Dropout(self.args['predict_dropout'])
+        self.lstm_input_dropout = nn.Dropout(self.args['lstm_input_dropout'])
+        # also register a buffer of zeros so that we can always get zeros on the appropriate device
+        self.register_buffer('word_zeros', torch.zeros(self.hidden_size * self.num_tree_lstm_layers))
+        self.register_buffer('constituent_zeros', torch.zeros(self.num_lstm_layers, 1, self.hidden_size))
+        # possibly add a couple vectors for bookends of the sentence
+        # We put the word_start and word_end here, AFTER counting the
+        # charlm dimension, but BEFORE counting the bert dimension,
+        # as we want word_start and word_end to not have dimensions
+        # for the bert embedding.  The bert model will add its own
+        # start and end representation.
+        self.sentence_boundary_vectors = self.args['sentence_boundary_vectors']
+        if self.sentence_boundary_vectors is not SentenceBoundary.NONE:
+            self.register_parameter('word_start_embedding', torch.nn.Parameter(0.2 * torch.randn(self.word_input_size, requires_grad=True)))
+            self.register_parameter('word_end_embedding', torch.nn.Parameter(0.2 * torch.randn(self.word_input_size, requires_grad=True)))
+        # we set up the bert AFTER building word_start and word_end
+        # so that we can use the charlm endpoint values rather than
+        # try to train our own
+        self.force_bert_saved = force_bert_saved or self.args['bert_finetune'] or self.args['stage1_bert_finetune']
+        attach_bert_model(self, bert_model, bert_tokenizer, self.args.get('use_peft', False), self.force_bert_saved)
+        self.peft_name = peft_name
+        if bert_model is not None:
+            if bert_tokenizer is None:
+                raise ValueError("Cannot have a bert model without a tokenizer")
+            self.bert_dim = self.bert_model.config.hidden_size
+            if args['bert_hidden_layers']:
+                # The average will be offset by 1/N so that the default zeros
+                # represents an average of the N layers
+                if args['bert_hidden_layers'] > bert_model.config.num_hidden_layers:
+                    # limit ourselves to the number of layers actually available
+                    # note that we can +1 because of the initial embedding layer
+                    args['bert_hidden_layers'] = bert_model.config.num_hidden_layers + 1
+                self.bert_layer_mix = nn.Linear(args['bert_hidden_layers'], 1, bias=False)
+                nn.init.zeros_(self.bert_layer_mix.weight)
+            else:
+                # an average of layers 2, 3, 4 will be used
+                # (for historic reasons)
+                self.bert_layer_mix = None
+            self.word_input_size = self.word_input_size + self.bert_dim
+        self.partitioned_transformer_module = None
+        self.pattn_d_model = 0
+        if LSTMModel.uses_pattn(self.args):
+            # Initializations of parameters for the Partitioned Attention
+            # round off the size of the model so that it divides in half evenly
+            self.pattn_d_model = self.args['pattn_d_model'] // 2 * 2
+            # Initializations for the Partitioned Attention
+            # experiments suggest having a bias does not help here
+            self.partitioned_transformer_module = PartitionedTransformerModule(
+                self.args['pattn_num_layers'],
+                d_model=self.pattn_d_model,
+                n_head=self.args['pattn_num_heads'],
+                d_qkv=self.args['pattn_d_kv'],
+                d_ff=self.args['pattn_d_ff'],
+                ff_dropout=self.args['pattn_relu_dropout'],
+                residual_dropout=self.args['pattn_residual_dropout'],
+                attention_dropout=self.args['pattn_attention_dropout'],
+                word_input_size=self.word_input_size,
+                bias=self.args['pattn_bias'],
+                morpho_emb_dropout=self.args['pattn_morpho_emb_dropout'],
+                timing=self.args['pattn_timing'],
+                encoder_max_len=self.args['pattn_encoder_max_len']
+            )
+            self.word_input_size += self.pattn_d_model
+        self.label_attention_module = None
+        if LSTMModel.uses_lattn(self.args):
+            if self.partitioned_transformer_module is None:
+                logger.error("Not using Labeled Attention, as the Partitioned Attention module is not used")
+            else:
+                # TODO: think of a couple ways to use alternate inputs
+                # for example, could pass in the word inputs with a positional embedding
+                # that would also allow it to work in the case of no partitioned module
+                if self.args['lattn_combined_input']:
+                    self.lattn_d_input = self.word_input_size
+                else:
+                    self.lattn_d_input = self.pattn_d_model
+                self.label_attention_module = LabelAttentionModule(self.lattn_d_input,
+                                                                   self.args['lattn_d_input_proj'],
+                                                                   self.args['lattn_d_kv'],
+                                                                   self.args['lattn_d_kv'],
+                                                                   self.args['lattn_d_l'],
+                                                                   self.args['lattn_d_proj'],
+                                                                   self.args['lattn_combine_as_self'],
+                                                                   self.args['lattn_resdrop'],
+                                                                   self.args['lattn_q_as_matrix'],
+                                                                   self.args['lattn_residual_dropout'],
+                                                                   self.args['lattn_attention_dropout'],
+                                                                   self.pattn_d_model // 2,
+                                                                   self.args['lattn_d_ff'],
+                                                                   self.args['lattn_relu_dropout'],
+                                                                   self.args['lattn_partitioned'])
+                self.word_input_size = self.word_input_size + self.args['lattn_d_proj']*self.args['lattn_d_l']
+        self.word_lstm = nn.LSTM(input_size=self.word_input_size, hidden_size=self.hidden_size, num_layers=self.num_lstm_layers, bidirectional=True, dropout=self.lstm_layer_dropout)
+        # after putting the word_delta_tag input through the word_lstm, we get back
+        # hidden_size * 2 output with the front and back lstms concatenated.
+        # this transforms it into hidden_size with the values mixed together
+        self.word_to_constituent = nn.Linear(self.hidden_size * 2, self.hidden_size * self.num_tree_lstm_layers)
+        initialize_linear(self.word_to_constituent, self.args['nonlinearity'], self.hidden_size * 2)
+        self.transitions = sorted(list(transitions))
+        self.transition_map = { t: i for i, t in enumerate(self.transitions) }
+        # precompute tensors for the transitions
+        self.register_buffer('transition_tensors', torch.tensor(range(len(transitions)), requires_grad=False))
+        self.transition_embedding = nn.Embedding(num_embeddings = len(transitions),
+                                                 embedding_dim = self.transition_embedding_dim)
+        nn.init.normal_(self.transition_embedding.weight, std=0.25)
+        if args['transition_stack'] == StackHistory.LSTM:
+            self.transition_stack = LSTMTreeStack(input_size=self.transition_embedding_dim,
+                                                  hidden_size=self.transition_hidden_size,
+                                                  num_lstm_layers=self.num_lstm_layers,
+                                                  dropout=self.lstm_layer_dropout,
+                                                  uses_boundary_vector=self.sentence_boundary_vectors is SentenceBoundary.EVERYTHING,
+                                                  input_dropout=self.lstm_input_dropout)
+        elif args['transition_stack'] == StackHistory.ATTN:
+            self.transition_stack = TransformerTreeStack(input_size=self.transition_embedding_dim,
+                                                         output_size=self.transition_hidden_size,
+                                                         input_dropout=self.lstm_input_dropout,
+                                                         use_position=True,
+                                                         num_heads=args['transition_heads'])
+        else:
+            raise ValueError("Unhandled transition_stack StackHistory: {}".format(args['transition_stack']))
+        self.constituent_opens = sorted(list(constituent_opens))
+        # an embedding for the spot on the constituent LSTM taken up by the Open transitions
+        # the pattern when condensing constituents is embedding - con1 - con2 - con3 - embedding
+        # TODO: try the two ends have different embeddings?
+        self.constituent_open_map = { x: i for (i, x) in enumerate(self.constituent_opens) }
+        self.constituent_open_embedding = nn.Embedding(num_embeddings = len(self.constituent_open_map),
+                                                       embedding_dim = self.hidden_size)
+        nn.init.normal_(self.constituent_open_embedding.weight, std=0.2)
+        # input_size is hidden_size - could introduce a new constituent_size instead if we liked
+        if args['constituent_stack'] == StackHistory.LSTM:
+            self.constituent_stack = LSTMTreeStack(input_size=self.hidden_size,
+                                                   hidden_size=self.hidden_size,
+                                                   num_lstm_layers=self.num_lstm_layers,
+                                                   dropout=self.lstm_layer_dropout,
+                                                   uses_boundary_vector=self.sentence_boundary_vectors is SentenceBoundary.EVERYTHING,
+                                                   input_dropout=self.lstm_input_dropout)
+        elif args['constituent_stack'] == StackHistory.ATTN:
+            self.constituent_stack = TransformerTreeStack(input_size=self.hidden_size,
+                                                          output_size=self.hidden_size,
+                                                          input_dropout=self.lstm_input_dropout,
+                                                          use_position=True,
+                                                          num_heads=args['constituent_heads'])
+        else:
+            raise ValueError("Unhandled constituent_stack StackHistory: {}".format(args['transition_stack']))
+        if args['combined_dummy_embedding']:
+            self.dummy_embedding = self.constituent_open_embedding
+        else:
+            self.dummy_embedding = nn.Embedding(num_embeddings = len(self.constituent_open_map),
+                                                embedding_dim = self.hidden_size)
+            nn.init.normal_(self.dummy_embedding.weight, std=0.2)
+        self.register_buffer('constituent_open_tensors', torch.tensor(range(len(constituent_opens)), requires_grad=False))
+        # TODO: refactor
+        if (self.constituency_composition == ConstituencyComposition.BILSTM or
+            self.constituency_composition == ConstituencyComposition.BILSTM_MAX):
+            # forward and backward pieces for crunching several
+            # constituents into one, combined into a bi-lstm
+            # TODO: make the hidden size here an option?
+            self.constituent_reduce_lstm = nn.LSTM(input_size=self.hidden_size, hidden_size=self.hidden_size, num_layers=self.num_lstm_layers, bidirectional=True, dropout=self.lstm_layer_dropout)
+            # affine transformation from bi-lstm reduce to a new hidden layer
+            if self.constituency_composition == ConstituencyComposition.BILSTM:
+                self.reduce_linear = nn.Linear(self.hidden_size * 2, self.hidden_size)
+                initialize_linear(self.reduce_linear, self.args['nonlinearity'], self.hidden_size * 2)
+            else:
+                self.reduce_forward = nn.Linear(self.hidden_size, self.hidden_size)
+                self.reduce_backward = nn.Linear(self.hidden_size, self.hidden_size)
+                initialize_linear(self.reduce_forward, self.args['nonlinearity'], self.hidden_size)
+                initialize_linear(self.reduce_backward, self.args['nonlinearity'], self.hidden_size)
+        elif self.constituency_composition == ConstituencyComposition.MAX:
+            # transformation to turn several constituents into one new constituent
+            self.reduce_linear = nn.Linear(self.hidden_size, self.hidden_size)
+            initialize_linear(self.reduce_linear, self.args['nonlinearity'], self.hidden_size)
+        elif self.constituency_composition == ConstituencyComposition.UNTIED_MAX:
+            # transformation to turn several constituents into one new constituent
+            self.register_parameter('reduce_linear_weight', torch.nn.Parameter(torch.randn(len(constituent_opens), self.hidden_size, self.hidden_size, requires_grad=True)))
+            self.register_parameter('reduce_linear_bias', torch.nn.Parameter(torch.randn(len(constituent_opens), self.hidden_size, requires_grad=True)))
+            for layer_idx in range(len(constituent_opens)):
+                nn.init.kaiming_normal_(self.reduce_linear_weight[layer_idx], nonlinearity=self.args['nonlinearity'])
+            nn.init.uniform_(self.reduce_linear_bias, 0, 1 / (self.hidden_size * 2) ** 0.5)
+        elif self.constituency_composition == ConstituencyComposition.BIGRAM:
+            self.reduce_linear = nn.Linear(self.hidden_size, self.hidden_size)
+            self.reduce_bigram = nn.Linear(self.hidden_size * 2, self.hidden_size)
+            initialize_linear(self.reduce_linear, self.args['nonlinearity'], self.hidden_size)
+            initialize_linear(self.reduce_bigram, self.args['nonlinearity'], self.hidden_size)
+        elif self.constituency_composition == ConstituencyComposition.ATTN:
+            self.reduce_attn = nn.MultiheadAttention(self.hidden_size, self.reduce_heads)
+        elif self.constituency_composition == ConstituencyComposition.KEY or self.constituency_composition == ConstituencyComposition.UNTIED_KEY:
+            if self.args['reduce_position']:
+                # unsaved module so that if it grows, we don't save
+                # the larger version unnecessarily
+                # under any normal circumstances, the growth will
+                # happen early in training when the model is not
+                # behaving well, then will not be needed once the
+                # model learns not to make super degenerate
+                # constituents
+                self.add_unsaved_module("reduce_position", ConcatSinusoidalEncoding(self.args['reduce_position'], 50))
+            else:
+                self.add_unsaved_module("reduce_position", nn.Identity())
+            self.reduce_query = nn.Linear(self.hidden_size + self.args['reduce_position'], self.hidden_size, bias=False)
+            self.reduce_value = nn.Linear(self.hidden_size + self.args['reduce_position'], self.hidden_size)
+            if self.constituency_composition == ConstituencyComposition.KEY:
+                self.register_parameter('reduce_key', torch.nn.Parameter(torch.randn(self.reduce_heads, self.hidden_size // self.reduce_heads, 1, requires_grad=True)))
+            else:
+                self.register_parameter('reduce_key', torch.nn.Parameter(torch.randn(len(constituent_opens), self.reduce_heads, self.hidden_size // self.reduce_heads, 1, requires_grad=True)))
+        elif self.constituency_composition == ConstituencyComposition.TREE_LSTM:
+            self.constituent_reduce_lstm = nn.LSTM(input_size=self.hidden_size, hidden_size=self.hidden_size, num_layers=self.num_tree_lstm_layers, dropout=self.lstm_layer_dropout)
+        elif self.constituency_composition == ConstituencyComposition.TREE_LSTM_CX:
+            self.constituent_reduce_embedding = nn.Embedding(num_embeddings = len(tags)+2,
+                                                             embedding_dim = self.num_tree_lstm_layers * self.hidden_size)
+            self.constituent_reduce_lstm = nn.LSTM(input_size=self.hidden_size, hidden_size=self.hidden_size, num_layers=self.num_tree_lstm_layers, dropout=self.lstm_layer_dropout)
+        else:
+            raise ValueError("Unhandled ConstituencyComposition: {}".format(self.constituency_composition))
+        self.nonlinearity = build_nonlinearity(self.args['nonlinearity'])
+        # matrix for predicting the next transition using word/constituent/transition queues
+        # word size + constituency size + transition size
+        # TODO: .get() is only necessary until all models rebuilt with this param
+        self.maxout_k = self.args.get('maxout_k', 0)
+        self.output_layers = self.build_output_layers(self.args['num_output_layers'], len(transitions), self.maxout_k)
+    @staticmethod
+    def uses_lattn(args):
+        return args.get('use_lattn', True) and args.get('lattn_d_proj', 0) > 0 and args.get('lattn_d_l', 0) > 0
+    @staticmethod
+    def uses_pattn(args):
+        return args['pattn_num_heads'] > 0 and args['pattn_num_layers'] > 0
+    def copy_with_new_structure(self, other):
+        """
+        Copy parameters from the other model to this model
+        word_lstm can change size if the other model didn't use pattn / lattn and this one does.
+        In that case, the new values are initialized to 0.
+        This will rebuild the model in such a way that the outputs will be
+        exactly the same as the previous model.
+        """
+        if self.constituency_composition != other.constituency_composition and self.constituency_composition != ConstituencyComposition.UNTIED_MAX:
+            raise ValueError("Models are incompatible: self.constituency_composition == {}, other.constituency_composition == {}".format(self.constituency_composition, other.constituency_composition))
+        for name, other_parameter in other.named_parameters():
+            # this allows other.constituency_composition == UNTIED_MAX to fall through
+            if name.startswith('reduce_linear.') and self.constituency_composition == ConstituencyComposition.UNTIED_MAX:
+                if name == 'reduce_linear.weight':
+                    my_parameter = self.reduce_linear_weight
+                elif name == 'reduce_linear.bias':
+                    my_parameter = self.reduce_linear_bias
+                else:
+                    raise ValueError("Unexpected other parameter name {}".format(name))
+                for idx in range(len(self.constituent_opens)):
+                    my_parameter[idx].data.copy_(other_parameter.data)
+            elif name.startswith('word_lstm.weight_ih_l0'):
+                # bottom layer shape may have changed from adding a new pattn / lattn block
+                my_parameter = self.get_parameter(name)
+                # -1 so that it can be converted easier to a different parameter
+                copy_size = min(other_parameter.data.shape[-1], my_parameter.data.shape[-1])
+                #new_values = my_parameter.data.clone().detach()
+                new_values = torch.zeros_like(my_parameter.data)
+                new_values[..., :copy_size] = other_parameter.data[..., :copy_size]
+                my_parameter.data.copy_(new_values)
+            else:
+                try:
+                    self.get_parameter(name).data.copy_(other_parameter.data)
+                except AttributeError as e:
+                    raise AttributeError("Could not process %s" % name) from e
+    def build_output_layers(self, num_output_layers, final_layer_size, maxout_k):
+        """
+        Build a ModuleList of Linear transformations for the given num_output_layers
+        The final layer size can be specified.
+        Initial layer size is the combination of word, constituent, and transition vectors
+        Middle layer sizes are self.hidden_size
+        """
+        middle_layers = num_output_layers - 1
+        # word_lstm:         hidden_size * num_tree_lstm_layers
+        # transition_stack:  transition_hidden_size
+        # constituent_stack: hidden_size
+        predict_input_size = [self.hidden_size + self.hidden_size * self.num_tree_lstm_layers + self.transition_hidden_size] + [self.hidden_size] * middle_layers
+        predict_output_size = [self.hidden_size] * middle_layers + [final_layer_size]
+        if not maxout_k:
+            output_layers = nn.ModuleList([nn.Linear(input_size, output_size)
+                                           for input_size, output_size in zip(predict_input_size, predict_output_size)])
+            for output_layer, input_size in zip(output_layers, predict_input_size):
+                initialize_linear(output_layer, self.args['nonlinearity'], input_size)
+        else:
+            output_layers = nn.ModuleList([MaxoutLinear(input_size, output_size, maxout_k)
+                                           for input_size, output_size in zip(predict_input_size, predict_output_size)])
+        return output_layers
+    def num_words_known(self, words):
+        return sum(word in self.vocab_map or word.lower() in self.vocab_map for word in words)
+    @property
+    def retag_method(self):
+        # TODO: make the method an enum
+        return self.args['retag_method']
+    def uses_xpos(self):
+        return self.args['retag_package'] is not None and self.args['retag_method'] == 'xpos'
+    def add_unsaved_module(self, name, module):
+        """
+        Adds a module which will not be saved to disk
+        Best used for large models such as pretrained word embeddings
+        """
+        self.unsaved_modules += [name]
+        setattr(self, name, module)
+        if module is not None and name in ('forward_charlm', 'backward_charlm'):
+            for _, parameter in module.named_parameters():
+                parameter.requires_grad = False
+    def is_unsaved_module(self, name):
+        return name.split('.')[0] in self.unsaved_modules
+    def get_norms(self):
+        lines = []
+        skip = set()
+        if self.constituency_composition == ConstituencyComposition.UNTIED_MAX:
+            skip = {'reduce_linear_weight', 'reduce_linear_bias'}
+            lines.append("reduce_linear:")
+            for c_idx, c_open in enumerate(self.constituent_opens):
+                lines.append("  %s weight %.6g bias %.6g" % (c_open, torch.norm(self.reduce_linear_weight[c_idx]).item(), torch.norm(self.reduce_linear_bias[c_idx]).item()))
+        active_params = [(name, param) for name, param in self.named_parameters() if param.requires_grad and name not in skip]
+        if len(active_params) == 0:
+            return lines
+        print(len(active_params))
+        max_name_len = max(len(name) for name, param in active_params)
+        max_norm_len = max(len("%.6g" % torch.norm(param).item()) for name, param in active_params)
+        format_string = "%-" + str(max_name_len) + "s   norm %" + str(max_norm_len) + "s  zeros %d / %d"
+        for name, param in active_params:
+            zeros = torch.sum(param.abs() < 0.000001).item()
+            norm = "%.6g" % torch.norm(param).item()
+            lines.append(format_string % (name, norm, zeros, param.nelement()))
+        return lines
+    def log_norms(self):
+        lines = ["NORMS FOR MODEL PARAMETERS"]
+        lines.extend(self.get_norms())
+        logger.info("\n".join(lines))
+    def log_shapes(self):
+        lines = ["NORMS FOR MODEL PARAMETERS"]
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                lines.append("{} {}".format(name, param.shape))
+        logger.info("\n".join(lines))
+    def initial_word_queues(self, tagged_word_lists):
+        """
+        Produce initial word queues out of the model's LSTMs for use in the tagged word lists.
+        Operates in a batched fashion to reduce the runtime for the LSTM operations
+        """
+        device = next(self.parameters()).device
+        vocab_map = self.vocab_map
+        def map_word(word):
+            idx = vocab_map.get(word, None)
+            if idx is not None:
+                return idx
+            return vocab_map.get(word.lower(), UNK_ID)
+        all_word_inputs = []
+        all_word_labels = [[word.children[0].label for word in tagged_words]
+                           for tagged_words in tagged_word_lists]
+        for sentence_idx, tagged_words in enumerate(tagged_word_lists):
+            word_labels = all_word_labels[sentence_idx]
+            word_idx = torch.stack([self.vocab_tensors[map_word(word.children[0].label)] for word in tagged_words])
+            word_input = self.embedding(word_idx)
+            # this occasionally learns UNK at train time
+            if self.training:
+                delta_labels = [None if word in self.rare_words and random.random() < self.args['rare_word_unknown_frequency'] else word
+                                for word in word_labels]
+            else:
+                delta_labels = word_labels
+            delta_idx = torch.stack([self.delta_tensors[self.delta_word_map.get(word, UNK_ID)] for word in delta_labels])
+            delta_input = self.delta_embedding(delta_idx)
+            word_inputs = [word_input, delta_input]
+            if self.tag_embedding_dim > 0:
+                if self.training:
+                    tag_labels = [None if random.random() < self.args['tag_unknown_frequency'] else word.label for word in tagged_words]
+                else:
+                    tag_labels = [word.label for word in tagged_words]
+                tag_idx = torch.stack([self.tag_tensors[self.tag_map.get(tag, UNK_ID)] for tag in tag_labels])
+                tag_input = self.tag_embedding(tag_idx)
+                word_inputs.append(tag_input)
+            all_word_inputs.append(word_inputs)
+        if self.forward_charlm is not None:
+            all_forward_chars = self.forward_charlm.build_char_representation(all_word_labels)
+            for word_inputs, forward_chars in zip(all_word_inputs, all_forward_chars):
+                word_inputs.append(forward_chars)
+        if self.backward_charlm is not None:
+            all_backward_chars = self.backward_charlm.build_char_representation(all_word_labels)
+            for word_inputs, backward_chars in zip(all_word_inputs, all_backward_chars):
+                word_inputs.append(backward_chars)
+        all_word_inputs = [torch.cat(word_inputs, dim=1) for word_inputs in all_word_inputs]
+        if self.sentence_boundary_vectors is not SentenceBoundary.NONE:
+            word_start = self.word_start_embedding.unsqueeze(0)
+            word_end = self.word_end_embedding.unsqueeze(0)
+            all_word_inputs = [torch.cat([word_start, word_inputs, word_end], dim=0) for word_inputs in all_word_inputs]
+        if self.bert_model is not None:
+            # BERT embedding extraction
+            # result will be len+2 for each sentence
+            # we will take 1:-1 if we don't care about the endpoints
+            bert_embeddings = extract_bert_embeddings(self.args['bert_model'], self.bert_tokenizer, self.bert_model, all_word_labels, device,
+                                                      keep_endpoints=self.sentence_boundary_vectors is not SentenceBoundary.NONE,
+                                                      num_layers=self.bert_layer_mix.in_features if self.bert_layer_mix is not None else None,
+                                                      detach=not self.args['bert_finetune'] and not self.args['stage1_bert_finetune'],
+                                                      peft_name=self.peft_name)
+            if self.bert_layer_mix is not None:
+                # add the average so that the default behavior is to
+                # take an average of the N layers, and anything else
+                # other than that needs to be learned
+                bert_embeddings = [self.bert_layer_mix(feature).squeeze(2) + feature.sum(axis=2) / self.bert_layer_mix.in_features for feature in bert_embeddings]
+            all_word_inputs = [torch.cat((x, y), axis=1) for x, y in zip(all_word_inputs, bert_embeddings)]
+        # Extract partitioned representation
+        if self.partitioned_transformer_module is not None:
+            partitioned_embeddings = self.partitioned_transformer_module(None, all_word_inputs)
+            all_word_inputs = [torch.cat((x, y[:x.shape[0], :]), axis=1) for x, y in zip(all_word_inputs, partitioned_embeddings)]
+        # Extract Labeled Representation
+        if self.label_attention_module is not None:
+            if self.args['lattn_combined_input']:
+                labeled_representations = self.label_attention_module(all_word_inputs, tagged_word_lists)
+            else:
+                labeled_representations = self.label_attention_module(partitioned_embeddings, tagged_word_lists)
+            all_word_inputs = [torch.cat((x, y[:x.shape[0], :]), axis=1) for x, y in zip(all_word_inputs, labeled_representations)]
+        all_word_inputs = [self.word_dropout(word_inputs) for word_inputs in all_word_inputs]
+        packed_word_input = torch.nn.utils.rnn.pack_sequence(all_word_inputs, enforce_sorted=False)
+        word_output, _ = self.word_lstm(packed_word_input)
+        # would like to do word_to_constituent here, but it seems PackedSequence doesn't support Linear
+        # word_output will now be sentence x batch x 2*hidden_size
+        word_output, word_output_lens = torch.nn.utils.rnn.pad_packed_sequence(word_output)
+        # now sentence x batch x hidden_size
+        word_queues = []
+        for sentence_idx, tagged_words in enumerate(tagged_word_lists):
+            if self.sentence_boundary_vectors is not SentenceBoundary.NONE:
+                sentence_output = word_output[:len(tagged_words)+2, sentence_idx, :]
+            else:
+                sentence_output = word_output[:len(tagged_words), sentence_idx, :]
+            sentence_output = self.word_to_constituent(sentence_output)
+            sentence_output = self.nonlinearity(sentence_output)
+            # TODO: this makes it so constituents downstream are
+            # build with the outputs of the LSTM, not the word
+            # embeddings themselves.  It is possible we want to
+            # transform the word_input to hidden_size in some way
+            # and use that instead
+            if self.sentence_boundary_vectors is not SentenceBoundary.NONE:
+                word_queue =  [WordNode(None, sentence_output[0, :])]
+                word_queue += [WordNode(tag_node, sentence_output[idx+1, :])
+                               for idx, tag_node in enumerate(tagged_words)]
+                word_queue.append(WordNode(None, sentence_output[len(tagged_words)+1, :]))
+            else:
+                word_queue =  [WordNode(None, self.word_zeros)]
+                word_queue += [WordNode(tag_node, sentence_output[idx, :])
+                                   for idx, tag_node in enumerate(tagged_words)]
+                word_queue.append(WordNode(None, self.word_zeros))
+            if self.reverse_sentence:
+                word_queue = list(reversed(word_queue))
+            word_queues.append(word_queue)
+        return word_queues
+    def initial_transitions(self):
+        """
+        Return an initial TreeStack with no transitions
+        """
+        return self.transition_stack.initial_state()
+    def initial_constituents(self):
+        """
+        Return an initial TreeStack with no constituents
+        """
+        return self.constituent_stack.initial_state(Constituent(None, self.constituent_zeros, self.constituent_zeros))
+    def get_word(self, word_node):
+        return word_node.value
+    def transform_word_to_constituent(self, state):
+        word_node = state.get_word(state.word_position)
+        word = word_node.value
+        if self.constituency_composition == ConstituencyComposition.TREE_LSTM:
+            return Constituent(word, word_node.hx.view(self.num_tree_lstm_layers, self.hidden_size), self.word_zeros.view(self.num_tree_lstm_layers, self.hidden_size))
+        elif self.constituency_composition == ConstituencyComposition.TREE_LSTM_CX:
+            # the UNK tag will be trained thanks to occasionally dropping out tags
+            tag = word.label
+            tree_hx = word_node.hx.view(self.num_tree_lstm_layers, self.hidden_size)
+            tag_tensor = self.tag_tensors[self.tag_map.get(tag, UNK_ID)]
+            tree_cx = self.constituent_reduce_embedding(tag_tensor)
+            tree_cx = tree_cx.view(self.num_tree_lstm_layers, self.hidden_size)
+            return Constituent(word, tree_hx, tree_cx * tree_hx)
+        else:
+            return Constituent(word, word_node.hx[:self.hidden_size].unsqueeze(0), None)
+    def dummy_constituent(self, dummy):
+        label = dummy.label
+        open_index = self.constituent_open_tensors[self.constituent_open_map[label]]
+        hx = self.dummy_embedding(open_index)
+        # the cx doesn't matter: the dummy will be discarded when building a new constituent
+        return Constituent(dummy, hx.unsqueeze(0), None)
+    def build_constituents(self, labels, children_lists):
+        """
+        Build new constituents with the given label from the list of children
+        labels is a list of labels for each of the new nodes to construct
+        children_lists is a list of children that go under each of the new nodes
+        lists of each are used so that we can stack operations
+        """
+        # at the end of each of these operations, we expect lstm_hx.shape
+        # is (L, N, hidden_size) for N lists of children
+        if (self.constituency_composition == ConstituencyComposition.BILSTM or
+            self.constituency_composition == ConstituencyComposition.BILSTM_MAX):
+            node_hx = [[child.value.tree_hx.squeeze(0) for child in children] for children in children_lists]
+            label_hx = [self.constituent_open_embedding(self.constituent_open_tensors[self.constituent_open_map[label]]) for label in labels]
+            max_length = max(len(children) for children in children_lists)
+            zeros = torch.zeros(self.hidden_size, device=label_hx[0].device)
+            # weirdly, this is faster than using pack_sequence
+            unpacked_hx = [[lhx] + nhx + [lhx] + [zeros] * (max_length - len(nhx)) for lhx, nhx in zip(label_hx, node_hx)]
+            unpacked_hx = [self.lstm_input_dropout(torch.stack(nhx)) for nhx in unpacked_hx]
+            packed_hx = torch.stack(unpacked_hx, axis=1)
+            packed_hx = torch.nn.utils.rnn.pack_padded_sequence(packed_hx, [len(x)+2 for x in children_lists], enforce_sorted=False)
+            lstm_output = self.constituent_reduce_lstm(packed_hx)
+            # take just the output of the final layer
+            #   result of lstm is ouput, (hx, cx)
+            #   so [1][0] gets hx
+            #      [1][0][-1] is the final output
+            # will be shape len(children_lists) * 2, hidden_size for bidirectional
+            # where forward outputs are -2 and backwards are -1
+            if self.constituency_composition == ConstituencyComposition.BILSTM:
+                lstm_output = lstm_output[1][0]
+                forward_hx = lstm_output[-2, :, :]
+                backward_hx = lstm_output[-1, :, :]
+                hx = self.reduce_linear(torch.cat((forward_hx, backward_hx), axis=1))
+            else:
+                lstm_output, lstm_lengths = torch.nn.utils.rnn.pad_packed_sequence(lstm_output[0])
+                lstm_output = [lstm_output[1:length-1, x, :] for x, length in zip(range(len(lstm_lengths)), lstm_lengths)]
+                lstm_output = torch.stack([torch.max(x, 0).values for x in lstm_output], axis=0)
+                hx = self.reduce_forward(lstm_output[:, :self.hidden_size]) + self.reduce_backward(lstm_output[:, self.hidden_size:])
+            lstm_hx = self.nonlinearity(hx).unsqueeze(0)
+            lstm_cx = None
+        elif self.constituency_composition == ConstituencyComposition.MAX:
+            node_hx = [[child.value.tree_hx for child in children] for children in children_lists]
+            unpacked_hx = [self.lstm_input_dropout(torch.max(torch.stack(nhx), 0).values) for nhx in node_hx]
+            packed_hx = torch.stack(unpacked_hx, axis=1)
+            hx = self.reduce_linear(packed_hx)
+            lstm_hx = self.nonlinearity(hx)
+            lstm_cx = None
+        elif self.constituency_composition == ConstituencyComposition.UNTIED_MAX:
+            node_hx = [[child.value.tree_hx for child in children] for children in children_lists]
+            unpacked_hx = [self.lstm_input_dropout(torch.max(torch.stack(nhx), 0).values) for nhx in node_hx]
+            # shape == len(labels),1,hidden_size after the stack
+            #packed_hx = torch.stack(unpacked_hx, axis=0)
+            label_indices = [self.constituent_open_map[label] for label in labels]
+            # we would like to stack the reduce_linear_weight calculations as follows:
+            #reduce_weight = self.reduce_linear_weight[label_indices]
+            #reduce_bias = self.reduce_linear_bias[label_indices]
+            # this would allow for faster vectorized operations.
+            # however, this runs out of memory on larger training examples,
+            # presumably because there are too many stacks in a row and each one
+            # has its own gradient kept for the entire calculation
+            # fortunately, this operation is not a huge part of the expense
+            hx = [torch.matmul(self.reduce_linear_weight[label_idx], hx_layer.squeeze(0)) + self.reduce_linear_bias[label_idx]
+                  for label_idx, hx_layer in zip(label_indices, unpacked_hx)]
+            hx = torch.stack(hx, axis=0)
+            hx = hx.unsqueeze(0)
+            lstm_hx = self.nonlinearity(hx)
+            lstm_cx = None
+        elif self.constituency_composition == ConstituencyComposition.BIGRAM:
+            node_hx = [[child.value.tree_hx for child in children] for children in children_lists]
+            unpacked_hx = []
+            for nhx in node_hx:
+                # tanh or otherwise limit the size of the output?
+                stacked_nhx = self.lstm_input_dropout(torch.cat(nhx, axis=0))
+                if stacked_nhx.shape[0] > 1:
+                    bigram_hx = torch.cat((stacked_nhx[:-1, :], stacked_nhx[1:, :]), axis=1)
+                    bigram_hx = self.reduce_bigram(bigram_hx) / 2
+                    stacked_nhx = torch.cat((stacked_nhx, bigram_hx), axis=0)
+                unpacked_hx.append(torch.max(stacked_nhx, 0).values)
+            packed_hx = torch.stack(unpacked_hx, axis=0).unsqueeze(0)
+            hx = self.reduce_linear(packed_hx)
+            lstm_hx = self.nonlinearity(hx)
+            lstm_cx = None
+        elif self.constituency_composition == ConstituencyComposition.ATTN:
+            node_hx = [[child.value.tree_hx for child in children] for children in children_lists]
+            label_hx = [self.constituent_open_embedding(self.constituent_open_tensors[self.constituent_open_map[label]]) for label in labels]
+            unpacked_hx = [torch.stack(nhx) for nhx in node_hx]
+            unpacked_hx = [torch.cat((lhx.unsqueeze(0).unsqueeze(0), nhx), axis=0) for lhx, nhx in zip(label_hx, unpacked_hx)]
+            unpacked_hx = [self.reduce_attn(nhx, nhx, nhx)[0].squeeze(1) for nhx in unpacked_hx]
+            unpacked_hx = [self.lstm_input_dropout(torch.max(nhx, 0).values) for nhx in unpacked_hx]
+            hx = torch.stack(unpacked_hx, axis=0)
+            lstm_hx = self.nonlinearity(hx).unsqueeze(0)
+            lstm_cx = None
+        elif self.constituency_composition == ConstituencyComposition.KEY or self.constituency_composition == ConstituencyComposition.UNTIED_KEY:
+            node_hx = [torch.stack([child.value.tree_hx for child in children]) for children in children_lists]
+            # add a position vector to each node_hx
+            node_hx = [self.reduce_position(x.reshape(x.shape[0], -1)) for x in node_hx]
+            query_hx = [self.reduce_query(nhx) for nhx in node_hx]
+            # reshape query for MHA
+            query_hx = [nhx.reshape(nhx.shape[0], self.reduce_heads, -1).transpose(0, 1) for nhx in query_hx]
+            if self.constituency_composition == ConstituencyComposition.KEY:
+                queries = [torch.matmul(nhx, self.reduce_key) for nhx in query_hx]
+            else:
+                label_indices = [self.constituent_open_map[label] for label in labels]
+                queries = [torch.matmul(nhx, self.reduce_key[label_idx]) for nhx, label_idx in zip(query_hx, label_indices)]
+            # softmax each head
+            weights = [torch.nn.functional.softmax(nhx, dim=1).transpose(1, 2) for nhx in queries]
+            value_hx = [self.reduce_value(nhx) for nhx in node_hx]
+            value_hx = [nhx.reshape(nhx.shape[0], self.reduce_heads, -1).transpose(0, 1) for nhx in value_hx]
+            # use the softmaxes to add up the heads
+            unpacked_hx = [torch.matmul(weight, nhx).squeeze(1) for weight, nhx in zip(weights, value_hx)]
+            unpacked_hx = [nhx.reshape(-1) for nhx in unpacked_hx]
+            hx = torch.stack(unpacked_hx, axis=0).unsqueeze(0)
+            lstm_hx = self.nonlinearity(hx)
+            lstm_cx = None
+        elif self.constituency_composition in (ConstituencyComposition.TREE_LSTM, ConstituencyComposition.TREE_LSTM_CX):
+            label_hx = [self.lstm_input_dropout(self.constituent_open_embedding(self.constituent_open_tensors[self.constituent_open_map[label]])) for label in labels]
+            label_hx = torch.stack(label_hx).unsqueeze(0)
+            max_length = max(len(children) for children in children_lists)
+            # stacking will let us do elementwise multiplication faster, hopefully
+            node_hx = [[child.value.tree_hx for child in children] for children in children_lists]
+            unpacked_hx = [self.lstm_input_dropout(torch.stack(nhx)) for nhx in node_hx]
+            unpacked_hx = [nhx.max(dim=0) for nhx in unpacked_hx]
+            packed_hx = torch.stack([nhx.values for nhx in unpacked_hx], axis=1)
+            #packed_hx = packed_hx.max(dim=0).values
+            node_cx = [torch.stack([child.value.tree_cx for child in children]) for children in children_lists]
+            node_cx_indices = [uhx.indices.unsqueeze(0) for uhx in unpacked_hx]
+            unpacked_cx = [ncx.gather(0, nci).squeeze(0) for ncx, nci in zip(node_cx, node_cx_indices)]
+            packed_cx = torch.stack(unpacked_cx, axis=1)
+            _, (lstm_hx, lstm_cx) = self.constituent_reduce_lstm(label_hx, (packed_hx, packed_cx))
+        else:
+            raise ValueError("Unhandled ConstituencyComposition: {}".format(self.constituency_composition))
+        constituents = []
+        for idx, (label, children) in enumerate(zip(labels, children_lists)):
+            children = [child.value.value for child in children]
+            if isinstance(label, str):
+                node = Tree(label=label, children=children)
+            else:
+                for value in reversed(label):
+                    node = Tree(label=value, children=children)
+                    children = node
+            constituents.append(Constituent(node, lstm_hx[:, idx, :], lstm_cx[:, idx, :] if lstm_cx is not None else None))
+        return constituents
+    def push_constituents(self, constituent_stacks, constituents):
+        # Another possibility here would be to use output[0, i, :]
+        # from the constituency lstm for the value of the new node.
+        # This might theoretically make the new constituent include
+        # information from neighboring constituents.  However, this
+        # lowers the scores of various models.
+        # For example, an experiment on ja_alt built this way,
+        # averaged over 5 trials, had the following loss in accuracy:
+        # 150 epochs: 0.8971 to 0.8953
+        # 200 epochs: 0.8985 to 0.8964
+        current_nodes = [stack.value for stack in constituent_stacks]
+        constituent_input = torch.stack([x.tree_hx[-1:] for x in constituents], axis=1)
+        #constituent_input = constituent_input.unsqueeze(0)
+        # the constituents are already Constituent(tree, tree_hx, tree_cx)
+        return self.constituent_stack.push_states(constituent_stacks, constituents, constituent_input)
+    def get_top_constituent(self, constituents):
+        """
+        Extract only the top constituent from a state's constituent
+        sequence, even though it has multiple addition pieces of
+        information
+        """
+        # TreeStack value -> LSTMTreeStack value -> Constituent value -> constituent
+        return constituents.value.value.value
+    def push_transitions(self, transition_stacks, transitions):
+        """
+        Push all of the given transitions on to the stack as a batch operations.
+        Significantly faster than doing one transition at a time.
+        """
+        transition_idx = torch.stack([self.transition_tensors[self.transition_map[transition]] for transition in transitions])
+        transition_input = self.transition_embedding(transition_idx).unsqueeze(0)
+        return self.transition_stack.push_states(transition_stacks, transitions, transition_input)
+    def get_top_transition(self, transitions):
+        """
+        Extract only the top transition from a state's transition
+        sequence, even though it has multiple addition pieces of
+        information
+        """
+        # TreeStack value -> LSTMTreeStack value -> transition
+        return transitions.value.value
+    def forward(self, states):
+        """
+        Return logits for a prediction of what transition to make next
+        We've basically done all the work analyzing the state as
+        part of applying the transitions, so this method is very simple
+        return shape: (num_states, num_transitions)
+        """
+        word_hx = torch.stack([state.get_word(state.word_position).hx for state in states])
+        transition_hx = torch.stack([self.transition_stack.output(state.transitions) for state in states])
+        # this .output() is the output of the constituent stack, not the
+        # constituent itself
+        # this way, we can, as an option, NOT include the constituents to the left
+        # when building the current vector for a constituent
+        # and the vector used for inference will still incorporate the entire LSTM
+        constituent_hx = torch.stack([self.constituent_stack.output(state.constituents) for state in states])
+        hx = torch.cat((word_hx, transition_hx, constituent_hx), axis=1)
+        for idx, output_layer in enumerate(self.output_layers):
+            hx = self.predict_dropout(hx)
+            if not self.maxout_k and idx < len(self.output_layers) - 1:
+                hx = self.nonlinearity(hx)
+            hx = output_layer(hx)
+        return hx
+    def predict(self, states, is_legal=True):
+        """
+        Generate and return predictions, along with the transitions those predictions represent
+        If is_legal is set to True, will only return legal transitions.
+        This means returning None if there are no legal transitions.
+        Hopefully the constraints prevent that from happening
+        """
+        predictions = self.forward(states)
+        pred_max = torch.argmax(predictions, dim=1)
+        scores = torch.take_along_dim(predictions, pred_max.unsqueeze(1), dim=1)
+        pred_max = pred_max.detach().cpu()
+        pred_trans = [self.transitions[pred_max[idx]] for idx in range(len(states))]
+        if is_legal:
+            for idx, (state, trans) in enumerate(zip(states, pred_trans)):
+                if not trans.is_legal(state, self):
+                    _, indices = predictions[idx, :].sort(descending=True)
+                    for index in indices:
+                        if self.transitions[index].is_legal(state, self):
+                            pred_trans[idx] = self.transitions[index]
+                            scores[idx] = predictions[idx, index]
+                            break
+                    else: # yeah, else on a for loop, deal with it
+                        pred_trans[idx] = None
+                        scores[idx] = None
+        return predictions, pred_trans, scores.squeeze(1)
+    def weighted_choice(self, states):
+        """
+        Generate and return predictions, and randomly choose a prediction weighted by the scores
+        TODO: pass in a temperature
+        """
+        predictions = self.forward(states)
+        pred_trans = []
+        all_scores = []
+        for state, prediction in zip(states, predictions):
+            legal_idx = [idx for idx in range(prediction.shape[0]) if self.transitions[idx].is_legal(state, self)]
+            if len(legal_idx) == 0:
+                pred_trans.append(None)
+                continue
+            scores = prediction[legal_idx]
+            scores = torch.softmax(scores, dim=0)
+            idx = torch.multinomial(scores, 1)
+            idx = legal_idx[idx]
+            pred_trans.append(self.transitions[idx])
+            all_scores.append(prediction[idx])
+        all_scores = torch.stack(all_scores)
+        return predictions, pred_trans, all_scores
+    def predict_gold(self, states):
+        """
+        For each State, return the next item in the gold_sequence
+        """
+        predictions = self.forward(states)
+        transitions = [y.gold_sequence[y.num_transitions] for y in states]
+        indices = torch.tensor([self.transition_map[t] for t in transitions], device=predictions.device)
+        scores = torch.take_along_dim(predictions, indices.unsqueeze(1), dim=1)
+        return predictions, transitions, scores.squeeze(1)
+    def get_params(self, skip_modules=True):
+        """
+        Get a dictionary for saving the model
+        """
+        model_state = self.state_dict()
+        # skip saving modules like pretrained embeddings, because they are large and will be saved in a separate file
+        if skip_modules:
+            skipped = [k for k in model_state.keys() if self.is_unsaved_module(k)]
+            for k in skipped:
+                del model_state[k]
+        config = copy.deepcopy(self.args)
+        config['sentence_boundary_vectors'] = config['sentence_boundary_vectors'].name
+        config['constituency_composition'] = config['constituency_composition'].name
+        config['transition_stack'] = config['transition_stack'].name
+        config['constituent_stack'] = config['constituent_stack'].name
+        config['transition_scheme'] = config['transition_scheme'].name
+        assert isinstance(self.rare_words, set)
+        params = {
+            'model': model_state,
+            'model_type': "LSTM",
+            'config': config,
+            'transitions': [repr(x) for x in self.transitions],
+            'constituents': self.constituents,
+            'tags': self.tags,
+            'words': self.delta_words,
+            'rare_words': list(self.rare_words),
+            'root_labels': self.root_labels,
+            'constituent_opens': self.constituent_opens,
+            'unary_limit': self.unary_limit(),
+        }
+        return params

stanza/stanza/models/constituency/parse_tree.py ADDED Viewed

	@@ -0,0 +1,591 @@

+"""
+Tree datastructure
+"""
+from collections import deque, Counter
+import copy
+from enum import Enum
+from io import StringIO
+import itertools
+import re
+import warnings
+from stanza.models.common.stanza_object import StanzaObject
+# useful more for the "is" functionality than the time savings
+CLOSE_PAREN = ')'
+SPACE_SEPARATOR = ' '
+OPEN_PAREN = '('
+EMPTY_CHILDREN = ()
+# used to split off the functional tags from various treebanks
+# for example, the Icelandic treebank (which we don't currently
+# incorporate) uses * to distinguish 'ADJP', 'ADJP*OC' but we treat
+# those as the same
+CONSTITUENT_SPLIT = re.compile("[-=#*]")
+# These words occur in the VLSP dataset.
+# The documentation claims there might be *O*, although those don't
+# seem to exist in practice
+WORDS_TO_PRUNE = ('*E*', '*T*', '*O*')
+class TreePrintMethod(Enum):
+    """
+    Describes a few options for printing trees.
+    This probably doesn't need to be used directly.  See __format__
+    """
+    ONE_LINE          = 1  # (ROOT (S ...  ))
+    LABELED_PARENS    = 2  # (_ROOT (_S ... )_S )_ROOT
+    PRETTY            = 3  # multiple lines
+    VLSP              = 4  # <s> (S ... ) </s>
+    LATEX_TREE        = 5  # \Tree [.S [.NP ... ] ]
+class Tree(StanzaObject):
+    """
+    A data structure to represent a parse tree
+    """
+    def __init__(self, label=None, children=None):
+        if children is None:
+            self.children = EMPTY_CHILDREN
+        elif isinstance(children, Tree):
+            self.children = (children,)
+        else:
+            self.children = tuple(children)
+        self.label = label
+    def is_leaf(self):
+        return len(self.children) == 0
+    def is_preterminal(self):
+        return len(self.children) == 1 and len(self.children[0].children) == 0
+    def yield_preterminals(self):
+        """
+        Yield the preterminals one at a time in order
+        """
+        if self.is_preterminal():
+            yield self
+            return
+        if self.is_leaf():
+            raise ValueError("Attempted to iterate preterminals on non-internal node")
+        iterator = iter(self.children)
+        node = next(iterator, None)
+        while node is not None:
+            if node.is_preterminal():
+                yield node
+            else:
+                iterator = itertools.chain(node.children, iterator)
+            node = next(iterator, None)
+    def leaf_labels(self):
+        """
+        Get the labels of the leaves
+        """
+        if self.is_leaf():
+            return [self.label]
+        words = [x.children[0].label for x in self.yield_preterminals()]
+        return words
+    def __len__(self):
+        return len(self.leaf_labels())
+    def all_leaves_are_preterminals(self):
+        """
+        Returns True if all leaves are under preterminals, False otherwise
+        """
+        if self.is_leaf():
+            return False
+        if self.is_preterminal():
+            return True
+        return all(t.all_leaves_are_preterminals() for t in self.children)
+    def pretty_print(self, normalize=None):
+        """
+        Print with newlines & indentation on each line
+        Preterminals and nodes with all preterminal children go on their own line
+        You can pass in your own normalize() function.  If you do,
+        make sure the function updates the parens to be something
+        other than () or the brackets will be broken
+        """
+        if normalize is None:
+            normalize = lambda x: x.replace("(", "-LRB-").replace(")", "-RRB-")
+        indent = 0
+        with StringIO() as buf:
+            stack = deque()
+            stack.append(self)
+            while len(stack) > 0:
+                node = stack.pop()
+                if node is CLOSE_PAREN:
+                    # if we're trying to pretty print trees, pop all off close parens
+                    # then write a newline
+                    while node is CLOSE_PAREN:
+                        indent -= 1
+                        buf.write(CLOSE_PAREN)
+                        if len(stack) == 0:
+                            node = None
+                            break
+                        node = stack.pop()
+                    buf.write("\n")
+                    if node is None:
+                        break
+                    stack.append(node)
+                elif node.is_preterminal():
+                    buf.write("  " * indent)
+                    buf.write("%s%s %s%s" % (OPEN_PAREN, normalize(node.label), normalize(node.children[0].label), CLOSE_PAREN))
+                    if len(stack) == 0 or stack[-1] is not CLOSE_PAREN:
+                        buf.write("\n")
+                elif all(x.is_preterminal() for x in node.children):
+                    buf.write("  " * indent)
+                    buf.write("%s%s" % (OPEN_PAREN, normalize(node.label)))
+                    for child in node.children:
+                        buf.write(" %s%s %s%s" % (OPEN_PAREN, normalize(child.label), normalize(child.children[0].label), CLOSE_PAREN))
+                    buf.write(CLOSE_PAREN)
+                    if len(stack) == 0 or stack[-1] is not CLOSE_PAREN:
+                        buf.write("\n")
+                else:
+                    buf.write("  " * indent)
+                    buf.write("%s%s\n" % (OPEN_PAREN, normalize(node.label)))
+                    stack.append(CLOSE_PAREN)
+                    for child in reversed(node.children):
+                        stack.append(child)
+                    indent += 1
+            buf.seek(0)
+            return buf.read()
+    def __format__(self, spec):
+        """
+        Turn the tree into a string representing the tree
+        Note that this is not a recursive traversal
+        Otherwise, a tree too deep might blow up the call stack
+        There is a type specific format:
+          O       -> one line PTB format, which is the default anyway
+          L       -> open and close brackets are labeled, spaces in the tokens are replaced with _
+          P       -> pretty print over multiple lines
+          V       -> surround lines with <s>...</s>, don't print ROOT, and turn () into L/RBKT
+          ?       -> spaces in the tokens are replaced with ? for any value of ? other than OLP
+                     warning: this may be removed in the future
+          ?{OLPV} -> specific format AND a custom space replacement
+          Vi      -> add an ID to the <s> in the V format.  Also works with ?Vi
+        """
+        space_replacement = " "
+        print_format = TreePrintMethod.ONE_LINE
+        if spec == 'L':
+            print_format = TreePrintMethod.LABELED_PARENS
+            space_replacement = "_"
+        elif spec and spec[-1] == 'L':
+            print_format = TreePrintMethod.LABELED_PARENS
+            space_replacement = spec[0]
+        elif spec == 'O':
+            print_format = TreePrintMethod.ONE_LINE
+        elif spec and spec[-1] == 'O':
+            print_format = TreePrintMethod.ONE_LINE
+            space_replacement = spec[0]
+        elif spec == 'P':
+            print_format = TreePrintMethod.PRETTY
+        elif spec and spec[-1] == 'P':
+            print_format = TreePrintMethod.PRETTY
+            space_replacement = spec[0]
+        elif spec and spec[0] == 'V':
+            print_format = TreePrintMethod.VLSP
+            use_tree_id = spec[-1] == 'i'
+        elif spec and len(spec) > 1 and spec[1] == 'V':
+            print_format = TreePrintMethod.VLSP
+            space_replacement = spec[0]
+            use_tree_id = spec[-1] == 'i'
+        elif spec == 'T':
+            print_format = TreePrintMethod.LATEX_TREE
+        elif spec and len(spec) > 1 and spec[1] == 'T':
+            print_format = TreePrintMethod.LATEX_TREE
+            space_replacement = spec[0]
+        elif spec:
+            space_replacement = spec[0]
+            warnings.warn("Use of a custom replacement without a format specifier is deprecated.  Please use {}O instead".format(space_replacement), stacklevel=2)
+        LRB = "LBKT" if print_format == TreePrintMethod.VLSP else "-LRB-"
+        RRB = "RBKT" if print_format == TreePrintMethod.VLSP else "-RRB-"
+        def normalize(text):
+            return text.replace(" ", space_replacement).replace("(", LRB).replace(")", RRB)
+        if print_format is TreePrintMethod.PRETTY:
+            return self.pretty_print(normalize)
+        with StringIO() as buf:
+            stack = deque()
+            if print_format == TreePrintMethod.VLSP:
+                if use_tree_id:
+                    buf.write("<s id={}>\n".format(self.tree_id))
+                else:
+                    buf.write("<s>\n")
+                if len(self.children) == 0:
+                    raise ValueError("Cannot print an empty tree with V format")
+                elif len(self.children) > 1:
+                    raise ValueError("Cannot print a tree with %d branches with V format" % len(self.children))
+                stack.append(self.children[0])
+            elif print_format == TreePrintMethod.LATEX_TREE:
+                buf.write("\\Tree ")
+                if len(self.children) == 0:
+                    raise ValueError("Cannot print an empty tree with T format")
+                elif len(self.children) == 1 and len(self.children[0].children) == 0:
+                    buf.write("[.? ")
+                    buf.write(normalize(self.children[0].label))
+                    buf.write(" ]")
+                elif self.label == 'ROOT':
+                    stack.append(self.children[0])
+                else:
+                    stack.append(self)
+            else:
+                stack.append(self)
+            while len(stack) > 0:
+                node = stack.pop()
+                if isinstance(node, str):
+                    buf.write(node)
+                    continue
+                if len(node.children) == 0:
+                    if node.label is not None:
+                        buf.write(normalize(node.label))
+                    continue
+                if print_format is TreePrintMethod.LATEX_TREE:
+                    if node.is_preterminal():
+                        buf.write(normalize(node.children[0].label))
+                        continue
+                    buf.write("[.%s" % normalize(node.label))
+                    stack.append(" ]")
+                elif print_format is TreePrintMethod.ONE_LINE or print_format is TreePrintMethod.VLSP:
+                    buf.write(OPEN_PAREN)
+                    if node.label is not None:
+                        buf.write(normalize(node.label))
+                    stack.append(CLOSE_PAREN)
+                elif print_format is TreePrintMethod.LABELED_PARENS:
+                    buf.write("%s_%s" % (OPEN_PAREN, normalize(node.label)))
+                    stack.append(CLOSE_PAREN + "_" + normalize(node.label))
+                    stack.append(SPACE_SEPARATOR)
+                for child in reversed(node.children):
+                    stack.append(child)
+                    stack.append(SPACE_SEPARATOR)
+            if print_format == TreePrintMethod.VLSP:
+                buf.write("\n</s>")
+            buf.seek(0)
+            return buf.read()
+    def __repr__(self):
+        return "{}".format(self)
+    def __eq__(self, other):
+        if self is other:
+            return True
+        if not isinstance(other, Tree):
+            return False
+        if self.label != other.label:
+            return False
+        if len(self.children) != len(other.children):
+            return False
+        if any(c1 != c2 for c1, c2 in zip(self.children, other.children)):
+            return False
+        return True
+    def depth(self):
+        if not self.children:
+            return 0
+        return 1 + max(x.depth() for x in self.children)
+    def visit_preorder(self, internal=None, preterminal=None, leaf=None):
+        """
+        Visit the tree in a preorder order
+        Applies the given functions to each node.
+        internal: if not None, applies this function to each non-leaf, non-preterminal node
+        preterminal: if not None, applies this functiion to each preterminal
+        leaf: if not None, applies this function to each leaf
+        The functions should *not* destructively alter the trees.
+        There is no attempt to interpret the results of calling these functions.
+        Rather, you can use visit_preorder to collect stats on trees, etc.
+        """
+        if self.is_leaf():
+            if leaf:
+                leaf(self)
+        elif self.is_preterminal():
+            if preterminal:
+                preterminal(self)
+        else:
+            if internal:
+                internal(self)
+        for child in self.children:
+            child.visit_preorder(internal, preterminal, leaf)
+    @staticmethod
+    def get_unique_constituent_labels(trees):
+        """
+        Walks over all of the trees and gets all of the unique constituent names from the trees
+        """
+        if isinstance(trees, Tree):
+            trees = [trees]
+        constituents = Tree.get_constituent_counts(trees)
+        return sorted(set(constituents.keys()))
+    @staticmethod
+    def get_constituent_counts(trees):
+        """
+        Walks over all of the trees and gets the count of the unique constituent names from the trees
+        """
+        if isinstance(trees, Tree):
+            trees = [trees]
+        constituents = Counter()
+        for tree in trees:
+            tree.visit_preorder(internal = lambda x: constituents.update([x.label]))
+        return constituents
+    @staticmethod
+    def get_unique_tags(trees):
+        """
+        Walks over all of the trees and gets all of the unique tags from the trees
+        """
+        if isinstance(trees, Tree):
+            trees = [trees]
+        tags = set()
+        for tree in trees:
+            tree.visit_preorder(preterminal = lambda x: tags.add(x.label))
+        return sorted(tags)
+    @staticmethod
+    def get_unique_words(trees):
+        """
+        Walks over all of the trees and gets all of the unique words from the trees
+        """
+        if isinstance(trees, Tree):
+            trees = [trees]
+        words = set()
+        for tree in trees:
+            tree.visit_preorder(leaf = lambda x: words.add(x.label))
+        return sorted(words)
+    @staticmethod
+    def get_common_words(trees, num_words):
+        """
+        Walks over all of the trees and gets the most frequently occurring words.
+        """
+        if num_words == 0:
+            return set()
+        if isinstance(trees, Tree):
+            trees = [trees]
+        words = Counter()
+        for tree in trees:
+            tree.visit_preorder(leaf = lambda x: words.update([x.label]))
+        return sorted(x[0] for x in words.most_common()[:num_words])
+    @staticmethod
+    def get_rare_words(trees, threshold=0.05):
+        """
+        Walks over all of the trees and gets the least frequently occurring words.
+        threshold: choose the bottom X percent
+        """
+        if isinstance(trees, Tree):
+            trees = [trees]
+        words = Counter()
+        for tree in trees:
+            tree.visit_preorder(leaf = lambda x: words.update([x.label]))
+        threshold = max(int(len(words) * threshold), 1)
+        return sorted(x[0] for x in words.most_common()[:-threshold-1:-1])
+    @staticmethod
+    def get_root_labels(trees):
+        return sorted(set(x.label for x in trees))
+    @staticmethod
+    def get_compound_constituents(trees, separate_root=False):
+        constituents = set()
+        stack = deque()
+        for tree in trees:
+            if separate_root:
+                constituents.add((tree.label,))
+                for child in tree.children:
+                    stack.append(child)
+            else:
+                stack.append(tree)
+            while len(stack) > 0:
+                node = stack.pop()
+                if node.is_leaf() or node.is_preterminal():
+                    continue
+                labels = [node.label]
+                while len(node.children) == 1 and not node.children[0].is_preterminal():
+                    node = node.children[0]
+                    labels.append(node.label)
+                constituents.add(tuple(labels))
+                for child in node.children:
+                    stack.append(child)
+        return sorted(constituents)
+    # TODO: test different pattern
+    def simplify_labels(self, pattern=CONSTITUENT_SPLIT):
+        """
+        Return a copy of the tree with the -=# removed
+        Leaves the text of the leaves alone.
+        """
+        new_label = self.label
+        # check len(new_label) just in case it's a tag of - or =
+        if new_label and not self.is_leaf() and len(new_label) > 1 and new_label not in ('-LRB-', '-RRB-'):
+            new_label = pattern.split(new_label)[0]
+        new_children = [child.simplify_labels(pattern) for child in self.children]
+        return Tree(new_label, new_children)
+    def reverse(self):
+        """
+        Flip a tree backwards
+        The intent is to train a parser backwards to see if the
+        forward and backwards parsers can augment each other
+        """
+        if self.is_leaf():
+            return Tree(self.label)
+        new_children = [child.reverse() for child in reversed(self.children)]
+        return Tree(self.label, new_children)
+    def remap_constituent_labels(self, label_map):
+        """
+        Copies the tree with some labels replaced.
+        Labels in the map are replaced with the mapped value.
+        Labels not in the map are unchanged.
+        """
+        if self.is_leaf():
+            return Tree(self.label)
+        if self.is_preterminal():
+            return Tree(self.label, Tree(self.children[0].label))
+        new_label = label_map.get(self.label, self.label)
+        return Tree(new_label, [child.remap_constituent_labels(label_map) for child in self.children])
+    def remap_words(self, word_map):
+        """
+        Copies the tree with some labels replaced.
+        Labels in the map are replaced with the mapped value.
+        Labels not in the map are unchanged.
+        """
+        if self.is_leaf():
+            new_label = word_map.get(self.label, self.label)
+            return Tree(new_label)
+        if self.is_preterminal():
+            return Tree(self.label, self.children[0].remap_words(word_map))
+        return Tree(self.label, [child.remap_words(word_map) for child in self.children])
+    def replace_words(self, words):
+        """
+        Replace all leaf words with the words in the given list (or iterable)
+        Returns a new tree
+        """
+        word_iterator = iter(words)
+        def recursive_replace_words(subtree):
+            if subtree.is_leaf():
+                word = next(word_iterator, None)
+                if word is None:
+                    raise ValueError("Not enough words to replace all leaves")
+                return Tree(word)
+            return Tree(subtree.label, [recursive_replace_words(x) for x in subtree.children])
+        new_tree = recursive_replace_words(self)
+        if any(True for _ in word_iterator):
+            raise ValueError("Too many words for the given tree")
+        return new_tree
+    def replace_tags(self, tags):
+        if self.is_leaf():
+            raise ValueError("Must call replace_tags with non-leaf")
+        if isinstance(tags, Tree):
+            tag_iterator = (x.label for x in tags.yield_preterminals())
+        else:
+            tag_iterator = iter(tags)
+        new_tree = copy.deepcopy(self)
+        queue = deque()
+        queue.append(new_tree)
+        while len(queue) > 0:
+            next_node = queue.pop()
+            if next_node.is_preterminal():
+                try:
+                    label = next(tag_iterator)
+                except StopIteration:
+                    raise ValueError("Not enough tags in sentence for given tree")
+                next_node.label = label
+            elif next_node.is_leaf():
+                raise ValueError("Got a badly structured tree: {}".format(self))
+            else:
+                queue.extend(reversed(next_node.children))
+        if any(True for _ in tag_iterator):
+            raise ValueError("Too many tags for the given tree")
+        return new_tree
+    def prune_none(self):
+        """
+        Return a copy of the tree, eliminating all nodes which are in one of two categories:
+            they are a preterminal -NONE-, such as appears in PTB
+              *E* shows up in a VLSP dataset
+            they have been pruned to 0 children by the recursive call
+        """
+        if self.is_leaf():
+            return Tree(self.label)
+        if self.is_preterminal():
+            if self.label == '-NONE-' or self.children[0].label in WORDS_TO_PRUNE:
+                return None
+            return Tree(self.label, Tree(self.children[0].label))
+        # must be internal node
+        new_children = [child.prune_none() for child in self.children]
+        new_children = [child for child in new_children if child is not None]
+        if len(new_children) == 0:
+            return None
+        return Tree(self.label, new_children)
+    def count_unary_depth(self):
+        if self.is_preterminal() or self.is_leaf():
+            return 0
+        if len(self.children) == 1:
+            t = self
+            score = 0
+            while not t.is_preterminal() and not t.is_leaf() and len(t.children) == 1:
+                score = score + 1
+                t = t.children[0]
+            child_score = max(tc.count_unary_depth() for tc in t.children)
+            score = max(score, child_score)
+            return score
+        score = max(t.count_unary_depth() for t in self.children)
+        return score
+    @staticmethod
+    def write_treebank(trees, out_file, fmt="{}"):
+        with open(out_file, "w", encoding="utf-8") as fout:
+            for tree in trees:
+                fout.write(fmt.format(tree))
+                fout.write("\n")

stanza/stanza/models/constituency/positional_encoding.py ADDED Viewed

	@@ -0,0 +1,89 @@

+"""
+Based on
+https://pytorch.org/tutorials/beginner/transformer_tutorial.html#define-the-model
+"""
+import math
+import torch
+from torch import nn
+class SinusoidalEncoding(nn.Module):
+    """
+    Uses sine & cosine to represent position
+    """
+    def __init__(self, model_dim, max_len):
+        super().__init__()
+        self.register_buffer('pe', self.build_position(model_dim, max_len))
+    @staticmethod
+    def build_position(model_dim, max_len, device=None):
+        position = torch.arange(max_len).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, model_dim, 2) * (-math.log(10000.0) / model_dim))
+        pe = torch.zeros(max_len, model_dim)
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        if device is not None:
+            pe = pe.to(device=device)
+        return pe
+    def forward(self, x):
+        if max(x) >= self.pe.shape[0]:
+            # try to drop the reference first before creating a new encoding
+            # the goal being to save memory if we are close to the memory limit
+            device = self.pe.device
+            shape = self.pe.shape[1]
+            self.register_buffer('pe', None)
+            # TODO: this may result in very poor performance
+            # in the event of a model that increases size one at a time
+            self.register_buffer('pe', self.build_position(shape, max(x)+1, device=device))
+        return self.pe[x]
+    def max_len(self):
+        return self.pe.shape[0]
+class AddSinusoidalEncoding(nn.Module):
+    """
+    Uses sine & cosine to represent position.  Adds the position to the given matrix
+    Default behavior is batch_first
+    """
+    def __init__(self, d_model=256, max_len=512):
+        super().__init__()
+        self.encoding = SinusoidalEncoding(d_model, max_len)
+    def forward(self, x, scale=1.0):
+        """
+        Adds the positional encoding to the input tensor
+        The tensor is expected to be of the shape B, N, D
+        Properly masking the output tensor is up to the caller
+        """
+        if len(x.shape) == 3:
+            timing = self.encoding(torch.arange(x.shape[1], device=x.device))
+            timing = timing.expand(x.shape[0], -1, -1)
+        elif len(x.shape) == 2:
+            timing = self.encoding(torch.arange(x.shape[0], device=x.device))
+        return x + timing * scale
+class ConcatSinusoidalEncoding(nn.Module):
+    """
+    Uses sine & cosine to represent position.  Concats the position and returns a larger object
+    Default behavior is batch_first
+    """
+    def __init__(self, d_model=256, max_len=512):
+        super().__init__()
+        self.encoding = SinusoidalEncoding(d_model, max_len)
+    def forward(self, x):
+        if len(x.shape) == 3:
+            timing = self.encoding(torch.arange(x.shape[1], device=x.device))
+            timing = timing.expand(x.shape[0], -1, -1)
+        else:
+            timing = self.encoding(torch.arange(x.shape[0], device=x.device))
+        out = torch.cat((x, timing), dim=-1)
+        return out

stanza/stanza/models/constituency/retagging.py ADDED Viewed

	@@ -0,0 +1,130 @@

+"""
+Refactor a few functions specifically for retagging trees
+Retagging is important because the gold tags will not be available at runtime
+Note that the method which does the actual retagging is in utils.py
+so as to avoid unnecessary circular imports
+(eg, Pipeline imports constituency/trainer which imports this which imports Pipeline)
+"""
+import copy
+import logging
+from stanza import Pipeline
+from stanza.models.common.foundation_cache import FoundationCache
+from stanza.models.common.vocab import VOCAB_PREFIX
+from stanza.resources.common import download_resources_json, load_resources_json, get_language_resources
+tlogger = logging.getLogger('stanza.constituency.trainer')
+# xpos tagger doesn't produce PP tag on the turin treebank,
+# so instead we use upos to avoid unknown tag errors
+RETAG_METHOD = {
+    "da": "upos",   # the DDT has no xpos tags anyway
+    "de": "upos",   # DE GSD is also missing a few punctuation tags
+    "es": "upos",   # AnCora has half-finished xpos tags
+    "id": "upos",   # GSD is missing a few punctuation tags - fixed in 2.12, though
+    "it": "upos",
+    "pt": "upos",   # default PT model has no xpos either
+    "vi": "xpos",   # the new version of UD can be merged with xpos from VLSP22
+}
+def add_retag_args(parser):
+    """
+    Arguments specifically for retagging treebanks
+    """
+    parser.add_argument('--retag_package', default="default", help='Which tagger shortname to use when retagging trees.  None for no retagging.  Retagging is recommended, as gold tags will not be available at pipeline time')
+    parser.add_argument('--retag_method', default=None, choices=['xpos', 'upos'], help='Which tags to use when retagging.  Default depends on the language')
+    parser.add_argument('--retag_model_path', default=None, help='Path to a retag POS model to use.  Will use a downloaded Stanza model by default.  Can specify multiple taggers with ; in which case the majority vote wins')
+    parser.add_argument('--retag_pretrain_path', default=None, help='Use this for a pretrain path for the retagging pipeline.  Generally not needed unless using a custom POS model with a custom pretrain')
+    parser.add_argument('--retag_charlm_forward_file', default=None, help='Use this for a forward charlm path for the retagging pipeline.  Generally not needed unless using a custom POS model with a custom charlm')
+    parser.add_argument('--retag_charlm_backward_file', default=None, help='Use this for a backward charlm  path for the retagging pipeline.  Generally not needed unless using a custom POS model with a custom charlm')
+    parser.add_argument('--no_retag', dest='retag_package', action="store_const", const=None, help="Don't retag the trees")
+def postprocess_args(args):
+    """
+    After parsing args, unify some settings
+    """
+    # use a language specific default for retag_method if we know the language
+    # otherwise, use xpos
+    if args['retag_method'] is None and 'lang' in args and args['lang'] in RETAG_METHOD:
+        args['retag_method'] = RETAG_METHOD[args['lang']]
+    if args['retag_method'] is None:
+        args['retag_method'] = 'xpos'
+    if args['retag_method'] == 'xpos':
+        args['retag_xpos'] = True
+    elif args['retag_method'] == 'upos':
+        args['retag_xpos'] = False
+    else:
+        raise ValueError("Unknown retag method {}".format(xpos))
+def build_retag_pipeline(args):
+    """
+    Builds retag pipelines based on the arguments
+    May alter the arguments if the pipeline is incompatible, such as
+    taggers with no xpos
+    Will return a list of one or more retag pipelines.
+    Multiple tagger models can be specified by having them
+    semi-colon separated in retag_model_path.
+    """
+    # some argument sets might not use 'mode'
+    if args['retag_package'] is not None and args.get('mode', None) != 'remove_optimizer':
+        download_resources_json()
+        resources = load_resources_json()
+        if '_' in args['retag_package']:
+            lang, package = args['retag_package'].split('_', 1)
+            lang_resources = get_language_resources(resources, lang)
+            if lang_resources is None and 'lang' in args:
+                lang_resources = get_language_resources(resources, args['lang'])
+                if lang_resources is not None and 'pos' in lang_resources and args['retag_package'] in lang_resources['pos']:
+                    lang = args['lang']
+                    package = args['retag_package']
+        else:
+            if 'lang' not in args:
+                raise ValueError("Retag package %s does not specify the language, and it is not clear from the arguments" % args['retag_package'])
+            lang = args.get('lang', None)
+            package = args['retag_package']
+        foundation_cache = FoundationCache()
+        retag_args = {"lang": lang,
+                      "processors": "tokenize, pos",
+                      "tokenize_pretokenized": True,
+                      "package": {"pos": package}}
+        if args['retag_pretrain_path'] is not None:
+            retag_args['pos_pretrain_path'] = args['retag_pretrain_path']
+        if args['retag_charlm_forward_file'] is not None:
+            retag_args['pos_forward_charlm_path'] = args['retag_charlm_forward_file']
+        if args['retag_charlm_backward_file'] is not None:
+            retag_args['pos_backward_charlm_path'] = args['retag_charlm_backward_file']
+        def build(retag_args, path):
+            retag_args = copy.deepcopy(retag_args)
+            # we just downloaded the resources a moment ago
+            # no need to repeatedly download
+            retag_args['download_method'] = 'reuse_resources'
+            if path is not None:
+                retag_args['allow_unknown_language'] = True
+                retag_args['pos_model_path'] = path
+                tlogger.debug('Creating retag pipeline using %s', path)
+            else:
+                tlogger.debug('Creating retag pipeline for %s package', package)
+            retag_pipeline = Pipeline(foundation_cache=foundation_cache, **retag_args)
+            if args['retag_xpos'] and len(retag_pipeline.processors['pos'].vocab['xpos']) == len(VOCAB_PREFIX):
+                tlogger.warning("XPOS for the %s tagger is empty.  Switching to UPOS", package)
+                args['retag_xpos'] = False
+                args['retag_method'] = 'upos'
+            return retag_pipeline
+        if args['retag_model_path'] is None:
+            return [build(retag_args, None)]
+        paths = args['retag_model_path'].split(";")
+        # can be length 1 if only one tagger to work with
+        return [build(retag_args, path) for path in paths]
+    return None

stanza/stanza/models/constituency/state.py ADDED Viewed

	@@ -0,0 +1,144 @@

+from collections import namedtuple
+class State(namedtuple('State', ['word_queue', 'transitions', 'constituents', 'gold_tree', 'gold_sequence',
+                                 'sentence_length', 'num_opens', 'word_position', 'score'])):
+    """
+    Represents a partially completed transition parse
+    Includes stack/buffers for unused words, already executed transitions, and partially build constituents
+    At training time, also keeps track of the gold data we are reparsing
+    num_opens is useful for tracking
+       1) if the parser is in a stuck state where it is making infinite opens
+       2) if a close transition is impossible because there are no previous opens
+    sentence_length tracks how long the sentence is so we abort if we go infinite
+    non-stack information such as sentence_length and num_opens
+    will be copied from the original_state if possible, with the
+    exact arguments overriding the values in the original_state
+    gold_tree: the original tree, if made from a gold tree.  might be None
+    gold_sequence: the original transition sequence, if available
+    Note that at runtime, gold values will not be available
+    word_position tracks where in the word queue we are.  cheaper than
+      manipulating the list itself.  this can be handled differently
+      from transitions and constituents as it is processed once
+      at the start of parsing
+    The word_queue should have both a start and an end word.
+    Those can be None in the case of the endpoints if they are unused.
+    """
+    def empty_word_queue(self):
+        # the first element of each stack is a sentinel with no value
+        # and no parent
+        return self.word_position == self.sentence_length
+    def empty_transitions(self):
+        # the first element of each stack is a sentinel with no value
+        # and no parent
+        return self.transitions.parent is None
+    def has_one_constituent(self):
+        # a length of 1 represents no constituents
+        return self.constituents.length == 2
+    @property
+    def empty_constituents(self):
+        return self.constituents.parent is None
+    def num_constituents(self):
+        return self.constituents.length - 1
+    @property
+    def num_transitions(self):
+        # -1 for the sentinel value
+        return self.transitions.length - 1
+    def get_word(self, pos):
+        # +1 to handle the initial sentinel value
+        # (which you can actually get with pos=-1)
+        return self.word_queue[pos+1]
+    def finished(self, model):
+        return self.empty_word_queue() and self.has_one_constituent() and model.get_top_constituent(self.constituents).label in model.root_labels
+    def get_tree(self, model):
+        return model.get_top_constituent(self.constituents)
+    def all_transitions(self, model):
+        # TODO: rewrite this to be nicer / faster?  or just refactor?
+        all_transitions = []
+        transitions = self.transitions
+        while transitions.parent is not None:
+            all_transitions.append(model.get_top_transition(transitions))
+            transitions = transitions.parent
+        return list(reversed(all_transitions))
+    def all_constituents(self, model):
+        # TODO: rewrite this to be nicer / faster?
+        all_constituents = []
+        constituents = self.constituents
+        while constituents.parent is not None:
+            all_constituents.append(model.get_top_constituent(constituents))
+            constituents = constituents.parent
+        return list(reversed(all_constituents))
+    def all_words(self, model):
+        return [model.get_word(x) for x in self.word_queue]
+    def to_string(self, model):
+        return "State(\n  buffer:%s\n  transitions:%s\n  constituents:%s\n  word_position:%d num_opens:%d)" % (str(self.all_words(model)), str(self.all_transitions(model)), str(self.all_constituents(model)), self.word_position, self.num_opens)
+    def __str__(self):
+        return "State(\n  buffer:%s\n  transitions:%s\n  constituents:%s)" % (str(self.word_queue), str(self.transitions), str(self.constituents))
+class MultiState(namedtuple('MultiState', ['states', 'gold_tree', 'gold_sequence', 'score'])):
+    def finished(self, ensemble):
+        return self.states[0].finished(ensemble.models[0])
+    def get_tree(self, ensemble):
+        return self.states[0].get_tree(ensemble.models[0])
+    @property
+    def empty_constituents(self):
+        return self.states[0].empty_constituents
+    def num_constituents(self):
+        return len(self.states[0].constituents) - 1
+    @property
+    def num_transitions(self):
+        # -1 for the sentinel value
+        return len(self.states[0].transitions) - 1
+    @property
+    def num_opens(self):
+        return self.states[0].num_opens
+    @property
+    def sentence_length(self):
+        return self.states[0].sentence_length
+    def empty_word_queue(self):
+        return self.states[0].empty_word_queue()
+    def empty_transitions(self):
+        return self.states[0].empty_transitions()
+    @property
+    def constituents(self):
+        # warning! if there is information in the constituents such as
+        # the embedding of the constituent, this will only contain the
+        # first such embedding
+        # the other models' constituent states won't be returned
+        return self.states[0].constituents
+    @property
+    def transitions(self):
+        # warning! if there is information in the transitions such as
+        # the embedding of the transition, this will only contain the
+        # first such embedding
+        # the other models' transition states won't be returned
+        return self.states[0].transitions

stanza/stanza/models/constituency/top_down_oracle.py ADDED Viewed

	@@ -0,0 +1,757 @@

+from enum import Enum
+import random
+from stanza.models.constituency.dynamic_oracle import advance_past_constituents, score_candidates, DynamicOracle, RepairEnum
+from stanza.models.constituency.parse_transitions import Shift, OpenConstituent, CloseConstituent
+def find_constituent_end(gold_sequence, cur_index):
+    """
+    Find the Close which ends the next constituent opened at or after cur_index
+    """
+    count = 0
+    while cur_index < len(gold_sequence):
+        if isinstance(gold_sequence[cur_index], OpenConstituent):
+            count = count + 1
+        elif isinstance(gold_sequence[cur_index], CloseConstituent):
+            count = count - 1
+            if count == 0:
+                return cur_index
+        cur_index += 1
+    raise AssertionError("Open constituent not closed starting from index %d in sequence %s" % (cur_index, gold_sequence))
+def fix_shift_close(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    Predicted a close when we should have shifted
+    The fix here is to remove the corresponding close from later in
+    the transition sequence.  The rest of the tree building is the same,
+    including doing the missing Shift immediately after
+    Anything else would make the situation of one precision, one
+    recall error worse
+    """
+    if not isinstance(pred_transition, CloseConstituent):
+        return None
+    if not isinstance(gold_transition, Shift):
+        return None
+    close_index = advance_past_constituents(gold_sequence, gold_index)
+    return gold_sequence[:gold_index] + [pred_transition] + gold_sequence[gold_index:close_index] + gold_sequence[close_index+1:]
+def fix_open_close(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    Predicted a close when we should have opened a constituent
+    In this case, the previous constituent is now a precision and
+    recall error, BUT we can salvage the constituent we were about to
+    open by proceeding as if everything else is still the same.
+    The next thing the model should do is open the transition it forgot about
+    """
+    if not isinstance(pred_transition, CloseConstituent):
+        return None
+    if not isinstance(gold_transition, OpenConstituent):
+        return None
+    close_index = advance_past_constituents(gold_sequence, gold_index)
+    return gold_sequence[:gold_index] + [pred_transition] + gold_sequence[gold_index:close_index] + gold_sequence[close_index+1:]
+def fix_one_open_shift(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    Predicted a shift when we should have opened a constituent
+    This causes a single recall error if we just pretend that
+    constituent didn't exist
+    Keep the shift where it was, remove the next shift
+    Also, scroll ahead, find the corresponding close, cut it out
+    For the corresponding multiple opens, shift error, see fix_multiple_open_shift
+    """
+    if not isinstance(pred_transition, Shift):
+        return None
+    if not isinstance(gold_transition, OpenConstituent):
+        return None
+    if not isinstance(gold_sequence[gold_index + 1], Shift):
+        return None
+    shift_index = gold_index + 1
+    close_index = advance_past_constituents(gold_sequence, gold_index + 1)
+    if close_index is None:
+        return None
+    # gold_index is the skipped open constituent
+    # close_index was the corresponding close
+    # shift_index is the shift to remove
+    updated_sequence = gold_sequence[:gold_index] + [pred_transition] + gold_sequence[gold_index+1:shift_index] + gold_sequence[shift_index+1:close_index] + gold_sequence[close_index+1:]
+    #print("Input sequence: %s\nIndex %d\nGold %s Pred %s\nUpdated sequence %s" % (gold_sequence, gold_index, gold_transition, pred_transition, updated_sequence))
+    return updated_sequence
+def fix_multiple_open_shift(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    Predicted a shift when we should have opened multiple constituents instead
+    This causes a single recall error per constituent if we just
+    pretend those constituents don't exist
+    For each open constituent, we find the corresponding close,
+    then remove both the open & close
+    """
+    if not isinstance(pred_transition, Shift):
+        return None
+    if not isinstance(gold_transition, OpenConstituent):
+        return None
+    shift_index = gold_index
+    while shift_index < len(gold_sequence) and isinstance(gold_sequence[shift_index], OpenConstituent):
+        shift_index += 1
+    if shift_index >= len(gold_sequence):
+        raise AssertionError("Found a sequence of OpenConstituent at the end of a TOP_DOWN sequence!")
+    if not isinstance(gold_sequence[shift_index], Shift):
+        raise AssertionError("Expected to find a Shift after a sequence of OpenConstituent.  There should not be a %s" % gold_sequence[shift_index])
+    #print("Input sequence: %s\nIndex %d\nGold %s Pred %s" % (gold_sequence, gold_index, gold_transition, pred_transition))
+    updated_sequence = gold_sequence
+    while shift_index > gold_index:
+        close_index = advance_past_constituents(updated_sequence, shift_index)
+        if close_index is None:
+            raise AssertionError("Did not find a corresponding Close for this Open")
+        # cut out the corresponding open and close
+        updated_sequence = updated_sequence[:shift_index-1] + updated_sequence[shift_index:close_index] + updated_sequence[close_index+1:]
+        shift_index -= 1
+        #print("  %s" % updated_sequence)
+    #print("Final updated sequence: %s" % updated_sequence)
+    return updated_sequence
+def fix_nested_open_constituent(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    We were supposed to predict Open(X), then Open(Y), but predicted Open(Y) instead
+    We treat this as a single recall error.
+    We could even go crazy and turn it into a Unary,
+    such as Open(Y), Open(X), Open(Y)...
+    presumably that would be very confusing to the parser
+    not to mention ambiguous as to where to close the new constituent
+    """
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    if not isinstance(gold_transition, OpenConstituent):
+        return None
+    assert len(gold_sequence) > gold_index + 1
+    if not isinstance(gold_sequence[gold_index+1], OpenConstituent):
+        return None
+    # This replacement works if we skipped exactly one level
+    if gold_sequence[gold_index+1].label != pred_transition.label:
+        return None
+    close_index = advance_past_constituents(gold_sequence, gold_index+1)
+    assert close_index is not None
+    updated_sequence = gold_sequence[:gold_index] + gold_sequence[gold_index+1:close_index] + gold_sequence[close_index+1:]
+    return updated_sequence
+def fix_shift_open_immediate_close(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    We were supposed to Shift, but instead we Opened
+    The biggest problem with this type of error is that the Close of
+    the Open is ambiguous.  We could put it immediately before the
+    next Close, immediately after the Shift, or anywhere in between.
+    One unambiguous case would be if the proper sequence was Shift - Close.
+    Then it is unambiguous that the only possible repair is Open - Shift - Close - Close.
+    """
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    if not isinstance(gold_transition, Shift):
+        return None
+    assert len(gold_sequence) > gold_index + 1
+    if not isinstance(gold_sequence[gold_index+1], CloseConstituent):
+        # this is the ambiguous case
+        return None
+    return gold_sequence[:gold_index] + [pred_transition, gold_transition, CloseConstituent()] + gold_sequence[gold_index+1:]
+def fix_shift_open_ambiguous_unary(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    We were supposed to Shift, but instead we Opened
+    The biggest problem with this type of error is that the Close of
+    the Open is ambiguous.  We could put it immediately before the
+    next Close, immediately after the Shift, or anywhere in between.
+    In this fix, we are testing what happens if we treat this Open as a Unary transition.
+    """
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    if not isinstance(gold_transition, Shift):
+        return None
+    assert len(gold_sequence) > gold_index + 1
+    if isinstance(gold_sequence[gold_index+1], CloseConstituent):
+        # this is the unambiguous case, which should already be handled
+        return None
+    return gold_sequence[:gold_index] + [pred_transition, gold_transition, CloseConstituent()] + gold_sequence[gold_index+1:]
+def fix_shift_open_ambiguous_later(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    We were supposed to Shift, but instead we Opened
+    The biggest problem with this type of error is that the Close of
+    the Open is ambiguous.  We could put it immediately before the
+    next Close, immediately after the Shift, or anywhere in between.
+    In this fix, we put the corresponding Close for this Open at the end of the enclosing bracket.
+    """
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    if not isinstance(gold_transition, Shift):
+        return None
+    assert len(gold_sequence) > gold_index + 1
+    if isinstance(gold_sequence[gold_index+1], CloseConstituent):
+        # this is the unambiguous case, which should already be handled
+        return None
+    outer_close_index = advance_past_constituents(gold_sequence, gold_index)
+    return gold_sequence[:gold_index] + [pred_transition] + gold_sequence[gold_index:outer_close_index] + [CloseConstituent()] + gold_sequence[outer_close_index:]
+def fix_shift_open_ambiguous_predicted(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    if not isinstance(gold_transition, Shift):
+        return None
+    assert len(gold_sequence) > gold_index + 1
+    if isinstance(gold_sequence[gold_index+1], CloseConstituent):
+        # this is the unambiguous case, which should already be handled
+        return None
+    # at this point: have Opened a constituent which we don't want
+    # need to figure out where to Close it
+    # could close it after the shift or after any given block
+    candidates = []
+    current_index = gold_index
+    while not isinstance(gold_sequence[current_index], CloseConstituent):
+        if isinstance(gold_sequence[current_index], Shift):
+            end_index = current_index
+        else:
+            end_index = find_constituent_end(gold_sequence, current_index)
+        candidates.append((gold_sequence[:gold_index], [pred_transition], gold_sequence[gold_index:end_index+1], [CloseConstituent()], gold_sequence[end_index+1:]))
+        current_index = end_index + 1
+    scores, best_idx, best_candidate = score_candidates(model, state, candidates, candidate_idx=3)
+    if best_idx == len(candidates) - 1:
+        best_idx = -1
+    repair_type = RepairEnum(name=RepairType.SHIFT_OPEN_AMBIGUOUS_PREDICTED.name,
+                             value="%d.%d" % (RepairType.SHIFT_OPEN_AMBIGUOUS_PREDICTED.value, best_idx),
+                             is_correct=False)
+    return repair_type, best_candidate
+def fix_close_shift_ambiguous_immediate(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    Instead of a Close, we predicted a Shift.  This time, we immediately close no matter what comes after the next Shift.
+    An alternate strategy would be to Close at the closing of the outer constituent.
+    """
+    if not isinstance(pred_transition, Shift):
+        return None
+    if not isinstance(gold_transition, CloseConstituent):
+        return None
+    num_closes = 0
+    while isinstance(gold_sequence[gold_index + num_closes], CloseConstituent):
+        num_closes += 1
+    if not isinstance(gold_sequence[gold_index + num_closes], Shift):
+        # TODO: we should be able to handle this case too (an Open)
+        # however, it will be rare once the parser gets going and it
+        # would cause a lot of errors, anyway
+        return None
+    if isinstance(gold_sequence[gold_index + num_closes + 1], CloseConstituent):
+        # this one should just have been satisfied in the non-ambiguous version
+        return None
+    updated_sequence = gold_sequence[:gold_index] + [pred_transition] + gold_sequence[gold_index:gold_index+num_closes] + gold_sequence[gold_index+num_closes+1:]
+    return updated_sequence
+def fix_close_shift_ambiguous_later(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    Instead of a Close, we predicted a Shift.  This time, we close at the end of the outer bracket no matter what comes after the next Shift.
+    An alternate strategy would be to Close as soon as possible after the Shift.
+    """
+    if not isinstance(pred_transition, Shift):
+        return None
+    if not isinstance(gold_transition, CloseConstituent):
+        return None
+    num_closes = 0
+    while isinstance(gold_sequence[gold_index + num_closes], CloseConstituent):
+        num_closes += 1
+    if not isinstance(gold_sequence[gold_index + num_closes], Shift):
+        # TODO: we should be able to handle this case too (an Open)
+        # however, it will be rare once the parser gets going and it
+        # would cause a lot of errors, anyway
+        return None
+    if isinstance(gold_sequence[gold_index + num_closes + 1], CloseConstituent):
+        # this one should just have been satisfied in the non-ambiguous version
+        return None
+    # outer_close_index is now where the constituent which the broken constituent(s) reside inside gets closed
+    outer_close_index = advance_past_constituents(gold_sequence, gold_index + num_closes)
+    updated_sequence = gold_sequence[:gold_index] + gold_sequence[gold_index+num_closes:outer_close_index] + gold_sequence[gold_index:gold_index+num_closes] + gold_sequence[outer_close_index:]
+    return updated_sequence
+def fix_close_shift(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state, count_opens=False):
+    """
+    We were supposed to Close, but instead did a Shift
+    In most cases, this will be ambiguous.  There is now a constituent
+    which has been missed, no matter what we do, and we are on the
+    hook for eventually closing this constituent, creating a precision
+    error as well.  The ambiguity arises because there will be
+    multiple places where the Close could occur if there are more
+    constituents created between now and when the outer constituent is
+    Closed.
+    The non-ambiguous case is if the proper sequence was
+      Close - Shift - Close
+    similar cases are also non-ambiguous, such as
+      Close - Close - Shift - Close
+    for that matter, so is the following, although the Opens will be lost
+      Close - Open - Shift - Close - Close
+    count_opens is an option to make it easy to count with or without
+      Open as different oracle fixes
+    """
+    if not isinstance(pred_transition, Shift):
+        return None
+    if not isinstance(gold_transition, CloseConstituent):
+        return None
+    num_closes = 0
+    while isinstance(gold_sequence[gold_index + num_closes], CloseConstituent):
+        num_closes += 1
+    # We may allow unary transitions here
+    # the opens will be lost in the repaired sequence
+    num_opens = 0
+    if count_opens:
+        while isinstance(gold_sequence[gold_index + num_closes + num_opens], OpenConstituent):
+            num_opens += 1
+    if not isinstance(gold_sequence[gold_index + num_closes + num_opens], Shift):
+        if count_opens:
+            raise AssertionError("Should have found a Shift after a sequence of Opens or a Close with no Open.  Started counting at %d in sequence %s" % (gold_index, gold_sequence))
+        return None
+    if not isinstance(gold_sequence[gold_index + num_closes + num_opens + 1], CloseConstituent):
+        return None
+    for idx in range(num_opens):
+        if not isinstance(gold_sequence[gold_index + num_closes + num_opens + idx + 1], CloseConstituent):
+            return None
+    # Now we know it is Close x num_closes, Shift, Close
+    # Since we have erroneously predicted a Shift now, the best we can
+    # do is to follow that, then add num_closes Closes
+    updated_sequence = gold_sequence[:gold_index] + [pred_transition] + gold_sequence[gold_index:gold_index+num_closes] + gold_sequence[gold_index+num_closes+num_opens*2+1:]
+    return updated_sequence
+def fix_close_shift_with_opens(*args, **kwargs):
+    return fix_close_shift(*args, **kwargs, count_opens=True)
+def fix_close_next_correct_predicted(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    We were supposed to Close, but instead predicted Shift when the next transition is Shift
+    This differs from the previous Close-Shift in that this case does
+    not have an unambiguous place to put the Close.  Instead, we let
+    the model predict where to put the Close
+    Note that this can also work for Close-Open with the next Open correct
+    Not covered (yet?) is multiple Close in a row
+    """
+    if not isinstance(gold_transition, CloseConstituent):
+        return None
+    if not isinstance(pred_transition, (Shift, OpenConstituent)):
+        return None
+    if gold_sequence[gold_index+1] != pred_transition:
+        return None
+    candidates = []
+    current_index = gold_index + 1
+    while not isinstance(gold_sequence[current_index], CloseConstituent):
+        if isinstance(gold_sequence[current_index], Shift):
+            end_index = current_index
+        else:
+            end_index = find_constituent_end(gold_sequence, current_index)
+        candidates.append((gold_sequence[:gold_index], gold_sequence[gold_index+1:end_index+1], [CloseConstituent()], gold_sequence[end_index+1:]))
+        current_index = end_index + 1
+    scores, best_idx, best_candidate = score_candidates(model, state, candidates, candidate_idx=3)
+    if best_idx == len(candidates) - 1:
+        best_idx = -1
+    repair_type = RepairEnum(name=RepairType.CLOSE_NEXT_CORRECT_AMBIGUOUS_PREDICTED.name,
+                             value="%d.%d" % (RepairType.CLOSE_NEXT_CORRECT_AMBIGUOUS_PREDICTED.value, best_idx),
+                             is_correct=False)
+    return repair_type, best_candidate
+def fix_close_open_correct_open(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state, check_close=True):
+    """
+    We were supposed to Close, but instead did an Open
+    In general this is ambiguous (like close/shift), as we need to know when to close the incorrect constituent
+    A case that is not ambiguous is when exactly one constituent was
+    supposed to come after the Close and it matches the Open we just
+    created.  In that case, we treat that constituent as if it were
+    part of the non-Closed constituent.  For example,
+    "ate (NP spaghetti) (PP with a fork)" ->
+    "ate (NP spaghetti (PP with a fork))"
+    (delicious)
+    There is also an option to not check for the Close after the first
+    constituent, in which case any number of constituents could have
+    been predicted.  This represents a solution of the ambiguous form
+    of the Close/Open transition where the Close could occur in
+    multiple places later in the sequence.
+    """
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    if not isinstance(gold_transition, CloseConstituent):
+        return None
+    if gold_sequence[gold_index+1] != pred_transition:
+        return None
+    close_index = find_constituent_end(gold_sequence, gold_index+1)
+    if check_close and not isinstance(gold_sequence[close_index+1], CloseConstituent):
+        return None
+    # at this point, we know we can put the Close at the end of the
+    # Open which was accidentally added
+    updated_sequence = gold_sequence[:gold_index] + gold_sequence[gold_index+1:close_index+1] + [gold_transition] + gold_sequence[close_index+1:]
+    return updated_sequence
+def fix_close_open_correct_open_ambiguous_immediate(*args, **kwargs):
+    return fix_close_open_correct_open(*args, **kwargs, check_close=False)
+def fix_close_open_correct_open_ambiguous_later(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state, check_close=True):
+    """
+    We were supposed to Close, but instead did an Open in an ambiguous context.  Here we resolve it later in the tree
+    """
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    if not isinstance(gold_transition, CloseConstituent):
+        return None
+    if gold_sequence[gold_index+1] != pred_transition:
+        return None
+    # this will be the index of the Close for the surrounding constituent
+    close_index = advance_past_constituents(gold_sequence, gold_index+1)
+    updated_sequence = gold_sequence[:gold_index] + gold_sequence[gold_index+1:close_index] + [gold_transition] + gold_sequence[close_index:]
+    return updated_sequence
+def fix_open_open_ambiguous_unary(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    If there is an Open/Open error which is not covered by the unambiguous single recall error, we try fixing it as a Unary
+    """
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    if not isinstance(gold_transition, OpenConstituent):
+        return None
+    if pred_transition == gold_transition:
+        return None
+    if gold_sequence[gold_index+1] == pred_transition:
+        # This case is covered by the nested open repair
+        return None
+    close_index = find_constituent_end(gold_sequence, gold_index)
+    assert close_index is not None
+    assert isinstance(gold_sequence[close_index], CloseConstituent)
+    updated_sequence = gold_sequence[:gold_index] + [pred_transition] + gold_sequence[gold_index:close_index] + [CloseConstituent()] + gold_sequence[close_index:]
+    return updated_sequence
+def fix_open_open_ambiguous_later(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    If there is an Open/Open error which is not covered by the
+    unambiguous single recall error, we try fixing it by putting the
+    close at the end of the outer constituent
+    """
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    if not isinstance(gold_transition, OpenConstituent):
+        return None
+    if pred_transition == gold_transition:
+        return None
+    if gold_sequence[gold_index+1] == pred_transition:
+        # This case is covered by the nested open repair
+        return None
+    close_index = advance_past_constituents(gold_sequence, gold_index)
+    updated_sequence = gold_sequence[:gold_index] + [pred_transition] + gold_sequence[gold_index:close_index] + [CloseConstituent()] + gold_sequence[close_index:]
+    return updated_sequence
+def fix_open_open_ambiguous_random(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    """
+    If there is an Open/Open error which is not covered by the
+    unambiguous single recall error, we try fixing it by putting the
+    close at the end of the outer constituent
+    """
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    if not isinstance(gold_transition, OpenConstituent):
+        return None
+    if pred_transition == gold_transition:
+        return None
+    if gold_sequence[gold_index+1] == pred_transition:
+        # This case is covered by the nested open repair
+        return None
+    if random.random() < 0.5:
+        return fix_open_open_ambiguous_later(gold_transition, pred_transition, gold_sequence, gold_index, root_labels)
+    else:
+        return fix_open_open_ambiguous_unary(gold_transition, pred_transition, gold_sequence, gold_index, root_labels)
+def report_shift_open(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    if not isinstance(gold_transition, Shift):
+        return None
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    return RepairType.OTHER_SHIFT_OPEN, None
+def report_close_shift(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    if not isinstance(gold_transition, CloseConstituent):
+        return None
+    if not isinstance(pred_transition, Shift):
+        return None
+    return RepairType.OTHER_CLOSE_SHIFT, None
+def report_close_open(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    if not isinstance(gold_transition, CloseConstituent):
+        return None
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    return RepairType.OTHER_CLOSE_OPEN, None
+def report_open_open(gold_transition, pred_transition, gold_sequence, gold_index, root_labels, model, state):
+    if not isinstance(gold_transition, OpenConstituent):
+        return None
+    if not isinstance(pred_transition, OpenConstituent):
+        return None
+    return RepairType.OTHER_OPEN_OPEN, None
+class RepairType(Enum):
+    """
+    Keep track of which repair is used, if any, on an incorrect transition
+    A test of the top-down oracle with no charlm or transformer
+      (eg, word vectors only) on EN PTB3 goes as follows.
+      3x training rounds, best training parameters as of Jan. 2024
+    unambiguous transitions only:
+        oracle scheme         dev        test
+      no oracle              0.9230     0.9194
+       +shift/close          0.9224     0.9180
+       +open/close           0.9225     0.9193
+       +open/shift (one)     0.9245     0.9207
+       +open/shift (mult)    0.9243     0.9211
+       +open/open nested     0.9258     0.9213
+       +shift/open           0.9266     0.9229
+       +close/shift (only)   0.9270     0.9230
+       +close/shift w/ opens 0.9262     0.9221
+       +close/open one con   0.9273     0.9230
+    Potential solutions for various ambiguous transitions:
+    close/open
+      can close immediately after the corresponding constituent or after any number of constituents
+    close/shift
+      can close immediately
+      can close anywhere up to the next close
+      any number of missed Opens are treated as recall errors
+    open/open
+      could treat as unary
+      could close at any number of positions after the next structures, up to the outer open's closing
+    shift/open ambiguity resolutions:
+      treat as unary
+      treat as wrapper around the next full constituent to build
+      treat as wrapper around everything to build until the next constituent
+    testing one at a time in addition to the full set of unambiguous corrections:
+       +close/open immediate   0.9259     0.9225
+       +close/open later       0.9258     0.9257
+       +close/shift immediate  0.9261     0.9219
+       +close/shift later      0.9270     0.9230
+       +open/open later        0.9269     0.9239
+       +open/open unary        0.9275     0.9246
+       +shift/open later       0.9263     0.9253
+       +shift/open unary       0.9264     0.9243
+    so there is some evidence that open/open or shift/open would be beneficial
+    Training by randomly choosing between the open/open, 50/50
+       +open/open random       0.9257     0.9235
+    so that didn't work great compared to the individual transitions
+    Testing deterministic resolutions of the ambiguous transitions
+    vs predicting the appropriate transition to use:
+    SHIFT_OPEN_AMBIGUOUS_UNARY_ERROR,CLOSE_SHIFT_AMBIGUOUS_IMMEDIATE_ERROR,CLOSE_OPEN_AMBIGUOUS_IMMEDIATE_ERROR
+    SHIFT_OPEN_AMBIGUOUS_PREDICTED,CLOSE_NEXT_CORRECT_AMBIGUOUS_PREDICTED
+    EN ambiguous (no charlm or transformer)   0.9268   0.9231
+    EN predicted                              0.9270   0.9257
+    EN none of the above                      0.9268   0.9229
+    ZH ambiguous                              0.9137   0.9127
+    ZH predicted                              0.9148   0.9141
+    ZH none of the above                      0.9141   0.9143
+    DE ambiguous                              0.9579   0.9408
+    DE predicted                              0.9575   0.9406
+    DE none of the above                      0.9581   0.9411
+    ID ambiguous                              0.8889   0.8794
+    ID predicted                              0.8911   0.8801
+    ID none of the above                      0.8913   0.8822
+    IT ambiguous                              0.8404   0.8380
+    IT predicted                              0.8397   0.8398
+    IT none of the above                      0.8400   0.8409
+    VI ambiguous                              0.8290   0.7676
+    VI predicted                              0.8287   0.7682
+    VI none of the above                      0.8292   0.7691
+    """
+    def __new__(cls, fn, correct=False, debug=False):
+        """
+        Enumerate values as normal, but also keep a pointer to a function which repairs that kind of error
+        """
+        value = len(cls.__members__)
+        obj = object.__new__(cls)
+        obj._value_ = value + 1
+        obj.fn = fn
+        obj.correct = correct
+        obj.debug = debug
+        return obj
+    @property
+    def is_correct(self):
+        return self.correct
+    # The parser chose to close a bracket instead of shift something
+    # into the bracket
+    # This causes both a precision and a recall error as there is now
+    # an incorrect bracket and a missing correct bracket
+    # Any bracket creation here would cause more wrong brackets, though
+    SHIFT_CLOSE_ERROR                      = (fix_shift_close,)
+    OPEN_CLOSE_ERROR                       = (fix_open_close,)
+    # open followed by shift was instead predicted to be shift
+    ONE_OPEN_SHIFT_ERROR                   = (fix_one_open_shift,)
+    # open followed by shift was instead predicted to be shift
+    MULTIPLE_OPEN_SHIFT_ERROR              = (fix_multiple_open_shift,)
+    # should have done Open(X), Open(Y)
+    # instead just did Open(Y)
+    NESTED_OPEN_OPEN_ERROR                 = (fix_nested_open_constituent,)
+    SHIFT_OPEN_ERROR                       = (fix_shift_open_immediate_close,)
+    CLOSE_SHIFT_ERROR                      = (fix_close_shift,)
+    CLOSE_SHIFT_WITH_OPENS_ERROR           = (fix_close_shift_with_opens,)
+    CLOSE_OPEN_ONE_CON_ERROR               = (fix_close_open_correct_open,)
+    CORRECT                                = (None, True)
+    UNKNOWN                                = None
+    CLOSE_OPEN_AMBIGUOUS_IMMEDIATE_ERROR   = (fix_close_open_correct_open_ambiguous_immediate,)
+    CLOSE_OPEN_AMBIGUOUS_LATER_ERROR       = (fix_close_open_correct_open_ambiguous_later,)
+    CLOSE_SHIFT_AMBIGUOUS_IMMEDIATE_ERROR  = (fix_close_shift_ambiguous_immediate,)
+    CLOSE_SHIFT_AMBIGUOUS_LATER_ERROR      = (fix_close_shift_ambiguous_later,)
+    # can potentially fix either close/shift or close/open
+    # as long as the gold transition after the close
+    # was the same as the transition we just predicted
+    CLOSE_NEXT_CORRECT_AMBIGUOUS_PREDICTED = (fix_close_next_correct_predicted,)
+    OPEN_OPEN_AMBIGUOUS_UNARY_ERROR        = (fix_open_open_ambiguous_unary,)
+    OPEN_OPEN_AMBIGUOUS_LATER_ERROR        = (fix_open_open_ambiguous_later,)
+    OPEN_OPEN_AMBIGUOUS_RANDOM_ERROR       = (fix_open_open_ambiguous_random,)
+    SHIFT_OPEN_AMBIGUOUS_UNARY_ERROR       = (fix_shift_open_ambiguous_unary,)
+    SHIFT_OPEN_AMBIGUOUS_LATER_ERROR       = (fix_shift_open_ambiguous_later,)
+    SHIFT_OPEN_AMBIGUOUS_PREDICTED         = (fix_shift_open_ambiguous_predicted,)
+    OTHER_SHIFT_OPEN                       = (report_shift_open, False, True)
+    OTHER_CLOSE_SHIFT                      = (report_close_shift, False, True)
+    OTHER_CLOSE_OPEN                       = (report_close_open, False, True)
+    OTHER_OPEN_OPEN                        = (report_open_open, False, True)
+class TopDownOracle(DynamicOracle):
+    def __init__(self, root_labels, oracle_level, additional_oracle_levels, deactivated_oracle_levels):
+        super().__init__(root_labels, oracle_level, RepairType, additional_oracle_levels, deactivated_oracle_levels)

stanza/stanza/models/constituency/trainer.py ADDED Viewed

	@@ -0,0 +1,306 @@

+"""
+This file includes a variety of methods needed to train new
+constituency parsers.  It also includes a method to load an
+already-trained parser.
+See the `train` method for the code block which starts from
+  raw treebank and returns a new parser.
+`evaluate` reads a treebank and gives a score for those trees.
+"""
+import copy
+import logging
+import os
+import torch
+from stanza.models.common.foundation_cache import load_bert, load_bert_with_peft, load_charlm, load_pretrain, NoTransformerFoundationCache
+from stanza.models.common.peft_config import build_peft_wrapper, load_peft_wrapper, pop_peft_args
+from stanza.models.constituency.base_trainer import BaseTrainer, ModelType
+from stanza.models.constituency.lstm_model import LSTMModel, SentenceBoundary, StackHistory, ConstituencyComposition
+from stanza.models.constituency.parse_transitions import Transition, TransitionScheme
+from stanza.models.constituency.utils import build_optimizer, build_scheduler
+# TODO: could put find_wordvec_pretrain, choose_charlm, etc in a more central place if it becomes widely used
+from stanza.utils.training.common import find_wordvec_pretrain, choose_charlm, find_charlm_file
+from stanza.resources.default_packages import default_charlms, default_pretrains
+logger = logging.getLogger('stanza')
+tlogger = logging.getLogger('stanza.constituency.trainer')
+class Trainer(BaseTrainer):
+    """
+    Stores a constituency model and its optimizer
+    Not inheriting from common/trainer.py because there's no concept of change_lr (yet?)
+    """
+    def __init__(self, model, optimizer=None, scheduler=None, epochs_trained=0, batches_trained=0, best_f1=0.0, best_epoch=0, first_optimizer=False):
+        super().__init__(model, optimizer, scheduler, epochs_trained, batches_trained, best_f1, best_epoch, first_optimizer)
+    def save(self, filename, save_optimizer=True):
+        """
+        Save the model (and by default the optimizer) to the given path
+        """
+        super().save(filename, save_optimizer)
+    def get_peft_params(self):
+        # Hide import so that peft dependency is optional
+        if self.model.args.get('use_peft', False):
+            from peft import get_peft_model_state_dict
+            return get_peft_model_state_dict(self.model.bert_model, adapter_name=self.model.peft_name)
+        return None
+    @property
+    def model_type(self):
+        return ModelType.LSTM
+    @staticmethod
+    def find_and_load_pretrain(saved_args, foundation_cache):
+        if 'wordvec_pretrain_file' not in saved_args:
+            return None
+        if os.path.exists(saved_args['wordvec_pretrain_file']):
+            return load_pretrain(saved_args['wordvec_pretrain_file'], foundation_cache)
+        logger.info("Unable to find pretrain in %s  Will try to load from the default resources instead", saved_args['wordvec_pretrain_file'])
+        language = saved_args['lang']
+        wordvec_pretrain = find_wordvec_pretrain(language, default_pretrains)
+        return load_pretrain(wordvec_pretrain, foundation_cache)
+    @staticmethod
+    def find_and_load_charlm(charlm_file, direction, saved_args, foundation_cache):
+        try:
+            return load_charlm(charlm_file, foundation_cache)
+        except FileNotFoundError as e:
+            logger.info("Unable to load charlm from %s  Will try to load from the default resources instead", charlm_file)
+            language = saved_args['lang']
+            dataset = saved_args['shorthand'].split("_")[1]
+            charlm = choose_charlm(language, dataset, "default", default_charlms, {})
+            charlm_file = find_charlm_file(direction, language, charlm)
+            return load_charlm(charlm_file, foundation_cache)
+    def log_num_words_known(self, words):
+        tlogger.info("Number of words in the training set found in the embedding: %d out of %d", self.model.num_words_known(words), len(words))
+    @staticmethod
+    def load_optimizer(model, checkpoint, first_optimizer, filename):
+        optimizer = build_optimizer(model.args, model, first_optimizer)
+        if checkpoint.get('optimizer_state_dict', None) is not None:
+            try:
+                optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+            except ValueError as e:
+                raise ValueError("Failed to load optimizer from %s" % filename) from e
+        else:
+            logger.info("Attempted to load optimizer to resume training, but optimizer not saved.  Creating new optimizer")
+        return optimizer
+    @staticmethod
+    def load_scheduler(model, optimizer, checkpoint, first_optimizer):
+        scheduler = build_scheduler(model.args, optimizer, first_optimizer=first_optimizer)
+        if 'scheduler_state_dict' in checkpoint:
+            scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
+        return scheduler
+    @staticmethod
+    def model_from_params(params, peft_params, args, foundation_cache=None, peft_name=None):
+        """
+        Build a new model just from the saved params and some extra args
+        Refactoring allows other processors to include a constituency parser as a module
+        """
+        saved_args = dict(params['config'])
+        if isinstance(saved_args['sentence_boundary_vectors'], str):
+            saved_args['sentence_boundary_vectors'] = SentenceBoundary[saved_args['sentence_boundary_vectors']]
+        if isinstance(saved_args['constituency_composition'], str):
+            saved_args['constituency_composition'] = ConstituencyComposition[saved_args['constituency_composition']]
+        if isinstance(saved_args['transition_stack'], str):
+            saved_args['transition_stack'] = StackHistory[saved_args['transition_stack']]
+        if isinstance(saved_args['constituent_stack'], str):
+            saved_args['constituent_stack'] = StackHistory[saved_args['constituent_stack']]
+        if isinstance(saved_args['transition_scheme'], str):
+            saved_args['transition_scheme'] = TransitionScheme[saved_args['transition_scheme']]
+        # some parameters which change the structure of a model have
+        # to be ignored, or the model will not function when it is
+        # reloaded from disk
+        if args is None: args = {}
+        update_args = copy.deepcopy(args)
+        pop_peft_args(update_args)
+        update_args.pop("bert_hidden_layers", None)
+        update_args.pop("bert_model", None)
+        update_args.pop("constituency_composition", None)
+        update_args.pop("constituent_stack", None)
+        update_args.pop("num_tree_lstm_layers", None)
+        update_args.pop("transition_scheme", None)
+        update_args.pop("transition_stack", None)
+        update_args.pop("maxout_k", None)
+        # if the pretrain or charlms are not specified, don't override the values in the model
+        # (if any), since the model won't even work without loading the same charlm
+        if 'wordvec_pretrain_file' in update_args and update_args['wordvec_pretrain_file'] is None:
+            update_args.pop('wordvec_pretrain_file')
+        if 'charlm_forward_file' in update_args and update_args['charlm_forward_file'] is None:
+            update_args.pop('charlm_forward_file')
+        if 'charlm_backward_file' in update_args and update_args['charlm_backward_file'] is None:
+            update_args.pop('charlm_backward_file')
+        # we don't pop bert_finetune, with the theory being that if
+        # the saved model has bert_finetune==True we can load the bert
+        # weights but then not further finetune if bert_finetune==False
+        saved_args.update(update_args)
+        # TODO: not needed if we rebuild the models
+        if saved_args.get("bert_finetune", None) is None:
+            saved_args["bert_finetune"] = False
+        if saved_args.get("stage1_bert_finetune", None) is None:
+            saved_args["stage1_bert_finetune"] = False
+        model_type = params['model_type']
+        if model_type == 'LSTM':
+            pt = Trainer.find_and_load_pretrain(saved_args, foundation_cache)
+            if saved_args.get('use_peft', False):
+                # if loading a peft model, we first load the base transformer
+                # then we load the weights using the saved weights in the file
+                if peft_name is None:
+                    bert_model, bert_tokenizer, peft_name = load_bert_with_peft(saved_args.get('bert_model', None), "constituency", foundation_cache)
+                else:
+                    bert_model, bert_tokenizer = load_bert(saved_args.get('bert_model', None), foundation_cache)
+                bert_model = load_peft_wrapper(bert_model, peft_params, saved_args, logger, peft_name)
+                bert_saved = True
+            elif saved_args['bert_finetune'] or saved_args['stage1_bert_finetune'] or any(x.startswith("bert_model.") for x in params['model'].keys()):
+                # if bert_finetune is True, don't use the cached model!
+                # otherwise, other uses of the cached model will be ruined
+                bert_model, bert_tokenizer = load_bert(saved_args.get('bert_model', None))
+                bert_saved = True
+            else:
+                bert_model, bert_tokenizer = load_bert(saved_args.get('bert_model', None), foundation_cache)
+                bert_saved = False
+            forward_charlm =  Trainer.find_and_load_charlm(saved_args["charlm_forward_file"],  "forward",  saved_args, foundation_cache)
+            backward_charlm = Trainer.find_and_load_charlm(saved_args["charlm_backward_file"], "backward", saved_args, foundation_cache)
+            # TODO: the isinstance will be unnecessary after 1.10.0
+            transitions = params['transitions']
+            if all(isinstance(x, str) for x in transitions):
+                transitions = [Transition.from_repr(x) for x in transitions]
+            model = LSTMModel(pretrain=pt,
+                              forward_charlm=forward_charlm,
+                              backward_charlm=backward_charlm,
+                              bert_model=bert_model,
+                              bert_tokenizer=bert_tokenizer,
+                              force_bert_saved=bert_saved,
+                              peft_name=peft_name,
+                              transitions=transitions,
+                              constituents=params['constituents'],
+                              tags=params['tags'],
+                              words=params['words'],
+                              rare_words=set(params['rare_words']),
+                              root_labels=params['root_labels'],
+                              constituent_opens=params['constituent_opens'],
+                              unary_limit=params['unary_limit'],
+                              args=saved_args)
+        else:
+            raise ValueError("Unknown model type {}".format(model_type))
+        model.load_state_dict(params['model'], strict=False)
+        # model will stay on CPU if device==None
+        # can be moved elsewhere later, of course
+        model = model.to(args.get('device', None))
+        return model
+    @staticmethod
+    def build_trainer(args, train_transitions, train_constituents, tags, words, rare_words, root_labels, open_nodes, unary_limit, foundation_cache, model_load_file):
+        # TODO: turn finetune, relearn_structure, multistage into an enum?
+        # finetune just means continue learning, so checkpoint is sufficient
+        # relearn_structure is essentially a one stage multistage
+        # multistage with a checkpoint will have the proper optimizer for that epoch
+        # and no special learning mode means we are training a new model and should continue
+        if args['checkpoint'] and args['checkpoint_save_name'] and os.path.exists(args['checkpoint_save_name']):
+            tlogger.info("Found checkpoint to continue training: %s", args['checkpoint_save_name'])
+            trainer = Trainer.load(args['checkpoint_save_name'], args, load_optimizer=True, foundation_cache=foundation_cache)
+            return trainer
+        # in the 'finetune' case, this will preload the models into foundation_cache,
+        # so the effort is not wasted
+        pt = foundation_cache.load_pretrain(args['wordvec_pretrain_file'])
+        forward_charlm = foundation_cache.load_charlm(args['charlm_forward_file'])
+        backward_charlm = foundation_cache.load_charlm(args['charlm_backward_file'])
+        if args['finetune']:
+            tlogger.info("Loading model to finetune: %s", model_load_file)
+            trainer = Trainer.load(model_load_file, args, load_optimizer=True, foundation_cache=NoTransformerFoundationCache(foundation_cache))
+            # a new finetuning will start with a new epochs_trained count
+            trainer.epochs_trained = 0
+            return trainer
+        if args['relearn_structure']:
+            tlogger.info("Loading model to continue training with new structure from %s", model_load_file)
+            temp_args = dict(args)
+            # remove the pattn & lattn layers unless the saved model had them
+            temp_args.pop('pattn_num_layers', None)
+            temp_args.pop('lattn_d_proj', None)
+            trainer = Trainer.load(model_load_file, temp_args, load_optimizer=False, foundation_cache=NoTransformerFoundationCache(foundation_cache))
+            # using the model's current values works for if the new
+            # dataset is the same or smaller
+            # TODO: handle a larger dataset as well
+            model = LSTMModel(pt,
+                              forward_charlm,
+                              backward_charlm,
+                              trainer.model.bert_model,
+                              trainer.model.bert_tokenizer,
+                              trainer.model.force_bert_saved,
+                              trainer.model.peft_name,
+                              trainer.model.transitions,
+                              trainer.model.constituents,
+                              trainer.model.tags,
+                              trainer.model.delta_words,
+                              trainer.model.rare_words,
+                              trainer.model.root_labels,
+                              trainer.model.constituent_opens,
+                              trainer.model.unary_limit(),
+                              args)
+            model = model.to(args['device'])
+            model.copy_with_new_structure(trainer.model)
+            optimizer = build_optimizer(args, model, False)
+            scheduler = build_scheduler(args, optimizer)
+            trainer = Trainer(model, optimizer, scheduler)
+            return trainer
+        if args['multistage']:
+            # run adadelta over the model for half the time with no pattn or lattn
+            # training then switches to a different optimizer for the rest
+            # this works surprisingly well
+            tlogger.info("Warming up model for %d iterations using AdaDelta to train the embeddings", args['epochs'] // 2)
+            temp_args = dict(args)
+            # remove the attention layers for the temporary model
+            temp_args['pattn_num_layers'] = 0
+            temp_args['lattn_d_proj'] = 0
+            args = temp_args
+        peft_name = None
+        if args['use_peft']:
+            peft_name = "constituency"
+            bert_model, bert_tokenizer = load_bert(args['bert_model'])
+            bert_model = build_peft_wrapper(bert_model, temp_args, tlogger, adapter_name=peft_name)
+        elif args['bert_finetune'] or args['stage1_bert_finetune']:
+            bert_model, bert_tokenizer = load_bert(args['bert_model'])
+        else:
+            bert_model, bert_tokenizer = load_bert(args['bert_model'], foundation_cache)
+        model = LSTMModel(pt,
+                          forward_charlm,
+                          backward_charlm,
+                          bert_model,
+                          bert_tokenizer,
+                          False,
+                          peft_name,
+                          train_transitions,
+                          train_constituents,
+                          tags,
+                          words,
+                          rare_words,
+                          root_labels,
+                          open_nodes,
+                          unary_limit,
+                          args)
+        model = model.to(args['device'])
+        optimizer = build_optimizer(args, model, build_simple_adadelta=args['multistage'])
+        scheduler = build_scheduler(args, optimizer, first_optimizer=args['multistage'])
+        trainer = Trainer(model, optimizer, scheduler, first_optimizer=args['multistage'])
+        return trainer

stanza/stanza/models/constituency/transformer_tree_stack.py ADDED Viewed

	@@ -0,0 +1,198 @@

+"""
+Based on
+Transition-based Parsing with Stack-Transformers
+Ramon Fernandez Astudillo, Miguel Ballesteros, Tahira Naseem,
+  Austin Blodget, and Radu Florian
+https://aclanthology.org/2020.findings-emnlp.89.pdf
+"""
+from collections import namedtuple
+import torch
+import torch.nn as nn
+from stanza.models.constituency.positional_encoding import SinusoidalEncoding
+from stanza.models.constituency.tree_stack import TreeStack
+Node = namedtuple("Node", ['value', 'key_stack', 'value_stack', 'output'])
+class TransformerTreeStack(nn.Module):
+    def __init__(self, input_size, output_size, input_dropout, length_limit=None, use_position=False, num_heads=1):
+        """
+        Builds the internal matrices and start parameter
+        TODO: currently only one attention head, implement MHA
+        """
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.inv_sqrt_output_size = 1 / output_size ** 0.5
+        self.num_heads = num_heads
+        self.w_query = nn.Linear(input_size, output_size)
+        self.w_key   = nn.Linear(input_size, output_size)
+        self.w_value = nn.Linear(input_size, output_size)
+        self.register_parameter('start_embedding', torch.nn.Parameter(0.2 * torch.randn(input_size, requires_grad=True)))
+        if isinstance(input_dropout, nn.Module):
+            self.input_dropout = input_dropout
+        else:
+            self.input_dropout = nn.Dropout(input_dropout)
+        if length_limit is not None and length_limit < 1:
+            raise ValueError("length_limit < 1 makes no sense")
+        self.length_limit = length_limit
+        self.use_position = use_position
+        if use_position:
+            self.position_encoding = SinusoidalEncoding(model_dim=self.input_size, max_len=512)
+    def attention(self, key, query, value, mask=None):
+        """
+        Calculate attention for the given key, query value
+        Where B is the number of items stacked together, N is the length:
+        The key should be BxNxD
+        The query is BxD
+        The value is BxNxD
+        If mask is specified, it should be BxN of True/False values,
+        where True means that location is masked out
+        Reshapes and reorders are used to handle num_heads
+        Return will be softmax(query x key^T) * value
+        of size BxD
+        """
+        B = key.shape[0]
+        N = key.shape[1]
+        D = key.shape[2]
+        H = self.num_heads
+        # query is now BxDx1
+        query = query.unsqueeze(2)
+        # BxHxD/Hx1
+        query = query.reshape((B, H, -1, 1))
+        # BxNxHxD/H
+        key = key.reshape((B, N, H, -1))
+        # BxHxNxD/H
+        key = key.transpose(1, 2)
+        # BxNxHxD/H
+        value = value.reshape((B, N, H, -1))
+        # BxHxNxD/H
+        value = value.transpose(1, 2)
+        # BxHxNxD/H x BxHxD/Hx1
+        # result shape: BxHxN
+        attn = torch.matmul(key, query).squeeze(3) * self.inv_sqrt_output_size
+        if mask is not None:
+            # mask goes from BxN -> Bx1xN
+            mask = mask.unsqueeze(1)
+            mask = mask.expand(-1, H, -1)
+            attn.masked_fill_(mask, float('-inf'))
+        # attn shape will now be BxHx1xN
+        attn = torch.softmax(attn, dim=2).unsqueeze(2)
+        # BxHx1xN x BxHxNxD/H -> BxHxD/H
+        output = torch.matmul(attn, value).squeeze(2)
+        output = output.reshape(B, -1)
+        return output
+    def initial_state(self, initial_value=None):
+        """
+        Return an initial state based on a single layer of attention
+        Running attention might be overkill, but it is the simplest
+        way to put the Linears and start_embedding in the computation graph
+        """
+        start = self.start_embedding
+        if self.use_position:
+            position = self.position_encoding([0]).squeeze(0)
+            start = start + position
+        # N=1
+        # shape: 1xD
+        key = self.w_key(start).unsqueeze(0)
+        # shape: D
+        query = self.w_query(start)
+        # shape: 1xD
+        value = self.w_value(start).unsqueeze(0)
+        # unsqueeze to make it look like we are part of a batch of size 1
+        output = self.attention(key.unsqueeze(0), query.unsqueeze(0), value.unsqueeze(0)).squeeze(0)
+        return TreeStack(value=Node(initial_value, key, value, output), parent=None, length=1)
+    def push_states(self, stacks, values, inputs):
+        """
+        Push new inputs to the stacks and rerun attention on them
+        Where B is the number of items stacked together, I is input_size
+        stacks: B TreeStacks such as produced by initial_state and/or push_states
+        values: the new items to push on the stacks such as tree nodes or anything
+        inputs: BxI for the new input items
+        Runs attention starting from the existing keys & values
+        """
+        device = self.w_key.weight.device
+        batch_len = len(stacks)   # B
+        positions = [x.value.key_stack.shape[0] for x in stacks]
+        max_len = max(positions)  # N
+        if self.use_position:
+            position_encodings = self.position_encoding(positions)
+            inputs = inputs + position_encodings
+        inputs = self.input_dropout(inputs)
+        if len(inputs.shape) == 3:
+            if inputs.shape[0] == 1:
+                inputs = inputs.squeeze(0)
+            else:
+                raise ValueError("Expected the inputs to be of shape 1xBxI, got {}".format(inputs.shape))
+        new_keys = self.w_key(inputs)
+        key_stack = torch.zeros(batch_len, max_len+1, self.output_size, device=device)
+        key_stack[:, -1, :] = new_keys
+        for stack_idx, stack in enumerate(stacks):
+            key_stack[stack_idx, -positions[stack_idx]-1:-1, :] = stack.value.key_stack
+        new_values = self.w_value(inputs)
+        value_stack = torch.zeros(batch_len, max_len+1, self.output_size, device=device)
+        value_stack[:, -1, :] = new_values
+        for stack_idx, stack in enumerate(stacks):
+            value_stack[stack_idx, -positions[stack_idx]-1:-1, :] = stack.value.value_stack
+        query = self.w_query(inputs)
+        mask = torch.zeros(batch_len, max_len+1, device=device, dtype=torch.bool)
+        for stack_idx, stack in enumerate(stacks):
+            if len(stack) < max_len:
+                masked = max_len - positions[stack_idx]
+                mask[stack_idx, :masked] = True
+        batched_output = self.attention(key_stack, query, value_stack, mask)
+        new_stacks = []
+        for stack_idx, (stack, node_value, new_key, new_value, output) in enumerate(zip(stacks, values, key_stack, value_stack, batched_output)):
+            # max_len-len(stack) so that we ignore the padding at the start of shorter stacks
+            new_key_stack = new_key[max_len-positions[stack_idx]:, :]
+            new_value_stack = new_value[max_len-positions[stack_idx]:, :]
+            if self.length_limit is not None and new_key_stack.shape[0] > self.length_limit + 1:
+                new_key_stack = torch.cat([new_key_stack[:1, :], new_key_stack[2:, :]], axis=0)
+                new_value_stack = torch.cat([new_value_stack[:1, :], new_value_stack[2:, :]], axis=0)
+            new_stacks.append(stack.push(value=Node(node_value, new_key_stack, new_value_stack, output)))
+        return new_stacks
+    def output(self, stack):
+        """
+        Return the last layer of the lstm_hx as the output from a stack
+        Refactored so that alternate structures have an easy way of getting the output
+        """
+        return stack.value.output

stanza/stanza/models/constituency/transition_sequence.py ADDED Viewed

	@@ -0,0 +1,186 @@

+"""
+Build a transition sequence from parse trees.
+Supports multiple transition schemes - TOP_DOWN and variants, IN_ORDER
+"""
+import logging
+from stanza.models.common import utils
+from stanza.models.constituency.parse_transitions import Shift, CompoundUnary, OpenConstituent, CloseConstituent, TransitionScheme, Finalize
+from stanza.models.constituency.tree_reader import read_trees
+from stanza.utils.get_tqdm import get_tqdm
+tqdm = get_tqdm()
+logger = logging.getLogger('stanza.constituency.trainer')
+def yield_top_down_sequence(tree, transition_scheme=TransitionScheme.TOP_DOWN_UNARY):
+    """
+    For tree (X A B C D), yield Open(X) A B C D Close
+    The details are in how to treat unary transitions
+    Three possibilities handled by this method:
+      TOP_DOWN_UNARY:    (Y (X ...)) -> Open(X) ... Close Unary(Y)
+      TOP_DOWN_COMPOUND: (Y (X ...)) -> Open(Y, X) ... Close
+      TOP_DOWN:          (Y (X ...)) -> Open(Y) Open(X) ... Close Close
+    """
+    if tree.is_preterminal():
+        yield Shift()
+        return
+    if tree.is_leaf():
+        return
+    if transition_scheme is TransitionScheme.TOP_DOWN_UNARY:
+        if len(tree.children) == 1:
+            labels = []
+            while not tree.is_preterminal() and len(tree.children) == 1:
+                labels.append(tree.label)
+                tree = tree.children[0]
+            for transition in yield_top_down_sequence(tree, transition_scheme):
+                yield transition
+            yield CompoundUnary(*labels)
+            return
+    if transition_scheme is TransitionScheme.TOP_DOWN_COMPOUND:
+        labels = [tree.label]
+        while len(tree.children) == 1 and not tree.children[0].is_preterminal():
+            tree = tree.children[0]
+            labels.append(tree.label)
+        yield OpenConstituent(*labels)
+    else:
+        yield OpenConstituent(tree.label)
+    for child in tree.children:
+        for transition in yield_top_down_sequence(child, transition_scheme):
+            yield transition
+    yield CloseConstituent()
+def yield_in_order_sequence(tree):
+    """
+    For tree (X A B C D), yield A Open(X) B C D Close
+    """
+    if tree.is_preterminal():
+        yield Shift()
+        return
+    if tree.is_leaf():
+        return
+    for transition in yield_in_order_sequence(tree.children[0]):
+        yield transition
+    yield OpenConstituent(tree.label)
+    for child in tree.children[1:]:
+        for transition in yield_in_order_sequence(child):
+            yield transition
+    yield CloseConstituent()
+def yield_in_order_compound_sequence(tree, transition_scheme):
+    def helper(tree):
+        if tree.is_leaf():
+            return
+        labels = []
+        while len(tree.children) == 1 and not tree.is_preterminal():
+            labels.append(tree.label)
+            tree = tree.children[0]
+        if tree.is_preterminal():
+            yield Shift()
+            if len(labels) > 0:
+                yield CompoundUnary(*labels)
+            return
+        for transition in helper(tree.children[0]):
+            yield transition
+        if transition_scheme is TransitionScheme.IN_ORDER_UNARY:
+            yield OpenConstituent(tree.label)
+        else:
+            labels.append(tree.label)
+            yield OpenConstituent(*labels)
+        for child in tree.children[1:]:
+            for transition in helper(child):
+                yield transition
+        yield CloseConstituent()
+        if transition_scheme is TransitionScheme.IN_ORDER_UNARY and len(labels) > 0:
+            yield CompoundUnary(*labels)
+    if len(tree.children) == 0:
+        raise ValueError("Cannot build {} on an empty tree".format(transition_scheme))
+    if len(tree.children) != 1:
+        raise ValueError("Cannot build {} with a tree that has two top level nodes: {}".format(transition_scheme, tree))
+    for t in helper(tree.children[0]):
+        yield t
+    yield Finalize(tree.label)
+def build_sequence(tree, transition_scheme=TransitionScheme.TOP_DOWN_UNARY):
+    """
+    Turn a single tree into a list of transitions based on the TransitionScheme
+    """
+    if transition_scheme is TransitionScheme.IN_ORDER:
+        return list(yield_in_order_sequence(tree))
+    elif (transition_scheme is TransitionScheme.IN_ORDER_COMPOUND or
+          transition_scheme is TransitionScheme.IN_ORDER_UNARY):
+        return list(yield_in_order_compound_sequence(tree, transition_scheme))
+    else:
+        return list(yield_top_down_sequence(tree, transition_scheme))
+def build_treebank(trees, transition_scheme=TransitionScheme.TOP_DOWN_UNARY, reverse=False):
+    """
+    Turn each of the trees in the treebank into a list of transitions based on the TransitionScheme
+    """
+    if reverse:
+        return [build_sequence(tree.reverse(), transition_scheme) for tree in trees]
+    else:
+        return [build_sequence(tree, transition_scheme) for tree in trees]
+def all_transitions(transition_lists):
+    """
+    Given a list of transition lists, combine them all into a list of unique transitions.
+    """
+    transitions = set()
+    for trans_list in transition_lists:
+        transitions.update(trans_list)
+    return sorted(transitions)
+def convert_trees_to_sequences(trees, treebank_name, transition_scheme, reverse=False):
+    """
+    Wrap both build_treebank and all_transitions, possibly with a tqdm
+    Converts trees to a list of sequences, then returns the list of known transitions
+    """
+    if len(trees) == 0:
+        return [], []
+    logger.info("Building %s transition sequences", treebank_name)
+    if logger.getEffectiveLevel() <= logging.INFO:
+        trees = tqdm(trees)
+    sequences = build_treebank(trees, transition_scheme, reverse)
+    transitions = all_transitions(sequences)
+    return sequences, transitions
+def main():
+    """
+    Convert a sample tree and print its transitions
+    """
+    text="( (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))"
+    #text = "(WP Who)"
+    tree = read_trees(text)[0]
+    print(tree)
+    transitions = build_sequence(tree)
+    print(transitions)
+if __name__ == '__main__':
+    main()

stanza/stanza/models/constituency/tree_embedding.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""
+A module to use a Constituency Parser to make an embedding for a tree
+The embedding can be produced just from the words and the top of the
+tree, or it can be done with a form of attention over the nodes
+Can be done over an existing parse tree or unparsed text
+"""
+import torch
+import torch.nn as nn
+from stanza.models.constituency.trainer import Trainer
+class TreeEmbedding(nn.Module):
+    def __init__(self, constituency_parser, args):
+        super(TreeEmbedding, self).__init__()
+        self.config = {
+            "all_words":   args["all_words"],
+            "backprop":    args["backprop"],
+            #"batch_norm":  args["batch_norm"],
+            "node_attn":   args["node_attn"],
+            "top_layer":   args["top_layer"],
+        }
+        self.constituency_parser = constituency_parser
+        # word_lstm:         hidden_size * num_tree_lstm_layers * 2 (start & end)
+        # transition_stack:  transition_hidden_size
+        # constituent_stack: hidden_size
+        self.hidden_size = self.constituency_parser.hidden_size + self.constituency_parser.transition_hidden_size
+        if self.config["all_words"]:
+            self.hidden_size += self.constituency_parser.hidden_size * self.constituency_parser.num_tree_lstm_layers
+        else:
+            self.hidden_size += self.constituency_parser.hidden_size * self.constituency_parser.num_tree_lstm_layers * 2
+        if self.config["node_attn"]:
+            self.query = nn.Linear(self.constituency_parser.hidden_size, self.constituency_parser.hidden_size)
+            self.key = nn.Linear(self.hidden_size, self.constituency_parser.hidden_size)
+            self.value = nn.Linear(self.constituency_parser.hidden_size, self.constituency_parser.hidden_size)
+            # TODO: cat transition and constituent hx as well?
+            self.output_size = self.constituency_parser.hidden_size * self.constituency_parser.num_tree_lstm_layers
+        else:
+            self.output_size = self.hidden_size
+        # TODO: maybe have batch_norm, maybe use Identity
+        #if self.config["batch_norm"]:
+        #    self.input_norm = nn.BatchNorm1d(self.output_size)
+    def embed_trees(self, inputs):
+        if self.config["backprop"]:
+            states = self.constituency_parser.analyze_trees(inputs)
+        else:
+            with torch.no_grad():
+                states = self.constituency_parser.analyze_trees(inputs)
+        constituent_lists = [x.constituents for x in states]
+        states = [x.state for x in states]
+        word_begin_hx = torch.stack([state.word_queue[0].hx for state in states])
+        word_end_hx = torch.stack([state.word_queue[state.word_position].hx for state in states])
+        transition_hx = torch.stack([self.constituency_parser.transition_stack.output(state.transitions) for state in states])
+        # go down one layer to get the embedding off the top of the S, not the ROOT
+        # (in terms of the typical treebank)
+        # the idea being that the ROOT has no additional information
+        # and may even have 0s for the embedding in certain circumstances,
+        # such as after learning UNTIED_MAX long enough
+        if self.config["top_layer"]:
+            constituent_hx = torch.stack([self.constituency_parser.constituent_stack.output(state.constituents) for state in states])
+        else:
+            constituent_hx = torch.cat([constituents[-2].tree_hx for constituents in constituent_lists], dim=0)
+        if self.config["all_words"]:
+            # need B matrices of N x hidden_size
+            key = [torch.stack([torch.cat([word.hx, thx, chx]) for word in state.word_queue], dim=0)
+                   for state, thx, chx in zip(states, transition_hx, constituent_hx)]
+        else:
+            key = torch.cat((word_begin_hx, word_end_hx, transition_hx, constituent_hx), dim=1).unsqueeze(1)
+        if not self.config["node_attn"]:
+            return key
+        key = [self.key(x) for x in key]
+        node_hx = [torch.stack([con.tree_hx for con in constituents], dim=0) for constituents in constituent_lists]
+        queries = [self.query(nhx).reshape(nhx.shape[0], -1) for nhx in node_hx]
+        values = [self.value(nhx).reshape(nhx.shape[0], -1) for nhx in node_hx]
+        # TODO: could pad to make faster here
+        attn = [torch.matmul(q, k.transpose(0, 1)) for q, k in zip(queries, key)]
+        attn = [torch.softmax(x, dim=0) for x in attn]
+        previous_layer = [torch.matmul(weight.transpose(0, 1), value) for weight, value in zip(attn, values)]
+        return previous_layer
+    def forward(self, inputs):
+        return embed_trees(self, inputs)
+    def get_norms(self):
+        lines = ["constituency_parser." + x for x in self.constituency_parser.get_norms()]
+        for name, param in self.named_parameters():
+            if param.requires_grad and not name.startswith('constituency_parser.'):
+                lines.append("%s %.6g" % (name, torch.norm(param).item()))
+        return lines
+    def get_params(self, skip_modules=True):
+        model_state = self.state_dict()
+        # skip all of the constituency parameters here -
+        # we will add them by calling the model's get_params()
+        skipped = [k for k in model_state.keys() if k.startswith("constituency_parser.")]
+        for k in skipped:
+            del model_state[k]
+        parser = self.constituency_parser.get_params(skip_modules)
+        params = {
+            'model':         model_state,
+            'constituency':  parser,
+            'config':        self.config,
+        }
+        return params
+    @staticmethod
+    def from_parser_file(args, foundation_cache=None):
+        constituency_parser = Trainer.load(args['model'], args, foundation_cache)
+        return TreeEmbedding(constituency_parser.model, args)
+    @staticmethod
+    def model_from_params(params, args, foundation_cache=None):
+        # TODO: integrate with peft
+        constituency_parser = Trainer.model_from_params(params['constituency'], None, args, foundation_cache)
+        model = TreeEmbedding(constituency_parser, params['config'])
+        model.load_state_dict(params['model'], strict=False)
+        return model

stanza/stanza/models/coref/config.py ADDED Viewed

	@@ -0,0 +1,66 @@

+""" Describes Config, a simple namespace for config values.
+For description of all config values, refer to config.toml.
+"""
+from dataclasses import dataclass
+from typing import Dict, List
+@dataclass
+class Config:  # pylint: disable=too-many-instance-attributes, too-few-public-methods
+    """ Contains values needed to set up the coreference model. """
+    section: str
+    # TODO: can either eliminate data_dir or use it for the train/dev/test data
+    data_dir: str
+    save_dir: str
+    save_name: str
+    train_data: str
+    dev_data: str
+    test_data: str
+    device: str
+    bert_model: str
+    bert_window_size: int
+    embedding_size: int
+    sp_embedding_size: int
+    a_scoring_batch_size: int
+    hidden_size: int
+    n_hidden_layers: int
+    max_span_len: int
+    rough_k: int
+    lora: bool
+    lora_alpha: int
+    lora_rank: int
+    lora_dropout: float
+    full_pairwise: bool
+    lora_target_modules: List[str]
+    lora_modules_to_save: List[str]
+    clusters_starts_are_singletons: bool
+    bert_finetune: bool
+    dropout_rate: float
+    learning_rate: float
+    bert_learning_rate: float
+    # we find that setting this to a small but non-zero number
+    # makes the model less likely to forget how to do anything
+    bert_finetune_begin_epoch: float
+    train_epochs: int
+    bce_loss_weight: float
+    tokenizer_kwargs: Dict[str, dict]
+    conll_log_dir: str
+    save_each_checkpoint: bool
+    log_norms: bool
+    singletons: bool

stanza/stanza/models/coref/coref_config.toml ADDED Viewed

	@@ -0,0 +1,285 @@

+# =============================================================================
+# Before you start changing anything here, read the comments.
+# All of them can be found below in the "DEFAULT" section
+[DEFAULT]
+# The directory that contains extracted files of everything you've downloaded.
+data_dir = "data/coref"
+# where to put checkpoints and final models
+save_dir = "saved_models/coref"
+save_name = "bert-large-cased"
+# Train, dev and test jsonlines
+# train_data = "data/coref/en_gum-ud.train.nosgl.json"
+# dev_data = "data/coref/en_gum-ud.dev.nosgl.json"
+# test_data = "data/coref/en_gum-ud.test.nosgl.json"
+train_data = "data/coref/corefud_concat_v1_0_langid.train.json"
+dev_data = "data/coref/corefud_concat_v1_0_langid.dev.json"
+test_data = "data/coref/corefud_concat_v1_0_langid.dev.json"
+#train_data = "data/coref/english_train_head.jsonlines"
+#dev_data = "data/coref/english_development_head.jsonlines"
+#test_data = "data/coref/english_test_head.jsonlines"
+# do not use the full pairwise encoding scheme
+full_pairwise = false
+# The device where everything is to be placed. "cuda:N"/"cpu" are supported.
+device = "cuda:0"
+save_each_checkpoint = false
+log_norms = false
+# Bert settings ======================
+# Base bert model architecture and tokenizer
+bert_model = "bert-large-cased"
+# Controls max length of sequences passed through bert to obtain its
+# contextual embeddings
+# Must be less than or equal to 512
+bert_window_size = 512
+# General model settings =============
+# Controls the dimensionality of feature embeddings
+embedding_size = 20
+# Controls the dimensionality of distance embeddings used by SpanPredictor
+sp_embedding_size = 64
+# Controls the number of spans for which anaphoricity can be scores in one
+# batch. Only affects final scoring; mention extraction and rough scoring
+# are less memory intensive, so they are always done in just one batch.
+a_scoring_batch_size = 128
+# AnaphoricityScorer FFNN parameters
+hidden_size = 1024
+n_hidden_layers = 1
+# Do you want to support singletons?
+singletons = true
+# Mention extraction settings ========
+# Mention extractor will check spans up to max_span_len words
+# The default value is chosen to be big enough to hold any dev data span
+max_span_len = 64
+# Pruning settings ===================
+# Controls how many pairs should be preserved per mention
+# after applying rough scoring.
+rough_k = 50
+# Lora settings ===================
+# LoRA settings
+lora = false
+lora_alpha = 128
+lora_dropout = 0.1
+lora_rank = 64
+lora_target_modules = []
+lora_modules_to_save = []
+# Training settings ==================
+# Controls whether the first dummy node predicts cluster starts or singletons
+clusters_starts_are_singletons = true
+# Controls whether to fine-tune bert_model
+bert_finetune = true
+# Controls the dropout rate throughout all models
+dropout_rate = 0.3
+# Bert learning rate (only used if bert_finetune is set)
+bert_learning_rate = 1e-6
+bert_finetune_begin_epoch = 0.5
+# Task learning rate
+learning_rate = 3e-4
+# For how many epochs the training is done
+train_epochs = 32
+# Controls the weight of binary cross entropy loss added to nlml loss
+bce_loss_weight = 0.5
+# The directory that will contain conll prediction files
+conll_log_dir = "data/conll_logs"
+# =============================================================================
+# Extra keyword arguments to be passed to bert tokenizers of specified models
+[DEFAULT.tokenizer_kwargs]
+    [DEFAULT.tokenizer_kwargs.roberta-large]
+        "add_prefix_space" = true
+    [DEFAULT.tokenizer_kwargs.xlm-roberta-large]
+        "add_prefix_space" = true
+    [DEFAULT.tokenizer_kwargs.spanbert-large-cased]
+        "do_lower_case" = false
+    [DEFAULT.tokenizer_kwargs.bert-large-cased]
+        "do_lower_case" = false
+# =============================================================================
+# The sections listed here do not need to make use of all config variables
+# If a variable is omitted, its default value will be used instead
+[roberta]
+bert_model = "roberta-large"
+[roberta_lora]
+bert_model = "roberta-large"
+bert_learning_rate = 0.00005
+lora = true
+lora_target_modules = [ "query", "value", "output.dense", "intermediate.dense" ]
+lora_modules_to_save = [ "pooler" ]
+[scandibert_lora]
+bert_model = "vesteinn/ScandiBERT"
+bert_learning_rate = 0.0002
+lora = true
+lora_target_modules = [ "query", "value", "output.dense", "intermediate.dense" ]
+lora_modules_to_save = [ "pooler" ]
+[xlm_roberta]
+bert_model = "FacebookAI/xlm-roberta-large"
+bert_learning_rate = 0.00001
+bert_finetune = true
+[xlm_roberta_lora]
+bert_model = "FacebookAI/xlm-roberta-large"
+bert_learning_rate = 0.000025
+lora = true
+lora_target_modules = [ "query", "value", "output.dense", "intermediate.dense" ]
+lora_modules_to_save = [ "pooler" ]
+[deeppavlov_slavic_bert_lora]
+bert_model = "DeepPavlov/bert-base-bg-cs-pl-ru-cased"
+bert_learning_rate = 0.000025
+lora = true
+lora_target_modules = [ "query", "value", "output.dense", "intermediate.dense" ]
+lora_modules_to_save = [ "pooler" ]
+[deberta_lora]
+bert_model = "microsoft/deberta-v3-large"
+bert_learning_rate = 0.00001
+lora = true
+lora_target_modules = [ "query_proj", "value_proj", "output.dense" ]
+lora_modules_to_save = [  ]
+[electra]
+bert_model = "google/electra-large-discriminator"
+bert_learning_rate = 0.00002
+[electra_lora]
+bert_model = "google/electra-large-discriminator"
+bert_learning_rate = 0.000025
+lora = true
+lora_target_modules = [ "query", "value", "output.dense", "intermediate.dense" ]
+lora_modules_to_save = [  ]
+[hungarian_electra_lora]
+# TODO: experiment with tokenizer options for this to see if that's
+# why the results are so low using this transformer
+bert_model = "NYTK/electra-small-discriminator-hungarian"
+bert_learning_rate = 0.000025
+lora = true
+lora_target_modules = [ "query", "value", "output.dense", "intermediate.dense" ]
+lora_modules_to_save = [  ]
+[muril_large_cased_lora]
+bert_model = "google/muril-large-cased"
+bert_learning_rate = 0.000025
+lora = true
+lora_target_modules = [ "query", "value", "output.dense", "intermediate.dense" ]
+lora_modules_to_save = [ "pooler" ]
+[indic_bert_lora]
+bert_model = "ai4bharat/indic-bert"
+bert_learning_rate = 0.0005
+lora = true
+# indic-bert is an albert with repeating layers of different names
+lora_target_modules = [ "query", "value", "dense", "ffn", "full_layer" ]
+lora_modules_to_save = [ "pooler" ]
+[bert_multilingual_cased_lora]
+# LR sweep on a Hindi dataset
+# 0.00001:  0.53238
+# 0.00002:  0.54012
+# 0.000025: 0.54206
+# 0.00003:  0.54050
+# 0.00004:  0.55081
+# 0.00005:  0.55135
+# 0.000075: 0.54482
+# 0.0001:   0.53888
+bert_model = "google-bert/bert-base-multilingual-cased"
+bert_learning_rate = 0.00005
+lora = true
+lora_target_modules = [ "query", "value", "output.dense", "intermediate.dense" ]
+lora_modules_to_save = [ "pooler" ]
+[t5_lora]
+bert_model = "google-t5/t5-large"
+bert_learning_rate = 0.000025
+bert_window_size = 1024
+lora = true
+lora_target_modules = [ "q", "v", "o", "wi", "wo" ]
+lora_modules_to_save = [  ]
+[mt5_lora]
+bert_model = "google/mt5-base"
+bert_learning_rate = 0.000025
+lora_alpha = 64
+lora_rank = 32
+lora = true
+lora_target_modules = [ "q", "v", "o", "wi", "wo" ]
+lora_modules_to_save = [  ]
+[deepnarrow_t5_xl_lora]
+bert_model = "google/t5-efficient-xl"
+bert_learning_rate = 0.00025
+lora = true
+lora_target_modules = [ "q", "v", "o", "wi", "wo" ]
+lora_modules_to_save = [  ]
+[roberta_no_finetune]
+bert_model = "roberta-large"
+bert_finetune = false
+[roberta_no_bce]
+bert_model = "roberta-large"
+bce_loss_weight = 0.0
+[spanbert]
+bert_model = "SpanBERT/spanbert-large-cased"
+[spanbert_no_bce]
+bert_model = "SpanBERT/spanbert-large-cased"
+bce_loss_weight = 0.0
+[bert]
+bert_model = "bert-large-cased"
+[longformer]
+bert_model = "allenai/longformer-large-4096"
+bert_window_size = 2048
+[debug]
+bert_window_size = 384
+bert_finetune = false
+device = "cpu:0"
+[debug_gpu]
+bert_window_size = 384
+bert_finetune = false

stanza/stanza/models/coref/dataset.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import json
+import logging
+from torch.utils.data import Dataset
+from stanza.models.coref.tokenizer_customization import TOKENIZER_FILTERS, TOKENIZER_MAPS
+logger = logging.getLogger('stanza')
+class CorefDataset(Dataset):
+    def __init__(self, path, config, tokenizer):
+        self.config = config
+        self.tokenizer = tokenizer
+        # by default, this doesn't filter anything (see lambda _ True);
+        # however, there are some subword symbols which are standalone
+        # tokens which we don't want on models like Albert; hence we
+        # pass along a filter if needed.
+        self.__filter_func = TOKENIZER_FILTERS.get(self.config.bert_model,
+                                                   lambda _: True)
+        self.__token_map = TOKENIZER_MAPS.get(self.config.bert_model, {})
+        try:
+            with open(path, encoding="utf-8") as fin:
+                data_f = json.load(fin)
+        except json.decoder.JSONDecodeError:
+            # read the old jsonlines format if necessary
+            with open(path, encoding="utf-8") as fin:
+                text = "[" + ",\n".join(fin) + "]"
+            data_f = json.loads(text)
+        logger.info("Processing %d docs from %s...", len(data_f), path)
+        self.__raw = data_f
+        self.__avg_span = sum(len(doc["head2span"]) for doc in self.__raw) / len(self.__raw)
+        self.__out = []
+        for doc in self.__raw:
+            doc["span_clusters"] = [[tuple(mention) for mention in cluster]
+                                for cluster in doc["span_clusters"]]
+            word2subword = []
+            subwords = []
+            word_id = []
+            for i, word in enumerate(doc["cased_words"]):
+                tokenized_word = self.__token_map.get(word, self.tokenizer.tokenize(word))
+                tokenized_word = list(filter(self.__filter_func, tokenized_word))
+                word2subword.append((len(subwords), len(subwords) + len(tokenized_word)))
+                subwords.extend(tokenized_word)
+                word_id.extend([i] * len(tokenized_word))
+            doc["word2subword"] = word2subword
+            doc["subwords"] = subwords
+            doc["word_id"] = word_id
+            self.__out.append(doc)
+        logger.info("Loaded %d docs from %s.", len(data_f), path)
+    @property
+    def avg_span(self):
+        return self.__avg_span
+    def __getitem__(self, x):
+        return self.__out[x]
+    def __len__(self):
+        return len(self.__out)

stanza/stanza/models/coref/pairwise_encoder.py ADDED Viewed

	@@ -0,0 +1,94 @@

+""" Describes PairwiseEncodes, that transforms pairwise features, such as
+distance between the mentions, same/different speaker into feature embeddings
+"""
+from typing import List
+import torch
+from stanza.models.coref.config import Config
+from stanza.models.coref.const import Doc
+class PairwiseEncoder(torch.nn.Module):
+    """ A Pytorch module to obtain feature embeddings for pairwise features
+    Usage:
+        encoder = PairwiseEncoder(config)
+        pairwise_features = encoder(pair_indices, doc)
+    """
+    def __init__(self, config: Config):
+        super().__init__()
+        emb_size = config.embedding_size
+        self.genre2int = {g: gi for gi, g in enumerate(["bc", "bn", "mz", "nw",
+                                                        "pt", "tc", "wb"])}
+        self.genre_emb = torch.nn.Embedding(len(self.genre2int), emb_size)
+        # each position corresponds to a bucket:
+        #   [(0, 2), (2, 3), (3, 4), (4, 5), (5, 8),
+        #    (8, 16), (16, 32), (32, 64), (64, float("inf"))]
+        self.distance_emb = torch.nn.Embedding(9, emb_size)
+        # two possibilities: same vs different speaker
+        self.speaker_emb = torch.nn.Embedding(2, emb_size)
+        self.dropout = torch.nn.Dropout(config.dropout_rate)
+        self.__full_pw = config.full_pairwise
+        if self.__full_pw:
+            self.shape = emb_size * 3  # genre, distance, speaker\
+        else:
+            self.shape = emb_size # distance only
+    @property
+    def device(self) -> torch.device:
+        """ A workaround to get current device (which is assumed to be the
+        device of the first parameter of one of the submodules) """
+        return next(self.genre_emb.parameters()).device
+    def forward(self,  # type: ignore  # pylint: disable=arguments-differ  #35566 in pytorch
+                top_indices: torch.Tensor,
+                doc: Doc) -> torch.Tensor:
+        word_ids = torch.arange(0, len(doc["cased_words"]), device=self.device)
+        # bucketing the distance (see __init__())
+        distance = (word_ids.unsqueeze(1) - word_ids[top_indices]
+                    ).clamp_min_(min=1)
+        log_distance = distance.to(torch.float).log2().floor_()
+        log_distance = log_distance.clamp_max_(max=6).to(torch.long)
+        distance = torch.where(distance < 5, distance - 1, log_distance + 2)
+        distance = self.distance_emb(distance)
+        if not self.__full_pw:
+            return self.dropout(distance)
+        # calculate speaker embeddings
+        speaker_map = torch.tensor(self._speaker_map(doc), device=self.device)
+        same_speaker = (speaker_map[top_indices] == speaker_map.unsqueeze(1))
+        same_speaker = self.speaker_emb(same_speaker.to(torch.long))
+        # if there is no genre information, use "wb" as the genre (which is what the
+        # Pipeline does
+        genre = torch.tensor(self.genre2int.get(doc["document_id"][:2], self.genre2int["wb"]),
+                             device=self.device).expand_as(top_indices)
+        genre = self.genre_emb(genre)
+        return self.dropout(torch.cat((same_speaker, distance, genre), dim=2))
+    @staticmethod
+    def _speaker_map(doc: Doc) -> List[int]:
+        """
+        Returns a tensor where i-th element is the speaker id of i-th word.
+        """
+        # if speaker is not found in the doc, simply return "speaker#1" for all the speakers
+        # and embed them using the same ID
+        # speaker string -> speaker id
+        str2int = {s: i for i, s in enumerate(set(doc.get("speaker", ["speaker#1"
+                                                                      for _ in range(len(doc["deprel"]))])))}
+        # word id -> speaker id
+        return [str2int[s] for s in doc.get("speaker", ["speaker#1"
+                                                        for _ in range(len(doc["deprel"]))])]

stanza/stanza/models/coref/rough_scorer.py ADDED Viewed

	@@ -0,0 +1,61 @@

+""" Describes RoughScorer, a simple bilinear module to calculate rough
+anaphoricity scores.
+"""
+from typing import Tuple
+import torch
+from stanza.models.coref.config import Config
+class RoughScorer(torch.nn.Module):
+    """
+    Is needed to give a roughly estimate of the anaphoricity of two candidates,
+    only top scoring candidates are considered on later steps to reduce
+    computational complexity.
+    """
+    def __init__(self, features: int, config: Config):
+        super().__init__()
+        self.dropout = torch.nn.Dropout(config.dropout_rate)
+        self.bilinear = torch.nn.Linear(features, features)
+        self.k = config.rough_k
+    def forward(self,  # type: ignore  # pylint: disable=arguments-differ  #35566 in pytorch
+                mentions: torch.Tensor,
+                ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Returns rough anaphoricity scores for candidates, which consist of
+        the bilinear output of the current model summed with mention scores.
+        """
+        # [n_mentions, n_mentions]
+        pair_mask = torch.arange(mentions.shape[0])
+        pair_mask = pair_mask.unsqueeze(1) - pair_mask.unsqueeze(0)
+        pair_mask = torch.log((pair_mask > 0).to(torch.float))
+        pair_mask = pair_mask.to(mentions.device)
+        bilinear_scores = self.dropout(self.bilinear(mentions)).mm(mentions.T)
+        rough_scores = pair_mask + bilinear_scores
+        return self._prune(rough_scores)
+    def _prune(self,
+               rough_scores: torch.Tensor
+               ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Selects top-k rough antecedent scores for each mention.
+        Args:
+            rough_scores: tensor of shape [n_mentions, n_mentions], containing
+                rough antecedent scores of each mention-antecedent pair.
+        Returns:
+            FloatTensor of shape [n_mentions, k], top rough scores
+            LongTensor of shape [n_mentions, k], top indices
+        """
+        top_scores, indices = torch.topk(rough_scores,
+                                         k=min(self.k, len(rough_scores)),
+                                         dim=1, sorted=False)
+        return top_scores, indices, rough_scores

stanza/stanza/models/coref/utils.py ADDED Viewed

	@@ -0,0 +1,35 @@

+""" Contains functions not directly linked to coreference resolution """
+from typing import List, Set
+import torch
+from stanza.models.coref.const import EPSILON
+class GraphNode:
+    def __init__(self, node_id: int):
+        self.id = node_id
+        self.links: Set[GraphNode] = set()
+        self.visited = False
+    def link(self, another: "GraphNode"):
+        self.links.add(another)
+        another.links.add(self)
+    def __repr__(self) -> str:
+        return str(self.id)
+def add_dummy(tensor: torch.Tensor, eps: bool = False):
+    """ Prepends zeros (or a very small value if eps is True)
+    to the first (not zeroth) dimension of tensor.
+    """
+    kwargs = dict(device=tensor.device, dtype=tensor.dtype)
+    shape: List[int] = list(tensor.shape)
+    shape[1] = 1
+    if not eps:
+        dummy = torch.zeros(shape, **kwargs)          # type: ignore
+    else:
+        dummy = torch.full(shape, EPSILON, **kwargs)  # type: ignore
+    return torch.cat((dummy, tensor), dim=1)

stanza/stanza/models/depparse/model.py ADDED Viewed

	@@ -0,0 +1,265 @@

+import logging
+import os
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence, pack_sequence, pad_sequence, PackedSequence
+from stanza.models.common.bert_embedding import extract_bert_embeddings
+from stanza.models.common.biaffine import DeepBiaffineScorer
+from stanza.models.common.foundation_cache import load_charlm
+from stanza.models.common.hlstm import HighwayLSTM
+from stanza.models.common.dropout import WordDropout
+from stanza.models.common.utils import attach_bert_model
+from stanza.models.common.vocab import CompositeVocab
+from stanza.models.common.char_model import CharacterModel, CharacterLanguageModel
+from stanza.models.common import utils
+logger = logging.getLogger('stanza')
+class Parser(nn.Module):
+    def __init__(self, args, vocab, emb_matrix=None, share_hid=False, foundation_cache=None, bert_model=None, bert_tokenizer=None, force_bert_saved=False, peft_name=None):
+        super().__init__()
+        self.vocab = vocab
+        self.args = args
+        self.share_hid = share_hid
+        self.unsaved_modules = []
+        # input layers
+        input_size = 0
+        if self.args['word_emb_dim'] > 0:
+            # frequent word embeddings
+            self.word_emb = nn.Embedding(len(vocab['word']), self.args['word_emb_dim'], padding_idx=0)
+            self.lemma_emb = nn.Embedding(len(vocab['lemma']), self.args['word_emb_dim'], padding_idx=0)
+            input_size += self.args['word_emb_dim'] * 2
+        if self.args['tag_emb_dim'] > 0:
+            if self.args.get('use_upos', True):
+                self.upos_emb = nn.Embedding(len(vocab['upos']), self.args['tag_emb_dim'], padding_idx=0)
+            if self.args.get('use_xpos', True):
+                if not isinstance(vocab['xpos'], CompositeVocab):
+                    self.xpos_emb = nn.Embedding(len(vocab['xpos']), self.args['tag_emb_dim'], padding_idx=0)
+                else:
+                    self.xpos_emb = nn.ModuleList()
+                    for l in vocab['xpos'].lens():
+                        self.xpos_emb.append(nn.Embedding(l, self.args['tag_emb_dim'], padding_idx=0))
+            if self.args.get('use_upos', True) or self.args.get('use_xpos', True):
+                input_size += self.args['tag_emb_dim']
+            if self.args.get('use_ufeats', True):
+                self.ufeats_emb = nn.ModuleList()
+                for l in vocab['feats'].lens():
+                    self.ufeats_emb.append(nn.Embedding(l, self.args['tag_emb_dim'], padding_idx=0))
+                input_size += self.args['tag_emb_dim']
+        if self.args['char'] and self.args['char_emb_dim'] > 0:
+            if self.args.get('charlm', None):
+                if args['charlm_forward_file'] is None or not os.path.exists(args['charlm_forward_file']):
+                    raise FileNotFoundError('Could not find forward character model: {}  Please specify with --charlm_forward_file'.format(args['charlm_forward_file']))
+                if args['charlm_backward_file'] is None or not os.path.exists(args['charlm_backward_file']):
+                    raise FileNotFoundError('Could not find backward character model: {}  Please specify with --charlm_backward_file'.format(args['charlm_backward_file']))
+                logger.debug("Depparse model loading charmodels: %s and %s", args['charlm_forward_file'], args['charlm_backward_file'])
+                self.add_unsaved_module('charmodel_forward', load_charlm(args['charlm_forward_file'], foundation_cache=foundation_cache))
+                self.add_unsaved_module('charmodel_backward', load_charlm(args['charlm_backward_file'], foundation_cache=foundation_cache))
+                input_size += self.charmodel_forward.hidden_dim() + self.charmodel_backward.hidden_dim()
+            else:
+                self.charmodel = CharacterModel(args, vocab)
+                self.trans_char = nn.Linear(self.args['char_hidden_dim'], self.args['transformed_dim'], bias=False)
+                input_size += self.args['transformed_dim']
+        self.peft_name = peft_name
+        attach_bert_model(self, bert_model, bert_tokenizer, self.args.get('use_peft', False), force_bert_saved)
+        if self.args.get('bert_model', None):
+            # TODO: refactor bert_hidden_layers between the different models
+            if args.get('bert_hidden_layers', False):
+                # The average will be offset by 1/N so that the default zeros
+                # represents an average of the N layers
+                self.bert_layer_mix = nn.Linear(args['bert_hidden_layers'], 1, bias=False)
+                nn.init.zeros_(self.bert_layer_mix.weight)
+            else:
+                # an average of layers 2, 3, 4 will be used
+                # (for historic reasons)
+                self.bert_layer_mix = None
+            input_size += self.bert_model.config.hidden_size
+        if self.args['pretrain']:
+            # pretrained embeddings, by default this won't be saved into model file
+            self.add_unsaved_module('pretrained_emb', nn.Embedding.from_pretrained(emb_matrix, freeze=True))
+            self.trans_pretrained = nn.Linear(emb_matrix.shape[1], self.args['transformed_dim'], bias=False)
+            input_size += self.args['transformed_dim']
+        # recurrent layers
+        self.parserlstm = HighwayLSTM(input_size, self.args['hidden_dim'], self.args['num_layers'], batch_first=True, bidirectional=True, dropout=self.args['dropout'], rec_dropout=self.args['rec_dropout'], highway_func=torch.tanh)
+        self.drop_replacement = nn.Parameter(torch.randn(input_size) / np.sqrt(input_size))
+        self.parserlstm_h_init = nn.Parameter(torch.zeros(2 * self.args['num_layers'], 1, self.args['hidden_dim']))
+        self.parserlstm_c_init = nn.Parameter(torch.zeros(2 * self.args['num_layers'], 1, self.args['hidden_dim']))
+        # classifiers
+        self.unlabeled = DeepBiaffineScorer(2 * self.args['hidden_dim'], 2 * self.args['hidden_dim'], self.args['deep_biaff_hidden_dim'], 1, pairwise=True, dropout=args['dropout'])
+        self.deprel = DeepBiaffineScorer(2 * self.args['hidden_dim'], 2 * self.args['hidden_dim'], self.args['deep_biaff_hidden_dim'], len(vocab['deprel']), pairwise=True, dropout=args['dropout'])
+        if args['linearization']:
+            self.linearization = DeepBiaffineScorer(2 * self.args['hidden_dim'], 2 * self.args['hidden_dim'], self.args['deep_biaff_hidden_dim'], 1, pairwise=True, dropout=args['dropout'])
+        if args['distance']:
+            self.distance = DeepBiaffineScorer(2 * self.args['hidden_dim'], 2 * self.args['hidden_dim'], self.args['deep_biaff_hidden_dim'], 1, pairwise=True, dropout=args['dropout'])
+        # criterion
+        self.crit = nn.CrossEntropyLoss(ignore_index=-1, reduction='sum') # ignore padding
+        self.drop = nn.Dropout(args['dropout'])
+        self.worddrop = WordDropout(args['word_dropout'])
+    def add_unsaved_module(self, name, module):
+        self.unsaved_modules += [name]
+        setattr(self, name, module)
+    def log_norms(self):
+        utils.log_norms(self)
+    def forward(self, word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, lemma, head, deprel, word_orig_idx, sentlens, wordlens, text):
+        def pack(x):
+            return pack_padded_sequence(x, sentlens, batch_first=True)
+        inputs = []
+        if self.args['pretrain']:
+            pretrained_emb = self.pretrained_emb(pretrained)
+            pretrained_emb = self.trans_pretrained(pretrained_emb)
+            pretrained_emb = pack(pretrained_emb)
+            inputs += [pretrained_emb]
+        #def pad(x):
+        #    return pad_packed_sequence(PackedSequence(x, pretrained_emb.batch_sizes), batch_first=True)[0]
+        if self.args['word_emb_dim'] > 0:
+            word_emb = self.word_emb(word)
+            word_emb = pack(word_emb)
+            lemma_emb = self.lemma_emb(lemma)
+            lemma_emb = pack(lemma_emb)
+            inputs += [word_emb, lemma_emb]
+        if self.args['tag_emb_dim'] > 0:
+            if self.args.get('use_upos', True):
+                pos_emb = self.upos_emb(upos)
+            else:
+                pos_emb = 0
+            if self.args.get('use_xpos', True):
+                if isinstance(self.vocab['xpos'], CompositeVocab):
+                    for i in range(len(self.vocab['xpos'])):
+                        pos_emb += self.xpos_emb[i](xpos[:, :, i])
+                else:
+                    pos_emb += self.xpos_emb(xpos)
+            if self.args.get('use_upos', True) or self.args.get('use_xpos', True):
+                pos_emb = pack(pos_emb)
+                inputs += [pos_emb]
+            if self.args.get('use_ufeats', True):
+                feats_emb = 0
+                for i in range(len(self.vocab['feats'])):
+                    feats_emb += self.ufeats_emb[i](ufeats[:, :, i])
+                feats_emb = pack(feats_emb)
+                inputs += [pos_emb]
+        if self.args['char'] and self.args['char_emb_dim'] > 0:
+            if self.args.get('charlm', None):
+                # \n is to add a somewhat neutral "word" for the ROOT
+                charlm_text = [["\n"] + x for x in text]
+                all_forward_chars = self.charmodel_forward.build_char_representation(charlm_text)
+                all_forward_chars = pack(pad_sequence(all_forward_chars, batch_first=True))
+                all_backward_chars = self.charmodel_backward.build_char_representation(charlm_text)
+                all_backward_chars = pack(pad_sequence(all_backward_chars, batch_first=True))
+                inputs += [all_forward_chars, all_backward_chars]
+            else:
+                char_reps = self.charmodel(wordchars, wordchars_mask, word_orig_idx, sentlens, wordlens)
+                char_reps = PackedSequence(self.trans_char(self.drop(char_reps.data)), char_reps.batch_sizes)
+                inputs += [char_reps]
+        if self.bert_model is not None:
+            device = next(self.parameters()).device
+            processed_bert = extract_bert_embeddings(self.args['bert_model'], self.bert_tokenizer, self.bert_model, text, device, keep_endpoints=True,
+                                                     num_layers=self.bert_layer_mix.in_features if self.bert_layer_mix is not None else None,
+                                                     detach=not self.args.get('bert_finetune', False) or not self.training,
+                                                     peft_name=self.peft_name)
+            if self.bert_layer_mix is not None:
+                # use a linear layer to weighted average the embedding dynamically
+                processed_bert = [self.bert_layer_mix(feature).squeeze(2) + feature.sum(axis=2) / self.bert_layer_mix.in_features for feature in processed_bert]
+            # we are using the first endpoint from the transformer as the "word" for ROOT
+            processed_bert = [x[:-1, :] for x in processed_bert]
+            processed_bert = pad_sequence(processed_bert, batch_first=True)
+            inputs += [pack(processed_bert)]
+        lstm_inputs = torch.cat([x.data for x in inputs], 1)
+        lstm_inputs = self.worddrop(lstm_inputs, self.drop_replacement)
+        lstm_inputs = self.drop(lstm_inputs)
+        lstm_inputs = PackedSequence(lstm_inputs, inputs[0].batch_sizes)
+        lstm_outputs, _ = self.parserlstm(lstm_inputs, sentlens, hx=(self.parserlstm_h_init.expand(2 * self.args['num_layers'], word.size(0), self.args['hidden_dim']).contiguous(), self.parserlstm_c_init.expand(2 * self.args['num_layers'], word.size(0), self.args['hidden_dim']).contiguous()))
+        lstm_outputs, _ = pad_packed_sequence(lstm_outputs, batch_first=True)
+        unlabeled_scores = self.unlabeled(self.drop(lstm_outputs), self.drop(lstm_outputs)).squeeze(3)
+        deprel_scores = self.deprel(self.drop(lstm_outputs), self.drop(lstm_outputs))
+        #goldmask = head.new_zeros(*head.size(), head.size(-1)+1, dtype=torch.uint8)
+        #goldmask.scatter_(2, head.unsqueeze(2), 1)
+        if self.args['linearization'] or self.args['distance']:
+            head_offset = torch.arange(word.size(1), device=head.device).view(1, 1, -1).expand(word.size(0), -1, -1) - torch.arange(word.size(1), device=head.device).view(1, -1, 1).expand(word.size(0), -1, -1)
+        if self.args['linearization']:
+            lin_scores = self.linearization(self.drop(lstm_outputs), self.drop(lstm_outputs)).squeeze(3)
+            unlabeled_scores += F.logsigmoid(lin_scores * torch.sign(head_offset).float()).detach()
+        if self.args['distance']:
+            dist_scores = self.distance(self.drop(lstm_outputs), self.drop(lstm_outputs)).squeeze(3)
+            dist_pred = 1 + F.softplus(dist_scores)
+            dist_target = torch.abs(head_offset)
+            dist_kld = -torch.log((dist_target.float() - dist_pred)**2/2 + 1)
+            unlabeled_scores += dist_kld.detach()
+        diag = torch.eye(head.size(-1)+1, dtype=torch.bool, device=head.device).unsqueeze(0)
+        unlabeled_scores.masked_fill_(diag, -float('inf'))
+        preds = []
+        if self.training:
+            unlabeled_scores = unlabeled_scores[:, 1:, :] # exclude attachment for the root symbol
+            unlabeled_scores = unlabeled_scores.masked_fill(word_mask.unsqueeze(1), -float('inf'))
+            unlabeled_target = head.masked_fill(word_mask[:, 1:], -1)
+            loss = self.crit(unlabeled_scores.contiguous().view(-1, unlabeled_scores.size(2)), unlabeled_target.view(-1))
+            deprel_scores = deprel_scores[:, 1:] # exclude attachment for the root symbol
+            #deprel_scores = deprel_scores.masked_select(goldmask.unsqueeze(3)).view(-1, len(self.vocab['deprel']))
+            deprel_scores = torch.gather(deprel_scores, 2, head.unsqueeze(2).unsqueeze(3).expand(-1, -1, -1, len(self.vocab['deprel']))).view(-1, len(self.vocab['deprel']))
+            deprel_target = deprel.masked_fill(word_mask[:, 1:], -1)
+            loss += self.crit(deprel_scores.contiguous(), deprel_target.view(-1))
+            if self.args['linearization']:
+                #lin_scores = lin_scores[:, 1:].masked_select(goldmask)
+                lin_scores = torch.gather(lin_scores[:, 1:], 2, head.unsqueeze(2)).view(-1)
+                lin_scores = torch.cat([-lin_scores.unsqueeze(1)/2, lin_scores.unsqueeze(1)/2], 1)
+                #lin_target = (head_offset[:, 1:] > 0).long().masked_select(goldmask)
+                lin_target = torch.gather((head_offset[:, 1:] > 0).long(), 2, head.unsqueeze(2))
+                loss += self.crit(lin_scores.contiguous(), lin_target.view(-1))
+            if self.args['distance']:
+                #dist_kld = dist_kld[:, 1:].masked_select(goldmask)
+                dist_kld = torch.gather(dist_kld[:, 1:], 2, head.unsqueeze(2))
+                loss -= dist_kld.sum()
+            loss /= wordchars.size(0) # number of words
+        else:
+            loss = 0
+            preds.append(F.log_softmax(unlabeled_scores, 2).detach().cpu().numpy())
+            preds.append(deprel_scores.max(3)[1].detach().cpu().numpy())
+        return loss, preds