bowphs commited on Apr 8, 2025

Commit

ba68d3c

verified ·

1 Parent(s): 9634055

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

stanza/stanza/models/classifiers/base_classifier.py +65 -0
stanza/stanza/models/classifiers/cnn_classifier.py +547 -0
stanza/stanza/models/classifiers/iterate_test.py +64 -0
stanza/stanza/models/classifiers/trainer.py +304 -0
stanza/stanza/models/constituency/__init__.py +0 -0
stanza/stanza/models/constituency/evaluate_treebanks.py +36 -0
stanza/stanza/models/constituency/label_attention.py +726 -0
stanza/stanza/models/constituency/lstm_tree_stack.py +91 -0
stanza/stanza/models/constituency/score_converted_dependencies.py +65 -0
stanza/stanza/models/constituency/text_processing.py +166 -0
stanza/stanza/models/constituency/tree_reader.py +274 -0
stanza/stanza/models/constituency/tree_stack.py +57 -0
stanza/stanza/models/constituency/utils.py +375 -0
stanza/stanza/models/coref/predict.py +55 -0
stanza/stanza/models/coref/span_predictor.py +146 -0
stanza/stanza/models/coref/tokenizer_customization.py +18 -0
stanza/stanza/models/coref/word_encoder.py +108 -0
stanza/stanza/models/depparse/data.py +233 -0
stanza/stanza/models/lemma/attach_lemma_classifier.py +25 -0
stanza/stanza/models/lemma/scorer.py +13 -0
stanza/stanza/models/lemma/vocab.py +18 -0
stanza/stanza/models/lemma_classifier/base_trainer.py +114 -0
stanza/stanza/models/lemma_classifier/constants.py +14 -0
stanza/stanza/models/lemma_classifier/evaluate_many.py +68 -0
stanza/stanza/models/lemma_classifier/evaluate_models.py +228 -0
stanza/stanza/models/lemma_classifier/prepare_dataset.py +125 -0
stanza/stanza/models/lemma_classifier/train_lstm_model.py +147 -0
stanza/stanza/models/lemma_classifier/train_many.py +155 -0
stanza/stanza/models/lemma_classifier/train_transformer_model.py +130 -0
stanza/stanza/models/lemma_classifier/transformer_model.py +89 -0
stanza/stanza/models/lemma_classifier/utils.py +173 -0
stanza/stanza/models/mwt/character_classifier.py +65 -0
stanza/stanza/models/mwt/trainer.py +218 -0
stanza/stanza/models/mwt/vocab.py +19 -0
stanza/stanza/models/ner/vocab.py +56 -0
stanza/stanza/models/pos/__init__.py +0 -0
stanza/stanza/models/pos/build_xpos_vocab_factory.py +144 -0
stanza/stanza/models/pos/data.py +387 -0
stanza/stanza/models/pos/model.py +256 -0
stanza/stanza/models/pos/trainer.py +179 -0
stanza/stanza/models/pos/xpos_vocab_factory.py +200 -0
stanza/stanza/models/pos/xpos_vocab_utils.py +48 -0
stanza/stanza/models/tokenization/__init__.py +0 -0
stanza/stanza/models/tokenization/data.py +432 -0
stanza/stanza/models/tokenization/model.py +101 -0
stanza/stanza/models/tokenization/tokenize_files.py +83 -0
stanza/stanza/models/tokenization/trainer.py +102 -0
stanza/stanza/utils/datasets/constituency/convert_ctb.py +224 -0
stanza/stanza/utils/datasets/constituency/extract_silver_dataset.py +47 -0
stanza/stanza/utils/datasets/coref/balance_languages.py +60 -0

stanza/stanza/models/classifiers/base_classifier.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from abc import ABC, abstractmethod
+import logging
+import torch
+import torch.nn as nn
+from stanza.models.common.utils import split_into_batches, sort_with_indices, unsort
+"""
+A base classifier type
+Currently, has the ability to process text or other inputs in a manner
+suitable for the particular model type.
+In other words, the CNNClassifier processes lists of words,
+and the ConstituencyClassifier processes trees
+"""
+logger = logging.getLogger('stanza')
+class BaseClassifier(ABC, nn.Module):
+    @abstractmethod
+    def extract_sentences(self, doc):
+        """
+        Extract the sentences or the relevant information in the sentences from a document
+        """
+    def preprocess_sentences(self, sentences):
+        """
+        By default, don't do anything
+        """
+        return sentences
+    def label_sentences(self, sentences, batch_size=None):
+        """
+        Given a list of sentences, return the model's results on that text.
+        """
+        self.eval()
+        sentences = self.preprocess_sentences(sentences)
+        if batch_size is None:
+            intervals = [(0, len(sentences))]
+            orig_idx = None
+        else:
+            sentences, orig_idx = sort_with_indices(sentences, key=len, reverse=True)
+            intervals = split_into_batches(sentences, batch_size)
+        labels = []
+        for interval in intervals:
+            if interval[1] - interval[0] == 0:
+                # this can happen for empty text
+                continue
+            output = self(sentences[interval[0]:interval[1]])
+            predicted = torch.argmax(output, dim=1)
+            labels.extend(predicted.tolist())
+        if orig_idx:
+            sentences = unsort(sentences, orig_idx)
+            labels = unsort(labels, orig_idx)
+        logger.debug("Found labels")
+        for (label, sentence) in zip(labels, sentences):
+            logger.debug((label, sentence))
+        return labels

stanza/stanza/models/classifiers/cnn_classifier.py ADDED Viewed

	@@ -0,0 +1,547 @@

+import dataclasses
+import logging
+import math
+import os
+import random
+import re
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import stanza.models.classifiers.data as data
+from stanza.models.classifiers.base_classifier import BaseClassifier
+from stanza.models.classifiers.config import CNNConfig
+from stanza.models.classifiers.data import SentimentDatum
+from stanza.models.classifiers.utils import ExtraVectors, ModelType, build_output_layers
+from stanza.models.common.bert_embedding import extract_bert_embeddings
+from stanza.models.common.data import get_long_tensor, sort_all
+from stanza.models.common.utils import attach_bert_model
+from stanza.models.common.vocab import PAD_ID, UNK_ID
+"""
+The CNN classifier is based on Yoon Kim's work:
+https://arxiv.org/abs/1408.5882
+Also included are maxpool 2d, conv 2d, and a bilstm, as in
+Text Classification Improved by Integrating Bidirectional LSTM
+with Two-dimensional Max Pooling
+https://aclanthology.org/C16-1329.pdf
+The architecture is simple:
+- Embedding at the bottom layer
+  - separate learnable entry for UNK, since many of the embeddings we have use 0 for UNK
+- maybe a bilstm layer, as per a command line flag
+- Some number of conv2d layers over the embedding
+- Maxpool layers over small windows, window size being a parameter
+- FC layer to the classification layer
+One experiment which was run and found to be a bit of a negative was
+putting a layer on top of the pretrain.  You would think that might
+help, but dev performance went down for each variation of
+  - trans(emb)
+  - relu(trans(emb))
+  - dropout(trans(emb))
+  - dropout(relu(trans(emb)))
+"""
+logger = logging.getLogger('stanza')
+tlogger = logging.getLogger('stanza.classifiers.trainer')
+class CNNClassifier(BaseClassifier):
+    def __init__(self, pretrain, extra_vocab, labels,
+                 charmodel_forward, charmodel_backward, elmo_model, bert_model, bert_tokenizer, force_bert_saved, peft_name,
+                 args):
+        """
+        pretrain is a pretrained word embedding.  should have .emb and .vocab
+        extra_vocab is a collection of words in the training data to
+        be used for the delta word embedding, if used.  can be set to
+        None if delta word embedding is not used.
+        labels is the list of labels we expect in the training data.
+        Used to derive the number of classes.  Saving it in the model
+        will let us check that test data has the same labels
+        args is either the complete arguments when training, or the
+        subset of arguments stored in the model save file
+        """
+        super(CNNClassifier, self).__init__()
+        self.labels = labels
+        bert_finetune = args.bert_finetune
+        use_peft = args.use_peft
+        force_bert_saved = force_bert_saved or bert_finetune
+        logger.debug("bert_finetune %s / force_bert_saved %s", bert_finetune, force_bert_saved)
+        # this may change when loaded in a new Pipeline, so it's not part of the config
+        self.peft_name = peft_name
+        # we build a separate config out of the args so that we can easily save it in torch
+        self.config = CNNConfig(filter_channels = args.filter_channels,
+                                filter_sizes = args.filter_sizes,
+                                fc_shapes = args.fc_shapes,
+                                dropout = args.dropout,
+                                num_classes = len(labels),
+                                wordvec_type = args.wordvec_type,
+                                extra_wordvec_method = args.extra_wordvec_method,
+                                extra_wordvec_dim = args.extra_wordvec_dim,
+                                extra_wordvec_max_norm = args.extra_wordvec_max_norm,
+                                char_lowercase = args.char_lowercase,
+                                charlm_projection = args.charlm_projection,
+                                has_charlm_forward = charmodel_forward is not None,
+                                has_charlm_backward = charmodel_backward is not None,
+                                use_elmo = args.use_elmo,
+                                elmo_projection = args.elmo_projection,
+                                bert_model = args.bert_model,
+                                bert_finetune = bert_finetune,
+                                bert_hidden_layers = args.bert_hidden_layers,
+                                force_bert_saved = force_bert_saved,
+                                use_peft = use_peft,
+                                lora_rank = args.lora_rank,
+                                lora_alpha = args.lora_alpha,
+                                lora_dropout = args.lora_dropout,
+                                lora_modules_to_save = args.lora_modules_to_save,
+                                lora_target_modules = args.lora_target_modules,
+                                bilstm = args.bilstm,
+                                bilstm_hidden_dim = args.bilstm_hidden_dim,
+                                maxpool_width = args.maxpool_width,
+                                model_type = ModelType.CNN)
+        self.char_lowercase = args.char_lowercase
+        self.unsaved_modules = []
+        emb_matrix = pretrain.emb
+        self.add_unsaved_module('embedding', nn.Embedding.from_pretrained(emb_matrix, freeze=True))
+        self.add_unsaved_module('elmo_model', elmo_model)
+        self.vocab_size = emb_matrix.shape[0]
+        self.embedding_dim = emb_matrix.shape[1]
+        self.add_unsaved_module('forward_charlm', charmodel_forward)
+        if charmodel_forward is not None:
+            tlogger.debug("Got forward char model of dimension {}".format(charmodel_forward.hidden_dim()))
+            if not charmodel_forward.is_forward_lm:
+                raise ValueError("Got a backward charlm as a forward charlm!")
+        self.add_unsaved_module('backward_charlm', charmodel_backward)
+        if charmodel_backward is not None:
+            tlogger.debug("Got backward char model of dimension {}".format(charmodel_backward.hidden_dim()))
+            if charmodel_backward.is_forward_lm:
+                raise ValueError("Got a forward charlm as a backward charlm!")
+        attach_bert_model(self, bert_model, bert_tokenizer, self.config.use_peft, force_bert_saved)
+        # The Pretrain has PAD and UNK already (indices 0 and 1), but we
+        # possibly want to train UNK while freezing the rest of the embedding
+        # note that the /10.0 operation has to be inside nn.Parameter unless
+        # you want to spend a long time debugging this
+        self.unk = nn.Parameter(torch.randn(self.embedding_dim) / np.sqrt(self.embedding_dim) / 10.0)
+        # replacing NBSP picks up a whole bunch of words for VI
+        self.vocab_map = { word.replace('\xa0', ' '): i for i, word in enumerate(pretrain.vocab) }
+        if self.config.extra_wordvec_method is not ExtraVectors.NONE:
+            if not extra_vocab:
+                raise ValueError("Should have had extra_vocab set for extra_wordvec_method {}".format(self.config.extra_wordvec_method))
+            if not args.extra_wordvec_dim:
+                self.config.extra_wordvec_dim = self.embedding_dim
+            if self.config.extra_wordvec_method is ExtraVectors.SUM:
+                if self.config.extra_wordvec_dim != self.embedding_dim:
+                    raise ValueError("extra_wordvec_dim must equal embedding_dim for {}".format(self.config.extra_wordvec_method))
+            self.extra_vocab = list(extra_vocab)
+            self.extra_vocab_map = { word: i for i, word in enumerate(self.extra_vocab) }
+            # TODO: possibly add regularization specifically on the extra embedding?
+            # note: it looks like a bug that this doesn't add UNK or PAD, but actually
+            # those are expected to already be the first two entries
+            self.extra_embedding = nn.Embedding(num_embeddings = len(extra_vocab),
+                                                embedding_dim = self.config.extra_wordvec_dim,
+                                                max_norm = self.config.extra_wordvec_max_norm,
+                                                padding_idx = 0)
+            tlogger.debug("Extra embedding size: {}".format(self.extra_embedding.weight.shape))
+        else:
+            self.extra_vocab = None
+            self.extra_vocab_map = None
+            self.config.extra_wordvec_dim = 0
+            self.extra_embedding = None
+        # Pytorch is "aware" of the existence of the nn.Modules inside
+        # an nn.ModuleList in terms of parameters() etc
+        if self.config.extra_wordvec_method is ExtraVectors.NONE:
+            total_embedding_dim = self.embedding_dim
+        elif self.config.extra_wordvec_method is ExtraVectors.SUM:
+            total_embedding_dim = self.embedding_dim
+        elif self.config.extra_wordvec_method is ExtraVectors.CONCAT:
+            total_embedding_dim = self.embedding_dim + self.config.extra_wordvec_dim
+        else:
+            raise ValueError("unable to handle {}".format(self.config.extra_wordvec_method))
+        if charmodel_forward is not None:
+            if args.charlm_projection:
+                self.charmodel_forward_projection = nn.Linear(charmodel_forward.hidden_dim(), args.charlm_projection)
+                total_embedding_dim += args.charlm_projection
+            else:
+                self.charmodel_forward_projection = None
+                total_embedding_dim += charmodel_forward.hidden_dim()
+        if charmodel_backward is not None:
+            if args.charlm_projection:
+                self.charmodel_backward_projection = nn.Linear(charmodel_backward.hidden_dim(), args.charlm_projection)
+                total_embedding_dim += args.charlm_projection
+            else:
+                self.charmodel_backward_projection = None
+                total_embedding_dim += charmodel_backward.hidden_dim()
+        if self.config.use_elmo:
+            if elmo_model is None:
+                raise ValueError("Model requires elmo, but elmo_model not passed in")
+            elmo_dim = elmo_model.sents2elmo([["Test"]])[0].shape[1]
+            # this mapping will combine 3 layers of elmo to 1 layer of features
+            self.elmo_combine_layers = nn.Linear(in_features=3, out_features=1, bias=False)
+            if self.config.elmo_projection:
+                self.elmo_projection = nn.Linear(in_features=elmo_dim, out_features=self.config.elmo_projection)
+                total_embedding_dim = total_embedding_dim + self.config.elmo_projection
+            else:
+                total_embedding_dim = total_embedding_dim + elmo_dim
+        if bert_model is not None:
+            if self.config.bert_hidden_layers:
+                # The average will be offset by 1/N so that the default zeros
+                # repressents an average of the N layers
+                if self.config.bert_hidden_layers > bert_model.config.num_hidden_layers:
+                    # limit ourselves to the number of layers actually available
+                    # note that we can +1 because of the initial embedding layer
+                    self.config.bert_hidden_layers = bert_model.config.num_hidden_layers + 1
+                self.bert_layer_mix = nn.Linear(self.config.bert_hidden_layers, 1, bias=False)
+                nn.init.zeros_(self.bert_layer_mix.weight)
+            else:
+                # an average of layers 2, 3, 4 will be used
+                # (for historic reasons)
+                self.bert_layer_mix = None
+            if bert_tokenizer is None:
+                raise ValueError("Cannot have a bert model without a tokenizer")
+            self.bert_dim = self.bert_model.config.hidden_size
+            total_embedding_dim += self.bert_dim
+        if self.config.bilstm:
+            conv_input_dim = self.config.bilstm_hidden_dim * 2
+            self.bilstm = nn.LSTM(batch_first=True,
+                                  input_size=total_embedding_dim,
+                                  hidden_size=self.config.bilstm_hidden_dim,
+                                  num_layers=2,
+                                  bidirectional=True,
+                                  dropout=0.2)
+        else:
+            conv_input_dim = total_embedding_dim
+            self.bilstm = None
+        self.fc_input_size = 0
+        self.conv_layers = nn.ModuleList()
+        self.max_window = 0
+        for filter_idx, filter_size in enumerate(self.config.filter_sizes):
+            if isinstance(filter_size, int):
+                self.max_window = max(self.max_window, filter_size)
+                if isinstance(self.config.filter_channels, int):
+                    filter_channels = self.config.filter_channels
+                else:
+                    filter_channels = self.config.filter_channels[filter_idx]
+                fc_delta = filter_channels // self.config.maxpool_width
+                tlogger.debug("Adding full width filter %d.  Output channels: %d -> %d", filter_size, filter_channels, fc_delta)
+                self.fc_input_size += fc_delta
+                self.conv_layers.append(nn.Conv2d(in_channels=1,
+                                                  out_channels=filter_channels,
+                                                  kernel_size=(filter_size, conv_input_dim)))
+            elif isinstance(filter_size, tuple) and len(filter_size) == 2:
+                filter_height, filter_width = filter_size
+                self.max_window = max(self.max_window, filter_width)
+                if isinstance(self.config.filter_channels, int):
+                    filter_channels = max(1, self.config.filter_channels // (conv_input_dim // filter_width))
+                else:
+                    filter_channels = self.config.filter_channels[filter_idx]
+                fc_delta = filter_channels * (conv_input_dim // filter_width) // self.config.maxpool_width
+                tlogger.debug("Adding filter %s.  Output channels: %d -> %d", filter_size, filter_channels, fc_delta)
+                self.fc_input_size += fc_delta
+                self.conv_layers.append(nn.Conv2d(in_channels=1,
+                                                  out_channels=filter_channels,
+                                                  stride=(1, filter_width),
+                                                  kernel_size=(filter_height, filter_width)))
+            else:
+                raise ValueError("Expected int or 2d tuple for conv size")
+        tlogger.debug("Input dim to FC layers: %d", self.fc_input_size)
+        self.fc_layers = build_output_layers(self.fc_input_size, self.config.fc_shapes, self.config.num_classes)
+        self.dropout = nn.Dropout(self.config.dropout)
+    def add_unsaved_module(self, name, module):
+        self.unsaved_modules += [name]
+        setattr(self, name, module)
+        if module is not None and (name in ('forward_charlm', 'backward_charlm') or
+                                   (name == 'bert_model' and not self.config.use_peft)):
+            # if we are using peft, we should not save the transformer directly
+            # instead, the peft parameters only will be saved later
+            for _, parameter in module.named_parameters():
+                parameter.requires_grad = False
+    def is_unsaved_module(self, name):
+        return name.split('.')[0] in self.unsaved_modules
+    def log_configuration(self):
+        """
+        Log some essential information about the model configuration to the training logger
+        """
+        tlogger.info("Filter sizes: %s" % str(self.config.filter_sizes))
+        tlogger.info("Filter channels: %s" % str(self.config.filter_channels))
+        tlogger.info("Intermediate layers: %s" % str(self.config.fc_shapes))
+    def log_norms(self):
+        lines = ["NORMS FOR MODEL PARAMTERS"]
+        for name, param in self.named_parameters():
+            if param.requires_grad and name.split(".")[0] not in ('forward_charlm', 'backward_charlm'):
+                lines.append("%s %.6g" % (name, torch.norm(param).item()))
+        logger.info("\n".join(lines))
+    def build_char_reps(self, inputs, max_phrase_len, charlm, projection, begin_paddings, device):
+        char_reps = charlm.build_char_representation(inputs)
+        if projection is not None:
+            char_reps = [projection(x) for x in char_reps]
+        char_inputs = torch.zeros((len(inputs), max_phrase_len, char_reps[0].shape[-1]), device=device)
+        for idx, rep in enumerate(char_reps):
+            start = begin_paddings[idx]
+            end = start + rep.shape[0]
+            char_inputs[idx, start:end, :] = rep
+        return char_inputs
+    def extract_bert_embeddings(self, inputs, max_phrase_len, begin_paddings, device):
+        bert_embeddings = extract_bert_embeddings(self.config.bert_model, self.bert_tokenizer, self.bert_model, inputs, device,
+                                                  keep_endpoints=False,
+                                                  num_layers=self.bert_layer_mix.in_features if self.bert_layer_mix is not None else None,
+                                                  detach=not self.config.bert_finetune,
+                                                  peft_name=self.peft_name)
+        if self.bert_layer_mix is not None:
+            # add the average so that the default behavior is to
+            # take an average of the N layers, and anything else
+            # other than that needs to be learned
+            bert_embeddings = [self.bert_layer_mix(feature).squeeze(2) + feature.sum(axis=2) / self.bert_layer_mix.in_features for feature in bert_embeddings]
+        bert_inputs = torch.zeros((len(inputs), max_phrase_len, bert_embeddings[0].shape[-1]), device=device)
+        for idx, rep in enumerate(bert_embeddings):
+            start = begin_paddings[idx]
+            end = start + rep.shape[0]
+            bert_inputs[idx, start:end, :] = rep
+        return bert_inputs
+    def forward(self, inputs):
+        # assume all pieces are on the same device
+        device = next(self.parameters()).device
+        vocab_map = self.vocab_map
+        def map_word(word):
+            idx = vocab_map.get(word, None)
+            if idx is not None:
+                return idx
+            if word[-1] == "'":
+                idx = vocab_map.get(word[:-1], None)
+                if idx is not None:
+                    return idx
+            return vocab_map.get(word.lower(), UNK_ID)
+        inputs = [x.text if isinstance(x, SentimentDatum) else x for x in inputs]
+        # we will pad each phrase so either it matches the longest
+        # conv or the longest phrase in the input, whichever is longer
+        max_phrase_len = max(len(x) for x in inputs)
+        if self.max_window > max_phrase_len:
+            max_phrase_len = self.max_window
+        batch_indices = []
+        batch_unknowns = []
+        extra_batch_indices = []
+        begin_paddings = []
+        end_paddings = []
+        elmo_batch_words = []
+        for phrase in inputs:
+            # we use random at training time to try to learn different
+            # positions of padding.  at test time, though, we want to
+            # have consistent results, so we set that to 0 begin_pad
+            if self.training:
+                begin_pad_width = random.randint(0, max_phrase_len - len(phrase))
+            else:
+                begin_pad_width = 0
+            end_pad_width = max_phrase_len - begin_pad_width - len(phrase)
+            begin_paddings.append(begin_pad_width)
+            end_paddings.append(end_pad_width)
+            # the initial lists are the length of the begin padding
+            sentence_indices = [PAD_ID] * begin_pad_width
+            sentence_indices.extend([map_word(x) for x in phrase])
+            sentence_indices.extend([PAD_ID] * end_pad_width)
+            # the "unknowns" will be the locations of the unknown words.
+            # these locations will get the specially trained unknown vector
+            # TODO: split UNK based on part of speech?  might be an interesting experiment
+            sentence_unknowns = [idx for idx, word in enumerate(sentence_indices) if word == UNK_ID]
+            batch_indices.append(sentence_indices)
+            batch_unknowns.append(sentence_unknowns)
+            if self.extra_vocab:
+                extra_sentence_indices = [PAD_ID] * begin_pad_width
+                for word in phrase:
+                    if word in self.extra_vocab_map:
+                        # the extra vocab is initialized from the
+                        # words in the training set, which means there
+                        # would be no unknown words.  to occasionally
+                        # train the extra vocab's unknown words, we
+                        # replace 1% of the words with UNK
+                        # we don't do that for the original embedding
+                        # on the assumption that there may be some
+                        # unknown words in the training set anyway
+                        # TODO: maybe train unk for the original embedding?
+                        if self.training and random.random() < 0.01:
+                            extra_sentence_indices.append(UNK_ID)
+                        else:
+                            extra_sentence_indices.append(self.extra_vocab_map[word])
+                    else:
+                        extra_sentence_indices.append(UNK_ID)
+                extra_sentence_indices.extend([PAD_ID] * end_pad_width)
+                extra_batch_indices.append(extra_sentence_indices)
+            if self.config.use_elmo:
+                elmo_phrase_words = [""] * begin_pad_width
+                for word in phrase:
+                    elmo_phrase_words.append(word)
+                elmo_phrase_words.extend([""] * end_pad_width)
+                elmo_batch_words.append(elmo_phrase_words)
+        # creating a single large list with all the indices lets us
+        # create a single tensor, which is much faster than creating
+        # many tiny tensors
+        # we can convert this to the input to the CNN
+        # it is padded at one or both ends so that it is now num_phrases x max_len x emb_size
+        # there are two ways in which this padding is suboptimal
+        # the first is that for short sentences, smaller windows will
+        #   be padded to the point that some windows are entirely pad
+        # the second is that a sentence S will have more or less padding
+        #   depending on what other sentences are in its batch
+        # we assume these effects are pretty minimal
+        batch_indices = torch.tensor(batch_indices, requires_grad=False, device=device)
+        input_vectors = self.embedding(batch_indices)
+        # we use the random unk so that we are not necessarily
+        # learning to match 0s for unk
+        for phrase_num, sentence_unknowns in enumerate(batch_unknowns):
+            input_vectors[phrase_num][sentence_unknowns] = self.unk
+        if self.extra_vocab:
+            extra_batch_indices = torch.tensor(extra_batch_indices, requires_grad=False, device=device)
+            extra_input_vectors = self.extra_embedding(extra_batch_indices)
+            if self.config.extra_wordvec_method is ExtraVectors.CONCAT:
+                all_inputs = [input_vectors, extra_input_vectors]
+            elif self.config.extra_wordvec_method is ExtraVectors.SUM:
+                all_inputs = [input_vectors + extra_input_vectors]
+            else:
+                raise ValueError("unable to handle {}".format(self.config.extra_wordvec_method))
+        else:
+            all_inputs = [input_vectors]
+        if self.forward_charlm is not None:
+            char_reps_forward = self.build_char_reps(inputs, max_phrase_len, self.forward_charlm, self.charmodel_forward_projection, begin_paddings, device)
+            all_inputs.append(char_reps_forward)
+        if self.backward_charlm is not None:
+            char_reps_backward = self.build_char_reps(inputs, max_phrase_len, self.backward_charlm, self.charmodel_backward_projection, begin_paddings, device)
+            all_inputs.append(char_reps_backward)
+        if self.config.use_elmo:
+            # this will be N arrays of 3xMx1024 where M is the number of words
+            # and N is the number of sentences (and 1024 is actually the number of weights)
+            elmo_arrays = self.elmo_model.sents2elmo(elmo_batch_words, output_layer=-2)
+            elmo_tensors = [torch.tensor(x).to(device=device) for x in elmo_arrays]
+            # elmo_tensor will now be Nx3xMx1024
+            elmo_tensor = torch.stack(elmo_tensors)
+            # Nx1024xMx3
+            elmo_tensor = torch.transpose(elmo_tensor, 1, 3)
+            # NxMx1024x3
+            elmo_tensor = torch.transpose(elmo_tensor, 1, 2)
+            # NxMx1024x1
+            elmo_tensor = self.elmo_combine_layers(elmo_tensor)
+            # NxMx1024
+            elmo_tensor = elmo_tensor.squeeze(3)
+            if self.config.elmo_projection:
+                elmo_tensor = self.elmo_projection(elmo_tensor)
+            all_inputs.append(elmo_tensor)
+        if self.bert_model is not None:
+            bert_embeddings = self.extract_bert_embeddings(inputs, max_phrase_len, begin_paddings, device)
+            all_inputs.append(bert_embeddings)
+        # still works even if there's just one item
+        input_vectors = torch.cat(all_inputs, dim=2)
+        if self.config.bilstm:
+            input_vectors, _ = self.bilstm(self.dropout(input_vectors))
+        # reshape to fit the input tensors
+        x = input_vectors.unsqueeze(1)
+        conv_outs = []
+        for conv, filter_size in zip(self.conv_layers, self.config.filter_sizes):
+            if isinstance(filter_size, int):
+                conv_out = self.dropout(F.relu(conv(x).squeeze(3)))
+                conv_outs.append(conv_out)
+            else:
+                conv_out = conv(x).transpose(2, 3).flatten(1, 2)
+                conv_out = self.dropout(F.relu(conv_out))
+                conv_outs.append(conv_out)
+        pool_outs = [F.max_pool2d(out, (self.config.maxpool_width, out.shape[2])).squeeze(2) for out in conv_outs]
+        pooled = torch.cat(pool_outs, dim=1)
+        previous_layer = pooled
+        for fc in self.fc_layers[:-1]:
+            previous_layer = self.dropout(F.relu(fc(previous_layer)))
+        out = self.fc_layers[-1](previous_layer)
+        # note that we return the raw logits rather than use a softmax
+        # https://discuss.pytorch.org/t/multi-class-cross-entropy-loss-and-softmax-in-pytorch/24920/4
+        return out
+    def get_params(self, skip_modules=True):
+        model_state = self.state_dict()
+        # skip saving modules like pretrained embeddings, because they are large and will be saved in a separate file
+        if skip_modules:
+            skipped = [k for k in model_state.keys() if self.is_unsaved_module(k)]
+            for k in skipped:
+                del model_state[k]
+        config = dataclasses.asdict(self.config)
+        config['wordvec_type'] = config['wordvec_type'].name
+        config['extra_wordvec_method'] = config['extra_wordvec_method'].name
+        config['model_type'] = config['model_type'].name
+        params = {
+            'model':        model_state,
+            'config':       config,
+            'labels':       self.labels,
+            'extra_vocab':  self.extra_vocab,
+        }
+        if self.config.use_peft:
+            # Hide import so that peft dependency is optional
+            from peft import get_peft_model_state_dict
+            params["bert_lora"] = get_peft_model_state_dict(self.bert_model, adapter_name=self.peft_name)
+        return params
+    def preprocess_data(self, sentences):
+        sentences = [data.update_text(s, self.config.wordvec_type) for s in sentences]
+        return sentences
+    def extract_sentences(self, doc):
+        # TODO: tokens or words better here?
+        return [[token.text for token in sentence.tokens] for sentence in doc.sentences]

stanza/stanza/models/classifiers/iterate_test.py ADDED Viewed

	@@ -0,0 +1,64 @@

+"""Iterate test."""
+import argparse
+import glob
+import logging
+import stanza.models.classifier as classifier
+import stanza.models.classifiers.cnn_classifier as cnn_classifier
+from stanza.models.common import utils
+from stanza.utils.confusion import format_confusion, confusion_to_accuracy
+"""
+A script for running the same test file on several different classifiers.
+For each one, it will output the accuracy and, if possible, the confusion matrix.
+Includes the arguments for pretrain, which allows for passing in a
+different directory for the pretrain file.
+Example command line:
+  python3 -m stanza.models.classifiers.iterate_test  --test_file extern_data/sentiment/sst-processed/threeclass/test-threeclass-roots.txt --glob "saved_models/classifier/FC41_3class_en_ewt_FS*ACC66*"
+"""
+logger = logging.getLogger('stanza')
+def parse_args():
+    """Add and parse arguments."""
+    parser = classifier.build_parser()
+    parser.add_argument('--glob', type=str, default='saved_models/classifier/*classifier*pt', help='Model file(s) to test.')
+    args = parser.parse_args()
+    return args
+args = parse_args()
+seed = utils.set_random_seed(args.seed)
+model_files = []
+for glob_piece in args.glob.split():
+    model_files.extend(glob.glob(glob_piece))
+model_files = sorted(set(model_files))
+test_set = data.read_dataset(args.test_file, args.wordvec_type, min_len=None)
+logger.info("Using test set: %s" % args.test_file)
+device = None
+for load_name in model_files:
+    args.load_name = load_name
+    model = classifier.load_model(args)
+    logger.info("Testing %s" % load_name)
+    model = cnn_classifier.load(load_name, pretrain)
+    if device is None:
+        device = next(model.parameters()).device
+        logger.info("Current device: %s" % device)
+    labels = model.labels
+    classifier.check_labels(labels, test_set)
+    confusion = classifier.confusion_dataset(model, test_set, device=device)
+    correct, total = confusion_to_accuracy(confusion)
+    logger.info("  Results: %d correct of %d examples.  Accuracy: %f" % (correct, total, correct / total))
+    logger.info("Confusion matrix:\n{}".format(format_confusion(confusion, model.labels)))

stanza/stanza/models/classifiers/trainer.py ADDED Viewed

	@@ -0,0 +1,304 @@

+"""
+Organizes the model itself and its optimizer in one place
+Saving the optimizer allows for easy restarting of training
+"""
+import logging
+import os
+import torch
+import torch.optim as optim
+from types import SimpleNamespace
+import stanza.models.classifiers.data as data
+import stanza.models.classifiers.cnn_classifier as cnn_classifier
+import stanza.models.classifiers.constituency_classifier as constituency_classifier
+from stanza.models.classifiers.config import CNNConfig, ConstituencyConfig
+from stanza.models.classifiers.utils import ModelType, WVType, ExtraVectors
+from stanza.models.common.foundation_cache import load_bert, load_bert_with_peft, load_charlm, load_pretrain
+from stanza.models.common.peft_config import build_peft_wrapper, load_peft_wrapper
+from stanza.models.common.pretrain import Pretrain
+from stanza.models.common.utils import get_split_optimizer
+from stanza.models.constituency.tree_embedding import TreeEmbedding
+from pickle import UnpicklingError
+import warnings
+logger = logging.getLogger('stanza')
+class Trainer:
+    """
+    Stores a constituency model and its optimizer
+    """
+    def __init__(self, model, optimizer=None, epochs_trained=0, global_step=0, best_score=None):
+        self.model = model
+        self.optimizer = optimizer
+        # we keep track of position in the learning so that we can
+        # checkpoint & restart if needed without restarting the epoch count
+        self.epochs_trained = epochs_trained
+        self.global_step = global_step
+        # save the best dev score so that when reloading a checkpoint
+        # of a model, we know how far we got
+        self.best_score = best_score
+    def save(self, filename, epochs_trained=None, skip_modules=True, save_optimizer=True):
+        """
+        save the current model, optimizer, and other state to filename
+        epochs_trained can be passed as a parameter to handle saving at the end of an epoch
+        """
+        if epochs_trained is None:
+            epochs_trained = self.epochs_trained
+        save_dir = os.path.split(filename)[0]
+        os.makedirs(save_dir, exist_ok=True)
+        model_params = self.model.get_params(skip_modules)
+        params = {
+            'params':         model_params,
+            'epochs_trained': epochs_trained,
+            'global_step':    self.global_step,
+            'best_score':     self.best_score,
+        }
+        if save_optimizer and self.optimizer is not None:
+            params['optimizer_state_dict'] = {opt_name: opt.state_dict() for opt_name, opt in self.optimizer.items()}
+        torch.save(params, filename, _use_new_zipfile_serialization=False)
+        logger.info("Model saved to {}".format(filename))
+    @staticmethod
+    def load(filename, args, foundation_cache=None, load_optimizer=False):
+        if not os.path.exists(filename):
+            if args.save_dir is None:
+                raise FileNotFoundError("Cannot find model in {} and args.save_dir is None".format(filename))
+            elif os.path.exists(os.path.join(args.save_dir, filename)):
+                filename = os.path.join(args.save_dir, filename)
+            else:
+                raise FileNotFoundError("Cannot find model in {} or in {}".format(filename, os.path.join(args.save_dir, filename)))
+        try:
+            # TODO: can remove the try/except once the new version is out
+            #checkpoint = torch.load(filename, lambda storage, loc: storage, weights_only=True)
+            try:
+                checkpoint = torch.load(filename, lambda storage, loc: storage, weights_only=True)
+            except UnpicklingError as e:
+                checkpoint = torch.load(filename, lambda storage, loc: storage, weights_only=False)
+                warnings.warn("The saved classifier has an old format using SimpleNamespace and/or Enum instead of a dict to store config.  This version of Stanza can support reading both the new and the old formats.  Future versions will only allow loading with weights_only=True.  Please resave the pretrained classifier using this version ASAP.")
+        except BaseException:
+            logger.exception("Cannot load model from {}".format(filename))
+            raise
+        logger.debug("Loaded model {}".format(filename))
+        epochs_trained = checkpoint.get('epochs_trained', 0)
+        global_step = checkpoint.get('global_step', 0)
+        best_score = checkpoint.get('best_score', None)
+        # TODO: can remove this block once all models are retrained
+        if 'params' not in checkpoint:
+            model_params = {
+                'model':        checkpoint['model'],
+                'config':       checkpoint['config'],
+                'labels':       checkpoint['labels'],
+                'extra_vocab':  checkpoint['extra_vocab'],
+            }
+        else:
+            model_params = checkpoint['params']
+        # TODO: this can be removed once v1.10.0 is out
+        if isinstance(model_params['config'], SimpleNamespace):
+            model_params['config'] = vars(model_params['config'])
+        # TODO: these isinstance can go away after 1.10.0
+        model_type = model_params['config']['model_type']
+        if isinstance(model_type, str):
+            model_type = ModelType[model_type]
+            model_params['config']['model_type'] = model_type
+        if model_type == ModelType.CNN:
+            # TODO: these updates are only necessary during the
+            # transition to the @dataclass version of the config
+            # Once those are all saved, it is no longer necessary
+            # to patch existing models (since they will all be patched)
+            if 'has_charlm_forward' not in model_params['config']:
+                model_params['config']['has_charlm_forward'] = args.charlm_forward_file is not None
+            if 'has_charlm_backward' not in model_params['config']:
+                model_params['config']['has_charlm_backward'] = args.charlm_backward_file is not None
+            for argname in ['bert_hidden_layers', 'bert_finetune', 'force_bert_saved', 'use_peft',
+                            'lora_rank', 'lora_alpha', 'lora_dropout', 'lora_modules_to_save', 'lora_target_modules']:
+                model_params['config'][argname] = model_params['config'].get(argname, None)
+            # TODO: these isinstance can go away after 1.10.0
+            if isinstance(model_params['config']['wordvec_type'], str):
+                model_params['config']['wordvec_type'] = WVType[model_params['config']['wordvec_type']]
+            if isinstance(model_params['config']['extra_wordvec_method'], str):
+                model_params['config']['extra_wordvec_method'] = ExtraVectors[model_params['config']['extra_wordvec_method']]
+            model_params['config'] = CNNConfig(**model_params['config'])
+            pretrain = Trainer.load_pretrain(args, foundation_cache)
+            elmo_model = utils.load_elmo(args.elmo_model) if args.use_elmo else None
+            if model_params['config'].has_charlm_forward:
+                charmodel_forward = load_charlm(args.charlm_forward_file, foundation_cache)
+            else:
+                charmodel_forward = None
+            if model_params['config'].has_charlm_backward:
+                charmodel_backward = load_charlm(args.charlm_backward_file, foundation_cache)
+            else:
+                charmodel_backward = None
+            bert_model = model_params['config'].bert_model
+            # TODO: can get rid of the getattr after rebuilding all models
+            use_peft = getattr(model_params['config'], 'use_peft', False)
+            force_bert_saved = getattr(model_params['config'], 'force_bert_saved', False)
+            peft_name = None
+            if use_peft:
+                # if loading a peft model, we first load the base transformer
+                # the CNNClassifier code wraps the transformer in peft
+                # after creating the CNNClassifier with the peft wrapper,
+                # we *then* load the weights
+                bert_model, bert_tokenizer, peft_name = load_bert_with_peft(bert_model, "classifier", foundation_cache)
+                bert_model = load_peft_wrapper(bert_model, model_params['bert_lora'], vars(model_params['config']), logger, peft_name)
+            elif force_bert_saved:
+                bert_model, bert_tokenizer = load_bert(bert_model)
+            else:
+                bert_model, bert_tokenizer = load_bert(bert_model, foundation_cache)
+            model = cnn_classifier.CNNClassifier(pretrain=pretrain,
+                                                 extra_vocab=model_params['extra_vocab'],
+                                                 labels=model_params['labels'],
+                                                 charmodel_forward=charmodel_forward,
+                                                 charmodel_backward=charmodel_backward,
+                                                 elmo_model=elmo_model,
+                                                 bert_model=bert_model,
+                                                 bert_tokenizer=bert_tokenizer,
+                                                 force_bert_saved=force_bert_saved,
+                                                 peft_name=peft_name,
+                                                 args=model_params['config'])
+        elif model_type == ModelType.CONSTITUENCY:
+            # the constituency version doesn't have a peft feature yet
+            use_peft = False
+            pretrain_args = {
+                'wordvec_pretrain_file': args.wordvec_pretrain_file,
+                'charlm_forward_file': args.charlm_forward_file,
+                'charlm_backward_file': args.charlm_backward_file,
+            }
+            # TODO: integrate with peft for the constituency version
+            tree_embedding = TreeEmbedding.model_from_params(model_params['tree_embedding'], pretrain_args, foundation_cache)
+            model_params['config'] = ConstituencyConfig(**model_params['config'])
+            model = constituency_classifier.ConstituencyClassifier(tree_embedding=tree_embedding,
+                                                                   labels=model_params['labels'],
+                                                                   args=model_params['config'])
+        else:
+            raise ValueError("Unknown model type {}".format(model_type))
+        model.load_state_dict(model_params['model'], strict=False)
+        model = model.to(args.device)
+        logger.debug("-- MODEL CONFIG --")
+        for k in model.config.__dict__:
+            logger.debug("  --{}: {}".format(k, model.config.__dict__[k]))
+        logger.debug("-- MODEL LABELS --")
+        logger.debug("  {}".format(" ".join(model.labels)))
+        optimizer = None
+        if load_optimizer:
+            optimizer = Trainer.build_optimizer(model, args)
+            if checkpoint.get('optimizer_state_dict', None) is not None:
+                for opt_name, opt_state_dict in checkpoint['optimizer_state_dict'].items():
+                    optimizer[opt_name].load_state_dict(opt_state_dict)
+            else:
+                logger.info("Attempted to load optimizer to resume training, but optimizer not saved.  Creating new optimizer")
+        trainer = Trainer(model, optimizer, epochs_trained, global_step, best_score)
+        return trainer
+    def load_pretrain(args, foundation_cache):
+        if args.wordvec_pretrain_file:
+            pretrain_file = args.wordvec_pretrain_file
+        elif args.wordvec_type:
+            pretrain_file = '{}/{}.{}.pretrain.pt'.format(args.save_dir, args.shorthand, args.wordvec_type.name.lower())
+        else:
+            raise RuntimeError("TODO: need to get the wv type back from get_wordvec_file")
+        logger.debug("Looking for pretrained vectors in {}".format(pretrain_file))
+        if os.path.exists(pretrain_file):
+            return load_pretrain(pretrain_file, foundation_cache)
+        elif args.wordvec_raw_file:
+            vec_file = args.wordvec_raw_file
+            logger.debug("Pretrain not found.  Looking in {}".format(vec_file))
+        else:
+            vec_file = utils.get_wordvec_file(args.wordvec_dir, args.shorthand, args.wordvec_type.name.lower())
+            logger.debug("Pretrain not found.  Looking in {}".format(vec_file))
+        pretrain = Pretrain(pretrain_file, vec_file, args.pretrain_max_vocab)
+        logger.debug("Embedding shape: %s" % str(pretrain.emb.shape))
+        return pretrain
+    @staticmethod
+    def build_new_model(args, train_set):
+        """
+        Load pretrained pieces and then build a new model
+        """
+        if train_set is None:
+            raise ValueError("Must have a train set to build a new model - needed for labels and delta word vectors")
+        labels = data.dataset_labels(train_set)
+        if args.model_type == ModelType.CNN:
+            pretrain = Trainer.load_pretrain(args, foundation_cache=None)
+            elmo_model = utils.load_elmo(args.elmo_model) if args.use_elmo else None
+            charmodel_forward = load_charlm(args.charlm_forward_file)
+            charmodel_backward = load_charlm(args.charlm_backward_file)
+            peft_name = None
+            bert_model, bert_tokenizer = load_bert(args.bert_model)
+            use_peft = getattr(args, "use_peft", False)
+            if use_peft:
+                peft_name = "sentiment"
+                bert_model = build_peft_wrapper(bert_model, vars(args), logger, adapter_name=peft_name)
+            extra_vocab = data.dataset_vocab(train_set)
+            force_bert_saved = args.bert_finetune
+            model = cnn_classifier.CNNClassifier(pretrain=pretrain,
+                                                 extra_vocab=extra_vocab,
+                                                 labels=labels,
+                                                 charmodel_forward=charmodel_forward,
+                                                 charmodel_backward=charmodel_backward,
+                                                 elmo_model=elmo_model,
+                                                 bert_model=bert_model,
+                                                 bert_tokenizer=bert_tokenizer,
+                                                 force_bert_saved=force_bert_saved,
+                                                 peft_name=peft_name,
+                                                 args=args)
+            model = model.to(args.device)
+        elif args.model_type == ModelType.CONSTITUENCY:
+            # this passes flags such as "constituency_backprop" from
+            # the classifier to the TreeEmbedding as the "backprop" flag
+            parser_args = { x[len("constituency_"):]: y for x, y in vars(args).items() if x.startswith("constituency_") }
+            parser_args.update({
+                "wordvec_pretrain_file": args.wordvec_pretrain_file,
+                "charlm_forward_file": args.charlm_forward_file,
+                "charlm_backward_file": args.charlm_backward_file,
+                "bert_model": args.bert_model,
+                # we found that finetuning from the classifier output
+                # all the way to the bert layers caused the bert model
+                # to go astray
+                # could make this an option... but it is much less accurate
+                # with the Bert finetuning
+                # noting that the constituency parser itself works better
+                # after finetuning, of course
+                "bert_finetune": False,
+                "stage1_bert_finetune": False,
+            })
+            logger.info("Building constituency classifier using %s as the base model" % args.constituency_model)
+            tree_embedding = TreeEmbedding.from_parser_file(parser_args)
+            model = constituency_classifier.ConstituencyClassifier(tree_embedding=tree_embedding,
+                                                                   labels=labels,
+                                                                   args=args)
+            model = model.to(args.device)
+        else:
+            raise ValueError("Unhandled model type {}".format(args.model_type))
+        optimizer = Trainer.build_optimizer(model, args)
+        return Trainer(model, optimizer)
+    @staticmethod
+    def build_optimizer(model, args):
+        return get_split_optimizer(args.optim.lower(), model, args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay, bert_learning_rate=args.bert_learning_rate, bert_weight_decay=args.weight_decay * args.bert_weight_decay, is_peft=args.use_peft)

stanza/stanza/models/constituency/__init__.py ADDED Viewed

File without changes

stanza/stanza/models/constituency/evaluate_treebanks.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""
+Read multiple treebanks, score the results.
+Reports the k-best score if multiple predicted treebanks are given.
+"""
+import argparse
+from stanza.models.constituency import tree_reader
+from stanza.server.parser_eval import EvaluateParser, ParseResult
+def main():
+    parser = argparse.ArgumentParser(description='Get scores for one or more treebanks against the gold')
+    parser.add_argument('gold', type=str, help='Which file to load as the gold trees')
+    parser.add_argument('pred', type=str, nargs='+', help='Which file(s) are the predictions.  If more than one is given, the evaluation will be "k-best" with the first prediction treated as the canonical')
+    args = parser.parse_args()
+    print("Loading gold treebank: " + args.gold)
+    gold = tree_reader.read_treebank(args.gold)
+    print("Loading predicted treebanks: " + args.pred)
+    pred = [tree_reader.read_treebank(x) for x in args.pred]
+    full_results = [ParseResult(parses[0], [*parses[1:]])
+                    for parses in zip(gold, *pred)]
+    if len(pred) <= 1:
+        kbest = None
+    else:
+        kbest = len(pred)
+    with EvaluateParser(kbest=kbest) as evaluator:
+        response = evaluator.process(full_results)
+if __name__ == '__main__':
+    main()

stanza/stanza/models/constituency/label_attention.py ADDED Viewed

	@@ -0,0 +1,726 @@

+import numpy as np
+import functools
+import sys
+import torch
+from torch.autograd import Variable
+import torch.nn as nn
+import torch.nn.init as init
+# publicly available versions alternate between torch.uint8 and torch.bool,
+# but that is for older versions of torch anyway
+DTYPE = torch.bool
+class BatchIndices:
+    """
+    Batch indices container class (used to implement packed batches)
+    """
+    def __init__(self, batch_idxs_np, device):
+        self.batch_idxs_np = batch_idxs_np
+        self.batch_idxs_torch = torch.as_tensor(batch_idxs_np, dtype=torch.long, device=device)
+        self.batch_size = int(1 + np.max(batch_idxs_np))
+        batch_idxs_np_extra = np.concatenate([[-1], batch_idxs_np, [-1]])
+        self.boundaries_np = np.nonzero(batch_idxs_np_extra[1:] != batch_idxs_np_extra[:-1])[0]
+        #print(f"boundaries_np: {self.boundaries_np}")
+        #print(f"boundaries_np[1:]: {self.boundaries_np[1:]}")
+        #print(f"boundaries_np[:-1]: {self.boundaries_np[:-1]}")
+        self.seq_lens_np = self.boundaries_np[1:] - self.boundaries_np[:-1]
+        #print(f"seq_lens_np: {self.seq_lens_np}")
+        #print(f"batch_size: {self.batch_size}")
+        assert len(self.seq_lens_np) == self.batch_size
+        self.max_len = int(np.max(self.boundaries_np[1:] - self.boundaries_np[:-1]))
+class FeatureDropoutFunction(torch.autograd.function.InplaceFunction):
+    @classmethod
+    def forward(cls, ctx, input, batch_idxs, p=0.5, train=False, inplace=False):
+        if p < 0 or p > 1:
+            raise ValueError("dropout probability has to be between 0 and 1, "
+                             "but got {}".format(p))
+        ctx.p = p
+        ctx.train = train
+        ctx.inplace = inplace
+        if ctx.inplace:
+            ctx.mark_dirty(input)
+            output = input
+        else:
+            output = input.clone()
+        if ctx.p > 0 and ctx.train:
+            ctx.noise = input.new().resize_(batch_idxs.batch_size, input.size(1))
+            if ctx.p == 1:
+                ctx.noise.fill_(0)
+            else:
+                ctx.noise.bernoulli_(1 - ctx.p).div_(1 - ctx.p)
+            ctx.noise = ctx.noise[batch_idxs.batch_idxs_torch, :]
+            output.mul_(ctx.noise)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        if ctx.p > 0 and ctx.train:
+            return grad_output.mul(ctx.noise), None, None, None, None
+        else:
+            return grad_output, None, None, None, None
+#
+class FeatureDropout(nn.Module):
+    """
+    Feature-level dropout: takes an input of size len x num_features and drops
+    each feature with probabibility p. A feature is dropped across the full
+    portion of the input that corresponds to a single batch element.
+    """
+    def __init__(self, p=0.5, inplace=False):
+        super().__init__()
+        if p < 0 or p > 1:
+            raise ValueError("dropout probability has to be between 0 and 1, "
+                             "but got {}".format(p))
+        self.p = p
+        self.inplace = inplace
+    def forward(self, input, batch_idxs):
+        return FeatureDropoutFunction.apply(input, batch_idxs, self.p, self.training, self.inplace)
+class LayerNormalization(nn.Module):
+    def __init__(self, d_hid, eps=1e-3, affine=True):
+        super(LayerNormalization, self).__init__()
+        self.eps = eps
+        self.affine = affine
+        if self.affine:
+            self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True)
+            self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True)
+    def forward(self, z):
+        if z.size(-1) == 1:
+            return z
+        mu = torch.mean(z, keepdim=True, dim=-1)
+        sigma = torch.std(z, keepdim=True, dim=-1)
+        ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps)
+        if self.affine:
+            ln_out = ln_out * self.a_2.expand_as(ln_out) + self.b_2.expand_as(ln_out)
+        return ln_out
+class ScaledDotProductAttention(nn.Module):
+    def __init__(self, d_model, attention_dropout=0.1):
+        super(ScaledDotProductAttention, self).__init__()
+        self.temper = d_model ** 0.5
+        self.dropout = nn.Dropout(attention_dropout)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, q, k, v, attn_mask=None):
+        # q: [batch, slot, feat] or (batch * d_l) x max_len x d_k
+        # k: [batch, slot, feat] or (batch * d_l) x max_len x d_k
+        # v: [batch, slot, feat] or (batch * d_l) x max_len x d_v
+        # q in LAL is (batch * d_l) x 1 x d_k
+        attn = torch.bmm(q, k.transpose(1, 2)) / self.temper # (batch * d_l) x max_len x max_len
+        # in LAL, gives: (batch * d_l) x 1 x max_len
+        # attention weights from each word to each word, for each label
+        # in best model (repeated q): attention weights from label (as vector weights) to each word
+        if attn_mask is not None:
+            assert attn_mask.size() == attn.size(), \
+                    'Attention mask shape {} mismatch ' \
+                    'with Attention logit tensor shape ' \
+                    '{}.'.format(attn_mask.size(), attn.size())
+            attn.data.masked_fill_(attn_mask, -float('inf'))
+        attn = self.softmax(attn)
+        # Note that this makes the distribution not sum to 1. At some point it
+        # may be worth researching whether this is the right way to apply
+        # dropout to the attention.
+        # Note that the t2t code also applies dropout in this manner
+        attn = self.dropout(attn)
+        output = torch.bmm(attn, v) # (batch * d_l) x max_len x d_v
+        # in LAL, gives: (batch * d_l) x 1 x d_v
+        return output, attn
+class MultiHeadAttention(nn.Module):
+    """
+    Multi-head attention module
+    """
+    def __init__(self, n_head, d_model, d_k, d_v, residual_dropout=0.1, attention_dropout=0.1, d_positional=None):
+        super(MultiHeadAttention, self).__init__()
+        self.n_head = n_head
+        self.d_k = d_k
+        self.d_v = d_v
+        if not d_positional:
+            self.partitioned = False
+        else:
+            self.partitioned = True
+        if self.partitioned:
+            self.d_content = d_model - d_positional
+            self.d_positional = d_positional
+            self.w_qs1 = nn.Parameter(torch.FloatTensor(n_head, self.d_content, d_k // 2))
+            self.w_ks1 = nn.Parameter(torch.FloatTensor(n_head, self.d_content, d_k // 2))
+            self.w_vs1 = nn.Parameter(torch.FloatTensor(n_head, self.d_content, d_v // 2))
+            self.w_qs2 = nn.Parameter(torch.FloatTensor(n_head, self.d_positional, d_k // 2))
+            self.w_ks2 = nn.Parameter(torch.FloatTensor(n_head, self.d_positional, d_k // 2))
+            self.w_vs2 = nn.Parameter(torch.FloatTensor(n_head, self.d_positional, d_v // 2))
+            init.xavier_normal_(self.w_qs1)
+            init.xavier_normal_(self.w_ks1)
+            init.xavier_normal_(self.w_vs1)
+            init.xavier_normal_(self.w_qs2)
+            init.xavier_normal_(self.w_ks2)
+            init.xavier_normal_(self.w_vs2)
+        else:
+            self.w_qs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k))
+            self.w_ks = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k))
+            self.w_vs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_v))
+            init.xavier_normal_(self.w_qs)
+            init.xavier_normal_(self.w_ks)
+            init.xavier_normal_(self.w_vs)
+        self.attention = ScaledDotProductAttention(d_model, attention_dropout=attention_dropout)
+        self.layer_norm = LayerNormalization(d_model)
+        if not self.partitioned:
+            # The lack of a bias term here is consistent with the t2t code, though
+            # in my experiments I have never observed this making a difference.
+            self.proj = nn.Linear(n_head*d_v, d_model, bias=False)
+        else:
+            self.proj1 = nn.Linear(n_head*(d_v//2), self.d_content, bias=False)
+            self.proj2 = nn.Linear(n_head*(d_v//2), self.d_positional, bias=False)
+        self.residual_dropout = FeatureDropout(residual_dropout)
+    def split_qkv_packed(self, inp, qk_inp=None):
+        v_inp_repeated = inp.repeat(self.n_head, 1).view(self.n_head, -1, inp.size(-1)) # n_head x len_inp x d_model
+        if qk_inp is None:
+            qk_inp_repeated = v_inp_repeated
+        else:
+            qk_inp_repeated = qk_inp.repeat(self.n_head, 1).view(self.n_head, -1, qk_inp.size(-1))
+        if not self.partitioned:
+            q_s = torch.bmm(qk_inp_repeated, self.w_qs) # n_head x len_inp x d_k
+            k_s = torch.bmm(qk_inp_repeated, self.w_ks) # n_head x len_inp x d_k
+            v_s = torch.bmm(v_inp_repeated, self.w_vs) # n_head x len_inp x d_v
+        else:
+            q_s = torch.cat([
+                torch.bmm(qk_inp_repeated[:,:,:self.d_content], self.w_qs1),
+                torch.bmm(qk_inp_repeated[:,:,self.d_content:], self.w_qs2),
+                ], -1)
+            k_s = torch.cat([
+                torch.bmm(qk_inp_repeated[:,:,:self.d_content], self.w_ks1),
+                torch.bmm(qk_inp_repeated[:,:,self.d_content:], self.w_ks2),
+                ], -1)
+            v_s = torch.cat([
+                torch.bmm(v_inp_repeated[:,:,:self.d_content], self.w_vs1),
+                torch.bmm(v_inp_repeated[:,:,self.d_content:], self.w_vs2),
+                ], -1)
+        return q_s, k_s, v_s
+    def pad_and_rearrange(self, q_s, k_s, v_s, batch_idxs):
+        # Input is padded representation: n_head x len_inp x d
+        # Output is packed representation: (n_head * mb_size) x len_padded x d
+        # (along with masks for the attention and output)
+        n_head = self.n_head
+        d_k, d_v = self.d_k, self.d_v
+        len_padded = batch_idxs.max_len
+        mb_size = batch_idxs.batch_size
+        q_padded = q_s.new_zeros((n_head, mb_size, len_padded, d_k))
+        k_padded = k_s.new_zeros((n_head, mb_size, len_padded, d_k))
+        v_padded = v_s.new_zeros((n_head, mb_size, len_padded, d_v))
+        invalid_mask = q_s.new_ones((mb_size, len_padded), dtype=DTYPE)
+        for i, (start, end) in enumerate(zip(batch_idxs.boundaries_np[:-1], batch_idxs.boundaries_np[1:])):
+            q_padded[:,i,:end-start,:] = q_s[:,start:end,:]
+            k_padded[:,i,:end-start,:] = k_s[:,start:end,:]
+            v_padded[:,i,:end-start,:] = v_s[:,start:end,:]
+            invalid_mask[i, :end-start].fill_(False)
+        return(
+            q_padded.view(-1, len_padded, d_k),
+            k_padded.view(-1, len_padded, d_k),
+            v_padded.view(-1, len_padded, d_v),
+            invalid_mask.unsqueeze(1).expand(mb_size, len_padded, len_padded).repeat(n_head, 1, 1),
+            (~invalid_mask).repeat(n_head, 1),
+            )
+    def combine_v(self, outputs):
+        # Combine attention information from the different heads
+        n_head = self.n_head
+        outputs = outputs.view(n_head, -1, self.d_v) # n_head x len_inp x d_kv
+        if not self.partitioned:
+            # Switch from n_head x len_inp x d_v to len_inp x (n_head * d_v)
+            outputs = torch.transpose(outputs, 0, 1).contiguous().view(-1, n_head * self.d_v)
+            # Project back to residual size
+            outputs = self.proj(outputs)
+        else:
+            d_v1 = self.d_v // 2
+            outputs1 = outputs[:,:,:d_v1]
+            outputs2 = outputs[:,:,d_v1:]
+            outputs1 = torch.transpose(outputs1, 0, 1).contiguous().view(-1, n_head * d_v1)
+            outputs2 = torch.transpose(outputs2, 0, 1).contiguous().view(-1, n_head * d_v1)
+            outputs = torch.cat([
+                self.proj1(outputs1),
+                self.proj2(outputs2),
+                ], -1)
+        return outputs
+    def forward(self, inp, batch_idxs, qk_inp=None):
+        residual = inp
+        # While still using a packed representation, project to obtain the
+        # query/key/value for each head
+        q_s, k_s, v_s = self.split_qkv_packed(inp, qk_inp=qk_inp)
+        # n_head x len_inp x d_kv
+        # Switch to padded representation, perform attention, then switch back
+        q_padded, k_padded, v_padded, attn_mask, output_mask = self.pad_and_rearrange(q_s, k_s, v_s, batch_idxs)
+        # (n_head * batch) x len_padded x d_kv
+        outputs_padded, attns_padded = self.attention(
+            q_padded, k_padded, v_padded,
+            attn_mask=attn_mask,
+            )
+        outputs = outputs_padded[output_mask]
+        # (n_head * len_inp) x d_kv
+        outputs = self.combine_v(outputs)
+        # len_inp x d_model
+        outputs = self.residual_dropout(outputs, batch_idxs)
+        return self.layer_norm(outputs + residual), attns_padded
+#
+class PositionwiseFeedForward(nn.Module):
+    """
+    A position-wise feed forward module.
+    Projects to a higher-dimensional space before applying ReLU, then projects
+    back.
+    """
+    def __init__(self, d_hid, d_ff, relu_dropout=0.1, residual_dropout=0.1):
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = nn.Linear(d_hid, d_ff)
+        self.w_2 = nn.Linear(d_ff, d_hid)
+        self.layer_norm = LayerNormalization(d_hid)
+        self.relu_dropout = FeatureDropout(relu_dropout)
+        self.residual_dropout = FeatureDropout(residual_dropout)
+        self.relu = nn.ReLU()
+    def forward(self, x, batch_idxs):
+        residual = x
+        output = self.w_1(x)
+        output = self.relu_dropout(self.relu(output), batch_idxs)
+        output = self.w_2(output)
+        output = self.residual_dropout(output, batch_idxs)
+        return self.layer_norm(output + residual)
+#
+class PartitionedPositionwiseFeedForward(nn.Module):
+    def __init__(self, d_hid, d_ff, d_positional, relu_dropout=0.1, residual_dropout=0.1):
+        super().__init__()
+        self.d_content = d_hid - d_positional
+        self.w_1c = nn.Linear(self.d_content, d_ff//2)
+        self.w_1p = nn.Linear(d_positional, d_ff//2)
+        self.w_2c = nn.Linear(d_ff//2, self.d_content)
+        self.w_2p = nn.Linear(d_ff//2, d_positional)
+        self.layer_norm = LayerNormalization(d_hid)
+        self.relu_dropout = FeatureDropout(relu_dropout)
+        self.residual_dropout = FeatureDropout(residual_dropout)
+        self.relu = nn.ReLU()
+    def forward(self, x, batch_idxs):
+        residual = x
+        xc = x[:, :self.d_content]
+        xp = x[:, self.d_content:]
+        outputc = self.w_1c(xc)
+        outputc = self.relu_dropout(self.relu(outputc), batch_idxs)
+        outputc = self.w_2c(outputc)
+        outputp = self.w_1p(xp)
+        outputp = self.relu_dropout(self.relu(outputp), batch_idxs)
+        outputp = self.w_2p(outputp)
+        output = torch.cat([outputc, outputp], -1)
+        output = self.residual_dropout(output, batch_idxs)
+        return self.layer_norm(output + residual)
+class LabelAttention(nn.Module):
+    """
+    Single-head Attention layer for label-specific representations
+    """
+    def __init__(self, d_model, d_k, d_v, d_l, d_proj, combine_as_self, use_resdrop=True, q_as_matrix=False, residual_dropout=0.1, attention_dropout=0.1, d_positional=None):
+        super(LabelAttention, self).__init__()
+        self.d_k = d_k
+        self.d_v = d_v
+        self.d_l = d_l # Number of Labels
+        self.d_model = d_model # Model Dimensionality
+        self.d_proj = d_proj # Projection dimension of each label output
+        self.use_resdrop = use_resdrop # Using Residual Dropout?
+        self.q_as_matrix = q_as_matrix # Using a Matrix of Q to be multiplied with input instead of learned q vectors
+        self.combine_as_self = combine_as_self # Using the Combination Method of Self-Attention
+        if not d_positional:
+            self.partitioned = False
+        else:
+            self.partitioned = True
+        if self.partitioned:
+            if d_model <= d_positional:
+                raise ValueError("Unable to build LabelAttention.  d_model %d <= d_positional %d" % (d_model, d_positional))
+            self.d_content = d_model - d_positional
+            self.d_positional = d_positional
+            if self.q_as_matrix:
+                self.w_qs1 = nn.Parameter(torch.FloatTensor(self.d_l, self.d_content, d_k // 2), requires_grad=True)
+            else:
+                self.w_qs1 = nn.Parameter(torch.FloatTensor(self.d_l, d_k // 2), requires_grad=True)
+            self.w_ks1 = nn.Parameter(torch.FloatTensor(self.d_l, self.d_content, d_k // 2), requires_grad=True)
+            self.w_vs1 = nn.Parameter(torch.FloatTensor(self.d_l, self.d_content, d_v // 2), requires_grad=True)
+            if self.q_as_matrix:
+                self.w_qs2 = nn.Parameter(torch.FloatTensor(self.d_l, self.d_positional, d_k // 2), requires_grad=True)
+            else:
+                self.w_qs2 = nn.Parameter(torch.FloatTensor(self.d_l, d_k // 2), requires_grad=True)
+            self.w_ks2 = nn.Parameter(torch.FloatTensor(self.d_l, self.d_positional, d_k // 2), requires_grad=True)
+            self.w_vs2 = nn.Parameter(torch.FloatTensor(self.d_l, self.d_positional, d_v // 2), requires_grad=True)
+            init.xavier_normal_(self.w_qs1)
+            init.xavier_normal_(self.w_ks1)
+            init.xavier_normal_(self.w_vs1)
+            init.xavier_normal_(self.w_qs2)
+            init.xavier_normal_(self.w_ks2)
+            init.xavier_normal_(self.w_vs2)
+        else:
+            if self.q_as_matrix:
+                self.w_qs = nn.Parameter(torch.FloatTensor(self.d_l, d_model, d_k), requires_grad=True)
+            else:
+                self.w_qs = nn.Parameter(torch.FloatTensor(self.d_l, d_k), requires_grad=True)
+            self.w_ks = nn.Parameter(torch.FloatTensor(self.d_l, d_model, d_k), requires_grad=True)
+            self.w_vs = nn.Parameter(torch.FloatTensor(self.d_l, d_model, d_v), requires_grad=True)
+            init.xavier_normal_(self.w_qs)
+            init.xavier_normal_(self.w_ks)
+            init.xavier_normal_(self.w_vs)
+        self.attention = ScaledDotProductAttention(d_model, attention_dropout=attention_dropout)
+        if self.combine_as_self:
+            self.layer_norm = LayerNormalization(d_model)
+        else:
+            self.layer_norm = LayerNormalization(self.d_proj)
+        if not self.partitioned:
+            # The lack of a bias term here is consistent with the t2t code, though
+            # in my experiments I have never observed this making a difference.
+            if self.combine_as_self:
+                self.proj = nn.Linear(self.d_l * d_v, d_model, bias=False)
+            else:
+                self.proj = nn.Linear(d_v, d_model, bias=False) # input dimension does not match, should be d_l * d_v
+        else:
+            if self.combine_as_self:
+                self.proj1 = nn.Linear(self.d_l*(d_v//2), self.d_content, bias=False)
+                self.proj2 = nn.Linear(self.d_l*(d_v//2), self.d_positional, bias=False)
+            else:
+                self.proj1 = nn.Linear(d_v//2, self.d_content, bias=False)
+                self.proj2 = nn.Linear(d_v//2, self.d_positional, bias=False)
+        if not self.combine_as_self:
+            self.reduce_proj = nn.Linear(d_model, self.d_proj, bias=False)
+        self.residual_dropout = FeatureDropout(residual_dropout)
+    def split_qkv_packed(self, inp, k_inp=None):
+        len_inp = inp.size(0)
+        v_inp_repeated = inp.repeat(self.d_l, 1).view(self.d_l, -1, inp.size(-1)) # d_l x len_inp x d_model
+        if k_inp is None:
+            k_inp_repeated = v_inp_repeated
+        else:
+            k_inp_repeated = k_inp.repeat(self.d_l, 1).view(self.d_l, -1, k_inp.size(-1)) # d_l x len_inp x d_model
+        if not self.partitioned:
+            if self.q_as_matrix:
+                q_s = torch.bmm(k_inp_repeated, self.w_qs) # d_l x len_inp x d_k
+            else:
+                q_s = self.w_qs.unsqueeze(1) # d_l x 1 x d_k
+            k_s = torch.bmm(k_inp_repeated, self.w_ks) # d_l x len_inp x d_k
+            v_s = torch.bmm(v_inp_repeated, self.w_vs) # d_l x len_inp x d_v
+        else:
+            if self.q_as_matrix:
+                q_s = torch.cat([
+                    torch.bmm(k_inp_repeated[:,:,:self.d_content], self.w_qs1),
+                    torch.bmm(k_inp_repeated[:,:,self.d_content:], self.w_qs2),
+                    ], -1)
+            else:
+                q_s = torch.cat([
+                    self.w_qs1.unsqueeze(1),
+                    self.w_qs2.unsqueeze(1),
+                    ], -1)
+            k_s = torch.cat([
+                torch.bmm(k_inp_repeated[:,:,:self.d_content], self.w_ks1),
+                torch.bmm(k_inp_repeated[:,:,self.d_content:], self.w_ks2),
+                ], -1)
+            v_s = torch.cat([
+                torch.bmm(v_inp_repeated[:,:,:self.d_content], self.w_vs1),
+                torch.bmm(v_inp_repeated[:,:,self.d_content:], self.w_vs2),
+                ], -1)
+        return q_s, k_s, v_s
+    def pad_and_rearrange(self, q_s, k_s, v_s, batch_idxs):
+        # Input is padded representation: n_head x len_inp x d
+        # Output is packed representation: (n_head * mb_size) x len_padded x d
+        # (along with masks for the attention and output)
+        n_head = self.d_l
+        d_k, d_v = self.d_k, self.d_v
+        len_padded = batch_idxs.max_len
+        mb_size = batch_idxs.batch_size
+        if self.q_as_matrix:
+            q_padded = q_s.new_zeros((n_head, mb_size, len_padded, d_k))
+        else:
+            q_padded = q_s.repeat(mb_size, 1, 1) # (d_l * mb_size) x 1 x d_k
+        k_padded = k_s.new_zeros((n_head, mb_size, len_padded, d_k))
+        v_padded = v_s.new_zeros((n_head, mb_size, len_padded, d_v))
+        invalid_mask = q_s.new_ones((mb_size, len_padded), dtype=DTYPE)
+        for i, (start, end) in enumerate(zip(batch_idxs.boundaries_np[:-1], batch_idxs.boundaries_np[1:])):
+            if self.q_as_matrix:
+                q_padded[:,i,:end-start,:] = q_s[:,start:end,:]
+            k_padded[:,i,:end-start,:] = k_s[:,start:end,:]
+            v_padded[:,i,:end-start,:] = v_s[:,start:end,:]
+            invalid_mask[i, :end-start].fill_(False)
+        if self.q_as_matrix:
+            q_padded = q_padded.view(-1, len_padded, d_k)
+            attn_mask = invalid_mask.unsqueeze(1).expand(mb_size, len_padded, len_padded).repeat(n_head, 1, 1)
+        else:
+            attn_mask = invalid_mask.unsqueeze(1).repeat(n_head, 1, 1)
+        output_mask = (~invalid_mask).repeat(n_head, 1)
+        return(
+            q_padded,
+            k_padded.view(-1, len_padded, d_k),
+            v_padded.view(-1, len_padded, d_v),
+            attn_mask,
+            output_mask,
+            )
+    def combine_v(self, outputs):
+        # Combine attention information from the different labels
+        d_l = self.d_l
+        outputs = outputs.view(d_l, -1, self.d_v) # d_l x len_inp x d_v
+        if not self.partitioned:
+            # Switch from d_l x len_inp x d_v to len_inp x d_l x d_v
+            if self.combine_as_self:
+                outputs = torch.transpose(outputs, 0, 1).contiguous().view(-1, d_l * self.d_v)
+            else:
+                outputs = torch.transpose(outputs, 0, 1)#.contiguous() #.view(-1, d_l * self.d_v)
+            # Project back to residual size
+            outputs = self.proj(outputs) # Becomes len_inp x d_l x d_model
+        else:
+            d_v1 = self.d_v // 2
+            outputs1 = outputs[:,:,:d_v1]
+            outputs2 = outputs[:,:,d_v1:]
+            if self.combine_as_self:
+                outputs1 = torch.transpose(outputs1, 0, 1).contiguous().view(-1, d_l * d_v1)
+                outputs2 = torch.transpose(outputs2, 0, 1).contiguous().view(-1, d_l * d_v1)
+            else:
+                outputs1 = torch.transpose(outputs1, 0, 1)#.contiguous() #.view(-1, d_l * d_v1)
+                outputs2 = torch.transpose(outputs2, 0, 1)#.contiguous() #.view(-1, d_l * d_v1)
+            outputs = torch.cat([
+                self.proj1(outputs1),
+                self.proj2(outputs2),
+                ], -1)#.contiguous()
+        return outputs
+    def forward(self, inp, batch_idxs, k_inp=None):
+        residual = inp # len_inp x d_model
+        #print()
+        #print(f"inp.shape: {inp.shape}")
+        len_inp = inp.size(0)
+        #print(f"len_inp: {len_inp}")
+        # While still using a packed representation, project to obtain the
+        # query/key/value for each head
+        q_s, k_s, v_s = self.split_qkv_packed(inp, k_inp=k_inp)
+        # d_l x len_inp x d_k
+        # q_s is d_l x 1 x d_k
+        # Switch to padded representation, perform attention, then switch back
+        q_padded, k_padded, v_padded, attn_mask, output_mask = self.pad_and_rearrange(q_s, k_s, v_s, batch_idxs)
+        # q_padded, k_padded, v_padded: (d_l * batch_size) x max_len x d_kv
+        # q_s is (d_l * batch_size) x 1 x d_kv
+        outputs_padded, attns_padded = self.attention(
+            q_padded, k_padded, v_padded,
+            attn_mask=attn_mask,
+            )
+        # outputs_padded: (d_l * batch_size) x max_len x d_kv
+        # in LAL: (d_l * batch_size) x 1 x d_kv
+        # on the best model, this is one value vector per label that is repeated max_len times
+        if not self.q_as_matrix:
+            outputs_padded = outputs_padded.repeat(1,output_mask.size(-1),1)
+        outputs = outputs_padded[output_mask]
+        # outputs: (d_l * len_inp) x d_kv or LAL: (d_l * len_inp) x d_kv
+        # output_mask: (d_l * batch_size) x max_len
+        outputs = self.combine_v(outputs)
+        #print(f"outputs shape: {outputs.shape}")
+        # outputs: len_inp x d_l x d_model, whereas a normal self-attention layer gets len_inp x d_model
+        if self.use_resdrop:
+            if self.combine_as_self:
+                outputs = self.residual_dropout(outputs, batch_idxs)
+            else:
+                outputs = torch.cat([self.residual_dropout(outputs[:,i,:], batch_idxs).unsqueeze(1) for i in range(self.d_l)], 1)
+        if self.combine_as_self:
+            outputs = self.layer_norm(outputs + inp)
+        else:
+            for l in range(self.d_l):
+                outputs[:, l, :] = outputs[:, l, :] + inp
+            outputs = self.reduce_proj(outputs) # len_inp x d_l x d_proj
+            outputs = self.layer_norm(outputs) # len_inp x d_l x d_proj
+            outputs = outputs.view(len_inp, -1).contiguous() # len_inp x (d_l * d_proj)
+        return outputs, attns_padded
+#
+class LabelAttentionModule(nn.Module):
+    """
+    Label Attention Module for label-specific representations
+    The module can be used right after the Partitioned Attention, or it can be experimented with for the transition stack
+    """
+    #
+    def __init__(self,
+                 d_model,
+                 d_input_proj,
+                 d_k,
+                 d_v,
+                 d_l,
+                 d_proj,
+                 combine_as_self,
+                 use_resdrop=True,
+                 q_as_matrix=False,
+                 residual_dropout=0.1,
+                 attention_dropout=0.1,
+                 d_positional=None,
+                 d_ff=2048,
+                 relu_dropout=0.2,
+                 lattn_partitioned=True):
+        super().__init__()
+        self.ff_dim = d_proj * d_l
+        if not lattn_partitioned:
+            self.d_positional = 0
+        else:
+            self.d_positional = d_positional if d_positional else 0
+        if d_input_proj:
+            if d_input_proj <= self.d_positional:
+                raise ValueError("Illegal argument for d_input_proj: d_input_proj %d is smaller than d_positional %d" % (d_input_proj, self.d_positional))
+            self.input_projection = nn.Linear(d_model - self.d_positional, d_input_proj - self.d_positional, bias=False)
+            d_input = d_input_proj
+        else:
+            self.input_projection = None
+            d_input = d_model
+        self.label_attention = LabelAttention(d_input,
+                                              d_k,
+                                              d_v,
+                                              d_l,
+                                              d_proj,
+                                              combine_as_self,
+                                              use_resdrop,
+                                              q_as_matrix,
+                                              residual_dropout,
+                                              attention_dropout,
+                                              self.d_positional)
+        if not lattn_partitioned:
+            self.lal_ff = PositionwiseFeedForward(self.ff_dim,
+                                                  d_ff,
+                                                  relu_dropout,
+                                                  residual_dropout)
+        else:
+            self.lal_ff = PartitionedPositionwiseFeedForward(self.ff_dim,
+                                                             d_ff,
+                                                             self.d_positional,
+                                                             relu_dropout,
+                                                             residual_dropout)
+    def forward(self, word_embeddings, tagged_word_lists):
+        if self.input_projection:
+            if self.d_positional > 0:
+                word_embeddings = [torch.cat((self.input_projection(sentence[:, :-self.d_positional]),
+                                              sentence[:, -self.d_positional:]), dim=1)
+                                   for sentence in word_embeddings]
+            else:
+                word_embeddings = [self.input_projection(sentence) for sentence in word_embeddings]
+        # Extract Labeled Representation
+        packed_len = sum(sentence.shape[0] for sentence in word_embeddings)
+        batch_idxs = np.zeros(packed_len, dtype=int)
+        batch_size = len(word_embeddings)
+        i = 0
+        sentence_lengths = [0] * batch_size
+        for sentence_idx, sentence in enumerate(word_embeddings):
+            sentence_lengths[sentence_idx] = len(sentence)
+            for word in sentence:
+                batch_idxs[i] = sentence_idx
+                i += 1
+        batch_indices = batch_idxs
+        batch_idxs = BatchIndices(batch_idxs, word_embeddings[0].device)
+        new_embeds = []
+        for sentence_idx, batch in enumerate(word_embeddings):
+            for word_idx, embed in enumerate(batch):
+                if word_idx < sentence_lengths[sentence_idx]:
+                    new_embeds.append(embed)
+        new_word_embeddings = torch.stack(new_embeds)
+        labeled_representations, _ = self.label_attention(new_word_embeddings, batch_idxs)
+        labeled_representations = self.lal_ff(labeled_representations, batch_idxs)
+        final_labeled_representations = [[] for i in range(batch_size)]
+        for idx, embed in enumerate(labeled_representations):
+            final_labeled_representations[batch_indices[idx]].append(embed)
+        for idx, representation in enumerate(final_labeled_representations):
+            final_labeled_representations[idx]  = torch.stack(representation)
+        return final_labeled_representations

stanza/stanza/models/constituency/lstm_tree_stack.py ADDED Viewed

	@@ -0,0 +1,91 @@

+"""
+Keeps an LSTM in TreeStack form.
+The TreeStack nodes keep the hx and cx for the LSTM, along with a
+"value" which represents whatever the user needs to store.
+The TreeStacks can be ppped to get back to the previous LSTM state.
+The module itself implements three methods: initial_state, push_states, output
+"""
+from collections import namedtuple
+import torch
+import torch.nn as nn
+from stanza.models.constituency.tree_stack import TreeStack
+Node = namedtuple("Node", ['value', 'lstm_hx', 'lstm_cx'])
+class LSTMTreeStack(nn.Module):
+    def __init__(self, input_size, hidden_size, num_lstm_layers, dropout, uses_boundary_vector, input_dropout):
+        """
+        Prepare LSTM and parameters
+        input_size: dimension of the inputs to the LSTM
+        hidden_size: LSTM internal & output dimension
+        num_lstm_layers: how many layers of LSTM to use
+        dropout: value of the LSTM dropout
+        uses_boundary_vector: if set, learn a start_embedding parameter.  otherwise, use zeros
+        input_dropout: an nn.Module to dropout inputs.  TODO: allow a float parameter as well
+        """
+        super().__init__()
+        self.uses_boundary_vector = uses_boundary_vector
+        # The start embedding needs to be input_size as we put it through the LSTM
+        if uses_boundary_vector:
+            self.register_parameter('start_embedding', torch.nn.Parameter(0.2 * torch.randn(input_size, requires_grad=True)))
+        else:
+            self.register_buffer('input_zeros',  torch.zeros(num_lstm_layers, 1, input_size))
+            self.register_buffer('hidden_zeros', torch.zeros(num_lstm_layers, 1, hidden_size))
+        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_lstm_layers, dropout=dropout)
+        self.input_dropout = input_dropout
+    def initial_state(self, initial_value=None):
+        """
+        Return an initial state, either based on zeros or based on the initial embedding and LSTM
+        Note that LSTM start operation is already batched, in a sense
+        The subsequent batch built this way will be used for batch_size trees
+        Returns a stack with None value, hx & cx either based on the
+        start_embedding or zeros, and no parent.
+        """
+        if self.uses_boundary_vector:
+            start = self.start_embedding.unsqueeze(0).unsqueeze(0)
+            output, (hx, cx) = self.lstm(start)
+            start = output[0, 0, :]
+        else:
+            start = self.input_zeros
+            hx = self.hidden_zeros
+            cx = self.hidden_zeros
+        return TreeStack(value=Node(initial_value, hx, cx), parent=None, length=1)
+    def push_states(self, stacks, values, inputs):
+        """
+        Starting from a list of current stacks, put the inputs through the LSTM and build new stack nodes.
+        B = stacks.len() = values.len()
+        inputs must be of shape 1 x B x input_size
+        """
+        inputs = self.input_dropout(inputs)
+        hx = torch.cat([t.value.lstm_hx for t in stacks], axis=1)
+        cx = torch.cat([t.value.lstm_cx for t in stacks], axis=1)
+        output, (hx, cx) = self.lstm(inputs, (hx, cx))
+        new_stacks = [stack.push(Node(transition, hx[:, i:i+1, :], cx[:, i:i+1, :]))
+                      for i, (stack, transition) in enumerate(zip(stacks, values))]
+        return new_stacks
+    def output(self, stack):
+        """
+        Return the last layer of the lstm_hx as the output from a stack
+        Refactored so that alternate structures have an easy way of getting the output
+        """
+        return stack.value.lstm_hx[-1, 0, :]

stanza/stanza/models/constituency/score_converted_dependencies.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""
+Script which processes a dependency file by using the constituency parser, then converting with the CoreNLP converter
+Currently this does not have the constituency parser as an option,
+although that is easy to add.
+Only English is supported, as only English is available in the CoreNLP converter
+"""
+import argparse
+import os
+import tempfile
+import stanza
+from stanza.models.constituency import retagging
+from stanza.models.depparse import scorer
+from stanza.utils.conll import CoNLL
+def score_converted_dependencies(args):
+    if args['lang'] != 'en':
+        raise ValueError("Converting and scoring dependencies is currently only supported for English")
+    constituency_package = args['constituency_package']
+    pipeline_args = {'lang': args['lang'],
+                     'tokenize_pretokenized': True,
+                     'package': {'pos': args['retag_package'], 'depparse': 'converter', 'constituency': constituency_package},
+                     'processors': 'tokenize, pos, constituency, depparse'}
+    pipeline = stanza.Pipeline(**pipeline_args)
+    input_doc = CoNLL.conll2doc(args['eval_file'])
+    output_doc = pipeline(input_doc)
+    print("Processed %d sentences" % len(output_doc.sentences))
+    # reload - the pipeline clobbered the gold values
+    input_doc = CoNLL.conll2doc(args['eval_file'])
+    scorer.score_named_dependencies(output_doc, input_doc)
+    with tempfile.TemporaryDirectory() as tempdir:
+        output_path = os.path.join(tempdir, "converted.conll")
+        CoNLL.write_doc2conll(output_doc, output_path)
+        _, _, score = scorer.score(output_path, args['eval_file'])
+        print("Parser score:")
+        print("{} {:.2f}".format(constituency_package, score*100))
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--lang', default='en', type=str, help='Language')
+    parser.add_argument('--eval_file', default="extern_data/ud2/ud-treebanks-v2.13/UD_English-EWT/en_ewt-ud-test.conllu", help='Input file for data loader.')
+    parser.add_argument('--constituency_package', default="ptb3-revised_electra-large", help='Which constituency parser to use for converting')
+    retagging.add_retag_args(parser)
+    args = parser.parse_args()
+    args = vars(args)
+    retagging.postprocess_args(args)
+    score_converted_dependencies(args)
+if __name__ == '__main__':
+    main()

stanza/stanza/models/constituency/text_processing.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import os
+import logging
+from stanza.models.common import utils
+from stanza.models.constituency.utils import retag_tags
+from stanza.models.constituency.trainer import Trainer
+from stanza.models.constituency.tree_reader import read_trees
+from stanza.utils.get_tqdm import get_tqdm
+logger = logging.getLogger('stanza')
+tqdm = get_tqdm()
+def read_tokenized_file(tokenized_file):
+    """
+    Read sentences from a tokenized file, potentially replacing _ with space for languages such as VI
+    """
+    with open(tokenized_file, encoding='utf-8') as fin:
+        lines = fin.readlines()
+    lines = [x.strip() for x in lines]
+    lines = [x for x in lines if x]
+    docs = [[word if all(x == '_' for x in word) else word.replace("_", " ") for word in sentence.split()] for sentence in lines]
+    ids = [None] * len(docs)
+    return docs, ids
+def read_xml_tree_file(tree_file):
+    """
+    Read sentences from a file of the format unique to VLSP test sets
+    in particular, it should be multiple blocks of
+    <s id=1>
+      (tree ...)
+    </s>
+    """
+    with open(tree_file, encoding='utf-8') as fin:
+        lines = fin.readlines()
+    lines = [x.strip() for x in lines]
+    lines = [x for x in lines if x]
+    docs = []
+    ids = []
+    tree_id = None
+    tree_text = []
+    for line in lines:
+        if line.startswith("<s"):
+            tree_id = line.split("=")
+            if len(tree_id) > 1:
+                tree_id = tree_id[1]
+                if tree_id.endswith(">"):
+                    tree_id = tree_id[:-1]
+                tree_id = int(tree_id)
+            else:
+                tree_id = None
+        elif line.startswith("</s"):
+            if len(tree_text) == 0:
+                raise ValueError("Found a blank tree in %s" % tree_file)
+            ids.append(tree_id)
+            tree_text = "\n".join(tree_text)
+            trees = read_trees(tree_text)
+            # TODO: perhaps the processing can be put into read_trees instead
+            trees = [t.prune_none().simplify_labels() for t in trees]
+            if len(trees) != 1:
+                raise ValueError("Found a tree with %d trees in %s" % (len(trees), tree_file))
+            tree = trees[0]
+            text = tree.leaf_labels()
+            text = [word if all(x == '_' for x in word) else word.replace("_", " ") for word in text]
+            docs.append(text)
+            tree_text = []
+            tree_id = None
+        else:
+            tree_text.append(line)
+    return docs, ids
+def parse_tokenized_sentences(args, model, retag_pipeline, sentences):
+    """
+    Parse the given sentences, return a list of ParseResult objects
+    """
+    tags = retag_tags(sentences, retag_pipeline, model.uses_xpos())
+    words = [[(word, tag) for word, tag in zip(s_words, s_tags)] for s_words, s_tags in zip(sentences, tags)]
+    logger.info("Retagging finished.  Parsing tagged text")
+    assert len(words) == len(sentences)
+    treebank = model.parse_sentences_no_grad(iter(tqdm(words)), model.build_batch_from_tagged_words, args['eval_batch_size'], model.predict, keep_scores=False)
+    return treebank
+def parse_text(args, model, retag_pipeline, tokenized_file=None, predict_file=None):
+    """
+    Use the given model to parse text and write it
+    refactored so it can be used elsewhere, such as Ensemble
+    """
+    model.eval()
+    if predict_file is None:
+        if args['predict_file']:
+            predict_file = args['predict_file']
+            if args['predict_dir']:
+                predict_file = os.path.join(args['predict_dir'], predict_file)
+    if tokenized_file is None:
+        tokenized_file = args['tokenized_file']
+    docs, ids = None, None
+    if tokenized_file is not None:
+        docs, ids = read_tokenized_file(tokenized_file)
+    elif args['xml_tree_file']:
+        logger.info("Reading trees from %s" % args['xml_tree_file'])
+        docs, ids = read_xml_tree_file(args['xml_tree_file'])
+    if not docs:
+        logger.error("No sentences to process!")
+        return
+    logger.info("Processing %d sentences", len(docs))
+    with utils.output_stream(predict_file) as fout:
+        chunk_size = 10000
+        for chunk_start in range(0, len(docs), chunk_size):
+            chunk = docs[chunk_start:chunk_start+chunk_size]
+            ids_chunk = ids[chunk_start:chunk_start+chunk_size]
+            logger.info("Processing trees %d to %d", chunk_start, chunk_start+len(chunk))
+            treebank = parse_tokenized_sentences(args, model, retag_pipeline, chunk)
+            for result, tree_id in zip(treebank, ids_chunk):
+                tree = result.predictions[0].tree
+                if tree_id is not None:
+                    tree.tree_id = tree_id
+                fout.write(args['predict_format'].format(tree))
+                fout.write("\n")
+def parse_dir(args, model, retag_pipeline, tokenized_dir, predict_dir):
+    os.makedirs(predict_dir, exist_ok=True)
+    for filename in os.listdir(tokenized_dir):
+        input_path = os.path.join(tokenized_dir, filename)
+        output_path = os.path.join(predict_dir, os.path.splitext(filename)[0] + ".mrg")
+        logger.info("Processing %s to %s", input_path, output_path)
+        parse_text(args, model, retag_pipeline, tokenized_file=input_path, predict_file=output_path)
+def load_model_parse_text(args, model_file, retag_pipeline):
+    """
+    Load a model, then parse text and write it to stdout or args['predict_file']
+    retag_pipeline: a list of Pipeline meant to use for retagging
+    """
+    foundation_cache = retag_pipeline[0].foundation_cache if retag_pipeline else FoundationCache()
+    load_args = {
+        'wordvec_pretrain_file': args['wordvec_pretrain_file'],
+        'charlm_forward_file': args['charlm_forward_file'],
+        'charlm_backward_file': args['charlm_backward_file'],
+        'device': args['device'],
+    }
+    trainer = Trainer.load(model_file, args=load_args, foundation_cache=foundation_cache)
+    model = trainer.model
+    model.eval()
+    logger.info("Loaded model from %s", model_file)
+    if args['tokenized_dir']:
+        if not args['predict_dir']:
+            raise ValueError("Must specific --predict_dir to go with --tokenized_dir")
+        parse_dir(args, model, retag_pipeline, args['tokenized_dir'], args['predict_dir'])
+    else:
+        parse_text(args, model, retag_pipeline)

stanza/stanza/models/constituency/tree_reader.py ADDED Viewed

	@@ -0,0 +1,274 @@

+"""
+Reads ParseTree objects from a file, string, or similar input
+Works by first splitting the input into (, ), and all other tokens,
+then recursively processing those tokens into trees.
+"""
+from collections import deque
+import logging
+import os
+import re
+from stanza.models.constituency.parse_tree import Tree
+from stanza.utils.get_tqdm import get_tqdm
+tqdm = get_tqdm()
+OPEN_PAREN = "("
+CLOSE_PAREN = ")"
+logger = logging.getLogger('stanza.constituency')
+# A few specific exception types to clarify parsing errors
+# They store the line number where the error occurred
+class UnclosedTreeError(ValueError):
+    """
+    A tree looked like (Foo
+    """
+    def __init__(self, line_num):
+        super().__init__("Found an unfinished tree (missing close brackets).  Tree started on line %d" % line_num)
+        self.line_num = line_num
+class ExtraCloseTreeError(ValueError):
+    """
+    A tree looked like (Foo))
+    """
+    def __init__(self, line_num):
+        super().__init__("Found a broken tree (extra close brackets).  Tree started on line %d" % line_num)
+        self.line_num = line_num
+class UnlabeledTreeError(ValueError):
+    """
+    A tree had no label, such as ((Foo) (Bar))
+    This does not actually happen at the root, btw, as ROOT is silently added
+    """
+    def __init__(self, line_num):
+        super().__init__("Found a tree with no label on a node!  Line number %d" % line_num)
+        self.line_num = line_num
+class MixedTreeError(ValueError):
+    """
+    Leaf and constituent children are mixed in the same node
+    """
+    def __init__(self, line_num, child_label, children):
+        super().__init__("Found a tree with both text children and bracketed children!  Line number {}  Child label {}  Children {}".format(line_num, child_label, children))
+        self.line_num = line_num
+        self.child_label = child_label
+        self.children = children
+def normalize(text):
+    return text.replace("-LRB-", "(").replace("-RRB-", ")")
+def read_single_tree(token_iterator, broken_ok):
+    """
+    Build a tree from the tokens in the token_iterator
+    """
+    # we were called here at a open paren, so start the stack of
+    # children with one empty list already on it
+    children_stack = deque()
+    children_stack.append([])
+    text_stack = deque()
+    text_stack.append([])
+    token = next(token_iterator, None)
+    token_iterator.set_mark()
+    while token is not None:
+        if token == OPEN_PAREN:
+            children_stack.append([])
+            text_stack.append([])
+        elif token == CLOSE_PAREN:
+            text = text_stack.pop()
+            children = children_stack.pop()
+            if text:
+                pieces = " ".join(text).split()
+                if len(pieces) == 1:
+                    child = Tree(pieces[0], children)
+                else:
+                    # the assumption here is that a language such as VI may
+                    # have spaces in the words, but it still represents
+                    # just one child
+                    label = pieces[0]
+                    child_label = " ".join(pieces[1:])
+                    if children:
+                        if broken_ok:
+                            child = Tree(label, children + [Tree(normalize(child_label))])
+                        else:
+                            raise MixedTreeError(token_iterator.line_num, child_label, children)
+                    else:
+                        child = Tree(label, Tree(normalize(child_label)))
+                if not children_stack:
+                    return child
+            else:
+                if not children_stack:
+                    return Tree("ROOT", children)
+                elif broken_ok:
+                    child = Tree(None, children)
+                else:
+                    raise UnlabeledTreeError(token_iterator.line_num)
+            children_stack[-1].append(child)
+        else:
+            text_stack[-1].append(token)
+        token = next(token_iterator, None)
+    raise UnclosedTreeError(token_iterator.get_mark())
+LINE_SPLIT_RE = re.compile(r"([()])")
+class TokenIterator:
+    """
+    A specific iterator for reading trees from a tree file
+    The idea is that this will keep track of which line
+    we are processing, so that an error can be logged
+    from the correct line
+    """
+    def __init__(self):
+        self.token_iterator = iter([])
+        self.line_num = -1
+        self.mark = None
+    def set_mark(self):
+        """
+        The mark is used for determining where the start of a tree occurs for an error
+        """
+        self.mark = self.line_num
+    def get_mark(self):
+        if self.mark is None:
+            raise ValueError("No mark set!")
+        return self.mark
+    def __iter__(self):
+        return self
+    def __next__(self):
+        n = next(self.token_iterator, None)
+        while n is None:
+            self.line_num = self.line_num + 1
+            line = next(self.line_iterator)
+            if line is None:
+                raise StopIteration
+            line = line.strip()
+            if not line:
+                continue
+            pieces = LINE_SPLIT_RE.split(line)
+            pieces = [x.strip() for x in pieces]
+            pieces = [x for x in pieces if x]
+            self.token_iterator = iter(pieces)
+            n = next(self.token_iterator, None)
+        return n
+class TextTokenIterator(TokenIterator):
+    def __init__(self, text, use_tqdm=True):
+        super().__init__()
+        self.lines = text.split("\n")
+        self.num_lines = len(self.lines)
+        if self.num_lines > 1000 and use_tqdm:
+            self.line_iterator = iter(tqdm(self.lines))
+        else:
+            self.line_iterator = iter(self.lines)
+class FileTokenIterator(TokenIterator):
+    def __init__(self, filename):
+        super().__init__()
+        self.filename = filename
+    def __enter__(self):
+        # TODO: use the file_size instead of counting the lines
+        # file_size = Path(self.filename).stat().st_size
+        with open(self.filename) as fin:
+            num_lines = sum(1 for _ in fin)
+        self.file_obj = open(self.filename)
+        if num_lines > 1000:
+            self.line_iterator = iter(tqdm(self.file_obj, total=num_lines))
+        else:
+            self.line_iterator = iter(self.file_obj)
+        return self
+    def __exit__(self, exc_type, exc_value, exc_tb):
+        if self.file_obj:
+            self.file_obj.close()
+def read_token_iterator(token_iterator, broken_ok, tree_callback):
+    trees = []
+    token = next(token_iterator, None)
+    while token:
+        if token == OPEN_PAREN:
+            next_tree = read_single_tree(token_iterator, broken_ok=broken_ok)
+            if next_tree is None:
+                raise ValueError("Tree reader somehow created a None tree!  Line number %d" % token_iterator.line_num)
+            if tree_callback is not None:
+                transformed = tree_callback(next_tree)
+                if transformed is not None:
+                    trees.append(transformed)
+            else:
+                trees.append(next_tree)
+            token = next(token_iterator, None)
+        elif token == CLOSE_PAREN:
+            raise ExtraCloseTreeError(token_iterator.line_num)
+        else:
+            raise ValueError("Tree document had text between trees!  Line number %d" % token_iterator.line_num)
+    return trees
+def read_trees(text, broken_ok=False, tree_callback=None, use_tqdm=True):
+    """
+    Reads multiple trees from the text
+    TODO: some of the error cases we hit can be recovered from
+    """
+    token_iterator = TextTokenIterator(text, use_tqdm)
+    return read_token_iterator(token_iterator, broken_ok=broken_ok, tree_callback=tree_callback)
+def read_tree_file(filename, broken_ok=False, tree_callback=None):
+    """
+    Read all of the trees in the given file
+    """
+    with FileTokenIterator(filename) as token_iterator:
+        trees = read_token_iterator(token_iterator, broken_ok=broken_ok, tree_callback=tree_callback)
+    return trees
+def read_directory(dirname, broken_ok=False, tree_callback=None):
+    """
+    Read all of the trees in all of the files in a directory
+    """
+    trees = []
+    for filename in sorted(os.listdir(dirname)):
+        full_name = os.path.join(dirname, filename)
+        trees.extend(read_tree_file(full_name, broken_ok, tree_callback))
+    return trees
+def read_treebank(filename, tree_callback=None):
+    """
+    Read a treebank and alter the trees to be a simpler format for learning to parse
+    """
+    logger.info("Reading trees from %s", filename)
+    trees = read_tree_file(filename, tree_callback=tree_callback)
+    trees = [t.prune_none().simplify_labels() for t in trees]
+    illegal_trees = [t for t in trees if len(t.children) > 1]
+    if len(illegal_trees) > 0:
+        raise ValueError("Found {} tree(s) which had non-unary transitions at the ROOT.  First illegal tree: {:P}".format(len(illegal_trees), illegal_trees[0]))
+    return trees
+def main():
+    """
+    Reads a sample tree
+    """
+    text="( (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))"
+    trees = read_trees(text)
+    print(trees)
+if __name__ == '__main__':
+    main()

stanza/stanza/models/constituency/tree_stack.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""
+A utilitiy class for keeping track of intermediate parse states
+"""
+from collections import namedtuple
+class TreeStack(namedtuple('TreeStack', ['value', 'parent', 'length'])):
+    """
+    A stack which can branch in several directions, as long as you
+    keep track of the branching heads
+    An example usage is when K constituents are removed at once
+    to create a new constituent, and then the LSTM which tracks the
+    values of the constituents is updated starting from the Kth
+    output of the LSTM with the new value.
+    We don't simply keep track of a single stack object using a deque
+    because versions of the parser which use a beam will want to be
+    able to branch in different directions from the same base stack
+    Another possible usage is if an oracle is used for training
+    in a manner where some fraction of steps are non-gold steps,
+    but we also want to take a gold step from the same state.
+    Eg, parser gets to state X, wants to make incorrect transition T
+    instead of gold transition G, and so we continue training both
+    X+G and X+T.  If we only represent the state X with standard
+    python stacks, it would not be possible to track both of these
+    states at the same time without copying the entire thing.
+    Value can be as transition, a word, or a partially built constituent
+    Implemented as a namedtuple to make it a bit more efficient
+    """
+    def pop(self):
+        return self.parent
+    def push(self, value):
+        # returns a new stack node which points to this
+        return TreeStack(value, self, self.length+1)
+    def __iter__(self):
+        stack = self
+        while stack.parent is not None:
+            yield stack.value
+            stack = stack.parent
+        yield stack.value
+    def __reversed__(self):
+        items = list(iter(self))
+        for item in reversed(items):
+            yield item
+    def __str__(self):
+        return "TreeStack(%s)" % ", ".join([str(x) for x in self])
+    def __len__(self):
+        return self.length

stanza/stanza/models/constituency/utils.py ADDED Viewed

	@@ -0,0 +1,375 @@

+"""
+Collects a few of the conparser utility methods which don't belong elsewhere
+"""
+from collections import Counter
+import logging
+import warnings
+import torch.nn as nn
+from torch import optim
+from stanza.models.common.doc import TEXT, Document
+from stanza.models.common.utils import get_optimizer
+from stanza.models.constituency.base_model import SimpleModel
+from stanza.models.constituency.parse_transitions import TransitionScheme
+from stanza.models.constituency.parse_tree import Tree
+from stanza.utils.get_tqdm import get_tqdm
+tqdm = get_tqdm()
+DEFAULT_LEARNING_RATES = { "adamw": 0.0002, "adadelta": 1.0, "sgd": 0.001, "adabelief": 0.00005, "madgrad": 0.0000007 , "mirror_madgrad": 0.00005 }
+DEFAULT_LEARNING_EPS = { "adabelief": 1e-12, "adadelta": 1e-6, "adamw": 1e-8 }
+DEFAULT_LEARNING_RHO = 0.9
+DEFAULT_MOMENTUM = { "madgrad": 0.9, "mirror_madgrad": 0.9, "sgd": 0.9 }
+tlogger = logging.getLogger('stanza.constituency.trainer')
+# madgrad experiment for weight decay
+# with learning_rate set to 0.0000007 and momentum 0.9
+# on en_wsj, with a baseline model trained on adadela for 200,
+# then madgrad used to further improve that model
+#  0.00000002.out: 0.9590347746438835
+#  0.00000005.out: 0.9591378819960182
+#  0.0000001.out: 0.9595450596319405
+#  0.0000002.out: 0.9594603134479271
+#  0.0000005.out: 0.9591317672706594
+#  0.000001.out: 0.9592548741021389
+#  0.000002.out: 0.9598395477013945
+#  0.000003.out: 0.9594974271553495
+#  0.000004.out: 0.9596665982603754
+#  0.000005.out: 0.9591620720706487
+DEFAULT_WEIGHT_DECAY = { "adamw": 0.05, "adadelta": 0.02, "sgd": 0.01, "adabelief": 1.2e-6, "madgrad": 2e-6, "mirror_madgrad": 2e-6 }
+def retag_tags(doc, pipelines, xpos):
+    """
+    Returns a list of list of tags for the items in doc
+    doc can be anything which feeds into the pipeline(s)
+    pipelines are a list of 1 or more retag pipelines
+    if multiple pipelines are given, majority vote wins
+    """
+    tag_lists = []
+    for pipeline in pipelines:
+        doc = pipeline(doc)
+        tag_lists.append([[x.xpos if xpos else x.upos for x in sentence.words] for sentence in doc.sentences])
+    # tag_lists: for N pipeline, S sentences
+    # we now have N lists of S sentences each
+    # for sentence in zip(*tag_lists): N lists of |s| tags for this given sentence s
+    # for tag in zip(*sentence): N predicted tags.
+    # most common one in the Counter will be chosen
+    tag_lists = [[Counter(tag).most_common(1)[0][0] for tag in zip(*sentence)]
+                 for sentence in zip(*tag_lists)]
+    return tag_lists
+def retag_trees(trees, pipelines, xpos=True):
+    """
+    Retag all of the trees using the given processor
+    Returns a list of new trees
+    """
+    if len(trees) == 0:
+        return trees
+    new_trees = []
+    chunk_size = 1000
+    with tqdm(total=len(trees)) as pbar:
+        for chunk_start in range(0, len(trees), chunk_size):
+            chunk_end = min(chunk_start + chunk_size, len(trees))
+            chunk = trees[chunk_start:chunk_end]
+            sentences = []
+            try:
+                for idx, tree in enumerate(chunk):
+                    tokens = [{TEXT: pt.children[0].label} for pt in tree.yield_preterminals()]
+                    sentences.append(tokens)
+            except ValueError as e:
+                raise ValueError("Unable to process tree %d" % (idx + chunk_start)) from e
+            doc = Document(sentences)
+            tag_lists = retag_tags(doc, pipelines, xpos)
+            for tree_idx, (tree, tags) in enumerate(zip(chunk, tag_lists)):
+                try:
+                    if any(tag is None for tag in tags):
+                        raise RuntimeError("Tagged tree #{} with a None tag!\n{}\n{}".format(tree_idx, tree, tags))
+                    new_tree = tree.replace_tags(tags)
+                    new_trees.append(new_tree)
+                    pbar.update(1)
+                except ValueError as e:
+                    raise ValueError("Failed to properly retag tree #{}: {}".format(tree_idx, tree)) from e
+    if len(new_trees) != len(trees):
+        raise AssertionError("Retagged tree counts did not match: {} vs {}".format(len(new_trees), len(trees)))
+    return new_trees
+# experimental results on nonlinearities
+# this is on a VI dataset, VLSP_22, using 1/10th of the data as a dev set
+# (no released test set at the time of the experiment)
+# original non-Bert tagger, with 1 iteration each instead of averaged over 5
+# considering the number of experiments and the length of time they would take
+#
+# Gelu had the highest score, which tracks with other experiments run.
+# Note that publicly released models have typically used Relu
+# on account of the runtime speed improvement
+#
+# Anyway, a larger experiment of 5x models on gelu or relu, using the
+# Roberta POS tagger and a corpus of silver trees, resulted in 0.8270
+# for relu and 0.8248 for gelu.  So it is not even clear that
+# switching to gelu would be an accuracy improvement.
+#
+# Gelu: 82.32
+# Relu: 82.14
+# Mish: 81.95
+# Relu6: 81.91
+# Silu: 81.90
+# ELU: 81.73
+# Hardswish: 81.67
+# Softsign: 81.63
+# Hardtanh: 81.44
+# Celu: 81.43
+# Selu: 81.17
+#   TODO: need to redo the prelu experiment with
+#         possibly different numbers of parameters
+#         and proper weight decay
+# Prelu: 80.95 (terminated early)
+# Softplus: 80.94
+# Logsigmoid: 80.91
+# Hardsigmoid: 79.03
+# RReLU: 77.00
+# Hardshrink: failed
+# Softshrink: failed
+NONLINEARITY = {
+    'celu':       nn.CELU,
+    'elu':        nn.ELU,
+    'gelu':       nn.GELU,
+    'hardshrink': nn.Hardshrink,
+    'hardtanh':   nn.Hardtanh,
+    'leaky_relu': nn.LeakyReLU,
+    'logsigmoid': nn.LogSigmoid,
+    'prelu':      nn.PReLU,
+    'relu':       nn.ReLU,
+    'relu6':      nn.ReLU6,
+    'rrelu':      nn.RReLU,
+    'selu':       nn.SELU,
+    'softplus':   nn.Softplus,
+    'softshrink': nn.Softshrink,
+    'softsign':   nn.Softsign,
+    'tanhshrink': nn.Tanhshrink,
+    'tanh':       nn.Tanh,
+}
+# separating these out allows for backwards compatibility with earlier versions of pytorch
+# NOTE torch compatibility: if we ever *release* models with these
+# activation functions, we will need to break that compatibility
+nonlinearity_list = [
+    'GLU',
+    'Hardsigmoid',
+    'Hardswish',
+    'Mish',
+    'SiLU',
+]
+for nonlinearity in nonlinearity_list:
+    if hasattr(nn, nonlinearity):
+        NONLINEARITY[nonlinearity.lower()] = getattr(nn, nonlinearity)
+def build_nonlinearity(nonlinearity):
+    """
+    Look up "nonlinearity" in a map from function name to function, build the appropriate layer.
+    """
+    if nonlinearity in NONLINEARITY:
+        return NONLINEARITY[nonlinearity]()
+    raise ValueError('Chosen value of nonlinearity, "%s", not handled' % nonlinearity)
+def build_optimizer(args, model, build_simple_adadelta=False):
+    """
+    Build an optimizer based on the arguments given
+    If we are "multistage" training and epochs_trained < epochs // 2,
+    we build an AdaDelta optimizer instead of whatever was requested
+    The build_simple_adadelta parameter controls this
+    """
+    bert_learning_rate = 0.0
+    bert_weight_decay = args['bert_weight_decay']
+    if build_simple_adadelta:
+        optim_type = 'adadelta'
+        bert_finetune = args.get('stage1_bert_finetune', False)
+        if bert_finetune:
+            bert_learning_rate = args['stage1_bert_learning_rate']
+        learning_beta2 = 0.999   # doesn't matter for AdaDelta
+        learning_eps = DEFAULT_LEARNING_EPS['adadelta']
+        learning_rate = args['stage1_learning_rate']
+        learning_rho = DEFAULT_LEARNING_RHO
+        momentum = None    # also doesn't matter for AdaDelta
+        weight_decay = DEFAULT_WEIGHT_DECAY['adadelta']
+    else:
+        optim_type = args['optim'].lower()
+        bert_finetune = args.get('bert_finetune', False)
+        if bert_finetune:
+            bert_learning_rate = args['bert_learning_rate']
+        learning_beta2 = args['learning_beta2']
+        learning_eps = args['learning_eps']
+        learning_rate = args['learning_rate']
+        learning_rho = args['learning_rho']
+        momentum = args['learning_momentum']
+        weight_decay = args['learning_weight_decay']
+    # TODO: allow rho as an arg for AdaDelta
+    return get_optimizer(name=optim_type,
+                         model=model,
+                         lr=learning_rate,
+                         betas=(0.9, learning_beta2),
+                         eps=learning_eps,
+                         momentum=momentum,
+                         weight_decay=weight_decay,
+                         bert_learning_rate=bert_learning_rate,
+                         bert_weight_decay=weight_decay*bert_weight_decay,
+                         is_peft=args.get('use_peft', False),
+                         bert_finetune_layers=args['bert_finetune_layers'],
+                         opt_logger=tlogger)
+def build_scheduler(args, optimizer, first_optimizer=False):
+    """
+    Build the scheduler for the conparser based on its args
+    Used to use a warmup for learning rate, but that wasn't working very well
+    Now, we just use a ReduceLROnPlateau, which does quite well
+    """
+    #if args.get('learning_rate_warmup', 0) <= 0:
+    #    # TODO: is there an easier way to make an empty scheduler?
+    #    lr_lambda = lambda x: 1.0
+    #else:
+    #    warmup_end = args['learning_rate_warmup']
+    #    def lr_lambda(x):
+    #        if x >= warmup_end:
+    #            return 1.0
+    #        return x / warmup_end
+    #scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
+    if first_optimizer:
+        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=args['learning_rate_factor'], patience=args['learning_rate_patience'], cooldown=args['learning_rate_cooldown'], min_lr=args['stage1_learning_rate_min_lr'])
+    else:
+        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=args['learning_rate_factor'], patience=args['learning_rate_patience'], cooldown=args['learning_rate_cooldown'], min_lr=args['learning_rate_min_lr'])
+    return scheduler
+def initialize_linear(linear, nonlinearity, bias):
+    """
+    Initializes the bias to a positive value, hopefully preventing dead neurons
+    """
+    if nonlinearity in ('relu', 'leaky_relu'):
+        nn.init.kaiming_normal_(linear.weight, nonlinearity=nonlinearity)
+        nn.init.uniform_(linear.bias, 0, 1 / (bias * 2) ** 0.5)
+def add_predict_output_args(parser):
+    """
+    Args specifically for the output location of data
+    """
+    parser.add_argument('--predict_dir', type=str, default=".", help='Where to write the predictions during --mode predict.  Pred and orig files will be written - the orig file will be retagged if that is requested.  Writing the orig file is useful for removing None and retagging')
+    parser.add_argument('--predict_file', type=str, default=None, help='Base name for writing predictions')
+    parser.add_argument('--predict_format', type=str, default="{:_O}", help='Format to use when writing predictions')
+    parser.add_argument('--predict_output_gold_tags', default=False, action='store_true', help='Output gold tags as part of the evaluation - useful for putting the trees through EvalB')
+def postprocess_predict_output_args(args):
+    if len(args['predict_format']) <= 2 or (len(args['predict_format']) <= 3 and args['predict_format'].endswith("Vi")):
+        args['predict_format'] = "{:" + args['predict_format'] + "}"
+def get_open_nodes(trees, transition_scheme):
+    """
+    Return a list of all open nodes in the given dataset.
+    Depending on the parameters, may be single or compound open transitions.
+    """
+    if transition_scheme is TransitionScheme.TOP_DOWN_COMPOUND:
+        return Tree.get_compound_constituents(trees)
+    elif transition_scheme is TransitionScheme.IN_ORDER_COMPOUND:
+        return Tree.get_compound_constituents(trees, separate_root=True)
+    else:
+        return [(x,) for x in Tree.get_unique_constituent_labels(trees)]
+def verify_transitions(trees, sequences, transition_scheme, unary_limit, reverse, name, root_labels):
+    """
+    Given a list of trees and their transition sequences, verify that the sequences rebuild the trees
+    """
+    model = SimpleModel(transition_scheme, unary_limit, reverse, root_labels)
+    tlogger.info("Verifying the transition sequences for %d trees", len(trees))
+    data = zip(trees, sequences)
+    if tlogger.getEffectiveLevel() <= logging.INFO:
+        data = tqdm(zip(trees, sequences), total=len(trees))
+    for tree_idx, (tree, sequence) in enumerate(data):
+        # TODO: make the SimpleModel have a parse operation?
+        state = model.initial_state_from_gold_trees([tree])[0]
+        for idx, trans in enumerate(sequence):
+            if not trans.is_legal(state, model):
+                raise RuntimeError("Tree {} of {} failed: transition {}:{} was not legal in a transition sequence:\nOriginal tree: {}\nTransitions: {}".format(tree_idx, name, idx, trans, tree, sequence))
+            state = trans.apply(state, model)
+        result = model.get_top_constituent(state.constituents)
+        if reverse:
+            result = result.reverse()
+        if tree != result:
+            raise RuntimeError("Tree {} of {} failed: transition sequence did not match for a tree!\nOriginal tree:{}\nTransitions: {}\nResult tree:{}".format(tree_idx, name, tree, sequence, result))
+def check_constituents(train_constituents, trees, treebank_name, fail=True):
+    """
+    Check that all the constituents in the other dataset are known in the train set
+    """
+    constituents = Tree.get_unique_constituent_labels(trees)
+    for con in constituents:
+        if con not in train_constituents:
+            first_error = None
+            num_errors = 0
+            for tree_idx, tree in enumerate(trees):
+                constituents = Tree.get_unique_constituent_labels(tree)
+                if con in constituents:
+                    num_errors += 1
+                    if first_error is None:
+                        first_error = tree_idx
+            error = "Found constituent label {} in the {} set which don't exist in the train set.  This constituent label occured in {} trees, with the first tree index at {} counting from 1\nThe error tree (which may have POS tags changed from the retagger and may be missing functional tags or empty nodes) is:\n{:P}".format(con, treebank_name, num_errors, (first_error+1), trees[first_error])
+            if fail:
+                raise RuntimeError(error)
+            else:
+                warnings.warn(error)
+def check_root_labels(root_labels, other_trees, treebank_name):
+    """
+    Check that all the root states in the other dataset are known in the train set
+    """
+    for root_state in Tree.get_root_labels(other_trees):
+        if root_state not in root_labels:
+            raise RuntimeError("Found root state {} in the {} set which is not a ROOT state in the train set".format(root_state, treebank_name))
+def remove_duplicate_trees(trees, treebank_name):
+    """
+    Filter duplicates from the given dataset
+    """
+    new_trees = []
+    known_trees = set()
+    for tree in trees:
+        tree_str = "{}".format(tree)
+        if tree_str in known_trees:
+            continue
+        known_trees.add(tree_str)
+        new_trees.append(tree)
+    if len(new_trees) < len(trees):
+        tlogger.info("Filtered %d duplicates from %s dataset", (len(trees) - len(new_trees)), treebank_name)
+    return new_trees
+def remove_singleton_trees(trees):
+    """
+    remove trees which are just a root and a single word
+    TODO: remove these trees in the conversion instead of here
+    """
+    new_trees = [x for x in trees if
+                 len(x.children) > 1 or
+                 (len(x.children) == 1 and len(x.children[0].children) > 1) or
+                 (len(x.children) == 1 and len(x.children[0].children) == 1 and len(x.children[0].children[0].children) >= 1)]
+    if len(trees) - len(new_trees) > 0:
+        tlogger.info("Eliminated %d trees with missing structure", (len(trees) - len(new_trees)))
+    return new_trees

stanza/stanza/models/coref/predict.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import argparse
+import json
+import torch
+from tqdm import tqdm
+from stanza.models.coref.model import CorefModel
+if __name__ == "__main__":
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("experiment")
+    argparser.add_argument("input_file")
+    argparser.add_argument("output_file")
+    argparser.add_argument("--config-file", default="config.toml")
+    argparser.add_argument("--batch-size", type=int,
+                           help="Adjust to override the config value if you're"
+                                " experiencing out-of-memory issues")
+    argparser.add_argument("--weights",
+                           help="Path to file with weights to load."
+                                " If not supplied, in the latest"
+                                " weights of the experiment will be loaded;"
+                                " if there aren't any, an error is raised.")
+    args = argparser.parse_args()
+    model = CorefModel.load_model(path=args.weights,
+                                  map_location="cpu",
+                                  ignore={"bert_optimizer", "general_optimizer",
+                                          "bert_scheduler", "general_scheduler"})
+    if args.batch_size:
+        model.config.a_scoring_batch_size = args.batch_size
+    model.training = False
+    try:
+        with open(args.input_file, encoding="utf-8") as fin:
+            input_data = json.load(fin)
+    except json.decoder.JSONDecodeError:
+        # read the old jsonlines format if necessary
+        with open(args.input_file, encoding="utf-8") as fin:
+            text = "[" + ",\n".join(fin) + "]"
+        input_data = json.loads(text)
+    docs = [model.build_doc(doc) for doc in input_data]
+    with torch.no_grad():
+        for doc in tqdm(docs, unit="docs"):
+            result = model.run(doc)
+            doc["span_clusters"] = result.span_clusters
+            doc["word_clusters"] = result.word_clusters
+            for key in ("word2subword", "subwords", "word_id", "head2span"):
+                del doc[key]
+    with open(args.output_file, mode="w") as fout:
+        for doc in docs:
+            json.dump(doc, fout)

stanza/stanza/models/coref/span_predictor.py ADDED Viewed

	@@ -0,0 +1,146 @@

+""" Describes SpanPredictor which aims to predict spans by taking as input
+head word and context embeddings.
+"""
+from typing import List, Optional, Tuple
+from stanza.models.coref.const import Doc, Span
+import torch
+class SpanPredictor(torch.nn.Module):
+    def __init__(self, input_size: int, distance_emb_size: int):
+        super().__init__()
+        self.ffnn = torch.nn.Sequential(
+            torch.nn.Linear(input_size * 2 + 64, input_size),
+            torch.nn.ReLU(),
+            torch.nn.Dropout(0.3),
+            torch.nn.Linear(input_size, 256),
+            torch.nn.ReLU(),
+            torch.nn.Dropout(0.3),
+            torch.nn.Linear(256, 64),
+        )
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv1d(64, 4, 3, 1, 1),
+            torch.nn.Conv1d(4, 2, 3, 1, 1)
+        )
+        self.emb = torch.nn.Embedding(128, distance_emb_size) # [-63, 63] + too_far
+    @property
+    def device(self) -> torch.device:
+        """ A workaround to get current device (which is assumed to be the
+        device of the first parameter of one of the submodules) """
+        return next(self.ffnn.parameters()).device
+    def forward(self,  # type: ignore  # pylint: disable=arguments-differ  #35566 in pytorch
+                doc: Doc,
+                words: torch.Tensor,
+                heads_ids: torch.Tensor) -> torch.Tensor:
+        """
+        Calculates span start/end scores of words for each span head in
+        heads_ids
+        Args:
+            doc (Doc): the document data
+            words (torch.Tensor): contextual embeddings for each word in the
+                document, [n_words, emb_size]
+            heads_ids (torch.Tensor): word indices of span heads
+        Returns:
+            torch.Tensor: span start/end scores, [n_heads, n_words, 2]
+        """
+        # Obtain distance embedding indices, [n_heads, n_words]
+        relative_positions = (heads_ids.unsqueeze(1) - torch.arange(words.shape[0], device=words.device).unsqueeze(0))
+        emb_ids = relative_positions + 63               # make all valid distances positive
+        emb_ids[(emb_ids < 0) + (emb_ids > 126)] = 127  # "too_far"
+        # Obtain "same sentence" boolean mask, [n_heads, n_words]
+        sent_id = torch.tensor(doc["sent_id"], device=words.device)
+        same_sent = (sent_id[heads_ids].unsqueeze(1) == sent_id.unsqueeze(0))
+        # To save memory, only pass candidates from one sentence for each head
+        # pair_matrix contains concatenated span_head_emb + candidate_emb + distance_emb
+        # for each candidate among the words in the same sentence as span_head
+        # [n_heads, input_size * 2 + distance_emb_size]
+        rows, cols = same_sent.nonzero(as_tuple=True)
+        pair_matrix = torch.cat((
+            words[heads_ids[rows]],
+            words[cols],
+            self.emb(emb_ids[rows, cols]),
+        ), dim=1)
+        lengths = same_sent.sum(dim=1)
+        padding_mask = torch.arange(0, lengths.max(), device=words.device).unsqueeze(0)
+        padding_mask = (padding_mask < lengths.unsqueeze(1))  # [n_heads, max_sent_len]
+        # [n_heads, max_sent_len, input_size * 2 + distance_emb_size]
+        # This is necessary to allow the convolution layer to look at several
+        # word scores
+        padded_pairs = torch.zeros(*padding_mask.shape, pair_matrix.shape[-1], device=words.device)
+        padded_pairs[padding_mask] = pair_matrix
+        res = self.ffnn(padded_pairs) # [n_heads, n_candidates, last_layer_output]
+        res = self.conv(res.permute(0, 2, 1)).permute(0, 2, 1) # [n_heads, n_candidates, 2]
+        scores = torch.full((heads_ids.shape[0], words.shape[0], 2), float('-inf'), device=words.device)
+        scores[rows, cols] = res[padding_mask]
+        # Make sure that start <= head <= end during inference
+        if not self.training:
+            valid_starts = torch.log((relative_positions >= 0).to(torch.float))
+            valid_ends = torch.log((relative_positions <= 0).to(torch.float))
+            valid_positions = torch.stack((valid_starts, valid_ends), dim=2)
+            return scores + valid_positions
+        return scores
+    def get_training_data(self,
+                          doc: Doc,
+                          words: torch.Tensor
+                          ) -> Tuple[Optional[torch.Tensor],
+                                     Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        """ Returns span starts/ends for gold mentions in the document. """
+        head2span = sorted(doc["head2span"])
+        if not head2span:
+            return None, None
+        heads, starts, ends = zip(*head2span)
+        heads = torch.tensor(heads, device=self.device)
+        starts = torch.tensor(starts, device=self.device)
+        ends = torch.tensor(ends, device=self.device) - 1
+        return self(doc, words, heads), (starts, ends)
+    def predict(self,
+                doc: Doc,
+                words: torch.Tensor,
+                clusters: List[List[int]]) -> List[List[Span]]:
+        """
+        Predicts span clusters based on the word clusters.
+        Args:
+            doc (Doc): the document data
+            words (torch.Tensor): [n_words, emb_size] matrix containing
+                embeddings for each of the words in the text
+            clusters (List[List[int]]): a list of clusters where each cluster
+                is a list of word indices
+        Returns:
+            List[List[Span]]: span clusters
+        """
+        if not clusters:
+            return []
+        heads_ids = torch.tensor(
+            sorted(i for cluster in clusters for i in cluster),
+            device=self.device
+        )
+        scores = self(doc, words, heads_ids)
+        starts = scores[:, :, 0].argmax(dim=1).tolist()
+        ends = (scores[:, :, 1].argmax(dim=1) + 1).tolist()
+        head2span = {
+            head: (start, end)
+            for head, start, end in zip(heads_ids.tolist(), starts, ends)
+        }
+        return [[head2span[head] for head in cluster]
+                for cluster in clusters]

stanza/stanza/models/coref/tokenizer_customization.py ADDED Viewed

	@@ -0,0 +1,18 @@

+""" This file defines functions used to modify the default behaviour
+of transformers.AutoTokenizer. These changes are necessary, because some
+tokenizers are meant to be used with raw text, while the OntoNotes documents
+have already been split into words.
+All the functions are used in coref_model.CorefModel._get_docs. """
+# Filters out unwanted tokens produced by the tokenizer
+TOKENIZER_FILTERS = {
+    "albert-xxlarge-v2": (lambda token: token != "▁"),  # U+2581, not just "_"
+    "albert-large-v2": (lambda token: token != "▁"),
+}
+# Maps some words to tokens directly, without a tokenizer
+TOKENIZER_MAPS = {
+    "roberta-large": {".": ["."], ",": [","], "!": ["!"], "?": ["?"],
+                      ":":[":"], ";":[";"], "'s": ["'s"]}
+}

stanza/stanza/models/coref/word_encoder.py ADDED Viewed

	@@ -0,0 +1,108 @@

+""" Describes WordEncoder. Extracts mention vectors from bert-encoded text.
+"""
+from typing import Tuple
+import torch
+from stanza.models.coref.config import Config
+from stanza.models.coref.const import Doc
+class WordEncoder(torch.nn.Module):  # pylint: disable=too-many-instance-attributes
+    """ Receives bert contextual embeddings of a text, extracts all the
+    possible mentions in that text. """
+    def __init__(self, features: int, config: Config):
+        """
+        Args:
+            features (int): the number of featues in the input embeddings
+            config (Config): the configuration of the current session
+        """
+        super().__init__()
+        self.attn = torch.nn.Linear(in_features=features, out_features=1)
+        self.dropout = torch.nn.Dropout(config.dropout_rate)
+    @property
+    def device(self) -> torch.device:
+        """ A workaround to get current device (which is assumed to be the
+        device of the first parameter of one of the submodules) """
+        return next(self.attn.parameters()).device
+    def forward(self,  # type: ignore  # pylint: disable=arguments-differ  #35566 in pytorch
+                doc: Doc,
+                x: torch.Tensor,
+                ) -> Tuple[torch.Tensor, ...]:
+        """
+        Extracts word representations from text.
+        Args:
+            doc: the document data
+            x: a tensor containing bert output, shape (n_subtokens, bert_dim)
+        Returns:
+            words: a Tensor of shape [n_words, mention_emb];
+                mention representations
+            cluster_ids: tensor of shape [n_words], containing cluster indices
+                for each word. Non-coreferent words have cluster id of zero.
+        """
+        word_boundaries = torch.tensor(doc["word2subword"], device=self.device)
+        starts = word_boundaries[:, 0]
+        ends = word_boundaries[:, 1]
+        # [n_mentions, features]
+        words = self._attn_scores(x, starts, ends).mm(x)
+        words = self.dropout(words)
+        return (words, self._cluster_ids(doc))
+    def _attn_scores(self,
+                     bert_out: torch.Tensor,
+                     word_starts: torch.Tensor,
+                     word_ends: torch.Tensor) -> torch.Tensor:
+        """ Calculates attention scores for each of the mentions.
+        Args:
+            bert_out (torch.Tensor): [n_subwords, bert_emb], bert embeddings
+                for each of the subwords in the document
+            word_starts (torch.Tensor): [n_words], start indices of words
+            word_ends (torch.Tensor): [n_words], end indices of words
+        Returns:
+            torch.Tensor: [description]
+        """
+        n_subtokens = len(bert_out)
+        n_words = len(word_starts)
+        # [n_mentions, n_subtokens]
+        # with 0 at positions belonging to the words and -inf elsewhere
+        attn_mask = torch.arange(0, n_subtokens, device=self.device).expand((n_words, n_subtokens))
+        attn_mask = ((attn_mask >= word_starts.unsqueeze(1))
+                     * (attn_mask < word_ends.unsqueeze(1)))
+        attn_mask = torch.log(attn_mask.to(torch.float))
+        attn_scores = self.attn(bert_out).T  # [1, n_subtokens]
+        attn_scores = attn_scores.expand((n_words, n_subtokens))
+        attn_scores = attn_mask + attn_scores
+        del attn_mask
+        return torch.softmax(attn_scores, dim=1)  # [n_words, n_subtokens]
+    def _cluster_ids(self, doc: Doc) -> torch.Tensor:
+        """
+        Args:
+            doc: document information
+        Returns:
+            torch.Tensor of shape [n_word], containing cluster indices for
+                each word. Non-coreferent words have cluster id of zero.
+        """
+        word2cluster = {word_i: i
+                        for i, cluster in enumerate(doc["word_clusters"], start=1)
+                        for word_i in cluster}
+        return torch.tensor(
+            [word2cluster.get(word_i, 0)
+             for word_i in range(len(doc["cased_words"]))],
+            device=self.device
+        )

stanza/stanza/models/depparse/data.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import random
+import logging
+import torch
+from stanza.models.common.bert_embedding import filter_data, needs_length_filter
+from stanza.models.common.data import map_to_ids, get_long_tensor, get_float_tensor, sort_all
+from stanza.models.common.vocab import PAD_ID, VOCAB_PREFIX, ROOT_ID, CompositeVocab, CharVocab
+from stanza.models.pos.vocab import WordVocab, XPOSVocab, FeatureVocab, MultiVocab
+from stanza.models.pos.xpos_vocab_factory import xpos_vocab_factory
+from stanza.models.common.doc import *
+logger = logging.getLogger('stanza')
+def data_to_batches(data, batch_size, eval_mode, sort_during_eval, min_length_to_batch_separately):
+    """
+    Given a list of lists, where the first element of each sublist
+    represents the sentence, group the sentences into batches.
+    During training mode (not eval_mode) the sentences are sorted by
+    length with a bit of random shuffling.  During eval mode, the
+    sentences are sorted by length if sort_during_eval is true.
+    Refactored from the data structure in case other models could use
+    it and for ease of testing.
+    Returns (batches, original_order), where original_order is None
+    when in train mode or when unsorted and represents the original
+    location of each sentence in the sort
+    """
+    res = []
+    if not eval_mode:
+        # sort sentences (roughly) by length for better memory utilization
+        data = sorted(data, key = lambda x: len(x[0]), reverse=random.random() > .5)
+        data_orig_idx = None
+    elif sort_during_eval:
+        (data, ), data_orig_idx = sort_all([data], [len(x[0]) for x in data])
+    else:
+        data_orig_idx = None
+    current = []
+    currentlen = 0
+    for x in data:
+        if min_length_to_batch_separately is not None and len(x[0]) > min_length_to_batch_separately:
+            if currentlen > 0:
+                res.append(current)
+                current = []
+                currentlen = 0
+            res.append([x])
+        else:
+            if len(x[0]) + currentlen > batch_size and currentlen > 0:
+                res.append(current)
+                current = []
+                currentlen = 0
+            current.append(x)
+            currentlen += len(x[0])
+    if currentlen > 0:
+        res.append(current)
+    return res, data_orig_idx
+class DataLoader:
+    def __init__(self, doc, batch_size, args, pretrain, vocab=None, evaluation=False, sort_during_eval=False, min_length_to_batch_separately=None, bert_tokenizer=None):
+        self.batch_size = batch_size
+        self.min_length_to_batch_separately=min_length_to_batch_separately
+        self.args = args
+        self.eval = evaluation
+        self.shuffled = not self.eval
+        self.sort_during_eval = sort_during_eval
+        self.doc = doc
+        data = self.load_doc(doc)
+        # handle vocab
+        if vocab is None:
+            self.vocab = self.init_vocab(data)
+        else:
+            self.vocab = vocab
+        # filter out the long sentences if bert is used
+        if self.args.get('bert_model', None) and needs_length_filter(self.args['bert_model']):
+            data = filter_data(self.args['bert_model'], data, bert_tokenizer)
+        # handle pretrain; pretrain vocab is used when args['pretrain'] == True and pretrain is not None
+        self.pretrain_vocab = None
+        if pretrain is not None and args['pretrain']:
+            self.pretrain_vocab = pretrain.vocab
+        # filter and sample data
+        if args.get('sample_train', 1.0) < 1.0 and not self.eval:
+            keep = int(args['sample_train'] * len(data))
+            data = random.sample(data, keep)
+            logger.debug("Subsample training set with rate {:g}".format(args['sample_train']))
+        data = self.preprocess(data, self.vocab, self.pretrain_vocab, args)
+        # shuffle for training
+        if self.shuffled:
+            random.shuffle(data)
+        self.num_examples = len(data)
+        # chunk into batches
+        self.data = self.chunk_batches(data)
+        logger.debug("{} batches created.".format(len(self.data)))
+    def init_vocab(self, data):
+        assert self.eval == False # for eval vocab must exist
+        charvocab = CharVocab(data, self.args['shorthand'])
+        wordvocab = WordVocab(data, self.args['shorthand'], cutoff=7, lower=True)
+        uposvocab = WordVocab(data, self.args['shorthand'], idx=1)
+        xposvocab = xpos_vocab_factory(data, self.args['shorthand'])
+        featsvocab = FeatureVocab(data, self.args['shorthand'], idx=3)
+        lemmavocab = WordVocab(data, self.args['shorthand'], cutoff=7, idx=4, lower=True)
+        deprelvocab = WordVocab(data, self.args['shorthand'], idx=6)
+        vocab = MultiVocab({'char': charvocab,
+                            'word': wordvocab,
+                            'upos': uposvocab,
+                            'xpos': xposvocab,
+                            'feats': featsvocab,
+                            'lemma': lemmavocab,
+                            'deprel': deprelvocab})
+        return vocab
+    def preprocess(self, data, vocab, pretrain_vocab, args):
+        processed = []
+        xpos_replacement = [[ROOT_ID] * len(vocab['xpos'])] if isinstance(vocab['xpos'], CompositeVocab) else [ROOT_ID]
+        feats_replacement = [[ROOT_ID] * len(vocab['feats'])]
+        for sent in data:
+            processed_sent = [[ROOT_ID] + vocab['word'].map([w[0] for w in sent])]
+            processed_sent += [[[ROOT_ID]] + [vocab['char'].map([x for x in w[0]]) for w in sent]]
+            processed_sent += [[ROOT_ID] + vocab['upos'].map([w[1] for w in sent])]
+            processed_sent += [xpos_replacement + vocab['xpos'].map([w[2] for w in sent])]
+            processed_sent += [feats_replacement + vocab['feats'].map([w[3] for w in sent])]
+            if pretrain_vocab is not None:
+                # always use lowercase lookup in pretrained vocab
+                processed_sent += [[ROOT_ID] + pretrain_vocab.map([w[0].lower() for w in sent])]
+            else:
+                processed_sent += [[ROOT_ID] + [PAD_ID] * len(sent)]
+            processed_sent += [[ROOT_ID] + vocab['lemma'].map([w[4] for w in sent])]
+            processed_sent += [[to_int(w[5], ignore_error=self.eval) for w in sent]]
+            processed_sent += [vocab['deprel'].map([w[6] for w in sent])]
+            processed_sent.append([w[0] for w in sent])
+            processed.append(processed_sent)
+        return processed
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, key):
+        """ Get a batch with index. """
+        if not isinstance(key, int):
+            raise TypeError
+        if key < 0 or key >= len(self.data):
+            raise IndexError
+        batch = self.data[key]
+        batch_size = len(batch)
+        batch = list(zip(*batch))
+        assert len(batch) == 10
+        # sort sentences by lens for easy RNN operations
+        lens = [len(x) for x in batch[0]]
+        batch, orig_idx = sort_all(batch, lens)
+        # sort words by lens for easy char-RNN operations
+        batch_words = [w for sent in batch[1] for w in sent]
+        word_lens = [len(x) for x in batch_words]
+        batch_words, word_orig_idx = sort_all([batch_words], word_lens)
+        batch_words = batch_words[0]
+        word_lens = [len(x) for x in batch_words]
+        # convert to tensors
+        words = batch[0]
+        words = get_long_tensor(words, batch_size)
+        words_mask = torch.eq(words, PAD_ID)
+        wordchars = get_long_tensor(batch_words, len(word_lens))
+        wordchars_mask = torch.eq(wordchars, PAD_ID)
+        upos = get_long_tensor(batch[2], batch_size)
+        xpos = get_long_tensor(batch[3], batch_size)
+        ufeats = get_long_tensor(batch[4], batch_size)
+        pretrained = get_long_tensor(batch[5], batch_size)
+        sentlens = [len(x) for x in batch[0]]
+        lemma = get_long_tensor(batch[6], batch_size)
+        head = get_long_tensor(batch[7], batch_size)
+        deprel = get_long_tensor(batch[8], batch_size)
+        text = batch[9]
+        return words, words_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, lemma, head, deprel, orig_idx, word_orig_idx, sentlens, word_lens, text
+    def load_doc(self, doc):
+        data = doc.get([TEXT, UPOS, XPOS, FEATS, LEMMA, HEAD, DEPREL], as_sentences=True)
+        data = self.resolve_none(data)
+        return data
+    def resolve_none(self, data):
+        # replace None to '_'
+        for sent_idx in range(len(data)):
+            for tok_idx in range(len(data[sent_idx])):
+                for feat_idx in range(len(data[sent_idx][tok_idx])):
+                    if data[sent_idx][tok_idx][feat_idx] is None:
+                        data[sent_idx][tok_idx][feat_idx] = '_'
+        return data
+    def __iter__(self):
+        for i in range(self.__len__()):
+            yield self.__getitem__(i)
+    def set_batch_size(self, batch_size):
+        self.batch_size = batch_size
+    def reshuffle(self):
+        data = [y for x in self.data for y in x]
+        self.data = self.chunk_batches(data)
+        random.shuffle(self.data)
+    def chunk_batches(self, data):
+        batches, data_orig_idx = data_to_batches(data=data, batch_size=self.batch_size,
+                                                 eval_mode=self.eval, sort_during_eval=self.sort_during_eval,
+                                                 min_length_to_batch_separately=self.min_length_to_batch_separately)
+        # data_orig_idx might be None at train time, since we don't anticipate unsorting
+        self.data_orig_idx = data_orig_idx
+        return batches
+def to_int(string, ignore_error=False):
+    try:
+        res = int(string)
+    except ValueError as err:
+        if ignore_error:
+            return 0
+        else:
+            raise err
+    return res

stanza/stanza/models/lemma/attach_lemma_classifier.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import argparse
+from stanza.models.lemma.trainer import Trainer
+from stanza.models.lemma_classifier.base_model import LemmaClassifier
+def attach_classifier(input_filename, output_filename, classifiers):
+    trainer = Trainer(model_file=input_filename)
+    for classifier in classifiers:
+        classifier = LemmaClassifier.load(classifier)
+        trainer.contextual_lemmatizers.append(classifier)
+    trainer.save(output_filename)
+def main(args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input', type=str, required=True, help='Which lemmatizer to start from')
+    parser.add_argument('--output', type=str, required=True, help='Where to save the lemmatizer')
+    parser.add_argument('--classifier', type=str, required=True, nargs='+', help='Lemma classifier to attach')
+    args = parser.parse_args(args)
+    attach_classifier(args.input, args.output, args.classifier)
+if __name__ == '__main__':
+    main()

stanza/stanza/models/lemma/scorer.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""
+Utils and wrappers for scoring lemmatizers.
+"""
+from stanza.models.common.utils import ud_scores
+def score(system_conllu_file, gold_conllu_file):
+    """ Wrapper for lemma scorer. """
+    evaluation = ud_scores(gold_conllu_file, system_conllu_file)
+    el = evaluation["Lemmas"]
+    p, r, f = el.precision, el.recall, el.f1
+    return p, r, f

stanza/stanza/models/lemma/vocab.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from collections import Counter
+from stanza.models.common.vocab import BaseVocab, BaseMultiVocab
+from stanza.models.common.seq2seq_constant import VOCAB_PREFIX
+class Vocab(BaseVocab):
+    def build_vocab(self):
+        counter = Counter(self.data)
+        self._id2unit = VOCAB_PREFIX + list(sorted(list(counter.keys()), key=lambda k: counter[k], reverse=True))
+        self._unit2id = {w:i for i, w in enumerate(self._id2unit)}
+class MultiVocab(BaseMultiVocab):
+    @classmethod
+    def load_state_dict(cls, state_dict):
+        new = cls()
+        for k,v in state_dict.items():
+            new[k] = Vocab.load_state_dict(v)
+        return new

stanza/stanza/models/lemma_classifier/base_trainer.py ADDED Viewed

	@@ -0,0 +1,114 @@

+from abc import ABC, abstractmethod
+import logging
+import os
+from typing import List, Tuple, Any, Mapping
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from stanza.models.common.utils import default_device
+from stanza.models.lemma_classifier import utils
+from stanza.models.lemma_classifier.constants import DEFAULT_BATCH_SIZE
+from stanza.models.lemma_classifier.evaluate_models import evaluate_model
+from stanza.utils.get_tqdm import get_tqdm
+tqdm = get_tqdm()
+logger = logging.getLogger('stanza.lemmaclassifier')
+class BaseLemmaClassifierTrainer(ABC):
+    def configure_weighted_loss(self, label_decoder: Mapping, counts: Mapping):
+        """
+        If applicable, this function will update the loss function of the LemmaClassifierLSTM model to become BCEWithLogitsLoss.
+        The weights are determined by the counts of the classes in the dataset. The weights are inversely proportional to the
+        frequency of the class in the set. E.g. classes with lower frequency will have higher weight.
+        """
+        weights = [0 for _ in label_decoder.keys()]  # each key in the label decoder is one class, we have one weight per class
+        total_samples = sum(counts.values())
+        for class_idx in counts:
+            weights[class_idx] = total_samples / (counts[class_idx] * len(counts))  # weight_i = total / (# examples in class i * num classes)
+        weights = torch.tensor(weights)
+        logger.info(f"Using weights {weights} for weighted loss.")
+        self.criterion = nn.BCEWithLogitsLoss(weight=weights)
+    @abstractmethod
+    def build_model(self, label_decoder, upos_to_id, known_words, target_words, target_upos):
+        """
+        Build a model using pieces of the dataset to determine some of the model shape
+        """
+    def train(self, num_epochs: int, save_name: str, args: Mapping, eval_file: str, train_file: str) -> None:
+        """
+        Trains a model on batches of texts, position indices of the target token, and labels (lemma annotation) for the target token.
+        Args:
+            num_epochs (int): Number of training epochs
+            save_name (str): Path to file where trained model should be saved.
+            eval_file (str): Path to the dev set file for evaluating model checkpoints each epoch.
+            train_file (str): Path to data file, containing tokenized text sentences, token index and true label for token lemma on each line.
+        """
+        # Put model on GPU (if possible)
+        device = default_device()
+        if not train_file:
+            raise ValueError("Cannot train model - no train_file supplied!")
+        dataset = utils.Dataset(train_file, get_counts=self.weighted_loss, batch_size=args.get("batch_size", DEFAULT_BATCH_SIZE))
+        label_decoder = dataset.label_decoder
+        upos_to_id = dataset.upos_to_id
+        self.output_dim = len(label_decoder)
+        logger.info(f"Loaded dataset successfully from {train_file}")
+        logger.info(f"Using label decoder: {label_decoder}  Output dimension: {self.output_dim}")
+        logger.info(f"Target words: {dataset.target_words}")
+        self.model = self.build_model(label_decoder, upos_to_id, dataset.known_words, dataset.target_words, set(dataset.target_upos))
+        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
+        self.model.to(device)
+        logger.info(f"Training model on device: {device}. {next(self.model.parameters()).device}")
+        if os.path.exists(save_name) and not args.get('force', False):
+            raise FileExistsError(f"Save name {save_name} already exists; training would overwrite previous file contents. Aborting...")
+        if self.weighted_loss:
+            self.configure_weighted_loss(label_decoder, dataset.counts)
+        # Put the criterion on GPU too
+        logger.debug(f"Criterion on {next(self.model.parameters()).device}")
+        self.criterion = self.criterion.to(next(self.model.parameters()).device)
+        best_model, best_f1 = None, float("-inf")  # Used for saving checkpoints of the model
+        for epoch in range(num_epochs):
+            # go over entire dataset with each epoch
+            for sentences, positions, upos_tags, labels in tqdm(dataset):
+                assert len(sentences) == len(positions) == len(labels), f"Input sentences, positions, and labels are of unequal length ({len(sentences), len(positions), len(labels)})"
+                self.optimizer.zero_grad()
+                outputs = self.model(positions, sentences, upos_tags)
+                # Compute loss, which is different if using CE or BCEWithLogitsLoss
+                if self.weighted_loss:  # BCEWithLogitsLoss requires a vector for target where probability is 1 on the true label class, and 0 on others.
+                    # TODO: three classes?
+                    targets = torch.stack([torch.tensor([1, 0]) if label == 0 else torch.tensor([0, 1]) for label in labels]).to(dtype=torch.float32).to(device)
+                    # should be shape size (batch_size, 2)
+                else:  # CELoss accepts target as just raw label
+                    targets = labels.to(device)
+                loss = self.criterion(outputs, targets)
+                loss.backward()
+                self.optimizer.step()
+            logger.info(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item()}")
+            if eval_file:
+                # Evaluate model on dev set to see if it should be saved.
+                _, _, _, f1 = evaluate_model(self.model, eval_file, is_training=True)
+                logger.info(f"Weighted f1 for model: {f1}")
+                if f1 > best_f1:
+                    best_f1 = f1
+                    self.model.save(save_name)
+                    logger.info(f"New best model: weighted f1 score of {f1}.")
+            else:
+                self.model.save(save_name)

stanza/stanza/models/lemma_classifier/constants.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from enum import Enum
+UNKNOWN_TOKEN = "unk"  # token name for unknown tokens
+UNKNOWN_TOKEN_IDX = -1   # custom index we apply to unknown tokens
+# TODO: ModelType could just be LSTM and TRANSFORMER
+# and then the transformer baseline would have the transformer as another argument
+class ModelType(Enum):
+    LSTM               = 1
+    TRANSFORMER        = 2
+    BERT               = 3
+    ROBERTA            = 4
+DEFAULT_BATCH_SIZE = 16

stanza/stanza/models/lemma_classifier/evaluate_many.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""
+Utils to evaluate many models of the same type at once
+"""
+import argparse
+import os
+import logging
+from stanza.models.lemma_classifier.evaluate_models import main as evaluate_main
+logger = logging.getLogger('stanza.lemmaclassifier')
+def evaluate_n_models(path_to_models_dir, args):
+    total_results = {
+        "be": 0.0,
+        "have": 0.0,
+        "accuracy": 0.0,
+        "weighted_f1": 0.0
+    }
+    paths = os.listdir(path_to_models_dir)
+    num_models = len(paths)
+    for model_path in paths:
+        full_path = os.path.join(path_to_models_dir, model_path)
+        args.save_name = full_path
+        mcc_results, confusion, acc, weighted_f1 = evaluate_main(predefined_args=args)
+        for lemma in mcc_results:
+            lemma_f1 = mcc_results.get(lemma, None).get("f1") * 100
+            total_results[lemma] += lemma_f1
+        total_results["accuracy"] += acc
+        total_results["weighted_f1"] += weighted_f1
+    total_results["be"] /= num_models
+    total_results["have"] /= num_models
+    total_results["accuracy"] /= num_models
+    total_results["weighted_f1"] /= num_models
+    logger.info(f"Models in {path_to_models_dir} had average weighted f1 of {100 * total_results['weighted_f1']}.\nLemma 'be' had f1: {total_results['be']}\nLemma 'have' had f1: {total_results['have']}.\nAccuracy: {100 * total_results['accuracy']}.\n ({num_models} models evaluated).")
+    return total_results
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--vocab_size", type=int, default=10000, help="Number of tokens in vocab")
+    parser.add_argument("--embedding_dim", type=int, default=100, help="Number of dimensions in word embeddings (currently using GloVe)")
+    parser.add_argument("--hidden_dim", type=int, default=256, help="Size of hidden layer")
+    parser.add_argument('--wordvec_pretrain_file', type=str, default=None, help='Exact name of the pretrain file to read')
+    parser.add_argument("--charlm", action='store_true', default=False, help="Whether not to use the charlm embeddings")
+    parser.add_argument('--charlm_shorthand', type=str, default=None, help="Shorthand for character-level language model training corpus.")
+    parser.add_argument("--charlm_forward_file", type=str, default=os.path.join(os.path.dirname(__file__), "charlm_files", "1billion_forward.pt"), help="Path to forward charlm file")
+    parser.add_argument("--charlm_backward_file", type=str, default=os.path.join(os.path.dirname(__file__), "charlm_files", "1billion_backwards.pt"), help="Path to backward charlm file")
+    parser.add_argument("--save_name", type=str, default=os.path.join(os.path.dirname(__file__), "saved_models", "lemma_classifier_model.pt"), help="Path to model save file")
+    parser.add_argument("--model_type", type=str, default="roberta", help="Which transformer to use ('bert' or 'roberta' or 'lstm')")
+    parser.add_argument("--bert_model", type=str, default=None, help="Use a specific transformer instead of the default bert/roberta")
+    parser.add_argument("--eval_file", type=str, help="path to evaluation file")
+    # Args specific to several model eval
+    parser.add_argument("--base_path", type=str, default=None, help="path to dir for eval")
+    args = parser.parse_args()
+    evaluate_n_models(args.base_path, args)
+if __name__ == "__main__":
+    main()

stanza/stanza/models/lemma_classifier/evaluate_models.py ADDED Viewed

	@@ -0,0 +1,228 @@

+import os
+import sys
+parentdir = os.path.dirname(__file__)
+parentdir = os.path.dirname(parentdir)
+parentdir = os.path.dirname(parentdir)
+sys.path.append(parentdir)
+import logging
+import argparse
+import os
+from typing import Any, List, Tuple, Mapping
+from collections import defaultdict
+from numpy import random
+import torch
+import torch.nn as nn
+import stanza
+from stanza.models.common.utils import default_device
+from stanza.models.lemma_classifier import utils
+from stanza.models.lemma_classifier.base_model import LemmaClassifier
+from stanza.models.lemma_classifier.lstm_model import LemmaClassifierLSTM
+from stanza.models.lemma_classifier.transformer_model import LemmaClassifierWithTransformer
+from stanza.utils.confusion import format_confusion
+from stanza.utils.get_tqdm import get_tqdm
+tqdm = get_tqdm()
+logger = logging.getLogger('stanza.lemmaclassifier')
+def get_weighted_f1(mcc_results: Mapping[int, Mapping[str, float]], confusion: Mapping[int, Mapping[int, int]]) -> float:
+    """
+    Computes the weighted F1 score across an evaluation set.
+    The weight of a class's F1 score is equal to the number of examples in evaluation. This makes classes that have more
+    examples in the evaluation more impactful to the weighted f1.
+    """
+    num_total_examples = 0
+    weighted_f1 = 0
+    for class_id in mcc_results:
+        class_f1 = mcc_results.get(class_id).get("f1")
+        num_class_examples = sum(confusion.get(class_id).values())
+        weighted_f1 += class_f1 * num_class_examples
+        num_total_examples += num_class_examples
+    return weighted_f1 / num_total_examples
+def evaluate_sequences(gold_tag_sequences: List[Any], pred_tag_sequences: List[Any], label_decoder: Mapping, verbose=True):
+    """
+    Evaluates a model's predicted tags against a set of gold tags. Computes precision, recall, and f1 for all classes.
+    Precision = true positives / true positives + false positives
+    Recall = true positives / true positives + false negatives
+    F1 = 2 * (Precision * Recall) / (Precision + Recall)
+    Returns:
+        1. Multi class result dictionary, where each class is a key and maps to another map of its F1, precision, and recall scores.
+           e.g. multiclass_results[0]["precision"] would give class 0's precision.
+        2. Confusion matrix, where each key is a gold tag and its value is another map with a key of the predicted tag with value of that (gold, pred) count.
+           e.g. confusion[0][1] = 6 would mean that for gold tag 0, the model predicted tag 1 a total of 6 times.
+    """
+    assert len(gold_tag_sequences) == len(pred_tag_sequences), \
+    f"Length of gold tag sequences is {len(gold_tag_sequences)}, while length of predicted tag sequence is {len(pred_tag_sequences)}"
+    confusion = defaultdict(lambda: defaultdict(int))
+    reverse_label_decoder = {y: x for x, y in label_decoder.items()}
+    for gold, pred in zip(gold_tag_sequences, pred_tag_sequences):
+        confusion[reverse_label_decoder[gold]][reverse_label_decoder[pred]] += 1
+    multi_class_result = defaultdict(lambda: defaultdict(float))
+    # compute precision, recall and f1 for each class and store inside of `multi_class_result`
+    for gold_tag in confusion.keys():
+        try:
+            prec = confusion.get(gold_tag, {}).get(gold_tag, 0) / sum([confusion.get(k, {}).get(gold_tag, 0) for k in confusion.keys()])
+        except ZeroDivisionError:
+            prec = 0.0
+        try:
+            recall = confusion.get(gold_tag, {}).get(gold_tag, 0) / sum(confusion.get(gold_tag, {}).values())
+        except ZeroDivisionError:
+            recall = 0.0
+        try:
+            f1 = 2 * (prec * recall) / (prec + recall)
+        except ZeroDivisionError:
+            f1 = 0.0
+        multi_class_result[gold_tag] = {
+            "precision": prec,
+            "recall": recall,
+            "f1": f1
+        }
+    if verbose:
+        for lemma in multi_class_result:
+            logger.info(f"Lemma '{lemma}' had precision {100 * multi_class_result[lemma]['precision']}, recall {100 * multi_class_result[lemma]['recall']} and F1 score of {100 * multi_class_result[lemma]['f1']}")
+    weighted_f1 = get_weighted_f1(multi_class_result, confusion)
+    return multi_class_result, confusion, weighted_f1
+def model_predict(model: nn.Module, position_indices: torch.Tensor, sentences: List[List[str]], upos_tags: List[List[int]]=[]) -> torch.Tensor:
+    """
+    A LemmaClassifierLSTM or LemmaClassifierWithTransformer is used to predict on a single text example, given the position index of the target token.
+    Args:
+        model (LemmaClassifier): A trained LemmaClassifier that is able to predict on a target token.
+        position_indices (Tensor[int]): A tensor of the (zero-indexed) position of the target token in `text` for each example in the batch.
+        sentences (List[List[str]]): A list of lists of the tokenized strings of the input sentences.
+    Returns:
+        (int): The index of the predicted class in `model`'s output.
+    """
+    with torch.no_grad():
+        logits = model(position_indices, sentences, upos_tags)  # should be size (batch_size, output_size)
+        predicted_class = torch.argmax(logits, dim=1)  # should be size (batch_size, 1)
+    return predicted_class
+def evaluate_model(model: nn.Module, eval_path: str, verbose: bool = True, is_training: bool = False) -> Tuple[Mapping, Mapping, float, float]:
+    """
+    Helper function for model evaluation
+    Args:
+        model (LemmaClassifierLSTM or LemmaClassifierWithTransformer): An instance of the LemmaClassifier class that has architecture initialized which matches the model saved in `model_path`.
+        model_path (str): Path to the saved model weights that will be loaded into `model`.
+        eval_path (str): Path to the saved evaluation dataset.
+        verbose (bool, optional): True if `evaluate_sequences()` should print the F1, Precision, and Recall for each class. Defaults to True.
+        is_training (bool, optional): Whether the model is in training mode. If the model is training, we do not change it to eval mode.
+    Returns:
+        1. Multi-class results (Mapping[int, Mapping[str, float]]): first map has keys as the classes (lemma indices) and value is
+                                                                    another map with key of "f1", "precision", or "recall" with corresponding values.
+        2. Confusion Matrix (Mapping[int, Mapping[int, int]]): A confusion matrix with keys equal to the index of the gold tag, and a value of the
+                                                               map with the key as the predicted tag and corresponding count of that (gold, pred) pair.
+        3. Accuracy (float): the total accuracy (num correct / total examples) across the evaluation set.
+    """
+    # load model
+    device = default_device()
+    model.to(device)
+    if not is_training:
+        model.eval()  # set to eval mode
+    # load in eval data
+    dataset = utils.Dataset(eval_path, label_decoder=model.label_decoder, shuffle=False)
+    logger.info(f"Evaluating on evaluation file {eval_path}")
+    correct, total = 0, 0
+    gold_tags, pred_tags = dataset.labels, []
+    # run eval on each example from dataset
+    for sentences, pos_indices, upos_tags, labels in tqdm(dataset, "Evaluating examples from data file"):
+        pred = model_predict(model, pos_indices, sentences, upos_tags)  # Pred should be size (batch_size, )
+        correct_preds = pred == labels.to(device)
+        correct += torch.sum(correct_preds)
+        total += len(correct_preds)
+        pred_tags += pred.tolist()
+    logger.info("Finished evaluating on dataset. Computing scores...")
+    accuracy = correct / total
+    mc_results, confusion, weighted_f1 = evaluate_sequences(gold_tags, pred_tags, dataset.label_decoder, verbose=verbose)
+    # add brackets around batches of gold and pred tags because each batch is an element within the sequences in this helper
+    if verbose:
+        logger.info(f"Accuracy: {accuracy} ({correct}/{total})")
+        logger.info(f"Label decoder: {dataset.label_decoder}")
+    return mc_results, confusion, accuracy, weighted_f1
+def main(args=None, predefined_args=None):
+    # TODO: can unify this script with train_lstm_model.py?
+    # TODO: can save the model type in the model .pt, then
+    # automatically figure out what type of model we are using by
+    # looking in the file
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--vocab_size", type=int, default=10000, help="Number of tokens in vocab")
+    parser.add_argument("--embedding_dim", type=int, default=100, help="Number of dimensions in word embeddings (currently using GloVe)")
+    parser.add_argument("--hidden_dim", type=int, default=256, help="Size of hidden layer")
+    parser.add_argument('--wordvec_pretrain_file', type=str, default=None, help='Exact name of the pretrain file to read')
+    parser.add_argument("--charlm", action='store_true', default=False, help="Whether not to use the charlm embeddings")
+    parser.add_argument('--charlm_shorthand', type=str, default=None, help="Shorthand for character-level language model training corpus.")
+    parser.add_argument("--charlm_forward_file", type=str, default=os.path.join(os.path.dirname(__file__), "charlm_files", "1billion_forward.pt"), help="Path to forward charlm file")
+    parser.add_argument("--charlm_backward_file", type=str, default=os.path.join(os.path.dirname(__file__), "charlm_files", "1billion_backwards.pt"), help="Path to backward charlm file")
+    parser.add_argument("--save_name", type=str, default=os.path.join(os.path.dirname(__file__), "saved_models", "lemma_classifier_model.pt"), help="Path to model save file")
+    parser.add_argument("--model_type", type=str, default="roberta", help="Which transformer to use ('bert' or 'roberta' or 'lstm')")
+    parser.add_argument("--bert_model", type=str, default=None, help="Use a specific transformer instead of the default bert/roberta")
+    parser.add_argument("--eval_file", type=str, help="path to evaluation file")
+    args = parser.parse_args(args) if not predefined_args else predefined_args
+    logger.info("Running training script with the following args:")
+    args = vars(args)
+    for arg in args:
+        logger.info(f"{arg}: {args[arg]}")
+    logger.info("------------------------------------------------------------")
+    logger.info(f"Attempting evaluation of model from {args['save_name']} on file {args['eval_file']}")
+    model = LemmaClassifier.load(args['save_name'], args)
+    mcc_results, confusion, acc, weighted_f1 = evaluate_model(model, args['eval_file'])
+    logger.info(f"MCC Results: {dict(mcc_results)}")
+    logger.info("______________________________________________")
+    logger.info(f"Confusion:\n%s", format_confusion(confusion))
+    logger.info("______________________________________________")
+    logger.info(f"Accuracy: {acc}")
+    logger.info("______________________________________________")
+    logger.info(f"Weighted f1: {weighted_f1}")
+    return mcc_results, confusion, acc, weighted_f1
+if __name__ == "__main__":
+    main()

stanza/stanza/models/lemma_classifier/prepare_dataset.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import argparse
+import json
+import os
+import re
+import stanza
+from stanza.models.lemma_classifier import utils
+from typing import List, Tuple, Any
+"""
+The code in this file processes a CoNLL dataset by taking its sentences and filtering out all sentences that do not contain the target token.
+Furthermore, it will store tuples of the Stanza document object, the position index of the target token, and its lemma.
+"""
+def load_doc_from_conll_file(path: str):
+    """"
+    loads in a Stanza document object from a path to a CoNLL file containing annotated sentences.
+    """
+    return stanza.utils.conll.CoNLL.conll2doc(path)
+class DataProcessor():
+    def __init__(self, target_word: str, target_upos: List[str], allowed_lemmas: str):
+        self.target_word = target_word
+        self.target_word_regex = re.compile(target_word)
+        self.target_upos = target_upos
+        self.allowed_lemmas = re.compile(allowed_lemmas)
+    def keep_sentence(self, sentence):
+        for word in sentence.words:
+            if self.target_word_regex.fullmatch(word.text) and word.upos in self.target_upos:
+                return True
+        return False
+    def find_all_occurrences(self, sentence) -> List[int]:
+        """
+        Finds all occurrences of self.target_word in tokens and returns the index(es) of such occurrences.
+        """
+        occurrences = []
+        for idx, token in enumerate(sentence.words):
+            if self.target_word_regex.fullmatch(token.text) and token.upos in self.target_upos:
+                occurrences.append(idx)
+        return occurrences
+    @staticmethod
+    def write_output_file(save_name, target_upos, sentences):
+        with open(save_name, "w+", encoding="utf-8") as output_f:
+            output_f.write("{\n")
+            output_f.write('  "upos": %s,\n' % json.dumps(target_upos))
+            output_f.write('  "sentences": [')
+            wrote_sentence = False
+            for sentence in sentences:
+                if not wrote_sentence:
+                    output_f.write("\n    ")
+                    wrote_sentence = True
+                else:
+                    output_f.write(",\n    ")
+                output_f.write(json.dumps(sentence))
+            output_f.write("\n  ]\n}\n")
+    def process_document(self, doc, save_name: str) -> None:
+        """
+        Takes any sentence from `doc` that meets the condition of `keep_sentence` and writes its tokens, index of target word, and lemma to `save_name`
+        Sentences that meet `keep_sentence` and contain `self.target_word` multiple times have each instance in a different example in the output file.
+        Args:
+            doc (Stanza.doc): Document object that represents the file to be analyzed
+            save_name (str): Path to the file for storing output
+        """
+        sentences = []
+        for sentence in doc.sentences:
+            # for each sentence, we need to determine if it should be added to the output file.
+            # if the sentence fulfills keep_sentence, then we will save it along with the target word's index and its corresponding lemma
+            if self.keep_sentence(sentence):
+                tokens = [token.text for token in sentence.words]
+                indexes = self.find_all_occurrences(sentence)
+                for idx in indexes:
+                    if self.allowed_lemmas.fullmatch(sentence.words[idx].lemma):
+                        # for each example found, we write the tokens,
+                        # their respective upos tags, the target token index,
+                        # and the target lemma
+                        upos_tags = [sentence.words[i].upos for i in range(len(sentence.words))]
+                        num_tokens = len(upos_tags)
+                        sentences.append({
+                            "words": tokens,
+                            "upos_tags": upos_tags,
+                            "index": idx,
+                            "lemma": sentence.words[idx].lemma
+                        })
+        if save_name:
+            self.write_output_file(save_name, self.target_upos, sentences)
+        return sentences
+def main(args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--conll_path", type=str, default=os.path.join(os.path.dirname(__file__), "en_gum-ud-train.conllu"), help="path to the conll file to translate")
+    parser.add_argument("--target_word", type=str, default="'s", help="Token to classify on, e.g. 's.")
+    parser.add_argument("--target_upos", type=str, default="AUX", help="upos on target token")
+    parser.add_argument("--output_path", type=str, default="test_output.txt", help="Path for output file")
+    parser.add_argument("--allowed_lemmas", type=str, default=".*", help="A regex for allowed lemmas.  If not set, all lemmas are allowed")
+    args = parser.parse_args(args)
+    conll_path = args.conll_path
+    target_upos = args.target_upos
+    output_path = args.output_path
+    allowed_lemmas = args.allowed_lemmas
+    args = vars(args)
+    for arg in args:
+        print(f"{arg}: {args[arg]}")
+    doc = load_doc_from_conll_file(conll_path)
+    processor = DataProcessor(target_word=args['target_word'], target_upos=[target_upos], allowed_lemmas=allowed_lemmas)
+    return processor.process_document(doc, output_path)
+if __name__ == "__main__":
+    main()

stanza/stanza/models/lemma_classifier/train_lstm_model.py ADDED Viewed

	@@ -0,0 +1,147 @@

+"""
+The code in this file works to train a lemma classifier for 's
+"""
+import argparse
+import logging
+import os
+import torch
+import torch.nn as nn
+from stanza.models.common.foundation_cache import load_pretrain
+from stanza.models.lemma_classifier.base_trainer import BaseLemmaClassifierTrainer
+from stanza.models.lemma_classifier.constants import DEFAULT_BATCH_SIZE
+from stanza.models.lemma_classifier.lstm_model import LemmaClassifierLSTM
+logger = logging.getLogger('stanza.lemmaclassifier')
+class LemmaClassifierTrainer(BaseLemmaClassifierTrainer):
+    """
+    Class to assist with training a LemmaClassifierLSTM
+    """
+    def __init__(self, model_args: dict, embedding_file: str, use_charlm: bool = False, charlm_forward_file: str = None, charlm_backward_file: str = None, lr: float = 0.001, loss_func: str = None):
+        """
+        Initializes the LemmaClassifierTrainer class.
+        Args:
+            model_args (dict): Various model shape parameters
+            embedding_file (str): What word embeddings file to use.  Use a Stanza pretrain .pt
+            use_charlm (bool, optional): Whether to use charlm embeddings as well. Defaults to False.
+            charlm_forward_file (str): Path to the forward pass embeddings for the charlm
+            charlm_backward_file (str): Path to the backward pass embeddings for the charlm
+            upos_emb_dim (int): The dimension size of UPOS tag embeddings
+            num_heads (int): The number of attention heads to use.
+            lr (float): Learning rate, defaults to 0.001.
+            loss_func (str): Which loss function to use (either 'ce' or 'weighted_bce')
+        Raises:
+            FileNotFoundError: If the forward charlm file is not present
+            FileNotFoundError: If the backward charlm file is not present
+        """
+        super().__init__()
+        self.model_args = model_args
+        # Load word embeddings
+        pt = load_pretrain(embedding_file)
+        self.pt_embedding = pt
+        # Load CharLM embeddings
+        if use_charlm and charlm_forward_file is not None and not os.path.exists(charlm_forward_file):
+            raise FileNotFoundError(f"Could not find forward charlm file: {charlm_forward_file}")
+        if use_charlm and charlm_backward_file is not None and not os.path.exists(charlm_backward_file):
+            raise FileNotFoundError(f"Could not find backward charlm file: {charlm_backward_file}")
+        # TODO: just pass around the args instead
+        self.use_charlm = use_charlm
+        self.charlm_forward_file = charlm_forward_file
+        self.charlm_backward_file = charlm_backward_file
+        self.lr = lr
+        # Find loss function
+        if loss_func == "ce":
+            self.criterion = nn.CrossEntropyLoss()
+            self.weighted_loss = False
+            logger.debug("Using CE loss")
+        elif loss_func == "weighted_bce":
+            self.criterion = nn.BCEWithLogitsLoss()
+            self.weighted_loss = True  # used to add weights during train time.
+            logger.debug("Using Weighted BCE loss")
+        else:
+            raise ValueError("Must enter a valid loss function (e.g. 'ce' or 'weighted_bce')")
+    def build_model(self, label_decoder, upos_to_id, known_words, target_words, target_upos):
+        return LemmaClassifierLSTM(self.model_args, self.output_dim, self.pt_embedding, label_decoder, upos_to_id, known_words, target_words, target_upos,
+                                   use_charlm=self.use_charlm, charlm_forward_file=self.charlm_forward_file, charlm_backward_file=self.charlm_backward_file)
+def build_argparse():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--hidden_dim", type=int, default=256, help="Size of hidden layer")
+    parser.add_argument('--wordvec_pretrain_file', type=str, default=os.path.join(os.path.dirname(__file__), "pretrain", "glove.pt"), help='Exact name of the pretrain file to read')
+    parser.add_argument("--charlm", action='store_true', dest='use_charlm', default=False, help="Whether not to use the charlm embeddings")
+    parser.add_argument('--charlm_shorthand', type=str, default=None, help="Shorthand for character-level language model training corpus.")
+    parser.add_argument("--charlm_forward_file", type=str, default=os.path.join(os.path.dirname(__file__), "charlm_files", "1billion_forward.pt"), help="Path to forward charlm file")
+    parser.add_argument("--charlm_backward_file", type=str, default=os.path.join(os.path.dirname(__file__), "charlm_files", "1billion_backwards.pt"), help="Path to backward charlm file")
+    parser.add_argument("--upos_emb_dim", type=int, default=20, help="Dimension size for UPOS tag embeddings.")
+    parser.add_argument("--use_attn", action='store_true', dest='attn', default=False, help='Whether to use multihead attention instead of LSTM.')
+    parser.add_argument("--num_heads", type=int, default=0, help="Number of heads to use for multihead attention.")
+    parser.add_argument("--save_name", type=str, default=os.path.join(os.path.dirname(__file__), "saved_models", "lemma_classifier_model_weighted_loss_charlm_new.pt"), help="Path to model save file")
+    parser.add_argument("--lr", type=float, default=0.001, help="learning rate")
+    parser.add_argument("--num_epochs", type=float, default=10, help="Number of training epochs")
+    parser.add_argument("--batch_size", type=int, default=DEFAULT_BATCH_SIZE, help="Number of examples to include in each batch")
+    parser.add_argument("--train_file", type=str, default=os.path.join(os.path.dirname(__file__), "data", "processed_ud_en", "combined_train.txt"), help="Full path to training file")
+    parser.add_argument("--weighted_loss", action='store_true', dest='weighted_loss', default=False, help="Whether to use weighted loss during training.")
+    parser.add_argument("--eval_file", type=str, default=os.path.join(os.path.dirname(__file__), "data", "processed_ud_en", "combined_dev.txt"), help="Path to dev file used to evaluate model for saves")
+    parser.add_argument("--force", action='store_true', default=False, help='Whether or not to clobber an existing save file')
+    return parser
+def main(args=None, predefined_args=None):
+    parser = build_argparse()
+    args = parser.parse_args(args) if predefined_args is None else predefined_args
+    wordvec_pretrain_file = args.wordvec_pretrain_file
+    use_charlm = args.use_charlm
+    charlm_forward_file = args.charlm_forward_file
+    charlm_backward_file = args.charlm_backward_file
+    upos_emb_dim = args.upos_emb_dim
+    use_attention = args.attn
+    num_heads = args.num_heads
+    save_name = args.save_name
+    lr = args.lr
+    num_epochs = args.num_epochs
+    train_file = args.train_file
+    weighted_loss = args.weighted_loss
+    eval_file = args.eval_file
+    args = vars(args)
+    if os.path.exists(save_name) and not args.get('force', False):
+        raise FileExistsError(f"Save name {save_name} already exists. Training would override existing data. Aborting...")
+    if not os.path.exists(train_file):
+        raise FileNotFoundError(f"Training file {train_file} not found. Try again with a valid path.")
+    logger.info("Running training script with the following args:")
+    for arg in args:
+        logger.info(f"{arg}: {args[arg]}")
+    logger.info("------------------------------------------------------------")
+    trainer = LemmaClassifierTrainer(model_args=args,
+                                     embedding_file=wordvec_pretrain_file,
+                                     use_charlm=use_charlm,
+                                     charlm_forward_file=charlm_forward_file,
+                                     charlm_backward_file=charlm_backward_file,
+                                     lr=lr,
+                                     loss_func="weighted_bce" if weighted_loss else "ce",
+                                     )
+    trainer.train(
+        num_epochs=num_epochs, save_name=save_name, args=args, eval_file=eval_file, train_file=train_file
+    )
+    return trainer
+if __name__ == "__main__":
+    main()

stanza/stanza/models/lemma_classifier/train_many.py ADDED Viewed

	@@ -0,0 +1,155 @@

+"""
+Utils for training and evaluating multiple models simultaneously
+"""
+import argparse
+import os
+from stanza.models.lemma_classifier.train_lstm_model import main as train_lstm_main
+from stanza.models.lemma_classifier.train_transformer_model import main as train_tfmr_main
+from stanza.models.lemma_classifier.constants import DEFAULT_BATCH_SIZE
+change_params_map = {
+    "lstm_layer": [16, 32, 64, 128, 256, 512],
+    "upos_emb_dim": [5, 10, 20, 30],
+    "training_size": [150, 300, 450, 600, 'full'],
+}  # TODO: Add attention
+def train_n_models(num_models: int, base_path: str, args):
+    if args.change_param == "lstm_layer":
+        for num_layers in change_params_map.get("lstm_layer", None):
+            for i in range(num_models):
+                new_save_name = os.path.join(base_path, f"{num_layers}_{i}.pt")
+                args.save_name = new_save_name
+                args.hidden_dim = num_layers
+                train_lstm_main(predefined_args=args)
+    if args.change_param == "upos_emb_dim":
+        for upos_dim in change_params_map("upos_emb_dim", None):
+            for i in range(num_models):
+                new_save_name = os.path.join(base_path, f"dim_{upos_dim}_{i}.pt")
+                args.save_name = new_save_name
+                args.upos_emb_dim = upos_dim
+                train_lstm_main(predefined_args=args)
+    if args.change_param == "training_size":
+        for size in change_params_map.get("training_size", None):
+            for i in range(num_models):
+                new_save_name = os.path.join(base_path, f"{size}_examples_{i}.pt")
+                new_train_file = os.path.join(os.path.dirname(__file__), "data", "processed_ud_en", "combined_train.txt")
+                args.save_name = new_save_name
+                args.train_file = new_train_file
+                train_lstm_main(predefined_args=args)
+    if args.change_param == "base":
+        for i in range(num_models):
+            new_save_name = os.path.join(base_path, f"lstm_model_{i}.pt")
+            args.save_name = new_save_name
+            args.weighted_loss = False
+            train_lstm_main(predefined_args=args)
+            if not args.weighted_loss:
+                args.weighted_loss = True
+                new_save_name = os.path.join(base_path, f"lstm_model_wloss_{i}.pt")
+                args.save_name = new_save_name
+                train_lstm_main(predefined_args=args)
+    if args.change_param == "base_charlm":
+        for i in range(num_models):
+            new_save_name = os.path.join(base_path, f"lstm_charlm_{i}.pt")
+            args.save_name = new_save_name
+            train_lstm_main(predefined_args=args)
+    if args.change_param == "base_charlm_upos":
+        for i in range(num_models):
+            new_save_name = os.path.join(base_path, f"lstm_charlm_upos_{i}.pt")
+            args.save_name = new_save_name
+            train_lstm_main(predefined_args=args)
+    if args.change_param == "base_upos":
+        for i in range(num_models):
+            new_save_name = os.path.join(base_path, f"lstm_upos_{i}.pt")
+            args.save_name = new_save_name
+            train_lstm_main(predefined_args=args)
+    if args.change_param == "attn_model":
+        for i in range(num_models):
+            new_save_name = os.path.join(base_path, f"attn_model_{args.num_heads}_heads_{i}.pt")
+            args.save_name = new_save_name
+            train_lstm_main(predefined_args=args)
+def train_n_tfmrs(num_models: int, base_path: str, args):
+    if args.multi_train_type == "tfmr":
+        for i in range(num_models):
+            if args.change_param == "bert":
+                new_save_name = os.path.join(base_path, f"bert_{i}.pt")
+                args.save_name = new_save_name
+                args.loss_fn = "ce"
+                train_tfmr_main(predefined_args=args)
+                new_save_name = os.path.join(base_path, f"bert_wloss_{i}.pt")
+                args.save_name = new_save_name
+                args.loss_fn = "weighted_bce"
+                train_tfmr_main(predefined_args=args)
+            elif args.change_param == "roberta":
+                new_save_name = os.path.join(base_path, f"roberta_{i}.pt")
+                args.save_name = new_save_name
+                args.loss_fn = "ce"
+                train_tfmr_main(predefined_args=args)
+                new_save_name = os.path.join(base_path, f"roberta_wloss_{i}.pt")
+                args.save_name = new_save_name
+                args.loss_fn = "weighted_bce"
+                train_tfmr_main(predefined_args=args)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--hidden_dim", type=int, default=256, help="Size of hidden layer")
+    parser.add_argument('--wordvec_pretrain_file', type=str, default=os.path.join(os.path.dirname(__file__), "pretrain", "glove.pt"), help='Exact name of the pretrain file to read')
+    parser.add_argument("--charlm", action='store_true', dest='use_charlm', default=False, help="Whether not to use the charlm embeddings")
+    parser.add_argument('--charlm_shorthand', type=str, default=None, help="Shorthand for character-level language model training corpus.")
+    parser.add_argument("--charlm_forward_file", type=str, default=os.path.join(os.path.dirname(__file__), "charlm_files", "1billion_forward.pt"), help="Path to forward charlm file")
+    parser.add_argument("--charlm_backward_file", type=str, default=os.path.join(os.path.dirname(__file__), "charlm_files", "1billion_backwards.pt"), help="Path to backward charlm file")
+    parser.add_argument("--upos_emb_dim", type=int, default=20, help="Dimension size for UPOS tag embeddings.")
+    parser.add_argument("--use_attn", action='store_true', dest='attn', default=False, help='Whether to use multihead attention instead of LSTM.')
+    parser.add_argument("--num_heads", type=int, default=0, help="Number of heads to use for multihead attention.")
+    parser.add_argument("--save_name", type=str, default=os.path.join(os.path.dirname(__file__), "saved_models", "lemma_classifier_model_weighted_loss_charlm_new.pt"), help="Path to model save file")
+    parser.add_argument("--lr", type=float, default=0.001, help="learning rate")
+    parser.add_argument("--num_epochs", type=float, default=10, help="Number of training epochs")
+    parser.add_argument("--batch_size", type=int, default=DEFAULT_BATCH_SIZE, help="Number of examples to include in each batch")
+    parser.add_argument("--train_file", type=str, default=os.path.join(os.path.dirname(__file__), "data", "processed_ud_en", "combined_train.txt"), help="Full path to training file")
+    parser.add_argument("--weighted_loss", action='store_true', dest='weighted_loss', default=False, help="Whether to use weighted loss during training.")
+    parser.add_argument("--eval_file", type=str, default=os.path.join(os.path.dirname(__file__), "data", "processed_ud_en", "combined_dev.txt"), help="Path to dev file used to evaluate model for saves")
+    # Tfmr-specific args
+    parser.add_argument("--model_type", type=str, default="roberta", help="Which transformer to use ('bert' or 'roberta')")
+    parser.add_argument("--bert_model", type=str, default=None, help="Use a specific transformer instead of the default bert/roberta")
+    parser.add_argument("--loss_fn", type=str, default="weighted_bce", help="Which loss function to train with (e.g. 'ce' or 'weighted_bce')")
+    # Multi-model train args
+    parser.add_argument("--multi_train_type", type=str, default="lstm", help="Whether you are attempting to multi-train an LSTM or transformer")
+    parser.add_argument("--multi_train_count", type=int, default=5, help="Number of each model to build")
+    parser.add_argument("--base_path", type=str, default=None, help="Path to start generating model type for.")
+    parser.add_argument("--change_param", type=str, default=None, help="Which hyperparameter to change when training")
+    args = parser.parse_args()
+    if args.multi_train_type == "lstm":
+        train_n_models(num_models=args.multi_train_count,
+                       base_path=args.base_path,
+                       args=args)
+    elif args.multi_train_type == "tfmr":
+        train_n_tfmrs(num_models=args.multi_train_count,
+                      base_path=args.base_path,
+                      args=args)
+    else:
+        raise ValueError(f"Improper input {args.multi_train_type}")
+if __name__ == "__main__":
+    main()

stanza/stanza/models/lemma_classifier/train_transformer_model.py ADDED Viewed

	@@ -0,0 +1,130 @@

+"""
+This file contains code used to train a baseline transformer model to classify on a lemma of a particular token.
+"""
+import argparse
+import os
+import sys
+import logging
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from stanza.models.lemma_classifier.base_trainer import BaseLemmaClassifierTrainer
+from stanza.models.lemma_classifier.constants import DEFAULT_BATCH_SIZE
+from stanza.models.lemma_classifier.transformer_model import LemmaClassifierWithTransformer
+from stanza.models.common.utils import default_device
+logger = logging.getLogger('stanza.lemmaclassifier')
+class TransformerBaselineTrainer(BaseLemmaClassifierTrainer):
+    """
+    Class to assist with training a baseline transformer model to classify on token lemmas.
+    To find the model spec, refer to `model.py` in this directory.
+    """
+    def __init__(self, model_args: dict, transformer_name: str = "roberta", loss_func: str = "ce", lr: int = 0.001):
+        """
+        Creates the Trainer object
+        Args:
+            transformer_name (str, optional): What kind of transformer to use for embeddings. Defaults to "roberta".
+            loss_func (str, optional): Which loss function to use (either 'ce' or 'weighted_bce'). Defaults to "ce".
+            lr (int, optional): learning rate for the optimizer. Defaults to 0.001.
+        """
+        super().__init__()
+        self.model_args = model_args
+        # Find loss function
+        if loss_func == "ce":
+            self.criterion = nn.CrossEntropyLoss()
+            self.weighted_loss = False
+        elif loss_func == "weighted_bce":
+            self.criterion = nn.BCEWithLogitsLoss()
+            self.weighted_loss = True  # used to add weights during train time.
+        else:
+            raise ValueError("Must enter a valid loss function (e.g. 'ce' or 'weighted_bce')")
+        self.transformer_name = transformer_name
+        self.lr = lr
+    def set_layer_learning_rates(self, transformer_lr: float, mlp_lr: float) -> torch.optim:
+        """
+        Sets learning rates for each layer of the model.
+        Currently, the model has the transformer layer and the MLP layer, so these are tweakable.
+        Returns (torch.optim): An Adam optimizer with the learning rates adjusted per layer.
+        Currently unused - could be refactored into the parent class's train method,
+        or the parent class could call a build_optimizer and this subclass would use the optimizer
+        """
+        transformer_params, mlp_params = [], []
+        for name, param in self.model.named_parameters():
+            if 'transformer' in name:
+                transformer_params.append(param)
+            elif 'mlp' in name:
+                mlp_params.append(param)
+        optimizer = optim.Adam([
+            {"params": transformer_params, "lr": transformer_lr},
+            {"params": mlp_params, "lr": mlp_lr}
+        ])
+        return optimizer
+    def build_model(self, label_decoder, upos_to_id, known_words, target_words, target_upos):
+        return LemmaClassifierWithTransformer(model_args=self.model_args, output_dim=self.output_dim, transformer_name=self.transformer_name, label_decoder=label_decoder, target_words=target_words, target_upos=target_upos)
+def main(args=None, predefined_args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--save_name", type=str, default=os.path.join(os.path.dirname(os.path.dirname(__file__)), "saved_models", "big_model_roberta_weighted_loss.pt"), help="Path to model save file")
+    parser.add_argument("--num_epochs", type=float, default=10, help="Number of training epochs")
+    parser.add_argument("--train_file", type=str, default=os.path.join(os.path.dirname(os.path.dirname(__file__)), "test_sets", "combined_train.txt"), help="Full path to training file")
+    parser.add_argument("--model_type", type=str, default="roberta", help="Which transformer to use ('bert' or 'roberta')")
+    parser.add_argument("--bert_model", type=str, default=None, help="Use a specific transformer instead of the default bert/roberta")
+    parser.add_argument("--loss_fn", type=str, default="weighted_bce", help="Which loss function to train with (e.g. 'ce' or 'weighted_bce')")
+    parser.add_argument("--batch_size", type=int, default=DEFAULT_BATCH_SIZE, help="Number of examples to include in each batch")
+    parser.add_argument("--eval_file", type=str, default=os.path.join(os.path.dirname(os.path.dirname(__file__)), "test_sets", "combined_dev.txt"), help="Path to dev file used to evaluate model for saves")
+    parser.add_argument("--lr", type=float, default=0.001, help="Learning rate for the optimizer.")
+    parser.add_argument("--force", action='store_true', default=False, help='Whether or not to clobber an existing save file')
+    args = parser.parse_args(args) if predefined_args is None else predefined_args
+    save_name = args.save_name
+    num_epochs = args.num_epochs
+    train_file = args.train_file
+    loss_fn = args.loss_fn
+    eval_file = args.eval_file
+    lr = args.lr
+    args = vars(args)
+    if args['model_type'] == 'bert':
+        args['bert_model'] = 'bert-base-uncased'
+    elif args['model_type'] == 'roberta':
+        args['bert_model'] = 'roberta-base'
+    elif args['model_type'] == 'transformer':
+        if args['bert_model'] is None:
+            raise ValueError("Need to specify a bert_model for model_type transformer!")
+    else:
+        raise ValueError("Unknown model type " + args['model_type'])
+    if os.path.exists(save_name) and not args.get('force', False):
+        raise FileExistsError(f"Save name {save_name} already exists. Training would override existing data. Aborting...")
+    if not os.path.exists(train_file):
+        raise FileNotFoundError(f"Training file {train_file} not found. Try again with a valid path.")
+    logger.info("Running training script with the following args:")
+    for arg in args:
+        logger.info(f"{arg}: {args[arg]}")
+    logger.info("------------------------------------------------------------")
+    trainer = TransformerBaselineTrainer(model_args=args, transformer_name=args['bert_model'], loss_func=loss_fn, lr=lr)
+    trainer.train(num_epochs=num_epochs, save_name=save_name, train_file=train_file, args=args, eval_file=eval_file)
+    return trainer
+if __name__ == "__main__":
+    main()

stanza/stanza/models/lemma_classifier/transformer_model.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import torch
+import torch.nn as nn
+import os
+import sys
+import logging
+from transformers import AutoTokenizer, AutoModel
+from typing import Mapping, List, Tuple, Any
+from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence, pad_sequence
+from stanza.models.common.bert_embedding import extract_bert_embeddings
+from stanza.models.lemma_classifier.base_model import LemmaClassifier
+from stanza.models.lemma_classifier.constants import ModelType
+logger = logging.getLogger('stanza.lemmaclassifier')
+class LemmaClassifierWithTransformer(LemmaClassifier):
+    def __init__(self, model_args: dict, output_dim: int, transformer_name: str, label_decoder: Mapping, target_words: set, target_upos: set):
+        """
+        Model architecture:
+            Use a transformer (BERT or RoBERTa) to extract contextual embedding over a sentence.
+            Get the embedding for the word that is to be classified on, and feed the embedding
+            as input to an MLP classifier that has 2 linear layers, and a prediction head.
+        Args:
+            model_args (dict): args for the model
+            output_dim (int): Dimension of the output from the MLP
+            transformer_name (str): name of the HF transformer to use
+            label_decoder (dict): a map of the labels available to the model
+            target_words (set(str)): a set of the words which might need lemmatization
+        """
+        super(LemmaClassifierWithTransformer, self).__init__(label_decoder, target_words, target_upos)
+        self.model_args = model_args
+        # Choose transformer
+        self.transformer_name = transformer_name
+        self.tokenizer = AutoTokenizer.from_pretrained(transformer_name, use_fast=True, add_prefix_space=True)
+        self.add_unsaved_module("transformer", AutoModel.from_pretrained(transformer_name))
+        config = self.transformer.config
+        embedding_size = config.hidden_size
+        # define an MLP layer
+        self.mlp = nn.Sequential(
+            nn.Linear(embedding_size, 64),
+            nn.ReLU(),
+            nn.Linear(64, output_dim)
+        )
+    def get_save_dict(self):
+        save_dict = {
+            "params": self.state_dict(),
+            "label_decoder": self.label_decoder,
+            "target_words": list(self.target_words),
+            "target_upos": list(self.target_upos),
+            "model_type": self.model_type().name,
+            "args": self.model_args,
+        }
+        skipped = [k for k in save_dict["params"].keys() if self.is_unsaved_module(k)]
+        for k in skipped:
+            del save_dict["params"][k]
+        return save_dict
+    def convert_tags(self, upos_tags: List[List[str]]):
+        return None
+    def forward(self, idx_positions: List[int], sentences: List[List[str]], upos_tags: List[List[int]]):
+        """
+        Computes the forward pass of the transformer baselines
+        Args:
+            idx_positions (List[int]): A list of the position index of the target token for lemmatization classification in each sentence.
+            sentences (List[List[str]]): A list of the token-split sentences of the input data.
+            upos_tags (List[List[int]]): A list of the upos tags for each token in every sentence - not used in this model, here for compatibility
+        Returns:
+            torch.tensor: Output logits of the neural network, where the shape is  (n, output_size) where n is the number of sentences.
+        """
+        device = next(self.transformer.parameters()).device
+        bert_embeddings = extract_bert_embeddings(self.transformer_name, self.tokenizer, self.transformer, sentences, device,
+                                                  keep_endpoints=False, num_layers=1, detach=True)
+        embeddings = [emb[idx] for idx, emb in zip(idx_positions, bert_embeddings)]
+        embeddings = torch.stack(embeddings, dim=0)[:, :, 0]
+        # pass to the MLP
+        output = self.mlp(embeddings)
+        return output
+    def model_type(self):
+        return ModelType.TRANSFORMER

stanza/stanza/models/lemma_classifier/utils.py ADDED Viewed

	@@ -0,0 +1,173 @@

+from collections import Counter
+import json
+import logging
+import os
+import random
+from typing import List, Tuple, Any, Mapping
+import stanza
+import torch
+from stanza.models.lemma_classifier.constants import DEFAULT_BATCH_SIZE
+logger = logging.getLogger('stanza.lemmaclassifier')
+class Dataset:
+    def __init__(self, data_path: str, batch_size: int =DEFAULT_BATCH_SIZE, get_counts: bool = False, label_decoder: dict = None, shuffle: bool = True):
+        """
+        Loads a data file into data batches for tokenized text sentences, token indices, and true labels for each sentence.
+        Args:
+            data_path (str): Path to data file, containing tokenized text sentences, token index and true label for token lemma on each line.
+            batch_size (int): Size of each batch of examples
+            get_counts (optional, bool): Whether there should be a map of the label index to counts
+        Returns:
+            1. List[List[List[str]]]: Batches of sentences, where each token is a separate entry in each sentence
+            2. List[torch.tensor[int]]: A batch of indexes for the target token corresponding to its sentence
+            3. List[torch.tensor[int]]: A batch of labels for the target token's lemma
+            4. List[List[int]]: A batch of UPOS IDs for the target token (this is a List of Lists, not a tensor. It should be padded later.)
+            5 (Optional): A mapping of label ID to counts in the dataset.
+            6. Mapping[str, int]: A map between the labels and their indexes
+            7. Mapping[str, int]: A map between the UPOS tags and their corresponding IDs found in the UPOS batches
+        """
+        if data_path is None or not os.path.exists(data_path):
+            raise FileNotFoundError(f"Data file {data_path} could not be found.")
+        if label_decoder is None:
+            label_decoder = {}
+        else:
+            # if labels in the test set aren't in the original model,
+            # the model will never predict those labels,
+            # but we can still use those labels in a confusion matrix
+            label_decoder = dict(label_decoder)
+        logger.debug("Final label decoder: %s  Should be strings to ints", label_decoder)
+        # words which we are analyzing
+        target_words = set()
+        # all known words in the dataset, not just target words
+        known_words = set()
+        with open(data_path, "r+", encoding="utf-8") as fin:
+            sentences, indices, labels, upos_ids, counts, upos_to_id = [], [], [], [], Counter(), {}
+            input_json = json.load(fin)
+            sentences_data = input_json['sentences']
+            self.target_upos = input_json['upos']
+            for idx, sentence in enumerate(sentences_data):
+                # TODO Could replace this with sentence.values(), but need to know if Stanza requires Python 3.7 or later for backward compatability reasons
+                words, target_idx, upos_tags, label = sentence.get("words"), sentence.get("index"), sentence.get("upos_tags"), sentence.get("lemma")
+                if None in [words, target_idx, upos_tags, label]:
+                    raise ValueError(f"Expected data to be complete but found a null value in sentence {idx}: {sentence}")
+                label_id = label_decoder.get(label, None)
+                if label_id is None:
+                    label_decoder[label] = len(label_decoder)  # create a new ID for the unknown label
+                converted_upos_tags = []  # convert upos tags to upos IDs
+                for upos_tag in upos_tags:
+                    if upos_tag not in upos_to_id:
+                        upos_to_id[upos_tag] = len(upos_to_id)  # create a new ID for the unknown UPOS tag
+                    converted_upos_tags.append(upos_to_id[upos_tag])
+                sentences.append(words)
+                indices.append(target_idx)
+                upos_ids.append(converted_upos_tags)
+                labels.append(label_decoder[label])
+                if get_counts:
+                    counts[label_decoder[label]] += 1
+                target_words.add(words[target_idx])
+                known_words.update(words)
+        self.sentences = sentences
+        self.indices = indices
+        self.upos_ids = upos_ids
+        self.labels = labels
+        self.counts = counts
+        self.label_decoder = label_decoder
+        self.upos_to_id = upos_to_id
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.known_words = [x.lower() for x in sorted(known_words)]
+        self.target_words = set(x.lower() for x in target_words)
+    def __len__(self):
+        """
+        Number of batches, rounded up to nearest batch
+        """
+        return len(self.sentences) // self.batch_size + (len(self.sentences) % self.batch_size > 0)
+    def __iter__(self):
+        num_sentences = len(self.sentences)
+        indices = list(range(num_sentences))
+        if self.shuffle:
+            random.shuffle(indices)
+        for i in range(self.__len__()):
+            batch_start = self.batch_size * i
+            batch_end = min(batch_start + self.batch_size, num_sentences)
+            batch_sentences = [self.sentences[x] for x in indices[batch_start:batch_end]]
+            batch_indices =   torch.tensor([self.indices[x] for x in indices[batch_start:batch_end]])
+            batch_upos_ids =  [self.upos_ids[x] for x in indices[batch_start:batch_end]]
+            batch_labels =    torch.tensor([self.labels[x] for x in indices[batch_start:batch_end]])
+            yield batch_sentences, batch_indices, batch_upos_ids, batch_labels
+def extract_unknown_token_indices(tokenized_indices: torch.tensor, unknown_token_idx: int) -> List[int]:
+    """
+    Extracts the indices within `tokenized_indices` which match `unknown_token_idx`
+    Args:
+        tokenized_indices (torch.tensor): A tensor filled with tokenized indices of words that have been mapped to vector indices.
+        unknown_token_idx (int): The special index for which unknown tokens are marked in the word vectors.
+    Returns:
+        List[int]: A list of indices in `tokenized_indices` which match `unknown_token_index`
+    """
+    return [idx for idx, token_index in enumerate(tokenized_indices) if token_index == unknown_token_idx]
+def get_device():
+    """
+    Get the device to run computations on
+    """
+    if torch.cuda.is_available:
+        device = torch.device("cuda")
+    if torch.backends.mps.is_available():
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
+    return device
+def round_up_to_multiple(number, multiple):
+    if multiple == 0:
+        return "Error: The second number (multiple) cannot be zero."
+    # Calculate the remainder when dividing the number by the multiple
+    remainder = number % multiple
+    # If remainder is non-zero, round up to the next multiple
+    if remainder != 0:
+        rounded_number = number + (multiple - remainder)
+    else:
+        rounded_number = number  # No rounding needed
+    return rounded_number
+def main():
+    default_test_path = os.path.join(os.path.dirname(__file__), "test_sets", "processed_ud_en", "combined_dev.txt")   # get the GUM stuff
+    sentence_batches, indices_batches, upos_batches, _, counts, _, upos_to_id = load_dataset(default_test_path, get_counts=True)
+if __name__ == "__main__":
+    main()

stanza/stanza/models/mwt/character_classifier.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""
+Classify characters based on an LSTM with learned character representations
+"""
+import logging
+import torch
+from torch import nn
+import stanza.models.common.seq2seq_constant as constant
+logger = logging.getLogger('stanza')
+class CharacterClassifier(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.vocab_size = args['vocab_size']
+        self.emb_dim = args['emb_dim']
+        self.hidden_dim = args['hidden_dim']
+        self.nlayers = args['num_layers'] # lstm encoder layers
+        self.pad_token = constant.PAD_ID
+        self.enc_hidden_dim = self.hidden_dim // 2   # since it is bidirectional
+        self.num_outputs = 2
+        self.args = args
+        self.emb_dropout = args.get('emb_dropout', 0.0)
+        self.emb_drop = nn.Dropout(self.emb_dropout)
+        self.dropout = args['dropout']
+        self.embedding = nn.Embedding(self.vocab_size, self.emb_dim, self.pad_token)
+        self.input_dim = self.emb_dim
+        self.encoder = nn.LSTM(self.input_dim, self.enc_hidden_dim, self.nlayers, \
+                               bidirectional=True, batch_first=True, dropout=self.dropout if self.nlayers > 1 else 0)
+        self.output_layer = nn.Sequential(
+            nn.Linear(self.hidden_dim, self.hidden_dim),
+            nn.ReLU(),
+            nn.Linear(self.hidden_dim, self.num_outputs))
+    def encode(self, enc_inputs, lens):
+        """ Encode source sequence. """
+        packed_inputs = nn.utils.rnn.pack_padded_sequence(enc_inputs, lens, batch_first=True)
+        packed_h_in, (hn, cn) = self.encoder(packed_inputs)
+        return packed_h_in
+    def embed(self, src, src_mask):
+        # the input data could have characters outside the known range
+        # of characters in cases where the vocabulary was temporarily
+        # expanded (note that this model does nothing with those chars)
+        embed_src = src.clone()
+        embed_src[embed_src >= self.vocab_size] = constant.UNK_ID
+        enc_inputs = self.emb_drop(self.embedding(embed_src))
+        batch_size = enc_inputs.size(0)
+        src_lens = list(src_mask.data.eq(self.pad_token).long().sum(1))
+        return enc_inputs, batch_size, src_lens, src_mask
+    def forward(self, src, src_mask):
+        enc_inputs, batch_size, src_lens, src_mask = self.embed(src, src_mask)
+        encoded = self.encode(enc_inputs, src_lens)
+        encoded, _ = nn.utils.rnn.pad_packed_sequence(encoded, batch_first=True)
+        logits = self.output_layer(encoded)
+        return logits

stanza/stanza/models/mwt/trainer.py ADDED Viewed

	@@ -0,0 +1,218 @@

+"""
+A trainer class to handle training and testing of models.
+"""
+import sys
+import numpy as np
+from collections import Counter
+import logging
+import torch
+from torch import nn
+import torch.nn.init as init
+import stanza.models.common.seq2seq_constant as constant
+from stanza.models.common.trainer import Trainer as BaseTrainer
+from stanza.models.common.seq2seq_model import Seq2SeqModel
+from stanza.models.common import utils, loss
+from stanza.models.mwt.character_classifier import CharacterClassifier
+from stanza.models.mwt.vocab import Vocab
+logger = logging.getLogger('stanza')
+def unpack_batch(batch, device):
+    """ Unpack a batch from the data loader. """
+    inputs = [b.to(device) if b is not None else None for b in batch[:4]]
+    orig_text = batch[4]
+    orig_idx = batch[5]
+    return inputs, orig_text, orig_idx
+class Trainer(BaseTrainer):
+    """ A trainer for training models. """
+    def __init__(self, args=None, vocab=None, emb_matrix=None, model_file=None, device=None):
+        if model_file is not None:
+            # load from file
+            self.load(model_file)
+        else:
+            self.args = args
+            if args['dict_only']:
+                self.model = None
+            elif args.get('force_exact_pieces', False):
+                self.model = CharacterClassifier(args)
+            else:
+                self.model = Seq2SeqModel(args, emb_matrix=emb_matrix)
+            self.vocab = vocab
+            self.expansion_dict = dict()
+        if not self.args['dict_only']:
+            self.model = self.model.to(device)
+            if self.args.get('force_exact_pieces', False):
+                self.crit = nn.CrossEntropyLoss()
+            else:
+                self.crit = loss.SequenceLoss(self.vocab.size).to(device)
+            self.optimizer = utils.get_optimizer(self.args['optim'], self.model, self.args['lr'])
+    def update(self, batch, eval=False):
+        device = next(self.model.parameters()).device
+        # ignore the original text when training
+        # can try to learn the correct values, even if we eventually
+        # copy directly from the original text
+        inputs, _, orig_idx = unpack_batch(batch, device)
+        src, src_mask, tgt_in, tgt_out = inputs
+        if eval:
+            self.model.eval()
+        else:
+            self.model.train()
+            self.optimizer.zero_grad()
+        if self.args.get('force_exact_pieces', False):
+            log_probs = self.model(src, src_mask)
+            src_lens = list(src_mask.data.eq(constant.PAD_ID).long().sum(1))
+            packed_output = nn.utils.rnn.pack_padded_sequence(log_probs, src_lens, batch_first=True)
+            packed_tgt = nn.utils.rnn.pack_padded_sequence(tgt_in, src_lens, batch_first=True)
+            loss = self.crit(packed_output.data, packed_tgt.data)
+        else:
+            log_probs, _ = self.model(src, src_mask, tgt_in)
+            loss = self.crit(log_probs.view(-1, self.vocab.size), tgt_out.view(-1))
+        loss_val = loss.data.item()
+        if eval:
+            return loss_val
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args['max_grad_norm'])
+        self.optimizer.step()
+        return loss_val
+    def predict(self, batch, unsort=True, never_decode_unk=False, vocab=None):
+        if vocab is None:
+            vocab = self.vocab
+        device = next(self.model.parameters()).device
+        inputs, orig_text, orig_idx = unpack_batch(batch, device)
+        src, src_mask, tgt, tgt_mask = inputs
+        self.model.eval()
+        batch_size = src.size(0)
+        if self.args.get('force_exact_pieces', False):
+            log_probs = self.model(src, src_mask)
+            cuts = log_probs[:, :, 1] > log_probs[:, :, 0]
+            src_lens = list(src_mask.data.eq(constant.PAD_ID).long().sum(1))
+            pred_tokens = []
+            for src_ids, cut, src_len in zip(src, cuts, src_lens):
+                src_chars = vocab.unmap(src_ids)
+                pred_seq = []
+                for char_idx in range(1, src_len-1):
+                    if cut[char_idx]:
+                        pred_seq.append(' ')
+                    pred_seq.append(src_chars[char_idx])
+                pred_seq = "".join(pred_seq).strip()
+                pred_tokens.append(pred_seq)
+        else:
+            preds, _ = self.model.predict(src, src_mask, self.args['beam_size'], never_decode_unk=never_decode_unk)
+            pred_seqs = [vocab.unmap(ids) for ids in preds] # unmap to tokens
+            pred_seqs = utils.prune_decoded_seqs(pred_seqs)
+            pred_tokens = ["".join(seq) for seq in pred_seqs] # join chars to be tokens
+            # if any tokens are predicted to expand to blank,
+            # that is likely an error.  use the original text
+            # this originally came up with the Spanish model turning 's' into a blank
+            # furthermore, if there are no spaces predicted by the seq2seq,
+            # might as well use the original in case the seq2seq went crazy
+            # this particular error came up training a Hebrew MWT
+            pred_tokens = [x if x and ' ' in x else y for x, y in zip(pred_tokens, orig_text)]
+        if unsort:
+            pred_tokens = utils.unsort(pred_tokens, orig_idx)
+        return pred_tokens
+    def train_dict(self, pairs):
+        """ Train a MWT expander given training word-expansion pairs. """
+        # accumulate counter
+        ctr = Counter()
+        ctr.update([(p[0], p[1]) for p in pairs])
+        seen = set()
+        # find the most frequent mappings
+        for p, _ in ctr.most_common():
+            w, l = p
+            if w not in seen and w != l:
+                self.expansion_dict[w] = l
+            seen.add(w)
+        return
+    def dict_expansion(self, word):
+        """
+        Check the expansion dictionary for the word along with a couple common lowercasings of the word
+        (Leadingcase and UPPERCASE)
+        """
+        expansion = self.expansion_dict.get(word)
+        if expansion is not None:
+            return expansion
+        if word.isupper():
+            expansion = self.expansion_dict.get(word.lower())
+            if expansion is not None:
+                return expansion.upper()
+        if word[0].isupper() and word[1:].islower():
+            expansion = self.expansion_dict.get(word.lower())
+            if expansion is not None:
+                return expansion[0].upper() + expansion[1:]
+        # could build a truecasing model of some kind to handle cRaZyCaSe...
+        # but that's probably too much effort
+        return None
+    def predict_dict(self, words):
+        """ Predict a list of expansions given words. """
+        expansions = []
+        for w in words:
+            expansion = self.dict_expansion(w)
+            if expansion is not None:
+                expansions.append(expansion)
+            else:
+                expansions.append(w)
+        return expansions
+    def ensemble(self, cands, other_preds):
+        """ Ensemble the dict with statistical model predictions. """
+        expansions = []
+        assert len(cands) == len(other_preds)
+        for c, pred in zip(cands, other_preds):
+            expansion = self.dict_expansion(c)
+            if expansion is not None:
+                expansions.append(expansion)
+            else:
+                expansions.append(pred)
+        return expansions
+    def save(self, filename):
+        params = {
+                'model': self.model.state_dict() if self.model is not None else None,
+                'dict': self.expansion_dict,
+                'vocab': self.vocab.state_dict(),
+                'config': self.args
+                }
+        try:
+            torch.save(params, filename, _use_new_zipfile_serialization=False)
+            logger.info("Model saved to {}".format(filename))
+        except BaseException:
+            logger.warning("Saving failed... continuing anyway.")
+    def load(self, filename):
+        try:
+            checkpoint = torch.load(filename, lambda storage, loc: storage, weights_only=True)
+        except BaseException:
+            logger.error("Cannot load model from {}".format(filename))
+            raise
+        self.args = checkpoint['config']
+        self.expansion_dict = checkpoint['dict']
+        if not self.args['dict_only']:
+            if self.args.get('force_exact_pieces', False):
+                self.model = CharacterClassifier(self.args)
+            else:
+                self.model = Seq2SeqModel(self.args)
+            # could remove strict=False after rebuilding all models,
+            # or could switch to 1.6.0 torch with the buffer in seq2seq persistent=False
+            self.model.load_state_dict(checkpoint['model'], strict=False)
+        else:
+            self.model = None
+        self.vocab = Vocab.load_state_dict(checkpoint['vocab'])

stanza/stanza/models/mwt/vocab.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from collections import Counter
+from stanza.models.common.vocab import BaseVocab
+import stanza.models.common.seq2seq_constant as constant
+class Vocab(BaseVocab):
+    def build_vocab(self):
+        pairs = self.data
+        allchars = "".join([src + tgt for src, tgt in pairs])
+        counter = Counter(allchars)
+        self._id2unit = constant.VOCAB_PREFIX + list(sorted(list(counter.keys()), key=lambda k: counter[k], reverse=True))
+        self._unit2id = {w:i for i, w in enumerate(self._id2unit)}
+    def add_unit(self, unit):
+        if unit in self._unit2id:
+            return
+        self._unit2id[unit] = len(self._id2unit)
+        self._id2unit.append(unit)

stanza/stanza/models/ner/vocab.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from collections import Counter, OrderedDict
+from stanza.models.common.vocab import BaseVocab, BaseMultiVocab, CharVocab, CompositeVocab
+from stanza.models.common.vocab import VOCAB_PREFIX
+from stanza.models.common.pretrain import PretrainedWordVocab
+from stanza.models.pos.vocab import WordVocab
+class TagVocab(BaseVocab):
+    """ A vocab for the output tag sequence. """
+    def build_vocab(self):
+        counter = Counter([w[self.idx] for sent in self.data for w in sent])
+        self._id2unit = VOCAB_PREFIX + list(sorted(list(counter.keys()), key=lambda k: counter[k], reverse=True))
+        self._unit2id = {w:i for i, w in enumerate(self._id2unit)}
+def convert_tag_vocab(state_dict):
+    if state_dict['lower']:
+        raise AssertionError("Did not expect an NER vocab with 'lower' set to True")
+    items = state_dict['_id2unit'][len(VOCAB_PREFIX):]
+    # this looks silly, but the vocab builder treats this as words with multiple fields
+    # (we set it to look for field 0 with idx=0)
+    # and then the label field is expected to be a list or tuple of items
+    items = [[[[x]]] for x in items]
+    vocab = CompositeVocab(data=items, lang=state_dict['lang'], idx=0, sep=None)
+    if len(vocab._id2unit[0]) != len(state_dict['_id2unit']):
+        raise AssertionError("Failed to construct a new vocab of the same length as the original")
+    if vocab._id2unit[0] != state_dict['_id2unit']:
+        raise AssertionError("Failed to construct a new vocab in the same order as the original")
+    return vocab
+class MultiVocab(BaseMultiVocab):
+    def state_dict(self):
+        """ Also save a vocab name to class name mapping in state dict. """
+        state = OrderedDict()
+        key2class = OrderedDict()
+        for k, v in self._vocabs.items():
+            state[k] = v.state_dict()
+            key2class[k] = type(v).__name__
+        state['_key2class'] = key2class
+        return state
+    @classmethod
+    def load_state_dict(cls, state_dict):
+        class_dict = {'CharVocab': CharVocab.load_state_dict,
+                      'PretrainedWordVocab': PretrainedWordVocab.load_state_dict,
+                      'TagVocab': convert_tag_vocab,
+                      'CompositeVocab': CompositeVocab.load_state_dict,
+                      'WordVocab': WordVocab.load_state_dict}
+        new = cls()
+        assert '_key2class' in state_dict, "Cannot find class name mapping in state dict!"
+        key2class = state_dict.pop('_key2class')
+        for k,v in state_dict.items():
+            classname = key2class[k]
+            new[k] = class_dict[classname](v)
+        return new

stanza/stanza/models/pos/__init__.py ADDED Viewed

File without changes

stanza/stanza/models/pos/build_xpos_vocab_factory.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import argparse
+from collections import defaultdict
+import logging
+import os
+import re
+import sys
+from zipfile import ZipFile
+from stanza.models.common.constant import treebank_to_short_name
+from stanza.models.pos.xpos_vocab_utils import DEFAULT_KEY, choose_simplest_factory, XPOSType
+from stanza.models.common.doc import *
+from stanza.utils.conll import CoNLL
+from stanza.utils import default_paths
+SHORTNAME_RE = re.compile("[a-z-]+_[a-z0-9]+")
+DATA_DIR = default_paths.get_default_paths()['POS_DATA_DIR']
+logger = logging.getLogger('stanza')
+def get_xpos_factory(shorthand, fn):
+    logger.info('Resolving vocab option for {}...'.format(shorthand))
+    doc = None
+    train_file = os.path.join(DATA_DIR, '{}.train.in.conllu'.format(shorthand))
+    if os.path.exists(train_file):
+        doc = CoNLL.conll2doc(input_file=train_file)
+    else:
+        zip_file = os.path.join(DATA_DIR, '{}.train.in.zip'.format(shorthand))
+        if os.path.exists(zip_file):
+            with ZipFile(zip_file) as zin:
+                for train_file in zin.namelist():
+                    doc = CoNLL.conll2doc(input_file=train_file, zip_file=zip_file)
+                    if any(word.xpos for sentence in doc.sentences for word in sentence.words):
+                        break
+                else:
+                    raise ValueError('Found training data in {}, but none of the files contained had xpos'.format(zip_file))
+    if doc is None:
+        raise FileNotFoundError('Training data for {} not found.  To generate the XPOS vocabulary '
+                                'for this treebank properly, please run the following command first:\n'
+                                '  python3 stanza/utils/datasets/prepare_pos_treebank.py {}'.format(fn, fn))
+        # without the training file, there's not much we can do
+        key = DEFAULT_KEY
+        return key
+    data = doc.get([TEXT, UPOS, XPOS, FEATS], as_sentences=True)
+    return choose_simplest_factory(data, shorthand)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--treebanks', type=str, default=DATA_DIR, help="Treebanks to process - directory with processed datasets or a file with a list")
+    parser.add_argument('--output_file', type=str, default="stanza/models/pos/xpos_vocab_factory.py", help="Where to write the results")
+    args = parser.parse_args()
+    output_file = args.output_file
+    if os.path.isdir(args.treebanks):
+        # if the path is a directory of datasets (which is the default if --treebanks is not set)
+        # we use those datasets to prepare the xpos factories
+        treebanks = os.listdir(args.treebanks)
+        treebanks = [x.split(".", maxsplit=1)[0] for x in treebanks]
+        treebanks = sorted(set(treebanks))
+    elif os.path.exists(args.treebanks):
+        # maybe it's a file with a list of names
+        with open(args.treebanks) as fin:
+            treebanks = sorted(set([x.strip() for x in fin.readlines() if x.strip()]))
+    else:
+        raise ValueError("Cannot figure out which treebanks to use.   Please set the --treebanks parameter")
+    logger.info("Processing the following treebanks: %s" % " ".join(treebanks))
+    shorthands = []
+    fullnames = []
+    for treebank in treebanks:
+        fullnames.append(treebank)
+        if SHORTNAME_RE.match(treebank):
+            shorthands.append(treebank)
+        else:
+            shorthands.append(treebank_to_short_name(treebank))
+    # For each treebank, we would like to find the XPOS Vocab configuration that minimizes
+    # the number of total classes needed to predict by all tagger classifiers. This is
+    # achieved by enumerating different options of separators that different treebanks might
+    # use, and comparing that to treating the XPOS tags as separate categories (using a
+    # WordVocab).
+    mapping = defaultdict(list)
+    for sh, fn in zip(shorthands, fullnames):
+        factory = get_xpos_factory(sh, fn)
+        mapping[factory].append(sh)
+        if sh == 'zh-hans_gsdsimp':
+            mapping[factory].append('zh_gsdsimp')
+        elif sh == 'no_bokmaal':
+            mapping[factory].append('nb_bokmaal')
+    mapping[DEFAULT_KEY].append('en_test')
+    # Generate code. This takes the XPOS vocabulary classes selected above, and generates the
+    # actual factory class as seen in models.pos.xpos_vocab_factory.
+    first = True
+    with open(output_file, 'w') as f:
+        max_len = max(max(len(x) for x in mapping[key]) for key in mapping)
+        print('''# This is the XPOS factory method generated automatically from stanza.models.pos.build_xpos_vocab_factory.
+# Please don't edit it!
+import logging
+from stanza.models.pos.vocab import WordVocab, XPOSVocab
+from stanza.models.pos.xpos_vocab_utils import XPOSDescription, XPOSType, build_xpos_vocab, choose_simplest_factory
+# using a sublogger makes it easier to test in the unittests
+logger = logging.getLogger('stanza.models.pos.xpos_vocab_factory')
+XPOS_DESCRIPTIONS = {''', file=f)
+        for key_idx, key in enumerate(mapping):
+            if key_idx > 0:
+                print(file=f)
+            for shorthand in sorted(mapping[key]):
+                # +2 to max_len for the ''
+                # this format string is left justified (either would be okay, probably)
+                if key.sep is None:
+                    sep = 'None'
+                else:
+                    sep = "'%s'" % key.sep
+                print(("    {:%ds}: XPOSDescription({}, {})," % (max_len+2)).format("'%s'" % shorthand, key.xpos_type, sep), file=f)
+        print('''}
+def xpos_vocab_factory(data, shorthand):
+    if shorthand not in XPOS_DESCRIPTIONS:
+        logger.warning("%s is not a known dataset.  Examining the data to choose which xpos vocab to use", shorthand)
+    desc = choose_simplest_factory(data, shorthand)
+    if shorthand in XPOS_DESCRIPTIONS:
+        if XPOS_DESCRIPTIONS[shorthand] != desc:
+            # log instead of throw
+            # otherwise, updating datasets would be unpleasant
+            logger.error("XPOS tagset in %s has apparently changed!  Was %s, is now %s", shorthand, XPOS_DESCRIPTIONS[shorthand], desc)
+    else:
+        logger.warning("Chose %s for the xpos factory for %s", desc, shorthand)
+    return build_xpos_vocab(desc, data, shorthand)
+''', file=f)
+    logger.info('Done!')
+if __name__ == "__main__":
+    main()

stanza/stanza/models/pos/data.py ADDED Viewed

	@@ -0,0 +1,387 @@

+import random
+import logging
+import copy
+import torch
+from collections import namedtuple
+from torch.utils.data import DataLoader as DL
+from torch.utils.data.sampler import Sampler
+from torch.nn.utils.rnn import pad_sequence
+from stanza.models.common.bert_embedding import filter_data, needs_length_filter
+from stanza.models.common.data import map_to_ids, get_long_tensor, get_float_tensor, sort_all
+from stanza.models.common.vocab import PAD_ID, VOCAB_PREFIX, CharVocab
+from stanza.models.pos.vocab import WordVocab, XPOSVocab, FeatureVocab, MultiVocab
+from stanza.models.pos.xpos_vocab_factory import xpos_vocab_factory
+from stanza.models.common.doc import *
+logger = logging.getLogger('stanza')
+DataSample = namedtuple("DataSample", "word char upos xpos feats pretrain text")
+DataBatch = namedtuple("DataBatch", "words words_mask wordchars wordchars_mask upos xpos ufeats pretrained orig_idx word_orig_idx lens word_lens text idx")
+class Dataset:
+    def __init__(self, doc, args, pretrain, vocab=None, evaluation=False, sort_during_eval=False, bert_tokenizer=None, **kwargs):
+        self.args = args
+        self.eval = evaluation
+        self.shuffled = not self.eval
+        self.sort_during_eval = sort_during_eval
+        self.doc = doc
+        if vocab is None:
+            self.vocab = Dataset.init_vocab([doc], args)
+        else:
+            self.vocab = vocab
+        self.has_upos = not all(x is None or x == '_' for x in doc.get(UPOS, as_sentences=False))
+        self.has_xpos = not all(x is None or x == '_' for x in doc.get(XPOS, as_sentences=False))
+        self.has_feats = not all(x is None or x == '_' for x in doc.get(FEATS, as_sentences=False))
+        data = self.load_doc(self.doc)
+        # filter out the long sentences if bert is used
+        if self.args.get('bert_model', None) and needs_length_filter(self.args['bert_model']):
+            data = filter_data(self.args['bert_model'], data, bert_tokenizer)
+        # handle pretrain; pretrain vocab is used when args['pretrain'] == True and pretrain is not None
+        self.pretrain_vocab = None
+        if pretrain is not None and args['pretrain']:
+            self.pretrain_vocab = pretrain.vocab
+        # filter and sample data
+        if args.get('sample_train', 1.0) < 1.0 and not self.eval:
+            keep = int(args['sample_train'] * len(data))
+            data = random.sample(data, keep)
+            logger.debug("Subsample training set with rate {:g}".format(args['sample_train']))
+        data = self.preprocess(data, self.vocab, self.pretrain_vocab, args)
+        self.data = data
+        self.num_examples = len(data)
+        self.__punct_tags = self.vocab["upos"].map(["PUNCT"])
+        self.augment_nopunct = self.args.get("augment_nopunct", 0.0)
+    @staticmethod
+    def init_vocab(docs, args):
+        data = [x for doc in docs for x in Dataset.load_doc(doc)]
+        charvocab = CharVocab(data, args['shorthand'])
+        wordvocab = WordVocab(data, args['shorthand'], cutoff=args['word_cutoff'], lower=True)
+        uposvocab = WordVocab(data, args['shorthand'], idx=1)
+        xposvocab = xpos_vocab_factory(data, args['shorthand'])
+        try:
+            featsvocab = FeatureVocab(data, args['shorthand'], idx=3)
+        except ValueError as e:
+            raise ValueError("Unable to build features vocab.  Please check the Features column of your data for an error which may match the following description.") from e
+        vocab = MultiVocab({'char': charvocab,
+                            'word': wordvocab,
+                            'upos': uposvocab,
+                            'xpos': xposvocab,
+                            'feats': featsvocab})
+        return vocab
+    def preprocess(self, data, vocab, pretrain_vocab, args):
+        processed = []
+        for sent in data:
+            processed_sent = DataSample(
+                word = [vocab['word'].map([w[0] for w in sent])],
+                char = [[vocab['char'].map([x for x in w[0]]) for w in sent]],
+                upos = [vocab['upos'].map([w[1] for w in sent])],
+                xpos = [vocab['xpos'].map([w[2] for w in sent])],
+                feats = [vocab['feats'].map([w[3] for w in sent])],
+                pretrain = ([pretrain_vocab.map([w[0].lower() for w in sent])]
+                            if pretrain_vocab is not None
+                           else [[PAD_ID] * len(sent)]),
+                text = [w[0] for w in sent]
+            )
+            processed.append(processed_sent)
+        return processed
+    def __len__(self):
+        return len(self.data)
+    def __mask(self, upos):
+        """Returns a torch boolean about which elements should be masked out"""
+        # creates all false mask
+        mask = torch.zeros_like(upos, dtype=torch.bool)
+        ### augmentation 1: punctuation augmentation ###
+        # tags that needs to be checked, currently only PUNCT
+        if random.uniform(0,1) < self.augment_nopunct:
+            for i in self.__punct_tags:
+                # generate a mask for the last element
+                last_element = torch.zeros_like(upos, dtype=torch.bool)
+                last_element[..., -1] = True
+                # we or the bitmask against the existing mask
+                # if it satisfies, we remove the word by masking it
+                # to true
+                #
+                # if your input is just a lone punctuation, we perform
+                # no masking
+                if not torch.all(upos.eq(torch.tensor([[i]]))):
+                    mask |= ((upos == i) & (last_element))
+        return mask
+    def __getitem__(self, key):
+        """Retrieves a sample from the dataset.
+        Retrieves a sample from the dataset. This function, for the
+        most part, is spent performing ad-hoc data augmentation and
+        restoration. It recieves a DataSample object from the storage,
+        and returns an almost-identical DataSample object that may
+        have been augmented with /possibly/ (depending on augment_punct
+        settings) PUNCT chopped.
+        **Important Note**
+        ------------------
+        If you would like to load the data into a model, please convert
+        this Dataset object into a DataLoader via self.to_loader(). Then,
+        you can use the resulting object like any other PyTorch data
+        loader. As masks are calculated ad-hoc given the batch, the samples
+        returned from this object doesn't have the appropriate masking.
+        Motivation
+        ----------
+        Why is this here? Every time you call next(iter(dataloader)), it calls
+        this function. Therefore, if we augmented each sample on each iteration,
+        the model will see dynamically generated augmentation.
+        Furthermore, PyTorch dataloader handles shuffling natively.
+        Parameters
+        ----------
+        key : int
+            the integer ID to from which to retrieve the key.
+        Returns
+        -------
+        DataSample
+            The sample of data you requested, with augmentation.
+        """
+        # get a sample of the input data
+        sample = self.data[key]
+        # some data augmentation requires constructing a mask based on upos.
+        # For instance, sometimes we'd like to mask out ending sentence punctuation.
+        # We copy the other items here so that any edits made because
+        # of the mask don't clobber the version owned by the Dataset
+        # convert to tensors
+        # TODO: only store single lists per data entry?
+        words = torch.tensor(sample.word[0])
+        # convert the rest to tensors
+        upos = torch.tensor(sample.upos[0]) if self.has_upos else None
+        xpos = torch.tensor(sample.xpos[0]) if self.has_xpos else None
+        ufeats = torch.tensor(sample.feats[0]) if self.has_feats else None
+        pretrained = torch.tensor(sample.pretrain[0])
+        # and deal with char & raw_text
+        char = sample.char[0]
+        raw_text = sample.text
+        # some data augmentation requires constructing a mask based on
+        # which upos. For instance, sometimes we'd like to mask out ending
+        # sentence punctuation. The mask is True if we want to remove the element
+        if self.has_upos and upos is not None and not self.eval:
+            # perform actual masking
+            mask = self.__mask(upos)
+        else:
+            # dummy mask that's all false
+            mask = None
+        if mask is not None:
+            mask_index = mask.nonzero()
+            # mask out the elements that we need to mask out
+            for mask in mask_index:
+                mask = mask.item()
+                words[mask] = PAD_ID
+                if upos is not None:
+                    upos[mask] = PAD_ID
+                if xpos is not None:
+                    # TODO: test the multi-dimension xpos
+                    xpos[mask, ...] = PAD_ID
+                if ufeats is not None:
+                    ufeats[mask, ...] = PAD_ID
+                pretrained[mask] = PAD_ID
+                char = char[:mask] + char[mask+1:]
+                raw_text = raw_text[:mask] + raw_text[mask+1:]
+        # get each character from the input sentnece
+        # chars = [w for sent in char for w in sent]
+        return DataSample(words, char, upos, xpos, ufeats, pretrained, raw_text), key
+    def __iter__(self):
+        for i in range(self.__len__()):
+            yield self.__getitem__(i)
+    def to_loader(self, **kwargs):
+        """Converts self to a DataLoader """
+        return DL(self,
+                  collate_fn=Dataset.__collate_fn,
+                  **kwargs)
+    def to_length_limited_loader(self, batch_size, maximum_tokens):
+        sampler = LengthLimitedBatchSampler(self, batch_size, maximum_tokens)
+        return DL(self,
+                  collate_fn=Dataset.__collate_fn,
+                  batch_sampler = sampler)
+    @staticmethod
+    def __collate_fn(data):
+        """Function used by DataLoader to pack data"""
+        (data, idx) = zip(*data)
+        (words, wordchars, upos, xpos, ufeats, pretrained, text) = zip(*data)
+        # collate_fn is given a list of length batch size
+        batch_size = len(data)
+        # sort sentences by lens for easy RNN operations
+        lens = [torch.sum(x != PAD_ID) for x in words]
+        (words, wordchars, upos, xpos,
+         ufeats, pretrained, text), orig_idx = sort_all((words, wordchars, upos, xpos,
+                                                         ufeats, pretrained, text), lens)
+        lens = [torch.sum(x != PAD_ID) for x in words] # we need to reinterpret lengths for the RNN
+        # combine all words into one large list, and sort for easy charRNN ops
+        wordchars = [w for sent in wordchars for w in sent]
+        word_lens = [len(x) for x in wordchars]
+        (wordchars,), word_orig_idx = sort_all([wordchars], word_lens)
+        word_lens = [len(x) for x in wordchars] # we need to reinterpret lengths for the RNN
+        # We now pad everything
+        words = pad_sequence(words, True, PAD_ID)
+        if None not in upos:
+            upos = pad_sequence(upos, True, PAD_ID)
+        else:
+            upos = None
+        if None not in xpos:
+            xpos = pad_sequence(xpos, True, PAD_ID)
+        else:
+            xpos = None
+        if None not in ufeats:
+            ufeats = pad_sequence(ufeats, True, PAD_ID)
+        else:
+            ufeats = None
+        pretrained = pad_sequence(pretrained, True, PAD_ID)
+        wordchars = get_long_tensor(wordchars, len(word_lens))
+        # and finally create masks for the padding indices
+        words_mask = torch.eq(words, PAD_ID)
+        wordchars_mask = torch.eq(wordchars, PAD_ID)
+        return DataBatch(words, words_mask, wordchars, wordchars_mask, upos, xpos, ufeats,
+                         pretrained, orig_idx, word_orig_idx, lens, word_lens, text, idx)
+    @staticmethod
+    def load_doc(doc):
+        data = doc.get([TEXT, UPOS, XPOS, FEATS], as_sentences=True)
+        data = Dataset.resolve_none(data)
+        return data
+    @staticmethod
+    def resolve_none(data):
+        # replace None to '_'
+        for sent_idx in range(len(data)):
+            for tok_idx in range(len(data[sent_idx])):
+                for feat_idx in range(len(data[sent_idx][tok_idx])):
+                    if data[sent_idx][tok_idx][feat_idx] is None:
+                        data[sent_idx][tok_idx][feat_idx] = '_'
+        return data
+class LengthLimitedBatchSampler(Sampler):
+    """
+    Batches up the text in batches of batch_size, but cuts off each time a batch reaches maximum_tokens
+    Intent is to avoid GPU OOM in situations where one sentence is significantly longer than expected,
+    leaving a batch too large to fit in the GPU
+    Sentences which are longer than maximum_tokens by themselves are put in their own batches
+    """
+    def __init__(self, data, batch_size, maximum_tokens):
+        """
+        Precalculate the batches, making it so len and iter just read off the precalculated batches
+        """
+        self.data = data
+        self.batch_size = batch_size
+        self.maximum_tokens = maximum_tokens
+        self.batches = []
+        current_batch = []
+        current_length = 0
+        for item, item_idx in data:
+            item_len = len(item.word)
+            if maximum_tokens and item_len > maximum_tokens:
+                if len(current_batch) > 0:
+                    self.batches.append(current_batch)
+                    current_batch = []
+                    current_length = 0
+                self.batches.append([item_idx])
+                continue
+            if len(current_batch) + 1 > batch_size or (maximum_tokens and item_len + current_length > maximum_tokens):
+                self.batches.append(current_batch)
+                current_batch = []
+                current_length = 0
+            current_batch.append(item_idx)
+            current_length += item_len
+        if len(current_batch) > 0:
+            self.batches.append(current_batch)
+    def __len__(self):
+        return len(self.batches)
+    def __iter__(self):
+        for batch in self.batches:
+            current_batch = []
+            for idx in batch:
+                current_batch.append(idx)
+            yield current_batch
+class ShuffledDataset:
+    """A wrapper around one or more datasets which shuffles the data in batch_size chunks
+    This means that if multiple datasets are passed in, the batches
+    from each dataset are shuffled together, with one batch being
+    entirely members of the same dataset.
+    The main use case of this is that in the tagger, there are cases
+    where batches from different datasets will have different
+    properties, such as having or not having UPOS tags.  We found that
+    it is actually somewhat tricky to make the model's loss function
+    (in model.py) properly represent batches with mixed w/ and w/o
+    property, whereas keeping one entire batch together makes it a lot
+    easier to process.
+    The mechanism for the shuffling is that the iterator first makes a
+    list long enough to represent each batch from each dataset,
+    tracking the index of the dataset it is coming from, then shuffles
+    that list.  Another alternative would be to use a weighted
+    randomization approach, but this is very simple and the memory
+    requirements are not too onerous.
+    Note that the batch indices are wasteful in the case of only one
+    underlying dataset, which is actually the most common use case,
+    but the overhead is small enough that it probably isn't worth
+    special casing the one dataset version.
+    """
+    def __init__(self, datasets, batch_size):
+        self.batch_size = batch_size
+        self.datasets = datasets
+        self.loaders = [x.to_loader(batch_size=self.batch_size, shuffle=True) for x in self.datasets]
+    def __iter__(self):
+        iterators = [iter(x) for x in self.loaders]
+        lengths = [len(x) for x in self.loaders]
+        indices = [[x] * y for x, y in enumerate(lengths)]
+        indices = [idx for inner in indices for idx in inner]
+        random.shuffle(indices)
+        for idx in indices:
+            yield(next(iterators[idx]))
+    def __len__(self):
+        return sum(len(x) for x in self.datasets)

stanza/stanza/models/pos/model.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import logging
+import os
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence, pack_sequence, pad_sequence, PackedSequence
+from stanza.models.common.bert_embedding import extract_bert_embeddings
+from stanza.models.common.biaffine import BiaffineScorer
+from stanza.models.common.foundation_cache import load_bert, load_charlm
+from stanza.models.common.hlstm import HighwayLSTM
+from stanza.models.common.dropout import WordDropout
+from stanza.models.common.utils import attach_bert_model
+from stanza.models.common.vocab import CompositeVocab
+from stanza.models.common.char_model import CharacterModel
+from stanza.models.common import utils
+logger = logging.getLogger('stanza')
+class Tagger(nn.Module):
+    def __init__(self, args, vocab, emb_matrix=None, share_hid=False, foundation_cache=None, bert_model=None, bert_tokenizer=None, force_bert_saved=False, peft_name=None):
+        super().__init__()
+        self.vocab = vocab
+        self.args = args
+        self.share_hid = share_hid
+        self.unsaved_modules = []
+        # input layers
+        input_size = 0
+        if self.args['word_emb_dim'] > 0:
+            # frequent word embeddings
+            self.word_emb = nn.Embedding(len(vocab['word']), self.args['word_emb_dim'], padding_idx=0)
+            input_size += self.args['word_emb_dim']
+        if not share_hid:
+            # upos embeddings
+            self.upos_emb = nn.Embedding(len(vocab['upos']), self.args['tag_emb_dim'], padding_idx=0)
+        if self.args['char'] and self.args['char_emb_dim'] > 0:
+            if self.args.get('charlm', None):
+                if args['charlm_forward_file'] is None or not os.path.exists(args['charlm_forward_file']):
+                    raise FileNotFoundError('Could not find forward character model: {}  Please specify with --charlm_forward_file'.format(args['charlm_forward_file']))
+                if args['charlm_backward_file'] is None or not os.path.exists(args['charlm_backward_file']):
+                    raise FileNotFoundError('Could not find backward character model: {}  Please specify with --charlm_backward_file'.format(args['charlm_backward_file']))
+                logger.debug("POS model loading charmodels: %s and %s", args['charlm_forward_file'], args['charlm_backward_file'])
+                self.add_unsaved_module('charmodel_forward', load_charlm(args['charlm_forward_file'], foundation_cache=foundation_cache))
+                self.add_unsaved_module('charmodel_backward', load_charlm(args['charlm_backward_file'], foundation_cache=foundation_cache))
+                # optionally add a input transformation layer
+                if self.args.get('charlm_transform_dim', 0):
+                    self.charmodel_forward_transform = nn.Linear(self.charmodel_forward.hidden_dim(), self.args['charlm_transform_dim'], bias=False)
+                    self.charmodel_backward_transform = nn.Linear(self.charmodel_backward.hidden_dim(), self.args['charlm_transform_dim'], bias=False)
+                    input_size += self.args['charlm_transform_dim'] * 2
+                else:
+                    self.charmodel_forward_transform = None
+                    self.charmodel_backward_transform = None
+                    input_size += self.charmodel_forward.hidden_dim() + self.charmodel_backward.hidden_dim()
+            else:
+                bidirectional = args.get('char_bidirectional', False)
+                self.charmodel = CharacterModel(args, vocab, bidirectional=bidirectional)
+                if bidirectional:
+                    self.trans_char = nn.Linear(self.args['char_hidden_dim'] * 2, self.args['transformed_dim'], bias=False)
+                else:
+                    self.trans_char = nn.Linear(self.args['char_hidden_dim'], self.args['transformed_dim'], bias=False)
+                input_size += self.args['transformed_dim']
+        self.peft_name = peft_name
+        attach_bert_model(self, bert_model, bert_tokenizer, self.args.get('use_peft', False), force_bert_saved)
+        if self.args.get('bert_model', None):
+            # TODO: refactor bert_hidden_layers between the different models
+            if args.get('bert_hidden_layers', False):
+                # The average will be offset by 1/N so that the default zeros
+                # represents an average of the N layers
+                self.bert_layer_mix = nn.Linear(args['bert_hidden_layers'], 1, bias=False)
+                nn.init.zeros_(self.bert_layer_mix.weight)
+            else:
+                # an average of layers 2, 3, 4 will be used
+                # (for historic reasons)
+                self.bert_layer_mix = None
+            input_size += self.bert_model.config.hidden_size
+        if self.args['pretrain']:
+            # pretrained embeddings, by default this won't be saved into model file
+            self.add_unsaved_module('pretrained_emb', nn.Embedding.from_pretrained(emb_matrix, freeze=True))
+            self.trans_pretrained = nn.Linear(emb_matrix.shape[1], self.args['transformed_dim'], bias=False)
+            input_size += self.args['transformed_dim']
+        # recurrent layers
+        self.taggerlstm = HighwayLSTM(input_size, self.args['hidden_dim'], self.args['num_layers'], batch_first=True, bidirectional=True, dropout=self.args['dropout'], rec_dropout=self.args['rec_dropout'], highway_func=torch.tanh)
+        self.drop_replacement = nn.Parameter(torch.randn(input_size) / np.sqrt(input_size))
+        self.taggerlstm_h_init = nn.Parameter(torch.zeros(2 * self.args['num_layers'], 1, self.args['hidden_dim']))
+        self.taggerlstm_c_init = nn.Parameter(torch.zeros(2 * self.args['num_layers'], 1, self.args['hidden_dim']))
+        # classifiers
+        self.upos_hid = nn.Linear(self.args['hidden_dim'] * 2, self.args['deep_biaff_hidden_dim'])
+        self.upos_clf = nn.Linear(self.args['deep_biaff_hidden_dim'], len(vocab['upos']))
+        self.upos_clf.weight.data.zero_()
+        self.upos_clf.bias.data.zero_()
+        if share_hid:
+            clf_constructor = lambda insize, outsize: nn.Linear(insize, outsize)
+        else:
+            self.xpos_hid = nn.Linear(self.args['hidden_dim'] * 2, self.args['deep_biaff_hidden_dim'] if not isinstance(vocab['xpos'], CompositeVocab) else self.args['composite_deep_biaff_hidden_dim'])
+            self.ufeats_hid = nn.Linear(self.args['hidden_dim'] * 2, self.args['composite_deep_biaff_hidden_dim'])
+            clf_constructor = lambda insize, outsize: BiaffineScorer(insize, self.args['tag_emb_dim'], outsize)
+        if isinstance(vocab['xpos'], CompositeVocab):
+            self.xpos_clf = nn.ModuleList()
+            for l in vocab['xpos'].lens():
+                self.xpos_clf.append(clf_constructor(self.args['composite_deep_biaff_hidden_dim'], l))
+        else:
+            self.xpos_clf = clf_constructor(self.args['deep_biaff_hidden_dim'], len(vocab['xpos']))
+            if share_hid:
+                self.xpos_clf.weight.data.zero_()
+                self.xpos_clf.bias.data.zero_()
+        self.ufeats_clf = nn.ModuleList()
+        for l in vocab['feats'].lens():
+            if share_hid:
+                self.ufeats_clf.append(clf_constructor(self.args['deep_biaff_hidden_dim'], l))
+                self.ufeats_clf[-1].weight.data.zero_()
+                self.ufeats_clf[-1].bias.data.zero_()
+            else:
+                self.ufeats_clf.append(clf_constructor(self.args['composite_deep_biaff_hidden_dim'], l))
+        # criterion
+        self.crit = nn.CrossEntropyLoss(ignore_index=0) # ignore padding
+        self.drop = nn.Dropout(args['dropout'])
+        self.worddrop = WordDropout(args['word_dropout'])
+    def add_unsaved_module(self, name, module):
+        self.unsaved_modules += [name]
+        setattr(self, name, module)
+    def log_norms(self):
+        utils.log_norms(self)
+    def forward(self, word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, word_orig_idx, sentlens, wordlens, text):
+        def pack(x):
+            return pack_padded_sequence(x, sentlens, batch_first=True)
+        inputs = []
+        if self.args['word_emb_dim'] > 0:
+            word_emb = self.word_emb(word)
+            word_emb = pack(word_emb)
+            inputs += [word_emb]
+        if self.args['pretrain']:
+            pretrained_emb = self.pretrained_emb(pretrained)
+            pretrained_emb = self.trans_pretrained(pretrained_emb)
+            pretrained_emb = pack(pretrained_emb)
+            inputs += [pretrained_emb]
+        def pad(x):
+            return pad_packed_sequence(PackedSequence(x, inputs[0].batch_sizes), batch_first=True)[0]
+        if self.args['char'] and self.args['char_emb_dim'] > 0:
+            if self.args.get('charlm', None):
+                all_forward_chars = self.charmodel_forward.build_char_representation(text)
+                assert isinstance(all_forward_chars, list)
+                if self.charmodel_forward_transform is not None:
+                    all_forward_chars = [self.charmodel_forward_transform(x) for x in all_forward_chars]
+                all_forward_chars = pack(pad_sequence(all_forward_chars, batch_first=True))
+                all_backward_chars = self.charmodel_backward.build_char_representation(text)
+                if self.charmodel_backward_transform is not None:
+                    all_backward_chars = [self.charmodel_backward_transform(x) for x in all_backward_chars]
+                all_backward_chars = pack(pad_sequence(all_backward_chars, batch_first=True))
+                inputs += [all_forward_chars, all_backward_chars]
+            else:
+                char_reps = self.charmodel(wordchars, wordchars_mask, word_orig_idx, sentlens, wordlens)
+                char_reps = PackedSequence(self.trans_char(self.drop(char_reps.data)), char_reps.batch_sizes)
+                inputs += [char_reps]
+        if self.bert_model is not None:
+            device = next(self.parameters()).device
+            processed_bert = extract_bert_embeddings(self.args['bert_model'], self.bert_tokenizer, self.bert_model, text, device, keep_endpoints=False,
+                                                     num_layers=self.bert_layer_mix.in_features if self.bert_layer_mix is not None else None,
+                                                     detach=not self.args.get('bert_finetune', False) or not self.training,
+                                                     peft_name=self.peft_name)
+            if self.bert_layer_mix is not None:
+                # add the average so that the default behavior is to
+                # take an average of the N layers, and anything else
+                # other than that needs to be learned
+                # TODO: refactor this
+                processed_bert = [self.bert_layer_mix(feature).squeeze(2) + feature.sum(axis=2) / self.bert_layer_mix.in_features for feature in processed_bert]
+            processed_bert = pad_sequence(processed_bert, batch_first=True)
+            inputs += [pack(processed_bert)]
+        lstm_inputs = torch.cat([x.data for x in inputs], 1)
+        lstm_inputs = self.worddrop(lstm_inputs, self.drop_replacement)
+        lstm_inputs = self.drop(lstm_inputs)
+        lstm_inputs = PackedSequence(lstm_inputs, inputs[0].batch_sizes)
+        lstm_outputs, _ = self.taggerlstm(lstm_inputs, sentlens, hx=(self.taggerlstm_h_init.expand(2 * self.args['num_layers'], word.size(0), self.args['hidden_dim']).contiguous(), self.taggerlstm_c_init.expand(2 * self.args['num_layers'], word.size(0), self.args['hidden_dim']).contiguous()))
+        lstm_outputs = lstm_outputs.data
+        upos_hid = F.relu(self.upos_hid(self.drop(lstm_outputs)))
+        upos_pred = self.upos_clf(self.drop(upos_hid))
+        preds = [pad(upos_pred).max(2)[1]]
+        if upos is not None:
+            upos = pack(upos).data
+            loss = self.crit(upos_pred.view(-1, upos_pred.size(-1)), upos.view(-1))
+        else:
+            loss = 0.0
+        if self.share_hid:
+            xpos_hid = upos_hid
+            ufeats_hid = upos_hid
+            clffunc = lambda clf, hid: clf(self.drop(hid))
+        else:
+            xpos_hid = F.relu(self.xpos_hid(self.drop(lstm_outputs)))
+            ufeats_hid = F.relu(self.ufeats_hid(self.drop(lstm_outputs)))
+            if self.training and upos is not None:
+                upos_emb = self.upos_emb(upos)
+            else:
+                upos_emb = self.upos_emb(upos_pred.max(1)[1])
+            clffunc = lambda clf, hid: clf(self.drop(hid), self.drop(upos_emb))
+        if xpos is not None: xpos = pack(xpos).data
+        if isinstance(self.vocab['xpos'], CompositeVocab):
+            xpos_preds = []
+            for i in range(len(self.vocab['xpos'])):
+                xpos_pred = clffunc(self.xpos_clf[i], xpos_hid)
+                if xpos is not None:
+                    loss += self.crit(xpos_pred.view(-1, xpos_pred.size(-1)), xpos[:, i].view(-1))
+                xpos_preds.append(pad(xpos_pred).max(2, keepdim=True)[1])
+            preds.append(torch.cat(xpos_preds, 2))
+        else:
+            xpos_pred = clffunc(self.xpos_clf, xpos_hid)
+            if xpos is not None:
+                loss += self.crit(xpos_pred.view(-1, xpos_pred.size(-1)), xpos.view(-1))
+            preds.append(pad(xpos_pred).max(2)[1])
+        ufeats_preds = []
+        if ufeats is not None: ufeats = pack(ufeats).data
+        for i in range(len(self.vocab['feats'])):
+            ufeats_pred = clffunc(self.ufeats_clf[i], ufeats_hid)
+            if ufeats is not None:
+                loss += self.crit(ufeats_pred.view(-1, ufeats_pred.size(-1)), ufeats[:, i].view(-1))
+            ufeats_preds.append(pad(ufeats_pred).max(2, keepdim=True)[1])
+        preds.append(torch.cat(ufeats_preds, 2))
+        return loss, preds

stanza/stanza/models/pos/trainer.py ADDED Viewed

	@@ -0,0 +1,179 @@

+"""
+A trainer class to handle training and testing of models.
+"""
+import sys
+import logging
+import torch
+from torch import nn
+from stanza.models.common.trainer import Trainer as BaseTrainer
+from stanza.models.common import utils, loss
+from stanza.models.common.foundation_cache import load_bert, load_bert_with_peft, NoTransformerFoundationCache
+from stanza.models.common.peft_config import build_peft_wrapper, load_peft_wrapper
+from stanza.models.pos.model import Tagger
+from stanza.models.pos.vocab import MultiVocab
+logger = logging.getLogger('stanza')
+def unpack_batch(batch, device):
+    """ Unpack a batch from the data loader. """
+    inputs = [b.to(device) if b is not None else None for b in batch[:8]]
+    orig_idx = batch[8]
+    word_orig_idx = batch[9]
+    sentlens = batch[10]
+    wordlens = batch[11]
+    text = batch[12]
+    return inputs, orig_idx, word_orig_idx, sentlens, wordlens, text
+class Trainer(BaseTrainer):
+    """ A trainer for training models. """
+    def __init__(self, args=None, vocab=None, pretrain=None, model_file=None, device=None, foundation_cache=None):
+        if model_file is not None:
+            # load everything from file
+            self.load(model_file, pretrain, args=args, foundation_cache=foundation_cache)
+        else:
+            # build model from scratch
+            self.args = args
+            self.vocab = vocab
+            bert_model, bert_tokenizer = load_bert(self.args['bert_model'])
+            peft_name = None
+            if self.args['use_peft']:
+                # fine tune the bert if we're using peft
+                self.args['bert_finetune'] = True
+                peft_name = "pos"
+                bert_model = build_peft_wrapper(bert_model, self.args, logger, adapter_name=peft_name)
+            self.model = Tagger(args, vocab, emb_matrix=pretrain.emb if pretrain is not None else None, share_hid=args['share_hid'], foundation_cache=foundation_cache, bert_model=bert_model, bert_tokenizer=bert_tokenizer, force_bert_saved=self.args['bert_finetune'], peft_name=peft_name)
+        self.model = self.model.to(device)
+        self.optimizers = utils.get_split_optimizer(self.args['optim'], self.model, self.args['lr'], betas=(0.9, self.args['beta2']), eps=1e-6, weight_decay=self.args.get('initial_weight_decay', None), bert_learning_rate=self.args.get('bert_learning_rate', 0.0), is_peft=self.args.get("peft", False))
+        self.schedulers = {}
+        if self.args.get('bert_finetune', None):
+            import transformers
+            warmup_scheduler = transformers.get_linear_schedule_with_warmup(
+                self.optimizers["bert_optimizer"],
+                # todo late starting?
+                0, self.args["max_steps"])
+            self.schedulers["bert_scheduler"] = warmup_scheduler
+    def update(self, batch, eval=False):
+        device = next(self.model.parameters()).device
+        inputs, orig_idx, word_orig_idx, sentlens, wordlens, text = unpack_batch(batch, device)
+        word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained = inputs
+        if eval:
+            self.model.eval()
+        else:
+            self.model.train()
+            for optimizer in self.optimizers.values():
+                optimizer.zero_grad()
+        loss, _ = self.model(word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, word_orig_idx, sentlens, wordlens, text)
+        if loss == 0.0:
+            return loss
+        loss_val = loss.data.item()
+        if eval:
+            return loss_val
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args['max_grad_norm'])
+        for optimizer in self.optimizers.values():
+            optimizer.step()
+        for scheduler in self.schedulers.values():
+            scheduler.step()
+        return loss_val
+    def predict(self, batch, unsort=True):
+        device = next(self.model.parameters()).device
+        inputs, orig_idx, word_orig_idx, sentlens, wordlens, text = unpack_batch(batch, device)
+        word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained = inputs
+        self.model.eval()
+        batch_size = word.size(0)
+        _, preds = self.model(word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, word_orig_idx, sentlens, wordlens, text)
+        upos_seqs = [self.vocab['upos'].unmap(sent) for sent in preds[0].tolist()]
+        xpos_seqs = [self.vocab['xpos'].unmap(sent) for sent in preds[1].tolist()]
+        feats_seqs = [self.vocab['feats'].unmap(sent) for sent in preds[2].tolist()]
+        pred_tokens = [[[upos_seqs[i][j], xpos_seqs[i][j], feats_seqs[i][j]] for j in range(sentlens[i])] for i in range(batch_size)]
+        if unsort:
+            pred_tokens = utils.unsort(pred_tokens, orig_idx)
+        return pred_tokens
+    def save(self, filename, skip_modules=True):
+        model_state = self.model.state_dict()
+        # skip saving modules like pretrained embeddings, because they are large and will be saved in a separate file
+        if skip_modules:
+            skipped = [k for k in model_state.keys() if k.split('.')[0] in self.model.unsaved_modules]
+            for k in skipped:
+                del model_state[k]
+        params = {
+                'model': model_state,
+                'vocab': self.vocab.state_dict(),
+                'config': self.args
+                }
+        if self.args.get('use_peft', False):
+            # Hide import so that peft dependency is optional
+            from peft import get_peft_model_state_dict
+            params["bert_lora"] = get_peft_model_state_dict(self.model.bert_model, adapter_name=self.model.peft_name)
+        try:
+            torch.save(params, filename, _use_new_zipfile_serialization=False)
+            logger.info("Model saved to {}".format(filename))
+        except (KeyboardInterrupt, SystemExit):
+            raise
+        except Exception as e:
+            logger.warning(f"Saving failed... {e} continuing anyway.")
+    def load(self, filename, pretrain, args=None, foundation_cache=None):
+        """
+        Load a model from file, with preloaded pretrain embeddings. Here we allow the pretrain to be None or a dummy input,
+        and the actual use of pretrain embeddings will depend on the boolean config "pretrain" in the loaded args.
+        """
+        try:
+            checkpoint = torch.load(filename, lambda storage, loc: storage, weights_only=True)
+        except BaseException:
+            logger.error("Cannot load model from {}".format(filename))
+            raise
+        self.args = checkpoint['config']
+        if args is not None: self.args.update(args)
+        # preserve old models which were created before transformers were added
+        if 'bert_model' not in self.args:
+            self.args['bert_model'] = None
+        lora_weights = checkpoint.get('bert_lora')
+        if lora_weights:
+            logger.debug("Found peft weights for POS; loading a peft adapter")
+            self.args["use_peft"] = True
+        # TODO: refactor this common block of code with NER
+        force_bert_saved = False
+        peft_name = None
+        if self.args.get('use_peft', False):
+            force_bert_saved = True
+            bert_model, bert_tokenizer, peft_name = load_bert_with_peft(self.args['bert_model'], "pos", foundation_cache)
+            bert_model = load_peft_wrapper(bert_model, lora_weights, self.args, logger, peft_name)
+            logger.debug("Loaded peft with name %s", peft_name)
+        else:
+            if any(x.startswith("bert_model.") for x in checkpoint['model'].keys()):
+                logger.debug("Model %s has a finetuned transformer.  Not using transformer cache to make sure the finetuned version of the transformer isn't accidentally used elsewhere", filename)
+                foundation_cache = NoTransformerFoundationCache(foundation_cache)
+                force_bert_saved = True
+            bert_model, bert_tokenizer = load_bert(self.args.get('bert_model'), foundation_cache)
+        self.vocab = MultiVocab.load_state_dict(checkpoint['vocab'])
+        # load model
+        emb_matrix = None
+        if self.args['pretrain'] and pretrain is not None: # we use pretrain only if args['pretrain'] == True and pretrain is not None
+            emb_matrix = pretrain.emb
+        if any(x.startswith("bert_model.") for x in checkpoint['model'].keys()):
+            logger.debug("Model %s has a finetuned transformer.  Not using transformer cache to make sure the finetuned version of the transformer isn't accidentally used elsewhere", filename)
+            foundation_cache = NoTransformerFoundationCache(foundation_cache)
+        self.model = Tagger(self.args, self.vocab, emb_matrix=emb_matrix, share_hid=self.args['share_hid'], foundation_cache=foundation_cache, bert_model=bert_model, bert_tokenizer=bert_tokenizer, force_bert_saved=force_bert_saved, peft_name=peft_name)
+        self.model.load_state_dict(checkpoint['model'], strict=False)

stanza/stanza/models/pos/xpos_vocab_factory.py ADDED Viewed

	@@ -0,0 +1,200 @@

+# This is the XPOS factory method generated automatically from stanza.models.pos.build_xpos_vocab_factory.
+# Please don't edit it!
+import logging
+from stanza.models.pos.vocab import WordVocab, XPOSVocab
+from stanza.models.pos.xpos_vocab_utils import XPOSDescription, XPOSType, build_xpos_vocab, choose_simplest_factory
+# using a sublogger makes it easier to test in the unittests
+logger = logging.getLogger('stanza.models.pos.xpos_vocab_factory')
+XPOS_DESCRIPTIONS = {
+    'af_afribooms'   : XPOSDescription(XPOSType.XPOS, ''),
+    'ar_padt'        : XPOSDescription(XPOSType.XPOS, ''),
+    'bg_btb'         : XPOSDescription(XPOSType.XPOS, ''),
+    'ca_ancora'      : XPOSDescription(XPOSType.XPOS, ''),
+    'cs_cac'         : XPOSDescription(XPOSType.XPOS, ''),
+    'cs_cltt'        : XPOSDescription(XPOSType.XPOS, ''),
+    'cs_fictree'     : XPOSDescription(XPOSType.XPOS, ''),
+    'cs_pdt'         : XPOSDescription(XPOSType.XPOS, ''),
+    'en_partut'      : XPOSDescription(XPOSType.XPOS, ''),
+    'es_ancora'      : XPOSDescription(XPOSType.XPOS, ''),
+    'es_combined'    : XPOSDescription(XPOSType.XPOS, ''),
+    'fr_partut'      : XPOSDescription(XPOSType.XPOS, ''),
+    'gd_arcosg'      : XPOSDescription(XPOSType.XPOS, ''),
+    'gl_ctg'         : XPOSDescription(XPOSType.XPOS, ''),
+    'gl_treegal'     : XPOSDescription(XPOSType.XPOS, ''),
+    'grc_perseus'    : XPOSDescription(XPOSType.XPOS, ''),
+    'hr_set'         : XPOSDescription(XPOSType.XPOS, ''),
+    'is_gc'          : XPOSDescription(XPOSType.XPOS, ''),
+    'is_icepahc'     : XPOSDescription(XPOSType.XPOS, ''),
+    'is_modern'      : XPOSDescription(XPOSType.XPOS, ''),
+    'it_combined'    : XPOSDescription(XPOSType.XPOS, ''),
+    'it_isdt'        : XPOSDescription(XPOSType.XPOS, ''),
+    'it_markit'      : XPOSDescription(XPOSType.XPOS, ''),
+    'it_parlamint'   : XPOSDescription(XPOSType.XPOS, ''),
+    'it_partut'      : XPOSDescription(XPOSType.XPOS, ''),
+    'it_postwita'    : XPOSDescription(XPOSType.XPOS, ''),
+    'it_twittiro'    : XPOSDescription(XPOSType.XPOS, ''),
+    'it_vit'         : XPOSDescription(XPOSType.XPOS, ''),
+    'la_perseus'     : XPOSDescription(XPOSType.XPOS, ''),
+    'la_udante'      : XPOSDescription(XPOSType.XPOS, ''),
+    'lt_alksnis'     : XPOSDescription(XPOSType.XPOS, ''),
+    'lv_lvtb'        : XPOSDescription(XPOSType.XPOS, ''),
+    'ro_nonstandard' : XPOSDescription(XPOSType.XPOS, ''),
+    'ro_rrt'         : XPOSDescription(XPOSType.XPOS, ''),
+    'ro_simonero'    : XPOSDescription(XPOSType.XPOS, ''),
+    'sk_snk'         : XPOSDescription(XPOSType.XPOS, ''),
+    'sl_ssj'         : XPOSDescription(XPOSType.XPOS, ''),
+    'sl_sst'         : XPOSDescription(XPOSType.XPOS, ''),
+    'sr_set'         : XPOSDescription(XPOSType.XPOS, ''),
+    'ta_ttb'         : XPOSDescription(XPOSType.XPOS, ''),
+    'uk_iu'          : XPOSDescription(XPOSType.XPOS, ''),
+    'be_hse'         : XPOSDescription(XPOSType.WORD, None),
+    'bxr_bdt'        : XPOSDescription(XPOSType.WORD, None),
+    'cop_scriptorium': XPOSDescription(XPOSType.WORD, None),
+    'cu_proiel'      : XPOSDescription(XPOSType.WORD, None),
+    'cy_ccg'         : XPOSDescription(XPOSType.WORD, None),
+    'da_ddt'         : XPOSDescription(XPOSType.WORD, None),
+    'de_gsd'         : XPOSDescription(XPOSType.WORD, None),
+    'de_hdt'         : XPOSDescription(XPOSType.WORD, None),
+    'el_gdt'         : XPOSDescription(XPOSType.WORD, None),
+    'el_gud'         : XPOSDescription(XPOSType.WORD, None),
+    'en_atis'        : XPOSDescription(XPOSType.WORD, None),
+    'en_combined'    : XPOSDescription(XPOSType.WORD, None),
+    'en_craft'       : XPOSDescription(XPOSType.WORD, None),
+    'en_eslspok'     : XPOSDescription(XPOSType.WORD, None),
+    'en_ewt'         : XPOSDescription(XPOSType.WORD, None),
+    'en_genia'       : XPOSDescription(XPOSType.WORD, None),
+    'en_gum'         : XPOSDescription(XPOSType.WORD, None),
+    'en_gumreddit'   : XPOSDescription(XPOSType.WORD, None),
+    'en_mimic'       : XPOSDescription(XPOSType.WORD, None),
+    'en_test'        : XPOSDescription(XPOSType.WORD, None),
+    'es_gsd'         : XPOSDescription(XPOSType.WORD, None),
+    'et_edt'         : XPOSDescription(XPOSType.WORD, None),
+    'et_ewt'         : XPOSDescription(XPOSType.WORD, None),
+    'eu_bdt'         : XPOSDescription(XPOSType.WORD, None),
+    'fa_perdt'       : XPOSDescription(XPOSType.WORD, None),
+    'fa_seraji'      : XPOSDescription(XPOSType.WORD, None),
+    'fi_tdt'         : XPOSDescription(XPOSType.WORD, None),
+    'fr_combined'    : XPOSDescription(XPOSType.WORD, None),
+    'fr_gsd'         : XPOSDescription(XPOSType.WORD, None),
+    'fr_parisstories': XPOSDescription(XPOSType.WORD, None),
+    'fr_rhapsodie'   : XPOSDescription(XPOSType.WORD, None),
+    'fr_sequoia'     : XPOSDescription(XPOSType.WORD, None),
+    'fro_profiterole': XPOSDescription(XPOSType.WORD, None),
+    'ga_idt'         : XPOSDescription(XPOSType.WORD, None),
+    'ga_twittirish'  : XPOSDescription(XPOSType.WORD, None),
+    'got_proiel'     : XPOSDescription(XPOSType.WORD, None),
+    'grc_proiel'     : XPOSDescription(XPOSType.WORD, None),
+    'grc_ptnk'       : XPOSDescription(XPOSType.WORD, None),
+    'gv_cadhan'      : XPOSDescription(XPOSType.WORD, None),
+    'hbo_ptnk'       : XPOSDescription(XPOSType.WORD, None),
+    'he_combined'    : XPOSDescription(XPOSType.WORD, None),
+    'he_htb'         : XPOSDescription(XPOSType.WORD, None),
+    'he_iahltknesset': XPOSDescription(XPOSType.WORD, None),
+    'he_iahltwiki'   : XPOSDescription(XPOSType.WORD, None),
+    'hi_hdtb'        : XPOSDescription(XPOSType.WORD, None),
+    'hsb_ufal'       : XPOSDescription(XPOSType.WORD, None),
+    'hu_szeged'      : XPOSDescription(XPOSType.WORD, None),
+    'hy_armtdp'      : XPOSDescription(XPOSType.WORD, None),
+    'hy_bsut'        : XPOSDescription(XPOSType.WORD, None),
+    'hyw_armtdp'     : XPOSDescription(XPOSType.WORD, None),
+    'id_csui'        : XPOSDescription(XPOSType.WORD, None),
+    'it_old'         : XPOSDescription(XPOSType.WORD, None),
+    'ka_glc'         : XPOSDescription(XPOSType.WORD, None),
+    'kk_ktb'         : XPOSDescription(XPOSType.WORD, None),
+    'kmr_mg'         : XPOSDescription(XPOSType.WORD, None),
+    'kpv_lattice'    : XPOSDescription(XPOSType.WORD, None),
+    'ky_ktmu'        : XPOSDescription(XPOSType.WORD, None),
+    'la_proiel'      : XPOSDescription(XPOSType.WORD, None),
+    'lij_glt'        : XPOSDescription(XPOSType.WORD, None),
+    'lt_hse'         : XPOSDescription(XPOSType.WORD, None),
+    'lzh_kyoto'      : XPOSDescription(XPOSType.WORD, None),
+    'mr_ufal'        : XPOSDescription(XPOSType.WORD, None),
+    'mt_mudt'        : XPOSDescription(XPOSType.WORD, None),
+    'myv_jr'         : XPOSDescription(XPOSType.WORD, None),
+    'nb_bokmaal'     : XPOSDescription(XPOSType.WORD, None),
+    'nds_lsdc'       : XPOSDescription(XPOSType.WORD, None),
+    'nn_nynorsk'     : XPOSDescription(XPOSType.WORD, None),
+    'nn_nynorsklia'  : XPOSDescription(XPOSType.WORD, None),
+    'no_bokmaal'     : XPOSDescription(XPOSType.WORD, None),
+    'orv_birchbark'  : XPOSDescription(XPOSType.WORD, None),
+    'orv_rnc'        : XPOSDescription(XPOSType.WORD, None),
+    'orv_torot'      : XPOSDescription(XPOSType.WORD, None),
+    'ota_boun'       : XPOSDescription(XPOSType.WORD, None),
+    'pcm_nsc'        : XPOSDescription(XPOSType.WORD, None),
+    'pt_bosque'      : XPOSDescription(XPOSType.WORD, None),
+    'pt_cintil'      : XPOSDescription(XPOSType.WORD, None),
+    'pt_dantestocks' : XPOSDescription(XPOSType.WORD, None),
+    'pt_gsd'         : XPOSDescription(XPOSType.WORD, None),
+    'pt_petrogold'   : XPOSDescription(XPOSType.WORD, None),
+    'pt_porttinari'  : XPOSDescription(XPOSType.WORD, None),
+    'qpm_philotis'   : XPOSDescription(XPOSType.WORD, None),
+    'qtd_sagt'       : XPOSDescription(XPOSType.WORD, None),
+    'ru_gsd'         : XPOSDescription(XPOSType.WORD, None),
+    'ru_poetry'      : XPOSDescription(XPOSType.WORD, None),
+    'ru_syntagrus'   : XPOSDescription(XPOSType.WORD, None),
+    'ru_taiga'       : XPOSDescription(XPOSType.WORD, None),
+    'sa_vedic'       : XPOSDescription(XPOSType.WORD, None),
+    'sme_giella'     : XPOSDescription(XPOSType.WORD, None),
+    'swl_sslc'       : XPOSDescription(XPOSType.WORD, None),
+    'sq_staf'        : XPOSDescription(XPOSType.WORD, None),
+    'te_mtg'         : XPOSDescription(XPOSType.WORD, None),
+    'tr_atis'        : XPOSDescription(XPOSType.WORD, None),
+    'tr_boun'        : XPOSDescription(XPOSType.WORD, None),
+    'tr_framenet'    : XPOSDescription(XPOSType.WORD, None),
+    'tr_imst'        : XPOSDescription(XPOSType.WORD, None),
+    'tr_kenet'       : XPOSDescription(XPOSType.WORD, None),
+    'tr_penn'        : XPOSDescription(XPOSType.WORD, None),
+    'tr_tourism'     : XPOSDescription(XPOSType.WORD, None),
+    'ug_udt'         : XPOSDescription(XPOSType.WORD, None),
+    'uk_parlamint'   : XPOSDescription(XPOSType.WORD, None),
+    'vi_vtb'         : XPOSDescription(XPOSType.WORD, None),
+    'wo_wtb'         : XPOSDescription(XPOSType.WORD, None),
+    'xcl_caval'      : XPOSDescription(XPOSType.WORD, None),
+    'zh-hans_gsdsimp': XPOSDescription(XPOSType.WORD, None),
+    'zh-hant_gsd'    : XPOSDescription(XPOSType.WORD, None),
+    'zh_gsdsimp'     : XPOSDescription(XPOSType.WORD, None),
+    'en_lines'       : XPOSDescription(XPOSType.XPOS, '-'),
+    'fo_farpahc'     : XPOSDescription(XPOSType.XPOS, '-'),
+    'ja_gsd'         : XPOSDescription(XPOSType.XPOS, '-'),
+    'ja_gsdluw'      : XPOSDescription(XPOSType.XPOS, '-'),
+    'sv_lines'       : XPOSDescription(XPOSType.XPOS, '-'),
+    'ur_udtb'        : XPOSDescription(XPOSType.XPOS, '-'),
+    'fi_ftb'         : XPOSDescription(XPOSType.XPOS, ','),
+    'orv_ruthenian'  : XPOSDescription(XPOSType.XPOS, ','),
+    'id_gsd'         : XPOSDescription(XPOSType.XPOS, '+'),
+    'ko_gsd'         : XPOSDescription(XPOSType.XPOS, '+'),
+    'ko_kaist'       : XPOSDescription(XPOSType.XPOS, '+'),
+    'ko_ksl'         : XPOSDescription(XPOSType.XPOS, '+'),
+    'qaf_arabizi'    : XPOSDescription(XPOSType.XPOS, '+'),
+    'la_ittb'        : XPOSDescription(XPOSType.XPOS, '|'),
+    'la_llct'        : XPOSDescription(XPOSType.XPOS, '|'),
+    'nl_alpino'      : XPOSDescription(XPOSType.XPOS, '|'),
+    'nl_lassysmall'  : XPOSDescription(XPOSType.XPOS, '|'),
+    'sv_talbanken'   : XPOSDescription(XPOSType.XPOS, '|'),
+    'pl_lfg'         : XPOSDescription(XPOSType.XPOS, ':'),
+    'pl_pdb'         : XPOSDescription(XPOSType.XPOS, ':'),
+}
+def xpos_vocab_factory(data, shorthand):
+    if shorthand not in XPOS_DESCRIPTIONS:
+        logger.warning("%s is not a known dataset.  Examining the data to choose which xpos vocab to use", shorthand)
+    desc = choose_simplest_factory(data, shorthand)
+    if shorthand in XPOS_DESCRIPTIONS:
+        if XPOS_DESCRIPTIONS[shorthand] != desc:
+            # log instead of throw
+            # otherwise, updating datasets would be unpleasant
+            logger.error("XPOS tagset in %s has apparently changed!  Was %s, is now %s", shorthand, XPOS_DESCRIPTIONS[shorthand], desc)
+    else:
+        logger.warning("Chose %s for the xpos factory for %s", desc, shorthand)
+    return build_xpos_vocab(desc, data, shorthand)

stanza/stanza/models/pos/xpos_vocab_utils.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from collections import namedtuple
+from enum import Enum
+import logging
+import os
+from stanza.models.common.vocab import VOCAB_PREFIX
+from stanza.models.pos.vocab import XPOSVocab, WordVocab
+class XPOSType(Enum):
+    XPOS     = 1
+    WORD     = 2
+XPOSDescription = namedtuple('XPOSDescription', ['xpos_type', 'sep'])
+DEFAULT_KEY = XPOSDescription(XPOSType.WORD, None)
+logger = logging.getLogger('stanza')
+def filter_data(data, idx):
+    data_filtered = []
+    for sentence in data:
+        flag = True
+        for token in sentence:
+            if token[idx] is None:
+                flag = False
+        if flag: data_filtered.append(sentence)
+    return data_filtered
+def choose_simplest_factory(data, shorthand):
+    logger.info(f'Original length = {len(data)}')
+    data = filter_data(data, idx=2)
+    logger.info(f'Filtered length = {len(data)}')
+    vocab = WordVocab(data, shorthand, idx=2, ignore=["_"])
+    key = DEFAULT_KEY
+    best_size = len(vocab) - len(VOCAB_PREFIX)
+    if best_size > 20:
+        for sep in ['', '-', '+', '|', ',', ':']: # separators
+            vocab = XPOSVocab(data, shorthand, idx=2, sep=sep)
+            length = sum(len(x) - len(VOCAB_PREFIX) for x in vocab._id2unit.values())
+            if length < best_size:
+                key = XPOSDescription(XPOSType.XPOS, sep)
+                best_size = length
+    return key
+def build_xpos_vocab(description, data, shorthand):
+    if description.xpos_type is XPOSType.WORD:
+        return WordVocab(data, shorthand, idx=2, ignore=["_"])
+    return XPOSVocab(data, shorthand, idx=2, sep=description.sep)

stanza/stanza/models/tokenization/__init__.py ADDED Viewed

File without changes

stanza/stanza/models/tokenization/data.py ADDED Viewed

	@@ -0,0 +1,432 @@

+from bisect import bisect_right
+from copy import copy
+import numpy as np
+import random
+import logging
+import re
+import torch
+from torch.utils.data import Dataset
+from .vocab import Vocab
+from stanza.models.common.utils import sort_with_indices, unsort
+logger = logging.getLogger('stanza')
+def filter_consecutive_whitespaces(para):
+    filtered = []
+    for i, (char, label) in enumerate(para):
+        if i > 0:
+            if char == ' ' and para[i-1][0] == ' ':
+                continue
+        filtered.append((char, label))
+    return filtered
+NEWLINE_WHITESPACE_RE = re.compile(r'\n\s*\n')
+# this was (r'^([\d]+[,\.]*)+$')
+# but the runtime on that can explode exponentially
+# for example, on 111111111111111111111111a
+NUMERIC_RE = re.compile(r'^[\d]+([,\.]+[\d]+)*[,\.]*$')
+WHITESPACE_RE = re.compile(r'\s')
+class TokenizationDataset:
+    def __init__(self, tokenizer_args, input_files={'txt': None, 'label': None}, input_text=None, vocab=None, evaluation=False, dictionary=None, *args, **kwargs):
+        super().__init__(*args, **kwargs)  # forwards all unused arguments
+        self.args = tokenizer_args
+        self.eval = evaluation
+        self.dictionary = dictionary
+        self.vocab = vocab
+        # get input files
+        txt_file = input_files['txt']
+        label_file = input_files['label']
+        # Load data and process it
+        # set up text from file or input string
+        assert txt_file is not None or input_text is not None
+        if input_text is None:
+            with open(txt_file) as f:
+                text = ''.join(f.readlines()).rstrip()
+        else:
+            text = input_text
+        text_chunks = NEWLINE_WHITESPACE_RE.split(text)
+        text_chunks = [pt.rstrip() for pt in text_chunks]
+        text_chunks = [pt for pt in text_chunks if pt]
+        if label_file is not None:
+            with open(label_file) as f:
+                labels = ''.join(f.readlines()).rstrip()
+                labels = NEWLINE_WHITESPACE_RE.split(labels)
+                labels = [pt.rstrip() for pt in labels]
+                labels = [map(int, pt) for pt in labels if pt]
+        else:
+            labels = [[0 for _ in pt] for pt in text_chunks]
+        skip_newline = self.args.get('skip_newline', False)
+        self.data = [[(WHITESPACE_RE.sub(' ', char), label) # substitute special whitespaces
+                      for char, label in zip(pt, pc) if not (skip_newline and char == '\n')] # check if newline needs to be eaten
+                     for pt, pc in zip(text_chunks, labels)]
+        # remove consecutive whitespaces
+        self.data = [filter_consecutive_whitespaces(x) for x in self.data]
+    def labels(self):
+        """
+        Returns a list of the labels for all of the sentences in this DataLoader
+        Used at eval time to compare to the results, for example
+        """
+        return [np.array(list(x[1] for x in sent)) for sent in self.data]
+    def extract_dict_feat(self, para, idx):
+        """
+        This function is to extract dictionary features for each character
+        """
+        length = len(para)
+        dict_forward_feats = [0 for i in range(self.args['num_dict_feat'])]
+        dict_backward_feats = [0 for i in range(self.args['num_dict_feat'])]
+        forward_word = para[idx][0]
+        backward_word = para[idx][0]
+        prefix = True
+        suffix = True
+        for window in range(1,self.args['num_dict_feat']+1):
+            # concatenate each character and check if words found in dict not, stop if prefix not found
+            #check if idx+t is out of bound and if the prefix is already not found
+            if (idx + window) <= length-1 and prefix:
+                forward_word += para[idx+window][0].lower()
+                #check in json file if the word is present as prefix or word or None.
+                feat = 1 if forward_word in self.dictionary["words"] else 0
+                #if the return value is not 2 or 3 then the checking word is not a valid word in dict.
+                dict_forward_feats[window-1] = feat
+                #if the dict return 0 means no prefixes found, thus, stop looking for forward.
+                if forward_word not in self.dictionary["prefixes"]:
+                    prefix = False
+            #backward check: similar to forward
+            if (idx - window) >= 0 and suffix:
+                backward_word = para[idx-window][0].lower() + backward_word
+                feat = 1 if backward_word in self.dictionary["words"] else 0
+                dict_backward_feats[window-1] = feat
+                if backward_word not in self.dictionary["suffixes"]:
+                    suffix = False
+            #if cannot find both prefix and suffix, then exit the loop
+            if not prefix and not suffix:
+                break
+        return dict_forward_feats + dict_backward_feats
+    def para_to_sentences(self, para):
+        """ Convert a paragraph to a list of processed sentences. """
+        res = []
+        funcs = []
+        for feat_func in self.args['feat_funcs']:
+            if feat_func == 'end_of_para' or feat_func == 'start_of_para':
+                # skip for position-dependent features
+                continue
+            if feat_func == 'space_before':
+                func = lambda x: 1 if x.startswith(' ') else 0
+            elif feat_func == 'capitalized':
+                func = lambda x: 1 if x[0].isupper() else 0
+            elif feat_func == 'numeric':
+                func = lambda x: 1 if (NUMERIC_RE.match(x) is not None) else 0
+            else:
+                raise ValueError('Feature function "{}" is undefined.'.format(feat_func))
+            funcs.append(func)
+        # stacking all featurize functions
+        composite_func = lambda x: [f(x) for f in funcs]
+        def process_sentence(sent_units, sent_labels, sent_feats):
+            return (np.array([self.vocab.unit2id(y) for y in sent_units]),
+                    np.array(sent_labels),
+                    np.array(sent_feats),
+                    list(sent_units))
+        use_end_of_para = 'end_of_para' in self.args['feat_funcs']
+        use_start_of_para = 'start_of_para' in self.args['feat_funcs']
+        use_dictionary = self.args['use_dictionary']
+        current_units = []
+        current_labels = []
+        current_feats = []
+        for i, (unit, label) in enumerate(para):
+            feats = composite_func(unit)
+            # position-dependent features
+            if use_end_of_para:
+                f = 1 if i == len(para)-1 else 0
+                feats.append(f)
+            if use_start_of_para:
+                f = 1 if i == 0 else 0
+                feats.append(f)
+            #if dictionary feature is selected
+            if use_dictionary:
+                dict_feats = self.extract_dict_feat(para, i)
+                feats = feats + dict_feats
+            current_units.append(unit)
+            current_labels.append(label)
+            current_feats.append(feats)
+            if not self.eval and (label == 2 or label == 4): # end of sentence
+                if len(current_units) <= self.args['max_seqlen']:
+                    # get rid of sentences that are too long during training of the tokenizer
+                    res.append(process_sentence(current_units, current_labels, current_feats))
+                current_units.clear()
+                current_labels.clear()
+                current_feats.clear()
+        if len(current_units) > 0:
+            if self.eval or len(current_units) <= self.args['max_seqlen']:
+                res.append(process_sentence(current_units, current_labels, current_feats))
+        return res
+    def advance_old_batch(self, eval_offsets, old_batch):
+        """
+        Advance to a new position in a batch where we have partially processed the batch
+        If we have previously built a batch of data and made predictions on them, then when we are trying to make
+        prediction on later characters in those paragraphs, we can avoid rebuilding the converted data from scratch
+        and just (essentially) advance the indices/offsets from where we read converted data in this old batch.
+        In this case, eval_offsets index within the old_batch to advance the strings to process.
+        """
+        unkid = self.vocab.unit2id('<UNK>')
+        padid = self.vocab.unit2id('<PAD>')
+        ounits, olabels, ofeatures, oraw = old_batch
+        feat_size = ofeatures.shape[-1]
+        lens = (ounits != padid).sum(1).tolist()
+        pad_len = max(l-i for i, l in zip(eval_offsets, lens))
+        units = torch.full((len(ounits), pad_len), padid, dtype=torch.int64)
+        labels = torch.full((len(ounits), pad_len), -1, dtype=torch.int32)
+        features = torch.zeros((len(ounits), pad_len, feat_size), dtype=torch.float32)
+        raw_units = []
+        for i in range(len(ounits)):
+            eval_offsets[i] = min(eval_offsets[i], lens[i])
+            units[i, :(lens[i] - eval_offsets[i])] = ounits[i, eval_offsets[i]:lens[i]]
+            labels[i, :(lens[i] - eval_offsets[i])] = olabels[i, eval_offsets[i]:lens[i]]
+            features[i, :(lens[i] - eval_offsets[i])] = ofeatures[i, eval_offsets[i]:lens[i]]
+            raw_units.append(oraw[i][eval_offsets[i]:lens[i]] + ['<PAD>'] * (pad_len - lens[i] + eval_offsets[i]))
+        return units, labels, features, raw_units
+class DataLoader(TokenizationDataset):
+    """
+    This is the training version of the dataset.
+    """
+    def __init__(self, args, input_files={'txt': None, 'label': None}, input_text=None, vocab=None, evaluation=False, dictionary=None):
+        super().__init__(args, input_files, input_text, vocab, evaluation, dictionary)
+        self.vocab = vocab if vocab is not None else self.init_vocab()
+        # data comes in a list of paragraphs, where each paragraph is a list of units with unit-level labels.
+        # At evaluation time, each paragraph is treated as single "sentence" as we don't know a priori where
+        # sentence breaks occur. We make prediction from left to right for each paragraph and move forward to
+        # the last predicted sentence break to start afresh.
+        self.sentences = [self.para_to_sentences(para) for para in self.data]
+        self.init_sent_ids()
+        logger.debug(f"{len(self.sentence_ids)} sentences loaded.")
+    def __len__(self):
+        return len(self.sentence_ids)
+    def init_vocab(self):
+        vocab = Vocab(self.data, self.args['lang'])
+        return vocab
+    def init_sent_ids(self):
+        self.sentence_ids = []
+        self.cumlen = [0]
+        for i, para in enumerate(self.sentences):
+            for j in range(len(para)):
+                self.sentence_ids += [(i, j)]
+                self.cumlen += [self.cumlen[-1] + len(self.sentences[i][j][0])]
+    def has_mwt(self):
+        # presumably this only needs to be called either 0 or 1 times,
+        # 1 when training and 0 any other time, so no effort is put
+        # into caching the result
+        for sentence in self.data:
+            for word in sentence:
+                if word[1] > 2:
+                    return True
+        return False
+    def shuffle(self):
+        for para in self.sentences:
+            random.shuffle(para)
+        self.init_sent_ids()
+    def next(self, eval_offsets=None, unit_dropout=0.0, feat_unit_dropout=0.0):
+        ''' Get a batch of converted and padded PyTorch data from preprocessed raw text for training/prediction. '''
+        feat_size = len(self.sentences[0][0][2][0])
+        unkid = self.vocab.unit2id('<UNK>')
+        padid = self.vocab.unit2id('<PAD>')
+        def strings_starting(id_pair, offset=0, pad_len=self.args['max_seqlen']):
+            # At eval time, this combines sentences in paragraph (indexed by id_pair[0]) starting sentence (indexed
+            # by id_pair[1]) into a long string for evaluation. At training time, we just select random sentences
+            # from the entire dataset until we reach max_seqlen.
+            pid, sid = id_pair if self.eval else random.choice(self.sentence_ids)
+            sentences = [copy([x[offset:] for x in self.sentences[pid][sid]])]
+            drop_sents = False if self.eval or (self.args.get('sent_drop_prob', 0) == 0) else (random.random() < self.args.get('sent_drop_prob', 0))
+            drop_last_char = False if self.eval or (self.args.get('last_char_drop_prob', 0) == 0) else (random.random() < self.args.get('last_char_drop_prob', 0))
+            total_len = len(sentences[0][0])
+            assert self.eval or total_len <= self.args['max_seqlen'], 'The maximum sequence length {} is less than that of the longest sentence length ({}) in the data, consider increasing it! {}'.format(self.args['max_seqlen'], total_len, ' '.join(["{}/{}".format(*x) for x in zip(self.sentences[pid][sid])]))
+            if self.eval:
+                for sid1 in range(sid+1, len(self.sentences[pid])):
+                    total_len += len(self.sentences[pid][sid1][0])
+                    sentences.append(self.sentences[pid][sid1])
+                    if total_len >= self.args['max_seqlen']:
+                        break
+            else:
+                while True:
+                    pid1, sid1 = random.choice(self.sentence_ids)
+                    total_len += len(self.sentences[pid1][sid1][0])
+                    sentences.append(self.sentences[pid1][sid1])
+                    if total_len >= self.args['max_seqlen']:
+                        break
+            if drop_sents and len(sentences) > 1:
+                if total_len > self.args['max_seqlen']:
+                    sentences = sentences[:-1]
+                if len(sentences) > 1:
+                    p = [.5 ** i for i in range(1, len(sentences) + 1)] # drop a large number of sentences with smaller probability
+                    cutoff = random.choices(list(range(len(sentences))), weights=list(reversed(p)))[0]
+                    sentences = sentences[:cutoff+1]
+            units = np.concatenate([s[0] for s in sentences])
+            labels = np.concatenate([s[1] for s in sentences])
+            feats = np.concatenate([s[2] for s in sentences])
+            raw_units = [x for s in sentences for x in s[3]]
+            if not self.eval:
+                cutoff = self.args['max_seqlen']
+                units, labels, feats, raw_units = units[:cutoff], labels[:cutoff], feats[:cutoff], raw_units[:cutoff]
+            if drop_last_char:  # can only happen in non-eval mode
+                if len(labels) > 1 and labels[-1] == 2 and labels[-2] in (1, 3):
+                    # training text ended with a sentence end position
+                    # and that word was a single character
+                    # and the previous character ended the word
+                    units, labels, feats, raw_units = units[:-1], labels[:-1], feats[:-1], raw_units[:-1]
+                    # word end -> sentence end, mwt end -> sentence mwt end
+                    labels[-1] = labels[-1] + 1
+            return units, labels, feats, raw_units
+        if eval_offsets is not None:
+            # find max padding length
+            pad_len = 0
+            for eval_offset in eval_offsets:
+                if eval_offset < self.cumlen[-1]:
+                    pair_id = bisect_right(self.cumlen, eval_offset) - 1
+                    pair = self.sentence_ids[pair_id]
+                    pad_len = max(pad_len, len(strings_starting(pair, offset=eval_offset-self.cumlen[pair_id])[0]))
+            pad_len += 1
+            id_pairs = [bisect_right(self.cumlen, eval_offset) - 1 for eval_offset in eval_offsets]
+            pairs = [self.sentence_ids[pair_id] for pair_id in id_pairs]
+            offsets = [eval_offset - self.cumlen[pair_id] for eval_offset, pair_id in zip(eval_offsets, id_pairs)]
+            offsets_pairs = list(zip(offsets, pairs))
+        else:
+            id_pairs = random.sample(self.sentence_ids, min(len(self.sentence_ids), self.args['batch_size']))
+            offsets_pairs = [(0, x) for x in id_pairs]
+            pad_len = self.args['max_seqlen']
+        # put everything into padded and nicely shaped NumPy arrays and eventually convert to PyTorch tensors
+        units = np.full((len(id_pairs), pad_len), padid, dtype=np.int64)
+        labels = np.full((len(id_pairs), pad_len), -1, dtype=np.int64)
+        features = np.zeros((len(id_pairs), pad_len, feat_size), dtype=np.float32)
+        raw_units = []
+        for i, (offset, pair) in enumerate(offsets_pairs):
+            u_, l_, f_, r_ = strings_starting(pair, offset=offset, pad_len=pad_len)
+            units[i, :len(u_)] = u_
+            labels[i, :len(l_)] = l_
+            features[i, :len(f_), :] = f_
+            raw_units.append(r_ + ['<PAD>'] * (pad_len - len(r_)))
+        if unit_dropout > 0 and not self.eval:
+            # dropout characters/units at training time and replace them with UNKs
+            mask = np.random.random_sample(units.shape) < unit_dropout
+            mask[units == padid] = 0
+            units[mask] = unkid
+            for i in range(len(raw_units)):
+                for j in range(len(raw_units[i])):
+                    if mask[i, j]:
+                        raw_units[i][j] = '<UNK>'
+        # dropout unit feature vector in addition to only torch.dropout in the model.
+        # experiments showed that only torch.dropout hurts the model
+        # we believe it is because the dict feature vector is mostly scarse so it makes
+        # more sense to drop out the whole vector instead of only single element.
+        if self.args['use_dictionary'] and feat_unit_dropout > 0 and not self.eval:
+            mask_feat = np.random.random_sample(units.shape) < feat_unit_dropout
+            mask_feat[units == padid] = 0
+            for i in range(len(raw_units)):
+                for j in range(len(raw_units[i])):
+                    if mask_feat[i,j]:
+                        features[i,j,:] = 0
+        units = torch.from_numpy(units)
+        labels = torch.from_numpy(labels)
+        features = torch.from_numpy(features)
+        return units, labels, features, raw_units
+class SortedDataset(Dataset):
+    """
+    Holds a TokenizationDataset for use in a torch DataLoader
+    The torch DataLoader is different from the DataLoader defined here
+    and allows for cpu & gpu parallelism.  Updating output_predictions
+    to use this class as a wrapper to a TokenizationDataset means the
+    calculation of features can happen in parallel, saving quite a
+    bit of time.
+    """
+    def __init__(self, dataset):
+        super().__init__()
+        self.dataset = dataset
+        self.data, self.indices = sort_with_indices(self.dataset.data, key=len)
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, index):
+        return self.dataset.para_to_sentences(self.data[index])
+    def unsort(self, arr):
+        return unsort(arr, self.indices)
+    def collate(self, samples):
+        if any(len(x) > 1 for x in samples):
+            raise ValueError("Expected all paragraphs to have no preset sentence splits!")
+        feat_size = samples[0][0][2].shape[-1]
+        padid = self.dataset.vocab.unit2id('<PAD>')
+        # +1 so that all samples end with at least one pad
+        pad_len = max(len(x[0][3]) for x in samples) + 1
+        units = torch.full((len(samples), pad_len), padid, dtype=torch.int64)
+        labels = torch.full((len(samples), pad_len), -1, dtype=torch.int32)
+        features = torch.zeros((len(samples), pad_len, feat_size), dtype=torch.float32)
+        raw_units = []
+        for i, sample in enumerate(samples):
+            u_, l_, f_, r_ = sample[0]
+            units[i, :len(u_)] = torch.from_numpy(u_)
+            labels[i, :len(l_)] = torch.from_numpy(l_)
+            features[i, :len(f_), :] = torch.from_numpy(f_)
+            raw_units.append(r_ + ['<PAD>'] * (pad_len - len(r_)))
+        return units, labels, features, raw_units

stanza/stanza/models/tokenization/model.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+class Tokenizer(nn.Module):
+    def __init__(self, args, nchars, emb_dim, hidden_dim, dropout, feat_dropout):
+        super().__init__()
+        self.args = args
+        feat_dim = args['feat_dim']
+        self.embeddings = nn.Embedding(nchars, emb_dim, padding_idx=0)
+        self.rnn = nn.LSTM(emb_dim + feat_dim, hidden_dim, num_layers=self.args['rnn_layers'], bidirectional=True, batch_first=True, dropout=dropout if self.args['rnn_layers'] > 1 else 0)
+        if self.args['conv_res'] is not None:
+            self.conv_res = nn.ModuleList()
+            self.conv_sizes = [int(x) for x in self.args['conv_res'].split(',')]
+            for si, size in enumerate(self.conv_sizes):
+                l = nn.Conv1d(emb_dim + feat_dim, hidden_dim * 2, size, padding=size//2, bias=self.args.get('hier_conv_res', False) or (si == 0))
+                self.conv_res.append(l)
+            if self.args.get('hier_conv_res', False):
+                self.conv_res2 = nn.Conv1d(hidden_dim * 2 * len(self.conv_sizes), hidden_dim * 2, 1)
+        self.tok_clf = nn.Linear(hidden_dim * 2, 1)
+        self.sent_clf = nn.Linear(hidden_dim * 2, 1)
+        if self.args['use_mwt']:
+            self.mwt_clf = nn.Linear(hidden_dim * 2, 1)
+        if args['hierarchical']:
+            in_dim = hidden_dim * 2
+            self.rnn2 = nn.LSTM(in_dim, hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
+            self.tok_clf2 = nn.Linear(hidden_dim * 2, 1, bias=False)
+            self.sent_clf2 = nn.Linear(hidden_dim * 2, 1, bias=False)
+            if self.args['use_mwt']:
+                self.mwt_clf2 = nn.Linear(hidden_dim * 2, 1, bias=False)
+        self.dropout = nn.Dropout(dropout)
+        self.dropout_feat = nn.Dropout(feat_dropout)
+        self.toknoise = nn.Dropout(self.args['tok_noise'])
+    def forward(self, x, feats):
+        emb = self.embeddings(x)
+        emb = self.dropout(emb)
+        feats = self.dropout_feat(feats)
+        emb = torch.cat([emb, feats], 2)
+        inp, _ = self.rnn(emb)
+        if self.args['conv_res'] is not None:
+            conv_input = emb.transpose(1, 2).contiguous()
+            if not self.args.get('hier_conv_res', False):
+                for l in self.conv_res:
+                    inp = inp + l(conv_input).transpose(1, 2).contiguous()
+            else:
+                hid = []
+                for l in self.conv_res:
+                    hid += [l(conv_input)]
+                hid = torch.cat(hid, 1)
+                hid = F.relu(hid)
+                hid = self.dropout(hid)
+                inp = inp + self.conv_res2(hid).transpose(1, 2).contiguous()
+        inp = self.dropout(inp)
+        tok0 = self.tok_clf(inp)
+        sent0 = self.sent_clf(inp)
+        if self.args['use_mwt']:
+            mwt0 = self.mwt_clf(inp)
+        if self.args['hierarchical']:
+            if self.args['hier_invtemp'] > 0:
+                inp2, _ = self.rnn2(inp * (1 - self.toknoise(torch.sigmoid(-tok0 * self.args['hier_invtemp']))))
+            else:
+                inp2, _ = self.rnn2(inp)
+            inp2 = self.dropout(inp2)
+            tok0 = tok0 + self.tok_clf2(inp2)
+            sent0 = sent0 + self.sent_clf2(inp2)
+            if self.args['use_mwt']:
+                mwt0 = mwt0 + self.mwt_clf2(inp2)
+        nontok = F.logsigmoid(-tok0)
+        tok = F.logsigmoid(tok0)
+        nonsent = F.logsigmoid(-sent0)
+        sent = F.logsigmoid(sent0)
+        if self.args['use_mwt']:
+            nonmwt = F.logsigmoid(-mwt0)
+            mwt = F.logsigmoid(mwt0)
+        if self.args['use_mwt']:
+            pred = torch.cat([nontok, tok+nonsent+nonmwt, tok+sent+nonmwt, tok+nonsent+mwt, tok+sent+mwt], 2)
+        else:
+            pred = torch.cat([nontok, tok+nonsent, tok+sent], 2)
+        return pred

stanza/stanza/models/tokenization/tokenize_files.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""Use a Stanza tokenizer to turn a text file into one tokenized paragraph per line
+For example, the output of this script is suitable for Glove
+Currently this *only* supports tokenization, no MWT splitting.
+It also would be beneficial to have an option to convert spaces into
+NBSP, underscore, or some other marker to make it easier to process
+languages such as VI which have spaces in them
+"""
+import argparse
+import io
+import os
+import time
+import re
+import zipfile
+import torch
+import stanza
+from stanza.models.common.utils import open_read_text, default_device
+from stanza.models.tokenization.data import TokenizationDataset
+from stanza.models.tokenization.utils import output_predictions
+from stanza.pipeline.tokenize_processor import TokenizeProcessor
+from stanza.utils.get_tqdm import get_tqdm
+tqdm = get_tqdm()
+NEWLINE_SPLIT_RE = re.compile(r"\n\s*\n")
+def tokenize_to_file(tokenizer, fin, fout, chunk_size=500):
+    raw_text = fin.read()
+    documents = NEWLINE_SPLIT_RE.split(raw_text)
+    for chunk_start in tqdm(range(0, len(documents), chunk_size), leave=False):
+        chunk_end = min(chunk_start + chunk_size, len(documents))
+        chunk = documents[chunk_start:chunk_end]
+        in_docs = [stanza.Document([], text=d) for d in chunk]
+        out_docs = tokenizer.bulk_process(in_docs)
+        for document in out_docs:
+            for sent_idx, sentence in enumerate(document.sentences):
+                if sent_idx > 0:
+                    fout.write(" ")
+                fout.write(" ".join(x.text for x in sentence.tokens))
+            fout.write("\n")
+def main(args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--lang", type=str, default="sd", help="Which language to use for tokenization")
+    parser.add_argument("--tokenize_model_path", type=str, default=None, help="Specific tokenizer model to use")
+    parser.add_argument("input_files", type=str, nargs="+", help="Which input files to tokenize")
+    parser.add_argument("--output_file", type=str, default="glove.txt", help="Where to write the tokenized output")
+    parser.add_argument("--model_dir", type=str, default=None, help="Where to get models for a Pipeline (None => default models dir)")
+    parser.add_argument("--chunk_size", type=int, default=500, help="How many 'documents' to use in a chunk when tokenizing.  This is separate from the tokenizer batching - this limits how much memory gets used at once, since we don't need to store an entire file in memory at once")
+    args = parser.parse_args(args=args)
+    if os.path.exists(args.output_file):
+        print("Cowardly refusing to overwrite existing output file %s" % args.output_file)
+        return
+    if args.tokenize_model_path:
+        config = { "model_path": args.tokenize_model_path,
+                   "check_requirements": False }
+        tokenizer = TokenizeProcessor(config, pipeline=None, device=default_device())
+    else:
+        pipe = stanza.Pipeline(lang=args.lang, processors="tokenize", model_dir=args.model_dir)
+        tokenizer = pipe.processors["tokenize"]
+    with open(args.output_file, "w", encoding="utf-8") as fout:
+        for filename in tqdm(args.input_files):
+            if filename.endswith(".zip"):
+                with zipfile.ZipFile(filename) as zin:
+                    input_names = zin.namelist()
+                    for input_name in tqdm(input_names, leave=False):
+                        with zin.open(input_names[0]) as fin:
+                            fin = io.TextIOWrapper(fin, encoding='utf-8')
+                            tokenize_to_file(tokenizer, fin, fout)
+            else:
+                with open_read_text(filename, encoding="utf-8") as fin:
+                    tokenize_to_file(tokenizer, fin, fout)
+if __name__ == '__main__':
+    main()

stanza/stanza/models/tokenization/trainer.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import sys
+import logging
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from stanza.models.common import utils
+from stanza.models.common.trainer import Trainer as BaseTrainer
+from stanza.models.tokenization.utils import create_dictionary
+from .model import Tokenizer
+from .vocab import Vocab
+logger = logging.getLogger('stanza')
+class Trainer(BaseTrainer):
+    def __init__(self, args=None, vocab=None, lexicon=None, dictionary=None, model_file=None, device=None):
+        if model_file is not None:
+            # load everything from file
+            self.load(model_file)
+        else:
+            # build model from scratch
+            self.args = args
+            self.vocab = vocab
+            self.lexicon = list(lexicon) if lexicon is not None else None
+            self.dictionary = dictionary
+            self.model = Tokenizer(self.args, self.args['vocab_size'], self.args['emb_dim'], self.args['hidden_dim'], dropout=self.args['dropout'], feat_dropout=self.args['feat_dropout'])
+        self.model = self.model.to(device)
+        self.criterion = nn.CrossEntropyLoss(ignore_index=-1).to(device)
+        self.optimizer = utils.get_optimizer("adam", self.model, lr=self.args['lr0'], betas=(.9, .9), weight_decay=self.args['weight_decay'])
+        self.feat_funcs = self.args.get('feat_funcs', None)
+        self.lang = self.args['lang'] # language determines how token normalization is done
+    def update(self, inputs):
+        self.model.train()
+        units, labels, features, _ = inputs
+        device = next(self.model.parameters()).device
+        units = units.to(device)
+        labels = labels.to(device)
+        features = features.to(device)
+        pred = self.model(units, features)
+        self.optimizer.zero_grad()
+        classes = pred.size(2)
+        loss = self.criterion(pred.view(-1, classes), labels.view(-1))
+        loss.backward()
+        nn.utils.clip_grad_norm_(self.model.parameters(), self.args['max_grad_norm'])
+        self.optimizer.step()
+        return loss.item()
+    def predict(self, inputs):
+        self.model.eval()
+        units, _, features, _ = inputs
+        device = next(self.model.parameters()).device
+        units = units.to(device)
+        features = features.to(device)
+        pred = self.model(units, features)
+        return pred.data.cpu().numpy()
+    def save(self, filename):
+        params = {
+            'model': self.model.state_dict() if self.model is not None else None,
+            'vocab': self.vocab.state_dict(),
+            # save and load lexicon as list instead of set so
+            # we can use weights_only=True
+            'lexicon': list(self.lexicon) if self.lexicon is not None else None,
+            'config': self.args
+        }
+        try:
+            torch.save(params, filename, _use_new_zipfile_serialization=False)
+            logger.info("Model saved to {}".format(filename))
+        except BaseException:
+            logger.warning("Saving failed... continuing anyway.")
+    def load(self, filename):
+        try:
+            checkpoint = torch.load(filename, lambda storage, loc: storage, weights_only=True)
+        except BaseException:
+            logger.error("Cannot load model from {}".format(filename))
+            raise
+        self.args = checkpoint['config']
+        if self.args.get('use_mwt', None) is None:
+            # Default to True as many currently saved models
+            # were built with mwt layers
+            self.args['use_mwt'] = True
+        self.model = Tokenizer(self.args, self.args['vocab_size'], self.args['emb_dim'], self.args['hidden_dim'], dropout=self.args['dropout'], feat_dropout=self.args['feat_dropout'])
+        self.model.load_state_dict(checkpoint['model'])
+        self.vocab = Vocab.load_state_dict(checkpoint['vocab'])
+        self.lexicon = checkpoint['lexicon']
+        if self.lexicon is not None:
+            self.lexicon = set(self.lexicon)
+            self.dictionary = create_dictionary(self.lexicon)
+        else:
+            self.dictionary = None

stanza/stanza/utils/datasets/constituency/convert_ctb.py ADDED Viewed

	@@ -0,0 +1,224 @@

+from enum import Enum
+import glob
+import os
+import re
+import xml.etree.ElementTree as ET
+from stanza.models.constituency import tree_reader
+from stanza.utils.datasets.constituency.utils import write_dataset
+from stanza.utils.get_tqdm import get_tqdm
+tqdm = get_tqdm()
+class Version(Enum):
+    V51   = 1
+    V51b  = 2
+    V90   = 3
+def filenum_to_shard_51(filenum):
+    if filenum >= 1 and filenum <= 815:
+        return 0
+    if filenum >= 1001 and filenum <= 1136:
+        return 0
+    if filenum >= 886 and filenum <= 931:
+        return 1
+    if filenum >= 1148 and filenum <= 1151:
+        return 1
+    if filenum >= 816 and filenum <= 885:
+        return 2
+    if filenum >= 1137 and filenum <= 1147:
+        return 2
+    raise ValueError("Unhandled filenum %d" % filenum)
+def filenum_to_shard_51_basic(filenum):
+    if filenum >= 1 and filenum <= 270:
+        return 0
+    if filenum >= 440 and filenum <= 1151:
+        return 0
+    if filenum >= 301 and filenum <= 325:
+        return 1
+    if filenum >= 271 and filenum <= 300:
+        return 2
+    if filenum >= 400 and filenum <= 439:
+        return None
+    raise ValueError("Unhandled filenum %d" % filenum)
+def filenum_to_shard_90(filenum):
+    if filenum >= 1 and filenum <= 40:
+        return 2
+    if filenum >= 900 and filenum <= 931:
+        return 2
+    if filenum in (1018, 1020, 1036, 1044, 1060, 1061, 1072, 1118, 1119, 1132, 1141, 1142, 1148):
+        return 2
+    if filenum >= 2165 and filenum <= 2180:
+        return 2
+    if filenum >= 2295 and filenum <= 2310:
+        return 2
+    if filenum >= 2570 and filenum <= 2602:
+        return 2
+    if filenum >= 2800 and filenum <= 2819:
+        return 2
+    if filenum >= 3110 and filenum <= 3145:
+        return 2
+    if filenum >= 41 and filenum <= 80:
+        return 1
+    if filenum >= 1120 and filenum <= 1129:
+        return 1
+    if filenum >= 2140 and filenum <= 2159:
+        return 1
+    if filenum >= 2280 and filenum <= 2294:
+        return 1
+    if filenum >= 2550 and filenum <= 2569:
+        return 1
+    if filenum >= 2775 and filenum <= 2799:
+        return 1
+    if filenum >= 3080 and filenum <= 3109:
+        return 1
+    if filenum >= 81 and filenum <= 900:
+        return 0
+    if filenum >= 1001 and filenum <= 1017:
+        return 0
+    if filenum in (1019, 1130, 1131):
+        return 0
+    if filenum >= 1021 and filenum <= 1035:
+        return 0
+    if filenum >= 1037 and filenum <= 1043:
+        return 0
+    if filenum >= 1045 and filenum <= 1059:
+        return 0
+    if filenum >= 1062 and filenum <= 1071:
+        return 0
+    if filenum >= 1073 and filenum <= 1117:
+        return 0
+    if filenum >= 1133 and filenum <= 1140:
+        return 0
+    if filenum >= 1143 and filenum <= 1147:
+        return 0
+    if filenum >= 1149 and filenum <= 2139:
+        return 0
+    if filenum >= 2160 and filenum <= 2164:
+        return 0
+    if filenum >= 2181 and filenum <= 2279:
+        return 0
+    if filenum >= 2311 and filenum <= 2549:
+        return 0
+    if filenum >= 2603 and filenum <= 2774:
+        return 0
+    if filenum >= 2820 and filenum <= 3079:
+        return 0
+    if filenum >= 4000 and filenum <= 7017:
+        return 0
+def collect_trees_s(root):
+    if root.tag == 'S':
+        yield root.text, root.attrib['ID']
+    for child in root:
+        for tree in collect_trees_s(child):
+            yield tree
+def collect_trees_text(root):
+    if root.tag == 'TEXT' and len(root.text.strip()) > 0:
+        yield root.text, None
+    if root.tag == 'TURN' and len(root.text.strip()) > 0:
+        yield root.text, None
+    for child in root:
+        for tree in collect_trees_text(child):
+            yield tree
+id_re = re.compile("<S ID=([0-9a-z]+)>")
+su_re = re.compile("<(su|msg) id=([0-9a-zA-Z_=]+)>")
+def convert_ctb(input_dir, output_dir, dataset_name, version):
+    input_files = glob.glob(os.path.join(input_dir, "*"))
+    # train, dev, test
+    datasets = [[], [], []]
+    sorted_filenames = []
+    for input_filename in input_files:
+        base_filename = os.path.split(input_filename)[1]
+        filenum = int(os.path.splitext(base_filename)[0].split("_")[1])
+        sorted_filenames.append((filenum, input_filename))
+    sorted_filenames.sort()
+    for filenum, filename in tqdm(sorted_filenames):
+        if version in (Version.V51, Version.V51b):
+            with open(filename, errors='ignore', encoding="gb2312") as fin:
+                text = fin.read()
+        elif version is Version.V90:
+            with open(filename, encoding="utf-8") as fin:
+                text = fin.read()
+            if text.find("<TURN>") >= 0 and text.find("</TURN>") < 0:
+                text = text.replace("<TURN>", "")
+            if filenum in (4205, 4208, 4289):
+                text = text.replace("<)", "&lt;)").replace(">)", "&gt;)")
+            if filenum >= 4000 and filenum <= 4411:
+                if text.find("<segment") >= 0:
+                    text = text.replace("<segment id=", "<S ID=").replace("</segment>", "</S>")
+                elif text.find("<seg") < 0:
+                    text = "<TEXT>\n%s</TEXT>\n" % text
+                else:
+                    text = text.replace("<seg id=", "<S ID=").replace("</seg>", "</S>")
+                text = "<foo>\n%s</foo>\n" % text
+            if filenum >= 5000 and filenum <= 5558 or filenum >= 6000 and filenum <= 6700 or filenum >= 7000 and filenum <= 7017:
+                text = su_re.sub("", text)
+                if filenum in (6066, 6453):
+                    text = text.replace("<", "&lt;").replace(">", "&gt;")
+                text = "<foo><TEXT>\n%s</TEXT></foo>\n" % text
+        else:
+            raise ValueError("Unknown CTB version %s" % version)
+        text = id_re.sub(r'<S ID="\1">', text)
+        text = text.replace("&", "&amp;")
+        try:
+            xml_root = ET.fromstring(text)
+        except Exception as e:
+            print(text[:1000])
+            raise RuntimeError("Cannot xml process %s" % filename) from e
+        trees = [x for x in collect_trees_s(xml_root)]
+        if version is Version.V90 and len(trees) == 0:
+            trees = [x for x in collect_trees_text(xml_root)]
+        if version in (Version.V51, Version.V51b):
+            trees = [x[0] for x in trees if filenum != 414 or x[1] != "4366"]
+        else:
+            trees = [x[0] for x in trees]
+        trees = "\n".join(trees)
+        try:
+            trees = tree_reader.read_trees(trees, use_tqdm=False)
+        except ValueError as e:
+            print(text[:300])
+            raise RuntimeError("Could not process the tree text in %s" % filename)
+        trees = [t.prune_none().simplify_labels() for t in trees]
+        assert len(trees) > 0, "No trees in %s" % filename
+        if version is Version.V51:
+            shard = filenum_to_shard_51(filenum)
+        elif version is Version.V51b:
+            shard = filenum_to_shard_51_basic(filenum)
+        else:
+            shard = filenum_to_shard_90(filenum)
+        if shard is None:
+            continue
+        datasets[shard].extend(trees)
+    write_dataset(datasets, output_dir, dataset_name)

stanza/stanza/utils/datasets/constituency/extract_silver_dataset.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""
+After running build_silver_dataset.py, this extracts the trees of a certain match level
+For example
+python3 stanza/utils/datasets/constituency/extract_silver_dataset.py --parsed_trees /u/nlp/data/constituency-parser/italian/2024_it_vit_electra/a*.trees --keep_score 0 --output_file /u/nlp/data/constituency-parser/italian/2024_it_vit_electra/it_silver_0.mrg
+for i in `echo 0 1 2 3 4 5 6 7 8 9 10`; do python3 stanza/utils/datasets/constituency/extract_silver_dataset.py --parsed_trees /u/nlp/data/constituency-parser/italian/2024_it_vit_electra/a*.trees --keep_score $i --output_file /u/nlp/data/constituency-parser/italian/2024_it_vit_electra/it_silver_$i.mrg; done
+"""
+import argparse
+import json
+def parse_args():
+    parser = argparse.ArgumentParser(description="After finding common trees using build_silver_dataset, this extracts them all or just the ones from a particular level of accuracy")
+    parser.add_argument('--parsed_trees', type=str, nargs='+', help='Input file(s) of trees parsed into the build_silver_dataset json format.')
+    parser.add_argument('--keep_score', type=int, default=None, help='Which agreement level to keep.  None keeps all')
+    parser.add_argument('--output_file', type=str, default=None, help='Where to put the output file')
+    args = parser.parse_args()
+    return args
+def main():
+    args = parse_args()
+    trees = []
+    for filename in args.parsed_trees:
+        with open(filename, encoding='utf-8') as fin:
+            for line in fin.readlines():
+                tree = json.loads(line)
+                if args.keep_score is None or tree['count'] == args.keep_score:
+                    tree = tree['tree']
+                    trees.append(tree)
+    if args.output_file is None:
+        for tree in trees:
+            print(tree)
+    else:
+        with open(args.output_file, 'w', encoding='utf-8') as fout:
+            for tree in trees:
+                fout.write(tree)
+                fout.write('\n')
+if __name__ == '__main__':
+    main()

stanza/stanza/utils/datasets/coref/balance_languages.py ADDED Viewed

	@@ -0,0 +1,60 @@

+"""
+balance_concat.py
+create a test set from a dev set which is language balanced
+"""
+import json
+from collections import defaultdict
+from random import Random
+# fix random seed for reproducability
+R = Random(42)
+with open("./corefud_concat_v1_0_langid.train.json", 'r') as df:
+    raw = json.load(df)
+# calculate type of each class; then, we will select the one
+# which has the LOWEST counts as the sample rate
+lang_counts = defaultdict(int)
+for i in raw:
+    lang_counts[i["lang"]] += 1
+min_lang_count = min(lang_counts.values())
+# sample 20% of the smallest amount for test set
+# this will look like an absurdly small number, but
+# remember this is DOCUMENTS not TOKENS or UTTERANCES
+# so its actually decent
+# also its per language
+test_set_size = int(0.1*min_lang_count)
+# sampling input by language
+raw_by_language = defaultdict(list)
+for i in raw:
+    raw_by_language[i["lang"]].append(i)
+languages = list(set(raw_by_language.keys()))
+train_set = []
+test_set = []
+for i in languages:
+    length = list(range(len(raw_by_language[i])))
+    choices = R.sample(length, test_set_size)
+    for indx,i in enumerate(raw_by_language[i]):
+        if indx in choices:
+            test_set.append(i)
+        else:
+            train_set.append(i)
+with open("./corefud_concat_v1_0_langid-bal.train.json", 'w') as df:
+    json.dump(train_set, df, indent=2)
+with open("./corefud_concat_v1_0_langid-bal.test.json", 'w') as df:
+    json.dump(test_set, df, indent=2)
+# raw_by_language["en"]