bowphs commited on Apr 8, 2025

Commit

6cd9428

verified ·

1 Parent(s): 9cbeb98

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

stanza/stanza/models/constituency_parser.py +881 -0
stanza/stanza/models/lemmatizer.py +313 -0
stanza/stanza/pipeline/_constants.py +13 -0
stanza/stanza/pipeline/external/spacy.py +74 -0
stanza/stanza/pipeline/ner_processor.py +143 -0
stanza/stanza/resources/print_charlm_depparse.py +22 -0
stanza/stanza/server/dependency_converter.py +101 -0
stanza/stanza/tests/classifiers/test_constituency_classifier.py +128 -0
stanza/stanza/tests/common/__init__.py +0 -0
stanza/stanza/tests/common/test_chuliu_edmonds.py +36 -0
stanza/stanza/tests/common/test_confusion.py +81 -0
stanza/stanza/tests/common/test_constant.py +67 -0
stanza/stanza/tests/common/test_data_conversion.py +520 -0
stanza/stanza/tests/common/test_foundation_cache.py +36 -0
stanza/stanza/tests/common/test_pretrain.py +139 -0
stanza/stanza/tests/common/test_utils.py +194 -0
stanza/stanza/tests/constituency/__init__.py +0 -0
stanza/stanza/tests/constituency/test_convert_arboretum.py +235 -0
stanza/stanza/tests/constituency/test_ensemble.py +110 -0
stanza/stanza/tests/constituency/test_in_order_compound_oracle.py +93 -0
stanza/stanza/tests/constituency/test_parse_transitions.py +486 -0
stanza/stanza/tests/constituency/test_parse_tree.py +369 -0
stanza/stanza/tests/constituency/test_positional_encoding.py +45 -0
stanza/stanza/tests/constituency/test_selftrain_vi_quad.py +23 -0
stanza/stanza/tests/constituency/test_utils.py +68 -0
stanza/stanza/tests/data/example_french.json +22 -0
stanza/stanza/tests/data/test.dat +0 -0
stanza/stanza/tests/data/tiny_emb.csv +4 -0
stanza/stanza/tests/datasets/__init__.py +0 -0
stanza/stanza/tests/datasets/ner/__init__.py +0 -0
stanza/stanza/tests/datasets/ner/test_prepare_ner_file.py +77 -0
stanza/stanza/tests/datasets/ner/test_utils.py +34 -0
stanza/stanza/tests/lemma/test_data.py +106 -0
stanza/stanza/tests/lemma/test_lemma_trainer.py +154 -0
stanza/stanza/tests/lemma_classifier/test_data_preparation.py +256 -0
stanza/stanza/tests/mwt/test_character_classifier.py +92 -0
stanza/stanza/tests/mwt/test_english_corner_cases.py +88 -0
stanza/stanza/tests/ner/test_bsf_2_iob.py +93 -0
stanza/stanza/tests/ner/test_convert_amt.py +104 -0
stanza/stanza/tests/ner/test_convert_starlang_ner.py +23 -0
stanza/stanza/tests/ner/test_from_conllu.py +30 -0
stanza/stanza/tests/ner/test_ner_utils.py +129 -0
stanza/stanza/tests/pipeline/__init__.py +0 -0
stanza/stanza/tests/pipeline/test_arabic_pipeline.py +27 -0
stanza/stanza/tests/pipeline/test_core.py +248 -0
stanza/stanza/tests/pipeline/test_depparse.py +87 -0
stanza/stanza/tests/pipeline/test_english_pipeline.py +279 -0
stanza/stanza/tests/pipeline/test_french_pipeline.py +353 -0
stanza/stanza/tests/pipeline/test_lemmatizer.py +135 -0
stanza/stanza/tests/pipeline/test_pipeline_constituency_processor.py +61 -0

stanza/stanza/models/constituency_parser.py ADDED Viewed

	@@ -0,0 +1,881 @@

+"""A command line interface to a shift reduce constituency parser.
+This follows the work of
+Recurrent neural network grammars by Dyer et al
+In-Order Transition-based Constituent Parsing by Liu & Zhang
+The general outline is:
+  Train a model by taking a list of trees, converting them to
+    transition sequences, and learning a model which can predict the
+    next transition given a current state
+  Then, at inference time, repeatedly predict the next transition until parsing is complete
+The "transitions" are variations on shift/reduce as per an
+intro-to-compilers class.  The idea is that you can treat all of the
+words in a sentence as a buffer of tokens, then either "shift" them to
+represent a new constituent, or "reduce" one or more constituents to
+form a new constituent.
+In order to make the runtime a more competitive speed, effort is taken
+to batch the transitions and apply multiple transitions at once.  At
+train time, batches are groups together by length, and at inference
+time, new trees are added to the batch as previous trees on the batch
+finish their inference.
+There are a few minor differences in the model:
+  - The word input is a bi-lstm, not a uni-lstm.
+    This gave a small increase in accuracy.
+  - The combination of several constituents into one constituent is done
+    via a single bi-lstm rather than two separate lstms.  This increases
+    speed without a noticeable effect on accuracy.
+  - In fact, an even better (in terms of final model accuracy) method
+    is to combine the constituents with torch.max, believe it or not
+    See lstm_model.py for more details
+  - Initializing the embeddings with smaller values than pytorch default
+    For example, on a ja_alt dataset, scores went from 0.8980 to 0.8985
+    at 200 iterations averaged over 5 trials
+  - Training with AdaDelta first, then AdamW or madgrad later improves
+    results quite a bit.  See --multistage
+A couple experiments which have been tried with little noticeable impact:
+  - Combining constituents using the method in the paper (only a trained
+    vector at the start instead of both ends) did not affect results
+    and is a little slower
+  - Using multiple layers of LSTM hidden state for the input to the final
+    classification layers didn't help
+  - Initializing Linear layers with He initialization and a positive bias
+    (to avoid dead connections) had no noticeable effect on accuracy
+    0.8396 on it_turin with the original initialization
+    0.8401 and 0.8427 on two runs with updated initialization
+    (so maybe a small improvement...)
+  - Initializing LSTM layers with different gates was slightly worse:
+    forget gates of 1.0
+    forget gates of 1.0, input gates of -1.0
+  - Replacing the LSTMs that make up the Transition and Constituent
+    LSTMs with Dynamic Skip LSTMs made no difference, but was slower
+  - Highway LSTMs also made no difference
+  - Putting labels on the shift transitions (the word or the tag shifted)
+    or putting labels on the close transitions didn't help
+  - Building larger constituents from the output of the constituent LSTM
+    instead of the children constituents hurts scores
+    For example, an experiment on ja_alt went from 0.8985 to 0.8964
+    when built that way
+  - The initial transition scheme implemented was TOP_DOWN.  We tried
+    a compound unary option, since this worked so well in the CoreNLP
+    constituency parser.  Unfortunately, this is far less effective
+    than IN_ORDER.  Both specialized unary matrices and reusing the
+    n-ary constituency combination fell short.  On the ja_alt dataset:
+      IN_ORDER, max combination method:           0.8985
+      TOP_DOWN_UNARY, specialized matrices:       0.8501
+      TOP_DOWN_UNARY, max combination method:     0.8508
+  - Adding multiple layers of MLP to combine inputs for words made
+    no difference in the scores
+    Tried both before the LSTM and after
+    A simple single layer tensor multiply after the LSTM works well.
+    Replacing that with a two layer MLP on the English PTB
+    with roberta-base causes a notable drop in scores
+    First experiment didn't use the fancy Linear weight init,
+    but adding that barely made a difference
+      260 training iterations on en_wsj dev, roberta-base
+      model as of bb983fd5e912f6706ad484bf819486971742c3d1
+      two layer MLP:                    0.9409
+      two layer MLP, init weights:      0.9413
+      single layer:                     0.9467
+  - There is code to rebuild models with a new structure in lstm_model.py
+    As part of this, we tried to randomly reinitialize the transitions
+    if the transition embedding had gone to 0, which often happens
+    This didn't help at all
+  - We tried something akin to attention with just the query vector
+    over the bert embeddings as a way to mix them, but that did not
+    improve scores.
+    Example, with a self.bert_layer_mix of size bert_dim x 1:
+        mixed_bert_embeddings = []
+        for feature in bert_embeddings:
+            weighted_feature = self.bert_layer_mix(feature.transpose(1, 2))
+            weighted_feature = torch.softmax(weighted_feature, dim=1)
+            weighted_feature = torch.matmul(feature, weighted_feature).squeeze(2)
+            mixed_bert_embeddings.append(weighted_feature)
+        bert_embeddings = mixed_bert_embeddings
+    It seems just finetuning the transformer is already enough
+    (in general, no need to mix layers at all when finetuning bert embeddings)
+The code breakdown is as follows:
+  this file: main interface for training or evaluating models
+  constituency/trainer.py: contains the training & evaluation code
+  constituency/ensemble.py: evaluation code specifically for letting multiple models
+    vote on the correct next transition.  a modest improvement.
+  constituency/evaluate_treebanks.py: specifically to evaluate multiple parsed treebanks
+    against a gold.  in particular, reports whether the theoretical best from those
+    parsed treebanks is an improvement (eg, the k-best score as reported by CoreNLP)
+  constituency/parse_tree.py: a data structure for representing a parse tree and utility methods
+  constituency/tree_reader.py: a module which can read trees from a string or input file
+  constituency/tree_stack.py: a linked list which can branch in
+    different directions, which will be useful when implementing beam
+    search or a dynamic oracle
+  constituency/lstm_tree_stack.py: an LSTM over the elements of a TreeStack
+  constituency/transformer_tree_stack.py: attempts to run attention over the nodes
+    of a tree_stack.  not as effective as the lstm_tree_stack in the initial experiments.
+    perhaps it could be refined to work better, though
+  constituency/parse_transitions.py: transitions and a State data structure to store them
+  constituency/transition_sequence.py: turns ParseTree objects into
+    the transition sequences needed to make them
+  constituency/base_model.py: operates on the transitions to turn them in to constituents,
+    eventually forming one final parse tree composed of all of the constituents
+  constituency/lstm_model.py: adds LSTM features to the constituents to predict what the
+    correct transition to make is, allowing for predictions on previously unseen text
+  constituency/retagging.py: a couple utility methods specifically for retagging
+  constituency/utils.py: a couple utility methods
+  constituency/dyanmic_oracle.py: a dynamic oracle which currently
+    only operates for the inorder transition sequence.
+    uses deterministic rules to redo the correct action sequence when
+    the parser makes an error.
+  constituency/partitioned_transformer.py: implementation of a transformer for self-attention.
+     presumably this should help, but we have yet to find a model structure where
+     this makes the scores go up.
+  constituency/label_attention.py: an even fancier form of transformer based on labeled attention:
+     https://arxiv.org/abs/1911.03875
+  constituency/positional_encoding.py: so far, just the sinusoidal is here.
+     a trained encoding is in partitioned_transformer.py.
+     this should probably be refactored to common, especially if used elsewhere.
+  stanza/pipeline/constituency_processor.py: interface between this model and the Pipeline
+  stanza/utils/datasets/constituency: various scripts and tools for processing constituency datasets
+Some alternate optimizer methods:
+  adabelief: https://github.com/juntang-zhuang/Adabelief-Optimizer
+  madgrad: https://github.com/facebookresearch/madgrad
+"""
+import argparse
+import logging
+import os
+import re
+import torch
+import stanza
+from stanza.models.common import constant
+from stanza.models.common import utils
+from stanza.models.common.peft_config import add_peft_args, resolve_peft_args
+from stanza.models.constituency import parser_training
+from stanza.models.constituency import retagging
+from stanza.models.constituency.lstm_model import ConstituencyComposition, SentenceBoundary, StackHistory
+from stanza.models.constituency.parse_transitions import TransitionScheme
+from stanza.models.constituency.text_processing import load_model_parse_text
+from stanza.models.constituency.utils import DEFAULT_LEARNING_EPS, DEFAULT_LEARNING_RATES, DEFAULT_MOMENTUM, DEFAULT_LEARNING_RHO, DEFAULT_WEIGHT_DECAY, NONLINEARITY, add_predict_output_args, postprocess_predict_output_args
+from stanza.resources.common import DEFAULT_MODEL_DIR
+logger = logging.getLogger('stanza')
+tlogger = logging.getLogger('stanza.constituency.trainer')
+def build_argparse():
+    """
+    Adds the arguments for building the con parser
+    For the most part, defaults are set to cross-validated values, at least for WSJ
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_dir', type=str, default='data/constituency', help='Directory of constituency data.')
+    parser.add_argument('--wordvec_dir', type=str, default='extern_data/wordvec', help='Directory of word vectors')
+    parser.add_argument('--wordvec_file', type=str, default='', help='File that contains word vectors')
+    parser.add_argument('--wordvec_pretrain_file', type=str, default=None, help='Exact name of the pretrain file to read')
+    parser.add_argument('--pretrain_max_vocab', type=int, default=250000)
+    parser.add_argument('--charlm_forward_file', type=str, default=None, help="Exact path to use for forward charlm")
+    parser.add_argument('--charlm_backward_file', type=str, default=None, help="Exact path to use for backward charlm")
+    # BERT helps a lot and actually doesn't slow things down too much
+    # for VI, for example, use vinai/phobert-base
+    parser.add_argument('--bert_model', type=str, default=None, help="Use an external bert model (requires the transformers package)")
+    parser.add_argument('--no_bert_model', dest='bert_model', action="store_const", const=None, help="Don't use bert")
+    parser.add_argument('--bert_hidden_layers', type=int, default=4, help="How many layers of hidden state to use from the transformer")
+    parser.add_argument('--bert_hidden_layers_original', action='store_const', const=None, dest='bert_hidden_layers', help='Use layers 2,3,4 of the Bert embedding')
+    # BERT finetuning (or any transformer finetuning)
+    # also helps quite a lot.
+    # Experimentally, finetuning all of the layers is the most effective
+    # On the id_icon dataset with the indolem transformer
+    # In this experiment, we trained for 150 iterations with AdaDelta,
+    # with the learning rate 0.01,
+    # then trained for another 150 with madgrad and no finetuning
+    #   1 layer        0.880753  (152)
+    #   2 layers       0.880453  (174)
+    #   3 layers       0.881774  (163)
+    #   4 layers       0.886915  (194)
+    #   5 layers       0.892064  (299)
+    #   6 layers       0.891825  (224)
+    #   7 layers       0.894373  (173)
+    #   8 layers       0.894505  (233)
+    #   9 layers       0.896676  (269)
+    #  10 layers       0.897525  (269)
+    #  11 layers       0.897348  (211)
+    #  12 layers       0.898729  (270)
+    #  everything      0.898855  (252)
+    # so the trend is clear that more finetuning is better
+    #
+    # We found that finetuning works very well on the AdaDelta portion
+    # of a multistage training, but less well on a madgrad second
+    # stage.  The issue was that we literally could not set the
+    # learning rate low enough because madgrad used epsilon in the LR:
+    #  https://github.com/facebookresearch/madgrad/issues/16
+    #
+    # Possible values of the AdaDelta learning rate on the id_icon dataset
+    # In this experiment, we finetuned the entire transformer 150
+    # iterations on AdaDelta, then trained with madgrad for another
+    # 150 with no finetuning
+    #   0.0005:    0.89122   (155)
+    #   0.001:     0.889807  (241)
+    #   0.002:     0.894874  (202)
+    #   0.005:     0.896327  (270)
+    #   0.006:     0.898989  (246)
+    #   0.007:     0.896712  (167)
+    #   0.008:     0.900136  (237)
+    #   0.009:     0.898597  (169)
+    #   0.01:      0.898665  (251)
+    #   0.012:     0.89661   (274)
+    #   0.014:     0.899149  (283)
+    #   0.016:     0.896314  (230)
+    #   0.018:     0.897753  (257)
+    #   0.02:      0.893665  (256)
+    #   0.05:      0.849274  (159)
+    #   0.1:       0.850633  (183)
+    #   0.2:       0.847332  (176)
+    #
+    # The peak is somewhere around 0.008 to 0.014, with the further
+    # observation that at the 150 iteration mark, 0.09 was winning:
+    #   0.007:     0.894589  (33)
+    #   0.008:     0.894777  (53)
+    #   0.009:     0.896466  (56)
+    #   0.01:      0.895557  (71)
+    #   0.012:     0.893479  (45)
+    #   0.014:     0.89468  (116)
+    #   0.016:     0.893053 (128)
+    #   0.018:     0.893086  (48)
+    #
+    # Another option is to train for a few iterations with no
+    # finetuning, then begin finetuning.  However, that was not
+    # beneficial at all.
+    # Start iteration on id_icon, same setup as above:
+    #   1:         0.898855  (252)
+    #   5:         0.897885  (217)
+    #   10:        0.895367  (215)
+    #   25:        0.896781  (193)
+    #   50:        0.895216  (193)
+    # Using adamw instead of madgrad:
+    #   1:         0.900594  (226)
+    #   5:         0.898153  (267)
+    #   10:        0.898756  (271)
+    #   25:        0.896867  (256)
+    #   50:        0.895025  (220)
+    #
+    #
+    # With the observation that very low learning rate is currently
+    # not working for madgrad, we tried to parameter sweep LR for
+    # AdamW, and got the following, using a first stage LR of 0.009:
+    #  0.0:     0.899706  (290)
+    #  0.00005: 0.899631  (176)
+    #  0.0001:  0.899851  (233)
+    #  0.0002:  0.898601  (207)
+    #  0.0003:  0.899258  (252)
+    #  0.0004:  0.90033  (187)
+    #  0.0005:  0.899091  (183)
+    #  0.001:   0.899791  (268)
+    #  0.002:   0.899453  (196)
+    #  0.003:   0.897029  (173)
+    #  0.004:   0.899566  (290)
+    #  0.005:   0.899285  (289)
+    #  0.01:    0.898938  (233)
+    #  0.02:    0.898983  (248)
+    #  0.03:    0.898571  (247)
+    #  0.04:    0.898466  (180)
+    #  0.05:    0.897448  (214)
+    # It should be noted that in the 0.0001 range, the epoch to epoch
+    # change of the Bert weights was almost negligible.  Weights would
+    # change in the 5th or 6th decimal place, if at all.
+    #
+    # The conclusion of all these experiments is that, if we are using
+    # bert_finetuning, the best approach is probably a stage1 learning
+    # rate of 0.009 or so and a second stage optimizer of adamw with
+    # no LR or a very low LR.  This behavior is what happens with the
+    # --stage1_bert_finetune flag
+    parser.add_argument('--bert_finetune', default=False, action='store_true', help='Finetune the bert (or other transformer)')
+    parser.add_argument('--no_bert_finetune', dest='bert_finetune', action='store_false', help="Don't finetune the bert (or other transformer)")
+    parser.add_argument('--bert_finetune_layers', default=None, type=int, help='Only finetune this many layers from the transformer')
+    parser.add_argument('--bert_finetune_begin_epoch', default=None, type=int, help='Which epoch to start finetuning the transformer')
+    parser.add_argument('--bert_finetune_end_epoch', default=None, type=int, help='Which epoch to stop finetuning the transformer')
+    parser.add_argument('--bert_learning_rate', default=0.009, type=float, help='Scale the learning rate for transformer finetuning by this much')
+    parser.add_argument('--stage1_bert_learning_rate', default=None, type=float, help="Scale the learning rate for transformer finetuning by this much only during an AdaDelta warmup")
+    parser.add_argument('--bert_weight_decay', default=0.0001, type=float, help='Scale the weight decay for transformer finetuning by this much')
+    parser.add_argument('--stage1_bert_finetune', default=None, action='store_true', help="Finetune the bert (or other transformer) during an AdaDelta warmup, even if the second half doesn't use bert_finetune")
+    parser.add_argument('--no_stage1_bert_finetune', dest='stage1_bert_finetune', action='store_false', help="Don't finetune the bert (or other transformer) during an AdaDelta warmup, even if the second half doesn't use bert_finetune")
+    add_peft_args(parser)
+    parser.add_argument('--tag_embedding_dim', type=int, default=20, help="Embedding size for a tag.  0 turns off the feature")
+    # Smaller values also seem to work
+    # For example, after 700 iterations:
+    #   32: 0.9174
+    #   50: 0.9183
+    #   72: 0.9176
+    #  100: 0.9185
+    # not a huge difference regardless
+    # (these numbers were without retagging)
+    parser.add_argument('--delta_embedding_dim', type=int, default=100, help="Embedding size for a delta embedding")
+    parser.add_argument('--train_file', type=str, default=None, help='Input file for data loader.')
+    parser.add_argument('--no_train_remove_duplicates', default=True, action='store_false', dest="train_remove_duplicates", help="Do/don't remove duplicates from the training file.  Could be useful for intentionally reweighting some trees")
+    parser.add_argument('--silver_file', type=str, default=None, help='Secondary training file.')
+    parser.add_argument('--silver_remove_duplicates', default=False, action='store_true', help="Do/don't remove duplicates from the silver training file.  Could be useful for intentionally reweighting some trees")
+    parser.add_argument('--eval_file', type=str, default=None, help='Input file for data loader.')
+    # TODO: possibly refactor --tokenized_file / --tokenized_dir from here & ensemble
+    parser.add_argument('--xml_tree_file', type=str, default=None, help='Input file of VLSP formatted trees for parsing with parse_text.')
+    parser.add_argument('--tokenized_file', type=str, default=None, help='Input file of tokenized text for parsing with parse_text.')
+    parser.add_argument('--tokenized_dir', type=str, default=None, help='Input directory of tokenized text for parsing with parse_text.')
+    parser.add_argument('--mode', default='train', choices=['train', 'parse_text', 'predict', 'remove_optimizer'])
+    parser.add_argument('--num_generate', type=int, default=0, help='When running a dev set, how many sentences to generate beyond the greedy one')
+    add_predict_output_args(parser)
+    parser.add_argument('--lang', type=str, help='Language')
+    parser.add_argument('--shorthand', type=str, help="Treebank shorthand")
+    parser.add_argument('--transition_embedding_dim', type=int, default=20, help="Embedding size for a transition")
+    parser.add_argument('--transition_hidden_size', type=int, default=20, help="Embedding size for transition stack")
+    parser.add_argument('--transition_stack', default=StackHistory.LSTM, type=lambda x: StackHistory[x.upper()],
+                        help='How to track transitions over a parse.  {}'.format(", ".join(x.name for x in StackHistory)))
+    parser.add_argument('--transition_heads', default=4, type=int, help="How many heads to use in MHA *if* the transition_stack is Attention")
+    parser.add_argument('--constituent_stack', default=StackHistory.LSTM, type=lambda x: StackHistory[x.upper()],
+                        help='How to track transitions over a parse.  {}'.format(", ".join(x.name for x in StackHistory)))
+    parser.add_argument('--constituent_heads', default=8, type=int, help="How many heads to use in MHA *if* the transition_stack is Attention")
+    # larger was more effective, up to a point
+    # substantially smaller, such as 128,
+    # is fine if bert & charlm are not available
+    parser.add_argument('--hidden_size', type=int, default=512, help="Size of the output layers for constituency stack and word queue")
+    parser.add_argument('--epochs', type=int, default=400)
+    parser.add_argument('--epoch_size', type=int, default=5000, help="Runs this many trees in an 'epoch' instead of going through the training dataset exactly once.  Set to 0 to do the whole training set")
+    parser.add_argument('--silver_epoch_size', type=int, default=None, help="Runs this many trees in a silver 'epoch'.  If not set, will match --epoch_size")
+    # AdaDelta warmup for the conparser.  Motivation: AdaDelta results in
+    # higher scores overall, but learns 0s for the weights of the pattn and
+    # lattn layers.  AdamW learns weights for pattn, and the models are more
+    # accurate than models trained without pattn using AdamW, but the models
+    # are lower scores overall than the AdaDelta models.
+    #
+    # This improves that by first running AdaDelta, then switching.
+    #
+    # Now, if --multistage is set, run AdaDelta for half the epochs with no
+    # pattn or lattn.  Then start the specified optimizer for the rest of
+    # the time with the full model.  If pattn and lattn are both present,
+    # the model is 1/2 no attn, 1/4 pattn, 1/4 pattn and lattn
+    #
+    # Improvement on the WSJ dev set can be seen from 94.8 to 95.3
+    # when 4 layers of pattn are trained this way.
+    # More experiments to follow.
+    parser.add_argument('--multistage', default=True, action='store_true', help='1/2 epochs with adadelta no pattn or lattn, 1/4 with chosen optim and no lattn, 1/4 full model')
+    parser.add_argument('--no_multistage', dest='multistage', action='store_false', help="don't do the multistage learning")
+    # 1 seems to be the most effective, but we should cross-validate
+    parser.add_argument('--oracle_initial_epoch', type=int, default=1, help="Epoch where we start using the dynamic oracle to let the parser keep going with wrong decisions")
+    parser.add_argument('--oracle_frequency', type=float, default=0.8, help="How often to use the oracle vs how often to force the correct transition")
+    parser.add_argument('--oracle_forced_errors', type=float, default=0.001, help="Occasionally have the model randomly walk through the state space to try to learn how to recover")
+    parser.add_argument('--oracle_level', type=int, default=None, help='Restrict oracle transitions to this level or lower.  0 means off.  None means use all oracle transitions.')
+    parser.add_argument('--additional_oracle_levels', type=str, default=None, help='Add some additional experimental oracle transitions.  Basically for A/B testing transitions we expect to be bad.')
+    parser.add_argument('--deactivated_oracle_levels', type=str, default=None, help='Temporarily turn off a default oracle level.  Basically for A/B testing transitions we expect to be bad.')
+    # 30 is slightly slower than 50, for example, but seems to train a bit better on WSJ
+    # earlier version of the model (less accurate overall) had the following results with adadelta:
+    #  30: 0.9085
+    #  50: 0.9070
+    #  75: 0.9010
+    # 150: 0.8985
+    # as another data point, running a newer version with better constituency lstm behavior had:
+    #  30: 0.9111
+    #  50: 0.9094
+    # checking smaller batch sizes to see how this works, at 135 epochs, the values are
+    #  10: 0.8919
+    #  20: 0.9072
+    #  30: 0.9121
+    # obviously these experiments aren't the complete story, but it
+    # looks like 30 trees per batch is the best value for WSJ
+    # note that these numbers are for adadelta and might not apply
+    # to other optimizers
+    # eval batch should generally be faster the bigger the batch,
+    # up to a point, as it allows for more batching of the LSTM
+    # operations and the prediction step
+    parser.add_argument('--train_batch_size', type=int, default=30, help='How many trees to train before taking an optimizer step')
+    parser.add_argument('--eval_batch_size', type=int, default=50, help='How many trees to batch when running eval')
+    parser.add_argument('--save_dir', type=str, default='saved_models/constituency', help='Root dir for saving models.')
+    parser.add_argument('--save_name', type=str, default="{shorthand}_{embedding}_{finetune}_constituency.pt", help="File name to save the model")
+    parser.add_argument('--save_each_name', type=str, default=None, help="Save each model in sequence to this pattern.  Mostly for testing")
+    parser.add_argument('--save_each_start', type=int, default=None, help="When to start saving each model")
+    parser.add_argument('--save_each_frequency', type=int, default=1, help="How frequently to save each model")
+    parser.add_argument('--no_save_each_optimizer', dest='save_each_optimizer', default=True, action='store_false', help="Don't save the optimizer when saving 'each' model")
+    parser.add_argument('--seed', type=int, default=1234)
+    parser.add_argument('--no_check_valid_states', default=True, action='store_false', dest='check_valid_states', help="Don't check the constituents or transitions in the dev set when starting a new parser.  Warning: the parser will never guess unknown constituents")
+    parser.add_argument('--no_strict_check_constituents', default=True, action='store_false', dest='strict_check_constituents', help="Don't check the constituents between the train & dev set.  May result in untrainable transitions")
+    utils.add_device_args(parser)
+    # Numbers are on a VLSP dataset, before adding attn or other improvements
+    # baseline is an 80.6 model that occurs when trained using adadelta, lr 1.0
+    #
+    # adabelief 0.1:      fails horribly
+    #           0.02:     converges very low scores
+    #           0.01:     very slow learning
+    #           0.002:    almost decent
+    #           0.001:    close, but about 1 f1 low on IT
+    #           0.0005:   79.71
+    #           0.0002:   80.11
+    #           0.0001:   79.85
+    #           0.00005:  80.40
+    #           0.00002:  80.02
+    #           0.00001:  78.95
+    #
+    # madgrad   0.005:    fails horribly
+    #           0.001:    low scores
+    #           0.0005:   still somewhat low
+    #           0.0002:   close, but about 1 f1 low on IT
+    #           0.0001:   80.04
+    #           0.00005:  79.91
+    #           0.00002:  80.15
+    #           0.00001:  80.44
+    #           0.000005: 80.34
+    #           0.000002: 80.39
+    #
+    # adamw experiment on a TR dataset (not necessarily the best test case)
+    # note that at that time, the expected best for adadelta was 0.816
+    #
+    #           0.00005 - 0.7925
+    #           0.0001  - 0.7889
+    #           0.0002  - 0.8110
+    #           0.00025 - 0.8108
+    #           0.0003  - 0.8050
+    #           0.0005  - 0.8076
+    #           0.001   - 0.8069
+    # Numbers on the VLSP Dataset, with --multistage and default learning rates and adabelief optimizer
+    # Gelu: 82.32
+    # Mish: 81.95
+    # ELU: 81.73
+    # Hardshrink: 0.3
+    # Hardsigmoid: 79.03
+    # Hardtanh: 81.44
+    # Hardswish: 81.67
+    # Logsigmoid: 80.91
+    # Prelu: 80.95 (terminated early)
+    # Relu6: 81.91
+    # RReLU: 77.00
+    # Selu: 81.17
+    # Celu: 81.43
+    # Silu: 81.90
+    # Softplus: 80.94
+    # Softshrink: 0.3
+    # Softsign: 81.63
+    # Softshrink: 13.74
+    #
+    # Tests with no_charlm, --multitstage
+    # Gelu
+    # 0.00002 0.819746
+    # 0.00005 0.818
+    # 0.0001 0.818566
+    # 0.0002 0.819111
+    # 0.001 0.815609
+    #
+    # Mish
+    # 0.00002 0.816898
+    # 0.00005 0.821085
+    # 0.0001 0.817821
+    # 0.0002 0.818806
+    # 0.001 0.816494
+    #
+    # Relu
+    # 0.00002 0.818402
+    # 0.00005 0.819019
+    # 0.0001 0.821625
+    # 0.0002 0.820633
+    # 0.001 0.814315
+    #
+    # Relu6
+    # 0.00002 0.819719
+    # 0.00005 0.819871
+    # 0.0001 0.819018
+    # 0.0002 0.819506
+    # 0.001 0.819018
+    parser.add_argument('--learning_rate', default=None, type=float, help='Learning rate for the optimizer.  Reasonable values are 1.0 for adadelta or 0.001 for SGD.  None uses a default for the given optimizer: {}'.format(DEFAULT_LEARNING_RATES))
+    parser.add_argument('--learning_eps', default=None, type=float, help='eps value to use in the optimizer.  None uses a default for the given optimizer: {}'.format(DEFAULT_LEARNING_EPS))
+    parser.add_argument('--learning_momentum', default=None, type=float, help='Momentum.  None uses a default for the given optimizer: {}'.format(DEFAULT_MOMENTUM))
+    # weight decay values other than adadelta have not been thoroughly tested.
+    # When using adadelta, weight_decay of 0.01 to 0.001 had the best results.
+    # 0.1 was very clearly too high. 0.0001 might have been okay.
+    # Running a series of 5x experiments on a VI dataset:
+    #    0.030:   0.8167018
+    #    0.025:   0.81659
+    #    0.020:   0.81722
+    #    0.015:   0.81721
+    #    0.010:   0.81474348
+    #    0.005:   0.81503
+    parser.add_argument('--learning_weight_decay', default=None, type=float, help='Weight decay (eg, l2 reg) to use in the optimizer')
+    parser.add_argument('--learning_rho', default=DEFAULT_LEARNING_RHO, type=float, help='Rho parameter in Adadelta')
+    # A few experiments on beta2 didn't show much benefit from changing it
+    #   On an experiment with training WSJ with default parameters
+    #   AdaDelta for 200 iterations, then training AdamW for 200 more,
+    #   0.999, 0.997, 0.995 all wound up with 0.9588
+    #   values lower than 0.995 all had a slight dropoff
+    parser.add_argument('--learning_beta2', default=0.999, type=float, help='Beta2 argument for AdamW')
+    parser.add_argument('--optim', default=None, help='Optimizer type: SGD, AdamW, Adadelta, AdaBelief, Madgrad')
+    parser.add_argument('--stage1_learning_rate', default=None, type=float, help='Learning rate to use in the first stage of --multistage.  None means use default: {}'.format(DEFAULT_LEARNING_RATES['adadelta']))
+    parser.add_argument('--learning_rate_warmup', default=0, type=int, help="Number of epochs to ramp up learning rate from 0 to full.  Set to 0 to always use the chosen learning rate.  Currently not functional, as it didn't do anything")
+    parser.add_argument('--learning_rate_factor', default=0.6, type=float, help='Plateau learning rate decreate when plateaued')
+    parser.add_argument('--learning_rate_patience', default=5, type=int, help='Plateau learning rate patience')
+    parser.add_argument('--learning_rate_cooldown', default=10, type=int, help='Plateau learning rate cooldown')
+    parser.add_argument('--learning_rate_min_lr', default=None, type=float, help='Plateau learning rate minimum')
+    parser.add_argument('--stage1_learning_rate_min_lr', default=None, type=float, help='Plateau learning rate minimum (stage 1)')
+    parser.add_argument('--grad_clipping', default=None, type=float, help='Clip abs(grad) to this amount.  Use --no_grad_clipping to turn off grad clipping')
+    parser.add_argument('--no_grad_clipping', action='store_const', const=None, dest='grad_clipping', help='Use --no_grad_clipping to turn off grad clipping')
+    # Large Margin is from Large Margin In Softmax Cross-Entropy Loss
+    # it did not help on an Italian VIT test
+    # scores went from 0.8252 to 0.8248
+    parser.add_argument('--loss', default='cross', help='cross, large_margin, or focal.  Focal requires `pip install focal_loss_torch`')
+    parser.add_argument('--loss_focal_gamma', default=2, type=float, help='gamma value for a focal loss')
+    # turn off dropout for word_dropout, predict_dropout, and lstm_input_dropout
+    # this mechanism doesn't actually turn off lstm_layer_dropout (yet)
+    # but that is set to a default of 0 anyway
+    # this is reusing the idea presented in
+    # https://arxiv.org/pdf/2303.01500v2
+    # "Dropout Reduces Underfitting"
+    # Zhuang Liu, Zhiqiu Xu, Joseph Jin, Zhiqiang Shen, Trevor Darrell
+    # Unfortunately, this does not consistently help results
+    # Averaged of 5 models w/ transformer, dev / test
+    # id_icon - improves a little
+    #  baseline           0.8823    0.8904
+    #  early_dropout 40   0.8835    0.8919
+    # ja_alt - worsens a little
+    #  baseline           0.9308    0.9355
+    #  early_dropout 40   0.9287    0.9345
+    # vi_vlsp23 - worsens a little
+    #  baseline           0.8262    0.8290
+    #  early_dropout 40   0.8255    0.8286
+    # We keep this as an available option for further experiments, if needed
+    parser.add_argument('--early_dropout', default=-1, type=int, help='When to turn off dropout')
+    # When using word_dropout and predict_dropout in conjunction with relu, one particular experiment produced the following dev scores after 300 iterations:
+    # 0.0: 0.9085
+    # 0.2: 0.9165
+    # 0.4: 0.9162
+    # 0.5: 0.9123
+    # Letting 0.2 and 0.4 run for longer, along with 0.3 as another
+    # trial, continued to give extremely similar results over time.
+    # No attempt has been made to test the different dropouts separately...
+    parser.add_argument('--word_dropout', default=0.2, type=float, help='Dropout on the word embedding')
+    parser.add_argument('--predict_dropout', default=0.2, type=float, help='Dropout on the final prediction layer')
+    # lstm_dropout has not been fully tested yet
+    # one experiment after 200 iterations (after retagging, so scores are lower than some other experiments):
+    # 0.0: 0.9093
+    # 0.1: 0.9094
+    # 0.2: 0.9094
+    # 0.3: 0.9076
+    # 0.4: 0.9077
+    parser.add_argument('--lstm_layer_dropout', default=0.0, type=float, help='Dropout in the LSTM layers')
+    # one not very conclusive experiment (not long enough) came up with these numbers after ~200 iterations
+    # 0.0       0.9091
+    # 0.1       0.9095
+    # 0.2       0.9118
+    # 0.3       0.9123
+    # 0.4       0.9080
+    parser.add_argument('--lstm_input_dropout', default=0.2, type=float, help='Dropout on the input to an LSTM')
+    parser.add_argument('--transition_scheme', default=TransitionScheme.IN_ORDER, type=lambda x: TransitionScheme[x.upper()],
+                        help='Transition scheme to use.  {}'.format(", ".join(x.name for x in TransitionScheme)))
+    parser.add_argument('--reversed', default=False, action='store_true', help='Do the transition sequence reversed')
+    # combining dummy and open node embeddings might be a slight improvement
+    # for example, after 550 iterations, one experiment had
+    # True:     0.9154
+    # False:    0.9150
+    # another (with a different structure) had 850 iterations
+    # True:     0.9155
+    # False:    0.9149
+    parser.add_argument('--combined_dummy_embedding', default=True, action='store_true', help="Use the same embedding for dummy nodes and the vectors used when combining constituents")
+    parser.add_argument('--no_combined_dummy_embedding', dest='combined_dummy_embedding', action='store_false', help="Don't use the same embedding for dummy nodes and the vectors used when combining constituents")
+    # relu gave at least 1 F1 improvement over tanh in various experiments
+    # relu & gelu seem roughly the same, but relu is clearly faster.
+    # relu, 496 iterations: 0.9176
+    # gelu, 467 iterations: 0.9181
+    # after the same clock time on the same hardware.  the two had been
+    # trading places in terms of accuracy over those ~500 iterations.
+    # leaky_relu was not an improvement - a full run on WSJ led to 0.9181 f1 instead of 0.919
+    # See constituency/utils.py for more extensive comments on nonlinearity options
+    parser.add_argument('--nonlinearity', default='relu', choices=NONLINEARITY.keys(), help='Nonlinearity to use in the model.  relu is a noticeable improvement over tanh')
+    # In one experiment on an Italian dataset, VIT, we got the following:
+    #  0.8254 with relu as the nonlinearity   (10 trials)
+    #  0.8265 with maxout, k = 2              (15)
+    #  0.8253 with maxout, k = 3              (5)
+    # The speed in terms of trees/second might be slightly slower with maxout.
+    #  51.4 it/s on a Titan Xp with maxout 2 and 51.9 it/s with relu
+    # It might also be worth running some experiments with bigger
+    # output layers to see if that makes up for the difference in score.
+    parser.add_argument('--maxout_k', default=None, type=int, help="Use maxout layers instead of a nonlinearity for the output layers")
+    parser.add_argument('--use_silver_words', default=True, dest='use_silver_words', action='store_true', help="Train/don't train word vectors for words only in the silver dataset")
+    parser.add_argument('--no_use_silver_words', default=True, dest='use_silver_words', action='store_false', help="Train/don't train word vectors for words only in the silver dataset")
+    parser.add_argument('--rare_word_unknown_frequency', default=0.02, type=float, help='How often to replace a rare word with UNK when training')
+    parser.add_argument('--rare_word_threshold', default=0.02, type=float, help='How many words to consider as rare words as a fraction of the dataset')
+    parser.add_argument('--tag_unknown_frequency', default=0.001, type=float, help='How often to replace a tag with UNK when training')
+    parser.add_argument('--num_lstm_layers', default=2, type=int, help='How many layers to use in the LSTMs')
+    parser.add_argument('--num_tree_lstm_layers', default=None, type=int, help='How many layers to use in the TREE_LSTMs, if used.  This also increases the width of the word outputs to match the tree lstm inputs.  Default 2 if TREE_LSTM or TREE_LSTM_CX, 1 otherwise')
+    parser.add_argument('--num_output_layers', default=3, type=int, help='How many layers to use at the prediction level')
+    parser.add_argument('--sentence_boundary_vectors', default=SentenceBoundary.EVERYTHING, type=lambda x: SentenceBoundary[x.upper()],
+                        help='Vectors to learn at the start & end of sentences.  {}'.format(", ".join(x.name for x in SentenceBoundary)))
+    parser.add_argument('--constituency_composition', default=ConstituencyComposition.MAX, type=lambda x: ConstituencyComposition[x.upper()],
+                        help='How to build a new composition from its children.  {}'.format(", ".join(x.name for x in ConstituencyComposition)))
+    parser.add_argument('--reduce_heads', default=8, type=int, help='Number of attn heads to use when reducing children into a parent tree (constituency_composition == attn)')
+    parser.add_argument('--reduce_position', default=None, type=int, help="Dimension of position vector to use when reducing children.  None means 1/4 hidden_size, 0 means don't use (constituency_composition == key | untied_key)")
+    parser.add_argument('--relearn_structure', action='store_true', help='Starting from an existing checkpoint, add or remove pattn / lattn.  One thing that works well is to train an initial model using adadelta with no pattn, then add pattn with adamw')
+    parser.add_argument('--finetune', action='store_true', help='Load existing model during `train` mode from `load_name` path')
+    parser.add_argument('--checkpoint_save_name', type=str, default=None, help="File name to save the most recent checkpoint")
+    parser.add_argument('--no_checkpoint', dest='checkpoint', action='store_false', help="Don't save checkpoints")
+    parser.add_argument('--load_name', type=str, default=None, help='Model to load when finetuning, evaluating, or manipulating an existing file')
+    parser.add_argument('--load_package', type=str, default=None, help='Download an existing stanza package & use this for tests, finetuning, etc')
+    retagging.add_retag_args(parser)
+    # Partitioned Attention
+    parser.add_argument('--pattn_d_model', default=1024, type=int, help='Partitioned attention model dimensionality')
+    parser.add_argument('--pattn_morpho_emb_dropout', default=0.2, type=float, help='Dropout rate for morphological features obtained from pretrained model')
+    parser.add_argument('--pattn_encoder_max_len', default=512, type=int, help='Max length that can be put into the transformer attention layer')
+    parser.add_argument('--pattn_num_heads', default=8, type=int, help='Partitioned attention model number of attention heads')
+    parser.add_argument('--pattn_d_kv', default=64, type=int, help='Size of the query and key vector')
+    parser.add_argument('--pattn_d_ff', default=2048, type=int, help='Size of the intermediate vectors in the feed-forward sublayer')
+    parser.add_argument('--pattn_relu_dropout', default=0.1, type=float, help='ReLU dropout probability in feed-forward sublayer')
+    parser.add_argument('--pattn_residual_dropout', default=0.2, type=float, help='Residual dropout probability for all residual connections')
+    parser.add_argument('--pattn_attention_dropout', default=0.2, type=float, help='Attention dropout probability')
+    parser.add_argument('--pattn_num_layers', default=0, type=int, help='Number of layers for the Partitioned Attention.  Currently turned off')
+    parser.add_argument('--pattn_bias', default=False, action='store_true', help='Whether or not to learn an additive bias')
+    # Results seem relatively similar with learned position embeddings or sin/cos position embeddings
+    parser.add_argument('--pattn_timing', default='sin', choices=['learned', 'sin'], help='Use a learned embedding or a sin embedding')
+    # Label Attention
+    parser.add_argument('--lattn_d_input_proj', default=None, type=int, help='If set, project the non-positional inputs down to this size before proceeding.')
+    parser.add_argument('--lattn_d_kv', default=64, type=int, help='Dimension of the key/query vector')
+    parser.add_argument('--lattn_d_proj', default=64, type=int, help='Dimension of the output vector from each label attention head')
+    parser.add_argument('--lattn_resdrop', default=True, action='store_true', help='Whether or not to use Residual Dropout')
+    parser.add_argument('--lattn_pwff', default=True, action='store_true', help='Whether or not to use a Position-wise Feed-forward Layer')
+    parser.add_argument('--lattn_q_as_matrix', default=False, action='store_true', help='Whether or not Label Attention uses learned query vectors. False means it does')
+    parser.add_argument('--lattn_partitioned', default=True, action='store_true', help='Whether or not it is partitioned')
+    parser.add_argument('--no_lattn_partitioned', default=True, action='store_false', dest='lattn_partitioned', help='Whether or not it is partitioned')
+    parser.add_argument('--lattn_combine_as_self', default=False, action='store_true', help='Whether or not the layer uses concatenation. False means it does')
+    # currently unused - always assume 1/2 of pattn
+    #parser.add_argument('--lattn_d_positional', default=512, type=int, help='Dimension for the positional embedding')
+    parser.add_argument('--lattn_d_l', default=32, type=int, help='Number of labels')
+    parser.add_argument('--lattn_attention_dropout', default=0.2, type=float, help='Dropout for attention layer')
+    parser.add_argument('--lattn_d_ff', default=2048, type=int, help='Dimension of the Feed-forward layer')
+    parser.add_argument('--lattn_relu_dropout', default=0.2, type=float, help='Relu dropout for the label attention')
+    parser.add_argument('--lattn_residual_dropout', default=0.2, type=float, help='Residual dropout for the label attention')
+    parser.add_argument('--lattn_combined_input', default=True, action='store_true', help='Combine all inputs for the lattn, not just the pattn')
+    parser.add_argument('--use_lattn', default=False, action='store_true', help='Use the lattn layers - currently turned off')
+    parser.add_argument('--no_use_lattn', dest='use_lattn', action='store_false', help='Use the lattn layers - currently turned off')
+    parser.add_argument('--no_lattn_combined_input', dest='lattn_combined_input', action='store_false', help="Don't combine all inputs for the lattn, not just the pattn")
+    parser.add_argument('--log_norms', default=False, action='store_true', help='Log the parameters norms while training.  A very noisy option')
+    parser.add_argument('--log_shapes', default=False, action='store_true', help='Log the parameters shapes at the beginning')
+    parser.add_argument('--watch_regex', default=None, help='regex to describe which weights and biases to output, if any')
+    parser.add_argument('--wandb', action='store_true', help='Start a wandb session and write the results of training.  Only applies to training.  Use --wandb_name instead to specify a name')
+    parser.add_argument('--wandb_name', default=None, help='Name of a wandb session to start when training.  Will default to the dataset short name')
+    parser.add_argument('--wandb_norm_regex', default=None, help='Log on wandb any tensor whose norm matches this matrix.  Might get cluttered?')
+    return parser
+def build_model_filename(args):
+    embedding = utils.embedding_name(args)
+    maybe_finetune = "finetuned" if args['bert_finetune'] or args['stage1_bert_finetune'] else ""
+    transformer_finetune_begin = "%d" % args['bert_finetune_begin_epoch'] if args['bert_finetune_begin_epoch'] is not None else ""
+    model_save_file = args['save_name'].format(shorthand=args['shorthand'],
+                                               oracle_level=args['oracle_level'],
+                                               embedding=embedding,
+                                               finetune=maybe_finetune,
+                                               transformer_finetune_begin=transformer_finetune_begin,
+                                               transition_scheme=args['transition_scheme'].name.lower().replace("_", ""),
+                                               tscheme=args['transition_scheme'].short_name,
+                                               trans_layers=args['bert_hidden_layers'],
+                                               seed=args['seed'])
+    model_save_file = re.sub("_+", "_", model_save_file)
+    logger.info("Expanded save_name: %s", model_save_file)
+    model_dir = os.path.split(model_save_file)[0]
+    if model_dir != args['save_dir']:
+        model_save_file = os.path.join(args['save_dir'], model_save_file)
+    return model_save_file
+def parse_args(args=None):
+    parser = build_argparse()
+    args = parser.parse_args(args=args)
+    resolve_peft_args(args, logger, check_bert_finetune=False)
+    if not args.lang and args.shorthand and len(args.shorthand.split("_", maxsplit=1)) == 2:
+        args.lang = args.shorthand.split("_")[0]
+    if args.stage1_bert_learning_rate is None:
+        args.stage1_bert_learning_rate = args.bert_learning_rate
+    if args.optim is None and args.mode == 'train':
+        if not args.multistage:
+            # this seemed to work the best when not doing multistage
+            args.optim = "adadelta"
+            if args.use_peft and not args.bert_finetune:
+                logger.info("--use_peft set.  setting --bert_finetune as well")
+                args.bert_finetune = True
+        elif args.bert_finetune or args.stage1_bert_finetune:
+            logger.info("Multistage training is set, optimizer is not chosen, and bert finetuning is active.  Will use AdamW as the second stage optimizer.")
+            args.optim = "adamw"
+        else:
+            # if MADGRAD exists, use it
+            # otherwise, adamw
+            try:
+                import madgrad
+                args.optim = "madgrad"
+                logger.info("Multistage training is set, optimizer is not chosen, and MADGRAD is available.  Will use MADGRAD as the second stage optimizer.")
+            except ModuleNotFoundError as e:
+                logger.warning("Multistage training is set.  Best models are with MADGRAD, but it is not installed.  Will use AdamW for the second stage optimizer.  Consider installing MADGRAD")
+                args.optim = "adamw"
+    if args.mode == 'train':
+        if args.learning_rate is None:
+            args.learning_rate = DEFAULT_LEARNING_RATES.get(args.optim.lower(), None)
+        if args.learning_eps is None:
+            args.learning_eps = DEFAULT_LEARNING_EPS.get(args.optim.lower(), None)
+        if args.learning_momentum is None:
+            args.learning_momentum = DEFAULT_MOMENTUM.get(args.optim.lower(), None)
+        if args.learning_weight_decay is None:
+            args.learning_weight_decay = DEFAULT_WEIGHT_DECAY.get(args.optim.lower(), None)
+        if args.stage1_learning_rate is None:
+            args.stage1_learning_rate = DEFAULT_LEARNING_RATES["adadelta"]
+        if args.stage1_bert_finetune is None:
+            args.stage1_bert_finetune = args.bert_finetune
+        if args.learning_rate_min_lr is None:
+            args.learning_rate_min_lr = args.learning_rate * 0.02
+        if args.stage1_learning_rate_min_lr is None:
+            args.stage1_learning_rate_min_lr = args.stage1_learning_rate * 0.02
+    if args.reduce_position is None:
+        args.reduce_position = args.hidden_size // 4
+    if args.num_tree_lstm_layers is None:
+        if args.constituency_composition in (ConstituencyComposition.TREE_LSTM, ConstituencyComposition.TREE_LSTM_CX):
+            args.num_tree_lstm_layers = 2
+        else:
+            args.num_tree_lstm_layers = 1
+    if args.wandb_name or args.wandb_norm_regex:
+        args.wandb = True
+    args = vars(args)
+    retagging.postprocess_args(args)
+    postprocess_predict_output_args(args)
+    model_save_file = build_model_filename(args)
+    args['save_name'] = model_save_file
+    if args['save_each_name']:
+        model_save_each_file = os.path.join(args['save_dir'], args['save_each_name'])
+        model_save_each_file = utils.build_save_each_filename(model_save_each_file)
+        args['save_each_name'] = model_save_each_file
+    else:
+        # in the event that there is a start epoch setting,
+        # this will make a reasonable default for the path
+        pieces = os.path.splitext(args['save_name'])
+        model_save_each_file = pieces[0] + "_%04d" + pieces[1]
+        args['save_each_name'] = model_save_each_file
+    if args['checkpoint']:
+        args['checkpoint_save_name'] = utils.checkpoint_name(args['save_dir'], model_save_file, args['checkpoint_save_name'])
+    return args
+def main(args=None):
+    """
+    Main function for building con parser
+    Processes args, calls the appropriate function for the chosen --mode
+    """
+    args = parse_args(args=args)
+    utils.set_random_seed(args['seed'])
+    logger.info("Running constituency parser in %s mode", args['mode'])
+    logger.debug("Using device: %s", args['device'])
+    model_load_file = args['save_name']
+    if args['load_name']:
+        if os.path.exists(args['load_name']):
+            model_load_file = args['load_name']
+        else:
+            model_load_file = os.path.join(args['save_dir'], args['load_name'])
+    elif args['load_package']:
+        if args['lang'] is None:
+            lang_pieces = args['load_package'].split("_", maxsplit=1)
+            try:
+                lang = constant.lang_to_langcode(lang_pieces[0])
+            except ValueError as e:
+                raise ValueError("--lang not specified, and the start of the --load_package name, %s, is not a known language.  Please check the values of those parameters" % args['load_package']) from e
+            args['lang'] = lang
+            args['load_package'] = lang_pieces[1]
+        stanza.download(args['lang'], processors="constituency", package={"constituency": args['load_package']})
+        model_load_file = os.path.join(DEFAULT_MODEL_DIR, args['lang'], 'constituency', args['load_package'] + ".pt")
+        if not os.path.exists(model_load_file):
+            raise FileNotFoundError("Expected the downloaded model file for language %s package %s to be in %s, but there is nothing there.  Perhaps the package name doesn't exist?" % (args['lang'], args['load_package'], model_load_file))
+        else:
+            logger.info("Model for language %s package %s is in %s", args['lang'], args['load_package'], model_load_file)
+    # TODO: when loading a saved model, we should default to whatever
+    # is in the model file for --retag_method, not the default for the language
+    if args['mode'] == 'train':
+        if tlogger.level == logging.NOTSET:
+            tlogger.setLevel(logging.DEBUG)
+            tlogger.debug("Set trainer logging level to DEBUG")
+    retag_pipeline = retagging.build_retag_pipeline(args)
+    if args['mode'] == 'train':
+        parser_training.train(args, model_load_file, retag_pipeline)
+    elif args['mode'] == 'predict':
+        parser_training.evaluate(args, model_load_file, retag_pipeline)
+    elif args['mode'] == 'parse_text':
+        load_model_parse_text(args, model_load_file, retag_pipeline)
+    elif args['mode'] == 'remove_optimizer':
+        parser_training.remove_optimizer(args, args['save_name'], model_load_file)
+if __name__ == '__main__':
+    main()

stanza/stanza/models/lemmatizer.py ADDED Viewed

	@@ -0,0 +1,313 @@

+"""
+Entry point for training and evaluating a lemmatizer.
+This lemmatizer combines a neural sequence-to-sequence architecture with an `edit` classifier
+and two dictionaries to produce robust lemmas from word forms.
+For details please refer to paper: https://nlp.stanford.edu/pubs/qi2018universal.pdf.
+"""
+import logging
+import sys
+import os
+import shutil
+import time
+from datetime import datetime
+import argparse
+import numpy as np
+import random
+import torch
+from torch import nn, optim
+from stanza.models.lemma.data import DataLoader
+from stanza.models.lemma.vocab import Vocab
+from stanza.models.lemma.trainer import Trainer
+from stanza.models.lemma import scorer, edit
+from stanza.models.common import utils
+import stanza.models.common.seq2seq_constant as constant
+from stanza.models.common.doc import *
+from stanza.utils.conll import CoNLL
+from stanza.models import _training_logging
+logger = logging.getLogger('stanza')
+def build_argparse():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_dir', type=str, default='data/lemma', help='Directory for all lemma data.')
+    parser.add_argument('--train_file', type=str, default=None, help='Input file for data loader.')
+    parser.add_argument('--eval_file', type=str, default=None, help='Input file for data loader.')
+    parser.add_argument('--output_file', type=str, default=None, help='Output CoNLL-U file.')
+    parser.add_argument('--gold_file', type=str, default=None, help='Output CoNLL-U file.')
+    parser.add_argument('--mode', default='train', choices=['train', 'predict'])
+    parser.add_argument('--shorthand', type=str, help='Shorthand for the dataset to use.  lang_dataset')
+    parser.add_argument('--no_dict', dest='ensemble_dict', action='store_false', help='Do not ensemble dictionary with seq2seq. By default use ensemble.')
+    parser.add_argument('--dict_only', action='store_true', help='Only train a dictionary-based lemmatizer.')
+    parser.add_argument('--hidden_dim', type=int, default=200)
+    parser.add_argument('--emb_dim', type=int, default=50)
+    parser.add_argument('--num_layers', type=int, default=1)
+    parser.add_argument('--emb_dropout', type=float, default=0.5)
+    parser.add_argument('--dropout', type=float, default=0.5)
+    parser.add_argument('--max_dec_len', type=int, default=50)
+    parser.add_argument('--beam_size', type=int, default=1)
+    parser.add_argument('--attn_type', default='soft', choices=['soft', 'mlp', 'linear', 'deep'], help='Attention type')
+    parser.add_argument('--pos_dim', type=int, default=50)
+    parser.add_argument('--pos_dropout', type=float, default=0.5)
+    parser.add_argument('--no_edit', dest='edit', action='store_false', help='Do not use edit classifier in lemmatization. By default use an edit classifier.')
+    parser.add_argument('--num_edit', type=int, default=len(edit.EDIT_TO_ID))
+    parser.add_argument('--alpha', type=float, default=1.0)
+    parser.add_argument('--no_pos', dest='pos', action='store_false', help='Do not use UPOS in lemmatization. By default UPOS is used.')
+    parser.add_argument('--no_copy', dest='copy', action='store_false', help='Do not use copy mechanism in lemmatization. By default copy mechanism is used to improve generalization.')
+    parser.add_argument('--charlm', action='store_true', help="Turn on contextualized char embedding using pretrained character-level language model.")
+    parser.add_argument('--charlm_shorthand', type=str, default=None, help="Shorthand for character-level language model training corpus.")
+    parser.add_argument('--charlm_forward_file', type=str, default=None, help="Exact path to use for forward charlm")
+    parser.add_argument('--charlm_backward_file', type=str, default=None, help="Exact path to use for backward charlm")
+    parser.add_argument('--sample_train', type=float, default=1.0, help='Subsample training data.')
+    parser.add_argument('--optim', type=str, default='adam', help='sgd, adagrad, adam or adamax.')
+    parser.add_argument('--lr', type=float, default=1e-3, help='Learning rate')
+    parser.add_argument('--lr_decay', type=float, default=0.9)
+    parser.add_argument('--decay_epoch', type=int, default=30, help="Decay the lr starting from this epoch.")
+    parser.add_argument('--num_epoch', type=int, default=60)
+    parser.add_argument('--batch_size', type=int, default=50)
+    parser.add_argument('--max_grad_norm', type=float, default=5.0, help='Gradient clipping.')
+    parser.add_argument('--log_step', type=int, default=20, help='Print log every k steps.')
+    parser.add_argument('--save_dir', type=str, default='saved_models/lemma', help='Root dir for saving models.')
+    parser.add_argument('--save_name', type=str, default="{shorthand}_{embedding}_lemmatizer.pt", help="File name to save the model")
+    parser.add_argument('--caseless', default=False, action='store_true', help='Lowercase everything first before processing.  This will happen automatically if 100%% of the data is caseless')
+    parser.add_argument('--seed', type=int, default=1234)
+    utils.add_device_args(parser)
+    parser.add_argument('--wandb', action='store_true', help='Start a wandb session and write the results of training.  Only applies to training.  Use --wandb_name instead to specify a name')
+    parser.add_argument('--wandb_name', default=None, help='Name of a wandb session to start when training.  Will default to the dataset short name')
+    return parser
+def parse_args(args=None):
+    parser = build_argparse()
+    args = parser.parse_args(args=args)
+    if args.wandb_name:
+        args.wandb = True
+    args = vars(args)
+    # when building the vocab, we keep track of the original language name
+    lang = args['shorthand'].split("_")[0] if args['shorthand'] else ""
+    args['lang'] = lang
+    return args
+def main(args=None):
+    args = parse_args(args=args)
+    utils.set_random_seed(args['seed'])
+    logger.info("Running lemmatizer in {} mode".format(args['mode']))
+    if args['mode'] == 'train':
+        train(args)
+    else:
+        evaluate(args)
+def all_lowercase(doc):
+    for sentence in doc.sentences:
+        for word in sentence.words:
+            if word.text.lower() != word.text:
+                return False
+    return True
+def build_model_filename(args):
+    embedding = "nocharlm"
+    if args['charlm'] and args['charlm_forward_file']:
+        embedding = "charlm"
+    model_file = args['save_name'].format(shorthand=args['shorthand'],
+                                          embedding=embedding)
+    model_dir = os.path.split(model_file)[0]
+    if not model_dir.startswith(args['save_dir']):
+        model_file = os.path.join(args['save_dir'], model_file)
+    return model_file
+def train(args):
+    # load data
+    logger.info("[Loading data with batch size {}...]".format(args['batch_size']))
+    train_doc = CoNLL.conll2doc(input_file=args['train_file'])
+    train_batch = DataLoader(train_doc, args['batch_size'], args, evaluation=False)
+    vocab = train_batch.vocab
+    args['vocab_size'] = vocab['char'].size
+    args['pos_vocab_size'] = vocab['pos'].size
+    dev_doc = CoNLL.conll2doc(input_file=args['eval_file'])
+    dev_batch = DataLoader(dev_doc, args['batch_size'], args, vocab=vocab, evaluation=True)
+    utils.ensure_dir(args['save_dir'])
+    model_file = build_model_filename(args)
+    logger.info("Using full savename: %s", model_file)
+    # pred and gold path
+    system_pred_file = args['output_file']
+    gold_file = args['gold_file']
+    utils.print_config(args)
+    # skip training if the language does not have training or dev data
+    if len(train_batch) == 0 or len(dev_batch) == 0:
+        logger.warning("[Skip training because no training data available...]")
+        return
+    if not args['caseless'] and all_lowercase(train_doc):
+        logger.info("Building a caseless model, as all of the training data is caseless")
+        args['caseless'] = True
+    # start training
+    # train a dictionary-based lemmatizer
+    logger.info("Building lemmatizer in %s", model_file)
+    trainer = Trainer(args=args, vocab=vocab, device=args['device'])
+    logger.info("[Training dictionary-based lemmatizer...]")
+    trainer.train_dict(train_batch.raw_data())
+    logger.info("Evaluating on dev set...")
+    dev_preds = trainer.predict_dict(dev_batch.doc.get([TEXT, UPOS]))
+    dev_batch.doc.set([LEMMA], dev_preds)
+    CoNLL.write_doc2conll(dev_batch.doc, system_pred_file)
+    _, _, dev_f = scorer.score(system_pred_file, gold_file)
+    logger.info("Dev F1 = {:.2f}".format(dev_f * 100))
+    if args.get('dict_only', False):
+        # save dictionaries
+        trainer.save(model_file)
+    else:
+        if args['wandb']:
+            import wandb
+            wandb_name = args['wandb_name'] if args['wandb_name'] else "%s_lemmatizer" % args['shorthand']
+            wandb.init(name=wandb_name, config=args)
+            wandb.run.define_metric('train_loss', summary='min')
+            wandb.run.define_metric('dev_score', summary='max')
+        # train a seq2seq model
+        logger.info("[Training seq2seq-based lemmatizer...]")
+        global_step = 0
+        max_steps = len(train_batch) * args['num_epoch']
+        dev_score_history = []
+        best_dev_preds = []
+        current_lr = args['lr']
+        global_start_time = time.time()
+        format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'
+        # start training
+        for epoch in range(1, args['num_epoch']+1):
+            train_loss = 0
+            for i, batch in enumerate(train_batch):
+                start_time = time.time()
+                global_step += 1
+                loss = trainer.update(batch, eval=False) # update step
+                train_loss += loss
+                if global_step % args['log_step'] == 0:
+                    duration = time.time() - start_time
+                    logger.info(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,
+                                                  max_steps, epoch, args['num_epoch'], loss, duration, current_lr))
+            # eval on dev
+            logger.info("Evaluating on dev set...")
+            dev_preds = []
+            dev_edits = []
+            for i, batch in enumerate(dev_batch):
+                preds, edits = trainer.predict(batch, args['beam_size'])
+                dev_preds += preds
+                if edits is not None:
+                    dev_edits += edits
+            dev_preds = trainer.postprocess(dev_batch.doc.get([TEXT]), dev_preds, edits=dev_edits)
+            # try ensembling with dict if necessary
+            if args.get('ensemble_dict', False):
+                logger.info("[Ensembling dict with seq2seq model...]")
+                dev_preds = trainer.ensemble(dev_batch.doc.get([TEXT, UPOS]), dev_preds)
+            dev_batch.doc.set([LEMMA], dev_preds)
+            CoNLL.write_doc2conll(dev_batch.doc, system_pred_file)
+            _, _, dev_score = scorer.score(system_pred_file, gold_file)
+            train_loss = train_loss / train_batch.num_examples * args['batch_size'] # avg loss per batch
+            logger.info("epoch {}: train_loss = {:.6f}, dev_score = {:.4f}".format(epoch, train_loss, dev_score))
+            if args['wandb']:
+                wandb.log({'train_loss': train_loss, 'dev_score': dev_score})
+            # save best model
+            if epoch == 1 or dev_score > max(dev_score_history):
+                trainer.save(model_file)
+                logger.info("new best model saved.")
+                best_dev_preds = dev_preds
+            # lr schedule
+            if epoch > args['decay_epoch'] and dev_score <= dev_score_history[-1] and \
+                    args['optim'] in ['sgd', 'adagrad']:
+                current_lr *= args['lr_decay']
+                trainer.update_lr(current_lr)
+            dev_score_history += [dev_score]
+            logger.info("")
+        logger.info("Training ended with {} epochs.".format(epoch))
+        if args['wandb']:
+            wandb.finish()
+        best_f, best_epoch = max(dev_score_history)*100, np.argmax(dev_score_history)+1
+        logger.info("Best dev F1 = {:.2f}, at epoch = {}".format(best_f, best_epoch))
+def evaluate(args):
+    # file paths
+    system_pred_file = args['output_file']
+    gold_file = args['gold_file']
+    model_file = build_model_filename(args)
+    # load model
+    trainer = Trainer(model_file=model_file, device=args['device'], args=args)
+    loaded_args, vocab = trainer.args, trainer.vocab
+    for k in args:
+        if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand']:
+            loaded_args[k] = args[k]
+    # load data
+    logger.info("Loading data with batch size {}...".format(args['batch_size']))
+    doc = CoNLL.conll2doc(input_file=args['eval_file'])
+    batch = DataLoader(doc, args['batch_size'], loaded_args, vocab=vocab, evaluation=True)
+    # skip eval if dev data does not exist
+    if len(batch) == 0:
+        logger.warning("Skip evaluation because no dev data is available...\nLemma score:\n{} ".format(args['shorthand']))
+        return
+    dict_preds = trainer.predict_dict(batch.doc.get([TEXT, UPOS]))
+    if loaded_args.get('dict_only', False):
+        preds = dict_preds
+    else:
+        logger.info("Running the seq2seq model...")
+        preds = []
+        edits = []
+        for i, b in enumerate(batch):
+            ps, es = trainer.predict(b, args['beam_size'])
+            preds += ps
+            if es is not None:
+                edits += es
+        preds = trainer.postprocess(batch.doc.get([TEXT]), preds, edits=edits)
+        if loaded_args.get('ensemble_dict', False):
+            logger.info("[Ensembling dict with seq2seq lemmatizer...]")
+            preds = trainer.ensemble(batch.doc.get([TEXT, UPOS]), preds)
+        if trainer.has_contextual_lemmatizers():
+            preds = trainer.update_contextual_preds(batch.doc, preds)
+    # write to file and score
+    batch.doc.set([LEMMA], preds)
+    CoNLL.write_doc2conll(batch.doc, system_pred_file)
+    if gold_file is not None:
+        _, _, score = scorer.score(system_pred_file, gold_file)
+        logger.info("Finished evaluation\nLemma score:\n{} {:.2f}".format(args['shorthand'], score*100))
+if __name__ == '__main__':
+    main()

stanza/stanza/pipeline/_constants.py ADDED Viewed

	@@ -0,0 +1,13 @@

+""" Module defining constants """
+# string constants for processor names
+LANGID = 'langid'
+TOKENIZE = 'tokenize'
+MWT = 'mwt'
+POS = 'pos'
+LEMMA = 'lemma'
+DEPPARSE = 'depparse'
+NER = 'ner'
+SENTIMENT = 'sentiment'
+CONSTITUENCY = 'constituency'
+COREF = 'coref'

stanza/stanza/pipeline/external/spacy.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""
+Processors related to spaCy in the pipeline.
+"""
+from stanza.models.common import doc
+from stanza.pipeline._constants import TOKENIZE
+from stanza.pipeline.processor import ProcessorVariant, register_processor_variant
+def check_spacy():
+    """
+    Import necessary components from spaCy to perform tokenization.
+    """
+    try:
+        import spacy
+    except ImportError:
+        raise ImportError(
+            "spaCy is used but not installed on your machine. Go to https://spacy.io/usage for installation instructions."
+        )
+    return True
+@register_processor_variant(TOKENIZE, 'spacy')
+class SpacyTokenizer(ProcessorVariant):
+    def __init__(self, config):
+        """ Construct a spaCy-based tokenizer by loading the spaCy pipeline.
+        """
+        if config['lang'] != 'en':
+            raise Exception("spaCy tokenizer is currently only allowed in English pipeline.")
+        try:
+            import spacy
+            from spacy.lang.en import English
+        except ImportError:
+            raise ImportError(
+                "spaCy 2.0+ is used but not installed on your machine. Go to https://spacy.io/usage for installation instructions."
+            )
+        # Create a Tokenizer with the default settings for English
+        # including punctuation rules and exceptions
+        self.nlp = English()
+        # by default spacy uses dependency parser to do ssplit
+        # we need to add a sentencizer for fast rule-based ssplit
+        if spacy.__version__.startswith("2."):
+            self.nlp.add_pipe(self.nlp.create_pipe("sentencizer"))
+        else:
+            self.nlp.add_pipe("sentencizer")
+        self.no_ssplit = config.get('no_ssplit', False)
+    def process(self, document):
+        """ Tokenize a document with the spaCy tokenizer and wrap the results into a Doc object.
+        """
+        if isinstance(document, doc.Document):
+            text = document.text
+        else:
+            text = document
+        if not isinstance(text, str):
+            raise Exception("Must supply a string or Stanza Document object to the spaCy tokenizer.")
+        spacy_doc = self.nlp(text)
+        sentences = []
+        for sent in spacy_doc.sents:
+            tokens = []
+            for tok in sent:
+                token_entry = {
+                    doc.TEXT: tok.text,
+                    doc.MISC: f"{doc.START_CHAR}={tok.idx}|{doc.END_CHAR}={tok.idx+len(tok.text)}"
+                }
+                tokens.append(token_entry)
+            sentences.append(tokens)
+        # if no_ssplit is set, flatten all the sentences into one sentence
+        if self.no_ssplit:
+            sentences = [[t for s in sentences for t in s]]
+        return doc.Document(sentences, text)

stanza/stanza/pipeline/ner_processor.py ADDED Viewed

	@@ -0,0 +1,143 @@

+"""
+Processor for performing named entity tagging.
+"""
+import torch
+import logging
+from stanza.models.common import doc
+from stanza.models.common.exceptions import ForwardCharlmNotFoundError, BackwardCharlmNotFoundError
+from stanza.models.common.utils import unsort
+from stanza.models.ner.data import DataLoader
+from stanza.models.ner.trainer import Trainer
+from stanza.models.ner.utils import merge_tags
+from stanza.pipeline._constants import *
+from stanza.pipeline.processor import UDProcessor, register_processor
+logger = logging.getLogger('stanza')
+@register_processor(name=NER)
+class NERProcessor(UDProcessor):
+    # set of processor requirements this processor fulfills
+    PROVIDES_DEFAULT = set([NER])
+    # set of processor requirements for this processor
+    REQUIRES_DEFAULT = set([TOKENIZE])
+    def _get_dependencies(self, config, dep_name):
+        dependencies = config.get(dep_name, None)
+        if dependencies is not None:
+            dependencies = dependencies.split(";")
+            dependencies = [x if x else None for x in dependencies]
+        else:
+            dependencies = [x.get(dep_name) for x in config.get('dependencies', [])]
+        return dependencies
+    def _set_up_model(self, config, pipeline, device):
+        # set up trainer
+        model_paths = config.get('model_path')
+        if isinstance(model_paths, str):
+            model_paths = model_paths.split(";")
+        charlm_forward_files = self._get_dependencies(config, 'forward_charlm_path')
+        charlm_backward_files = self._get_dependencies(config, 'backward_charlm_path')
+        pretrain_files = self._get_dependencies(config, 'pretrain_path')
+        # allow predict_tagset to be specified as an int
+        # (which only applies to the first model)
+        # or as a string ";" separated list of ints
+        self._predict_tagset = {}
+        predict_tagset = config.get('predict_tagset', None)
+        if predict_tagset:
+            if isinstance(predict_tagset, int):
+                self._predict_tagset[0] = predict_tagset
+            else:
+                predict_tagset = predict_tagset.split(";")
+                for piece_idx, piece in enumerate(predict_tagset):
+                    if piece:
+                        self._predict_tagset[piece_idx] = int(piece)
+        self.trainers = []
+        for (model_path, pretrain_path, charlm_forward, charlm_backward) in zip(model_paths, pretrain_files, charlm_forward_files, charlm_backward_files):
+            logger.debug("Loading %s with pretrain %s, forward charlm %s, backward charlm %s", model_path, pretrain_path, charlm_forward, charlm_backward)
+            pretrain = pipeline.foundation_cache.load_pretrain(pretrain_path) if pretrain_path else None
+            args = {'charlm_forward_file': charlm_forward,
+                    'charlm_backward_file': charlm_backward}
+            predict_tagset = self._predict_tagset.get(len(self.trainers), None)
+            if predict_tagset is not None:
+                args['predict_tagset'] = predict_tagset
+            try:
+                trainer = Trainer(args=args, model_file=model_path, pretrain=pretrain, device=device, foundation_cache=pipeline.foundation_cache)
+            except ForwardCharlmNotFoundError as e:
+                raise ForwardCharlmNotFoundError("Could not find the forward charlm %s.  Please specify the correct path with ner_forward_charlm_path" % e.filename, e.filename) from None
+            except BackwardCharlmNotFoundError as e:
+                raise BackwardCharlmNotFoundError("Could not find the backward charlm %s.  Please specify the correct path with ner_backward_charlm_path" % e.filename, e.filename) from None
+            self.trainers.append(trainer)
+        self._trainer = self.trainers[0]
+        self.model_paths = model_paths
+    def _set_up_final_config(self, config):
+        """ Finalize the configurations for this processor, based off of values from a UD model. """
+        # set configurations from loaded model
+        if len(self.trainers) == 0:
+            raise RuntimeError("Somehow there are no models loaded!")
+        self._vocab = self.trainers[0].vocab
+        self.configs = []
+        for trainer in self.trainers:
+            loaded_args = trainer.args
+            # filter out unneeded args from model
+            loaded_args = {k: v for k, v in loaded_args.items() if not UDProcessor.filter_out_option(k)}
+            loaded_args.update(config)
+            self.configs.append(loaded_args)
+        self._config = self.configs[0]
+    def __str__(self):
+        return "NERProcessor(%s)" % ";".join(self.model_paths)
+    def mark_inactive(self):
+        """ Drop memory intensive resources if keeping this processor around for reasons other than running it. """
+        super().mark_inactive()
+        self.trainers = None
+    def process(self, document):
+        with torch.no_grad():
+            all_preds = []
+            for trainer, config in zip(self.trainers, self.configs):
+                # set up a eval-only data loader and skip tag preprocessing
+                batch = DataLoader(document, config['batch_size'], config, vocab=trainer.vocab, evaluation=True, preprocess_tags=False, bert_tokenizer=trainer.model.bert_tokenizer)
+                preds = []
+                for i, b in enumerate(batch):
+                    preds += trainer.predict(b)
+                all_preds.append(preds)
+        # for each sentence, gather a list of predictions
+        # merge those predictions into a single list
+        # earlier models will have precedence
+        preds = [merge_tags(*x) for x in zip(*all_preds)]
+        batch.doc.set([doc.NER], [y for x in preds for y in x], to_token=True)
+        batch.doc.set([doc.MULTI_NER], [tuple(y) for x in zip(*all_preds) for y in zip(*x)], to_token=True)
+        # collect entities into document attribute
+        total = len(batch.doc.build_ents())
+        logger.debug(f'{total} entities found in document.')
+        return batch.doc
+    def bulk_process(self, docs):
+        """
+        NER processor has a collation step after running inference
+        """
+        docs = super().bulk_process(docs)
+        for doc in docs:
+            doc.build_ents()
+        return docs
+    def get_known_tags(self, model_idx=0):
+        """
+        Return the tags known by this model
+        Removes the S-, B-, etc, and does not include O
+        Specify model_idx if the processor  has more than one model
+        """
+        return self.trainers[model_idx].get_known_tags()

stanza/stanza/resources/print_charlm_depparse.py ADDED Viewed

	@@ -0,0 +1,22 @@

+"""
+A small utility script to output which depparse models use charlm
+(It should skip en_genia, en_craft, but currently doesn't)
+Not frequently useful, but seems like the kind of thing that might get used a couple times
+"""
+from stanza.resources.common import load_resources_json
+from stanza.resources.default_packages import default_charlms, depparse_charlms
+def list_depparse():
+    charlm_langs = list(default_charlms.keys())
+    resources = load_resources_json()
+    models = ["%s_%s" % (lang, model) for lang in charlm_langs for model in resources[lang].get("depparse", {})
+              if lang not in depparse_charlms or model not in depparse_charlms[lang] or depparse_charlms[lang][model] is not None]
+    return models
+if __name__ == "__main__":
+    models = list_depparse()
+    print(" ".join(models))

stanza/stanza/server/dependency_converter.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""
+A converter from constituency trees to dependency trees using CoreNLP's UniversalEnglish converter.
+ONLY works on English.
+"""
+import stanza
+from stanza.protobuf import DependencyConverterRequest, DependencyConverterResponse
+from stanza.server.java_protobuf_requests import send_request, build_tree, JavaProtobufContext
+CONVERTER_JAVA = "edu.stanford.nlp.trees.ProcessDependencyConverterRequest"
+def send_converter_request(request, classpath=None):
+    return send_request(request, DependencyConverterResponse, CONVERTER_JAVA, classpath=classpath)
+def build_request(doc):
+    """
+    Request format is simple: one tree per sentence in the document
+    """
+    request = DependencyConverterRequest()
+    for sentence in doc.sentences:
+        request.trees.append(build_tree(sentence.constituency, None))
+    return request
+def process_doc(doc, classpath=None):
+    """
+    Convert the constituency trees in the document,
+    then attach the resulting dependencies to the sentences
+    """
+    request = build_request(doc)
+    response = send_converter_request(request, classpath=classpath)
+    attach_dependencies(doc, response)
+def attach_dependencies(doc, response):
+    if len(doc.sentences) != len(response.conversions):
+        raise ValueError("Sent %d sentences but got back %d conversions" % (len(doc.sentences), len(response.conversions)))
+    for sent_idx, (sentence, conversion) in enumerate(zip(doc.sentences, response.conversions)):
+        graph = conversion.graph
+        # The deterministic conversion should have an equal number of words and one fewer edge
+        # ... the root is represented by a word with no parent
+        if len(sentence.words) != len(graph.node):
+            raise ValueError("Sentence %d of the conversion should have %d words but got back %d nodes in the graph" % (sent_idx, len(sentence.words), len(graph.node)))
+        if len(sentence.words) != len(graph.edge) + 1:
+            raise ValueError("Sentence %d of the conversion should have %d edges (one per word, plus the root) but got back %d edges in the graph" % (sent_idx, len(sentence.words) - 1, len(graph.edge)))
+        expected_nodes = set(range(1, len(sentence.words) + 1))
+        targets = set()
+        for edge in graph.edge:
+            if edge.target in targets:
+                raise ValueError("Found two parents of %d in sentence %d" % (edge.target, sent_idx))
+            targets.add(edge.target)
+            # -1 since the words are 0 indexed in the sentence,
+            # but we count dependencies from 1
+            sentence.words[edge.target-1].head = edge.source
+            sentence.words[edge.target-1].deprel = edge.dep
+        roots = expected_nodes - targets
+        assert len(roots) == 1
+        for root in roots:
+            sentence.words[root-1].head = 0
+            sentence.words[root-1].deprel = "root"
+        sentence.build_dependencies()
+class DependencyConverter(JavaProtobufContext):
+    """
+    Context window for the dependency converter
+    This is a context window which keeps a process open.  Should allow
+    for multiple requests without launching new java processes each time.
+    """
+    def __init__(self, classpath=None):
+        super(DependencyConverter, self).__init__(classpath, DependencyConverterResponse, CONVERTER_JAVA)
+    def process(self, doc):
+        """
+        Converts a constituency tree to dependency trees for each of the sentences in the document
+        """
+        request = build_request(doc)
+        response = self.process_request(request)
+        attach_dependencies(doc, response)
+        return doc
+def main():
+    nlp = stanza.Pipeline('en',
+                          processors='tokenize,pos,constituency')
+    doc = nlp('I like blue antennae.')
+    print("{:C}".format(doc))
+    process_doc(doc, classpath="$CLASSPATH")
+    print("{:C}".format(doc))
+    doc = nlp('And I cannot lie.')
+    print("{:C}".format(doc))
+    with DependencyConverter(classpath="$CLASSPATH") as converter:
+        converter.process(doc)
+        print("{:C}".format(doc))
+if __name__ == '__main__':
+    main()

stanza/stanza/tests/classifiers/test_constituency_classifier.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import os
+import pytest
+import stanza
+import stanza.models.classifier as classifier
+import stanza.models.classifiers.data as data
+from stanza.models.classifiers.trainer import Trainer
+from stanza.tests import TEST_MODELS_DIR
+from stanza.tests.classifiers.test_classifier import fake_embeddings
+from stanza.tests.classifiers.test_data import train_file_with_trees, dev_file_with_trees
+from stanza.models.common import utils
+from stanza.tests.constituency.test_trainer import build_trainer, TREEBANK
+pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
+class TestConstituencyClassifier:
+    @pytest.fixture(scope="class")
+    def constituency_model(self, fake_embeddings, tmp_path_factory):
+        args = ['--pattn_num_layers', '0', '--lattn_d_proj', '0', '--hidden_size', '20', '--delta_embedding_dim', '10']
+        trainer = build_trainer(str(fake_embeddings), *args, treebank=TREEBANK)
+        trainer_pt = str(tmp_path_factory.mktemp("constituency") / "constituency.pt")
+        trainer.save(trainer_pt, save_optimizer=False)
+        return trainer_pt
+    def build_model(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, extra_args=None):
+        """
+        Build a Constituency Classifier model to be used by one of the later tests
+        """
+        save_dir = str(tmp_path / "classifier")
+        save_name = "model.pt"
+        args = ["--save_dir", save_dir,
+                "--save_name", save_name,
+                "--model_type", "constituency",
+                "--constituency_model", constituency_model,
+                "--wordvec_pretrain_file", str(fake_embeddings),
+                "--fc_shapes", "20,10",
+                "--train_file", str(train_file_with_trees),
+                "--dev_file", str(dev_file_with_trees),
+                "--max_epochs", "2",
+                "--batch_size", "60"]
+        if extra_args is not None:
+            args = args + extra_args
+        args = classifier.parse_args(args)
+        train_set = data.read_dataset(args.train_file, args.wordvec_type, args.min_train_len)
+        trainer = Trainer.build_new_model(args, train_set)
+        return trainer, train_set, args
+    def run_training(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, extra_args=None):
+        """
+        Iterate a couple times over a model
+        """
+        trainer, train_set, args = self.build_model(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, extra_args)
+        dev_set = data.read_dataset(args.dev_file, args.wordvec_type, args.min_train_len)
+        labels = data.dataset_labels(train_set)
+        save_filename = os.path.join(args.save_dir, args.save_name)
+        checkpoint_file = utils.checkpoint_name(args.save_dir, save_filename, args.checkpoint_save_name)
+        classifier.train_model(trainer, save_filename, checkpoint_file, args, train_set, dev_set, labels)
+        return trainer, train_set, args
+    def test_build_model(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees):
+        """
+        Test that building a basic constituency-based model works
+        """
+        self.build_model(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees)
+    def test_save_load(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees):
+        """
+        Test that a constituency model can save & load
+        """
+        trainer, _, args = self.build_model(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees)
+        save_filename = os.path.join(args.save_dir, args.save_name)
+        trainer.save(save_filename)
+        args.load_name = args.save_name
+        trainer = Trainer.load(args.load_name, args)
+    def test_train_basic(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees):
+        self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees)
+    def test_train_pipeline(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees):
+        """
+        Test that writing out a temp model, then loading it in the pipeline is a thing that works
+        """
+        trainer, _, args = self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees)
+        save_filename = os.path.join(args.save_dir, args.save_name)
+        assert os.path.exists(save_filename)
+        assert os.path.exists(args.constituency_model)
+        pipeline_args = {"lang": "en",
+                         "download_method": None,
+                         "model_dir": TEST_MODELS_DIR,
+                         "processors": "tokenize,pos,constituency,sentiment",
+                         "tokenize_pretokenized": True,
+                         "constituency_model_path": args.constituency_model,
+                         "constituency_pretrain_path": args.wordvec_pretrain_file,
+                         "constituency_backward_charlm_path": None,
+                         "constituency_forward_charlm_path": None,
+                         "sentiment_model_path": save_filename,
+                         "sentiment_pretrain_path": args.wordvec_pretrain_file,
+                         "sentiment_backward_charlm_path": None,
+                         "sentiment_forward_charlm_path": None}
+        pipeline = stanza.Pipeline(**pipeline_args)
+        doc = pipeline("This is a test")
+        # since the model is random, we have no expectations for what the result actually is
+        assert doc.sentences[0].sentiment is not None
+    def test_train_all_words(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees):
+        self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, ['--constituency_all_words'])
+        self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, ['--no_constituency_all_words'])
+    def test_train_top_layer(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees):
+        self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, ['--constituency_top_layer'])
+        self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, ['--no_constituency_top_layer'])
+    def test_train_attn(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees):
+        self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, ['--constituency_node_attn', '--no_constituency_all_words'])
+        self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, ['--constituency_node_attn', '--constituency_all_words'])
+        self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, ['--no_constituency_node_attn'])

stanza/stanza/tests/common/__init__.py ADDED Viewed

File without changes

stanza/stanza/tests/common/test_chuliu_edmonds.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""
+Test some use cases of the chuliu_edmonds algorithm
+(currently just the tarjan implementation)
+"""
+import numpy as np
+import pytest
+from stanza.models.common.chuliu_edmonds import tarjan
+pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
+def test_tarjan_basic():
+    simple = np.array([0, 4, 4, 4, 0])
+    result = tarjan(simple)
+    assert result == []
+    simple = np.array([0, 2, 0, 4, 2, 2])
+    result = tarjan(simple)
+    assert result == []
+def test_tarjan_cycle():
+    cycle_graph = np.array([0, 3, 1, 2])
+    result = tarjan(cycle_graph)
+    expected = np.array([False,  True,  True,  True])
+    assert len(result) == 1
+    np.testing.assert_array_equal(result[0], expected)
+    cycle_graph = np.array([0, 3, 1, 2, 5, 6, 4])
+    result = tarjan(cycle_graph)
+    assert len(result) == 2
+    expected = [np.array([False,  True,  True,  True, False, False, False]),
+                np.array([False, False, False, False,  True,  True,  True])]
+    for r, e in zip(result, expected):
+        np.testing.assert_array_equal(r, e)

stanza/stanza/tests/common/test_confusion.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""
+Test a couple simple confusion matrices and output formats
+"""
+from collections import defaultdict
+import pytest
+from stanza.utils.confusion import format_confusion, confusion_to_f1, confusion_to_macro_f1, confusion_to_weighted_f1
+pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
+@pytest.fixture
+def simple_confusion():
+    confusion = defaultdict(lambda: defaultdict(int))
+    confusion["B-ORG"]["B-ORG"] = 1
+    confusion["B-ORG"]["B-PER"] = 1
+    confusion["E-ORG"]["E-ORG"] = 1
+    confusion["E-ORG"]["E-PER"] = 1
+    confusion["O"]["O"] = 4
+    return confusion
+@pytest.fixture
+def short_confusion():
+    """
+    Same thing, but with a short name.  This should not be sorted by entity type
+    """
+    confusion = defaultdict(lambda: defaultdict(int))
+    confusion["A"]["B-ORG"] = 1
+    confusion["B-ORG"]["B-PER"] = 1
+    confusion["E-ORG"]["E-ORG"] = 1
+    confusion["E-ORG"]["E-PER"] = 1
+    confusion["O"]["O"] = 4
+    return confusion
+EXPECTED_SIMPLE_OUTPUT = """
+     t\\p      O B-ORG E-ORG B-PER E-PER
+        O     4     0     0     0     0
+    B-ORG     0     1     0     1     0
+    E-ORG     0     0     1     0     1
+    B-PER     0     0     0     0     0
+    E-PER     0     0     0     0     0
+"""[1:-1]  # don't want to strip
+EXPECTED_SHORT_OUTPUT = """
+     t\\p      O     A B-ORG B-PER E-ORG E-PER
+        O     4     0     0     0     0     0
+        A     0     0     1     0     0     0
+    B-ORG     0     0     0     1     0     0
+    B-PER     0     0     0     0     0     0
+    E-ORG     0     0     0     0     1     1
+    E-PER     0     0     0     0     0     0
+"""[1:-1]
+EXPECTED_HIDE_BLANK_SHORT_OUTPUT = """
+     t\\p      O B-ORG E-ORG B-PER E-PER
+        O     4     0     0     0     0
+        A     0     1     0     0     0
+    B-ORG     0     0     0     1     0
+    E-ORG     0     0     1     0     1
+"""[1:-1]
+def test_simple_output(simple_confusion):
+    assert EXPECTED_SIMPLE_OUTPUT == format_confusion(simple_confusion)
+def test_short_output(short_confusion):
+    assert EXPECTED_SHORT_OUTPUT == format_confusion(short_confusion)
+def test_hide_blank_short_output(short_confusion):
+    assert EXPECTED_HIDE_BLANK_SHORT_OUTPUT == format_confusion(short_confusion, hide_blank=True)
+def test_macro_f1(simple_confusion, short_confusion):
+    assert confusion_to_macro_f1(simple_confusion) == pytest.approx(0.466666666666)
+    assert confusion_to_macro_f1(short_confusion) == pytest.approx(0.277777777777)
+def test_weighted_f1(simple_confusion, short_confusion):
+    assert confusion_to_weighted_f1(simple_confusion) == pytest.approx(0.83333333)
+    assert confusion_to_weighted_f1(short_confusion) == pytest.approx(0.66666666)
+    assert confusion_to_weighted_f1(simple_confusion, exclude=["O"]) == pytest.approx(0.66666666)
+    assert confusion_to_weighted_f1(short_confusion, exclude=["O"]) == pytest.approx(0.33333333)

stanza/stanza/tests/common/test_constant.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""
+Test the conversion to lcodes and splitting of dataset names
+"""
+import tempfile
+import pytest
+import stanza
+from stanza.models.common.constant import treebank_to_short_name, lang_to_langcode, is_right_to_left, two_to_three_letters, langlower2lcode
+from stanza.tests import *
+pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
+def test_treebank():
+    """
+    Test the entire treebank name conversion
+    """
+    # conversion of a UD_ name
+    assert "hi_hdtb" == treebank_to_short_name("UD_Hindi-HDTB")
+    # conversion of names without UD
+    assert "hi_fire2013" == treebank_to_short_name("Hindi-fire2013")
+    assert "hi_fire2013" == treebank_to_short_name("Hindi-Fire2013")
+    assert "hi_fire2013" == treebank_to_short_name("Hindi-FIRE2013")
+    # already short names are generally preserved
+    assert "hi_fire2013" == treebank_to_short_name("hi-fire2013")
+    assert "hi_fire2013" == treebank_to_short_name("hi_fire2013")
+    # a special case
+    assert "zh-hant_pud" == treebank_to_short_name("UD_Chinese-PUD")
+    # a special case already converted once
+    assert "zh-hant_pud" == treebank_to_short_name("zh-hant_pud")
+    assert "zh-hant_pud" == treebank_to_short_name("zh-hant-pud")
+    assert "zh-hans_gsdsimp" == treebank_to_short_name("zh-hans_gsdsimp")
+    assert "wo_masakhane" == treebank_to_short_name("wo_masakhane")
+    assert "wo_masakhane" == treebank_to_short_name("wol_masakhane")
+    assert "wo_masakhane" == treebank_to_short_name("Wol_masakhane")
+    assert "wo_masakhane" == treebank_to_short_name("wolof_masakhane")
+    assert "wo_masakhane" == treebank_to_short_name("Wolof_masakhane")
+def test_lang_to_langcode():
+    assert "hi" == lang_to_langcode("Hindi")
+    assert "hi" == lang_to_langcode("HINDI")
+    assert "hi" == lang_to_langcode("hindi")
+    assert "hi" == lang_to_langcode("HI")
+    assert "hi" == lang_to_langcode("hi")
+def test_right_to_left():
+    assert is_right_to_left("ar")
+    assert is_right_to_left("Arabic")
+    assert not is_right_to_left("en")
+    assert not is_right_to_left("English")
+def test_two_to_three():
+    assert lang_to_langcode("Wolof") == "wo"
+    assert lang_to_langcode("wol") == "wo"
+    assert "wo" in two_to_three_letters
+    assert two_to_three_letters["wo"] == "wol"
+def test_langlower():
+    assert lang_to_langcode("WOLOF") == "wo"
+    assert lang_to_langcode("nOrWeGiAn") == "nb"
+    assert "soj" == langlower2lcode["soi"]
+    assert "soj" == langlower2lcode["sohi"]

stanza/stanza/tests/common/test_data_conversion.py ADDED Viewed

	@@ -0,0 +1,520 @@

+"""
+Basic tests of the data conversion
+"""
+import io
+import pytest
+import tempfile
+from zipfile import ZipFile
+import stanza
+from stanza.utils.conll import CoNLL
+from stanza.models.common.doc import Document
+from stanza.tests import *
+pytestmark = pytest.mark.pipeline
+# data for testing
+CONLL = [[['1', 'Nous', 'il', 'PRON', '_', 'Number=Plur|Person=1|PronType=Prs', '3', 'nsubj', '_', 'start_char=0|end_char=4'],
+          ['2', 'avons', 'avoir', 'AUX', '_', 'Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin', '3', 'aux:tense', '_', 'start_char=5|end_char=10'],
+          ['3', 'atteint', 'atteindre', 'VERB', '_', 'Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part', '0', 'root', '_', 'start_char=11|end_char=18'],
+          ['4', 'la', 'le', 'DET', '_', 'Definite=Def|Gender=Fem|Number=Sing|PronType=Art', '5', 'det', '_', 'start_char=19|end_char=21'],
+          ['5', 'fin', 'fin', 'NOUN', '_', 'Gender=Fem|Number=Sing', '3', 'obj', '_', 'start_char=22|end_char=25'],
+          ['6-7', 'du', '_', '_', '_', '_', '_', '_', '_', 'start_char=26|end_char=28'],
+          ['6', 'de', 'de', 'ADP', '_', '_', '8', 'case', '_', '_'],
+          ['7', 'le', 'le', 'DET', '_', 'Definite=Def|Gender=Masc|Number=Sing|PronType=Art', '8', 'det', '_', '_'],
+          ['8', 'sentier', 'sentier', 'NOUN', '_', 'Gender=Masc|Number=Sing', '5', 'nmod', '_', 'start_char=29|end_char=36'],
+          ['9', '.', '.', 'PUNCT', '_', '_', '3', 'punct', '_', 'start_char=36|end_char=37']]]
+DICT = [[{'id': (1,), 'text': 'Nous', 'lemma': 'il', 'upos': 'PRON', 'feats': 'Number=Plur|Person=1|PronType=Prs', 'head': 3, 'deprel': 'nsubj', 'misc': 'start_char=0|end_char=4'},
+         {'id': (2,), 'text': 'avons', 'lemma': 'avoir', 'upos': 'AUX', 'feats': 'Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin', 'head': 3, 'deprel': 'aux:tense', 'misc': 'start_char=5|end_char=10'},
+         {'id': (3,), 'text': 'atteint', 'lemma': 'atteindre', 'upos': 'VERB', 'feats': 'Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part', 'head': 0, 'deprel': 'root', 'misc': 'start_char=11|end_char=18'},
+         {'id': (4,), 'text': 'la', 'lemma': 'le', 'upos': 'DET', 'feats': 'Definite=Def|Gender=Fem|Number=Sing|PronType=Art', 'head': 5, 'deprel': 'det', 'misc': 'start_char=19|end_char=21'},
+         {'id': (5,), 'text': 'fin', 'lemma': 'fin', 'upos': 'NOUN', 'feats': 'Gender=Fem|Number=Sing', 'head': 3, 'deprel': 'obj', 'misc': 'start_char=22|end_char=25'},
+         {'id': (6, 7), 'text': 'du', 'misc': 'start_char=26|end_char=28'},
+         {'id': (6,), 'text': 'de', 'lemma': 'de', 'upos': 'ADP', 'head': 8, 'deprel': 'case'},
+         {'id': (7,), 'text': 'le', 'lemma': 'le', 'upos': 'DET', 'feats': 'Definite=Def|Gender=Masc|Number=Sing|PronType=Art', 'head': 8, 'deprel': 'det'},
+         {'id': (8,), 'text': 'sentier', 'lemma': 'sentier', 'upos': 'NOUN', 'feats': 'Gender=Masc|Number=Sing', 'head': 5, 'deprel': 'nmod', 'misc': 'start_char=29|end_char=36'},
+         {'id': (9,), 'text': '.', 'lemma': '.', 'upos': 'PUNCT', 'head': 3, 'deprel': 'punct', 'misc': 'start_char=36|end_char=37'}]]
+def test_conll_to_dict():
+    dicts, empty = CoNLL.convert_conll(CONLL)
+    assert dicts == DICT
+    assert len(dicts) == len(empty)
+    assert all(len(x) == 0 for x in empty)
+def test_dict_to_conll():
+    document = Document(DICT)
+    # :c = no comments
+    conll = [[sentence.split("\t") for sentence in doc.split("\n")] for doc in "{:c}".format(document).split("\n\n")]
+    assert conll == CONLL
+def test_dict_to_doc_and_doc_to_dict():
+    """
+    Test the conversion from raw dict to Document and back
+    This code path will first turn start_char|end_char into start_char & end_char fields in the Document
+    That version to a dict will have separate fields for each of those
+    Finally, the conversion from that dict to a list of conll entries should convert that back to misc
+    """
+    document = Document(DICT)
+    dicts = document.to_dict()
+    document = Document(dicts)
+    conll = [[sentence.split("\t") for sentence in doc.split("\n")] for doc in "{:c}".format(document).split("\n\n")]
+    assert conll == CONLL
+# sample is two sentences long so that the tests check multiple sentences
+RUSSIAN_SAMPLE="""
+# sent_id = yandex.reviews-f-8xh5zqnmwak3t6p68y4rhwd4e0-1969-9253
+# genre = review
+# text = Как- то слишком мало цветов получают актёры после спектакля.
+1	Как	как-то	ADV	_	Degree=Pos|PronType=Ind	7	advmod	_	SpaceAfter=No
+2	-	-	PUNCT	_	_	3	punct	_	_
+3	то	то	PART	_	_	1	list	_	deprel=list:goeswith
+4	слишком	слишком	ADV	_	Degree=Pos	5	advmod	_	_
+5	мало	мало	ADV	_	Degree=Pos	6	advmod	_	_
+6	цветов	цветок	NOUN	_	Animacy=Inan|Case=Gen|Gender=Masc|Number=Plur	7	obj	_	_
+7	получают	получать	VERB	_	Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act	0	root	_	_
+8	актёры	актер	NOUN	_	Animacy=Anim|Case=Nom|Gender=Masc|Number=Plur	7	nsubj	_	_
+9	после	после	ADP	_	_	10	case	_	_
+10	спектакля	спектакль	NOUN	_	Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing	7	obl	_	SpaceAfter=No
+11	.	.	PUNCT	_	_	7	punct	_	_
+# sent_id = 4
+# genre = social
+# text = В женщине важна верность, а не красота.
+1	В	в	ADP	_	_	2	case	_	_
+2	женщине	женщина	NOUN	_	Animacy=Anim|Case=Loc|Gender=Fem|Number=Sing	3	obl	_	_
+3	важна	важный	ADJ	_	Degree=Pos|Gender=Fem|Number=Sing|Variant=Short	0	root	_	_
+4	верность	верность	NOUN	_	Animacy=Inan|Case=Nom|Gender=Fem|Number=Sing	3	nsubj	_	SpaceAfter=No
+5	,	,	PUNCT	_	_	8	punct	_	_
+6	а	а	CCONJ	_	_	8	cc	_	_
+7	не	не	PART	_	Polarity=Neg	8	advmod	_	_
+8	красота	красота	NOUN	_	Animacy=Inan|Case=Nom|Gender=Fem|Number=Sing	4	conj	_	SpaceAfter=No
+9	.	.	PUNCT	_	_	3	punct	_	_
+""".strip()
+RUSSIAN_TEXT = ["Как- то слишком мало цветов получают актёры после спектакля.", "В женщине важна верность, а не красота."]
+RUSSIAN_IDS = ["yandex.reviews-f-8xh5zqnmwak3t6p68y4rhwd4e0-1969-9253", "4"]
+def check_russian_doc(doc):
+    """
+    Refactored the test for the Russian doc so we can use it to test various file methods
+    """
+    lines = RUSSIAN_SAMPLE.split("\n")
+    assert len(doc.sentences) == 2
+    assert lines[0] == doc.sentences[0].comments[0]
+    assert lines[1] == doc.sentences[0].comments[1]
+    assert lines[2] == doc.sentences[0].comments[2]
+    for sent_idx, (expected_text, expected_id, sentence) in enumerate(zip(RUSSIAN_TEXT, RUSSIAN_IDS, doc.sentences)):
+        assert expected_text == sentence.text
+        assert expected_id == sentence.sent_id
+        assert sent_idx == sentence.index
+        assert len(sentence.comments) == 3
+        assert not sentence.has_enhanced_dependencies()
+    sentences = "{:C}".format(doc)
+    sentences = sentences.split("\n\n")
+    assert len(sentences) == 2
+    sentence = sentences[0].split("\n")
+    assert len(sentence) == 14
+    assert lines[0] == sentence[0]
+    assert lines[1] == sentence[1]
+    assert lines[2] == sentence[2]
+    # assert that the weird deprel=list:goeswith was properly handled
+    assert doc.sentences[0].words[2].head == 1
+    assert doc.sentences[0].words[2].deprel == "list:goeswith"
+def test_write_russian_doc(tmp_path):
+    """
+    Specifically test the write_doc2conll method
+    """
+    filename = tmp_path / "russian.conll"
+    doc = CoNLL.conll2doc(input_str=RUSSIAN_SAMPLE)
+    check_russian_doc(doc)
+    CoNLL.write_doc2conll(doc, filename)
+    with open(filename, encoding="utf-8") as fin:
+        text = fin.read()
+    # the conll docs have to end with \n\n
+    assert text.endswith("\n\n")
+    # but to compare against the original, strip off the whitespace
+    text = text.strip()
+    # we skip the first sentence because the "deprel=list:goeswith" is weird
+    # note that the deprel itself is checked in check_russian_doc
+    text = text[text.find("# sent_id = 4"):]
+    sample = RUSSIAN_SAMPLE[RUSSIAN_SAMPLE.find("# sent_id = 4"):]
+    assert text == sample
+    doc2 = CoNLL.conll2doc(filename)
+    check_russian_doc(doc2)
+# random sentence from EN_Pronouns
+ENGLISH_SAMPLE = """
+# newdoc
+# sent_id = 1
+# text = It is hers.
+# previous = Which person owns this?
+# comment = copular subject
+1	It	it	PRON	PRP	Number=Sing|Person=3|PronType=Prs	3	nsubj	_	_
+2	is	be	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	3	cop	_	_
+3	hers	hers	PRON	PRP	Gender=Fem|Number=Sing|Person=3|Poss=Yes|PronType=Prs	0	root	_	SpaceAfter=No
+4	.	.	PUNCT	.	_	3	punct	_	_
+""".strip()
+def test_write_to_io():
+    doc = CoNLL.conll2doc(input_str=ENGLISH_SAMPLE)
+    output = io.StringIO()
+    CoNLL.write_doc2conll(doc, output)
+    output_value = output.getvalue()
+    assert output_value.endswith("\n\n")
+    assert output_value.strip() == ENGLISH_SAMPLE
+def test_write_doc2conll_append(tmp_path):
+    doc = CoNLL.conll2doc(input_str=ENGLISH_SAMPLE)
+    filename = tmp_path / "english.conll"
+    CoNLL.write_doc2conll(doc, filename)
+    CoNLL.write_doc2conll(doc, filename, mode="a")
+    with open(filename) as fin:
+        text = fin.read()
+    expected = ENGLISH_SAMPLE + "\n\n" + ENGLISH_SAMPLE + "\n\n"
+    assert text == expected
+def test_doc_with_comments():
+    """
+    Test that a doc with comments gets converted back with comments
+    """
+    doc = CoNLL.conll2doc(input_str=RUSSIAN_SAMPLE)
+    check_russian_doc(doc)
+def test_unusual_misc():
+    """
+    The above RUSSIAN_SAMPLE resulted in a blank misc field in one particular implementation of the conll code
+    (the below test would fail)
+    """
+    doc = CoNLL.conll2doc(input_str=RUSSIAN_SAMPLE)
+    sentences = "{:C}".format(doc).split("\n\n")
+    assert len(sentences) == 2
+    sentence = sentences[0].split("\n")
+    assert len(sentence) == 14
+    for word in sentence:
+        pieces = word.split("\t")
+        assert len(pieces) == 1 or len(pieces) == 10
+        if len(pieces) == 10:
+            assert all(piece for piece in pieces)
+def test_file():
+    """
+    Test loading a doc from a file
+    """
+    with tempfile.TemporaryDirectory() as tempdir:
+        filename = os.path.join(tempdir, "russian.conll")
+        with open(filename, "w", encoding="utf-8") as fout:
+            fout.write(RUSSIAN_SAMPLE)
+        doc = CoNLL.conll2doc(input_file=filename)
+        check_russian_doc(doc)
+def test_zip_file():
+    """
+    Test loading a doc from a zip file
+    """
+    with tempfile.TemporaryDirectory() as tempdir:
+        zip_file = os.path.join(tempdir, "russian.zip")
+        filename = "russian.conll"
+        with ZipFile(zip_file, "w") as zout:
+            with zout.open(filename, "w") as fout:
+                fout.write(RUSSIAN_SAMPLE.encode())
+        doc = CoNLL.conll2doc(input_file=filename, zip_file=zip_file)
+        check_russian_doc(doc)
+SIMPLE_NER = """
+# text = Teferi's best friend is Karn
+# sent_id = 0
+1	Teferi	_	_	_	_	0	_	_	start_char=0|end_char=6|ner=S-PERSON
+2	's	_	_	_	_	1	_	_	start_char=6|end_char=8|ner=O
+3	best	_	_	_	_	2	_	_	start_char=9|end_char=13|ner=O
+4	friend	_	_	_	_	3	_	_	start_char=14|end_char=20|ner=O
+5	is	_	_	_	_	4	_	_	start_char=21|end_char=23|ner=O
+6	Karn	_	_	_	_	5	_	_	start_char=24|end_char=28|ner=S-PERSON
+""".strip()
+def test_simple_ner_conversion():
+    """
+    Test that tokens get properly created with NER tags
+    """
+    doc = CoNLL.conll2doc(input_str=SIMPLE_NER)
+    assert len(doc.sentences) == 1
+    sentence = doc.sentences[0]
+    assert len(sentence.tokens) == 6
+    EXPECTED_NER = ["S-PERSON", "O", "O", "O", "O", "S-PERSON"]
+    for token, ner in zip(sentence.tokens, EXPECTED_NER):
+        assert token.ner == ner
+        # check that the ner, start_char, end_char fields were not put on the token's misc
+        # those should all be set as specific fields on the token
+        assert not token.misc
+        assert len(token.words) == 1
+        # they should also not reach the word's misc field
+        assert not token.words[0].misc
+    conll = "{:C}".format(doc)
+    assert conll == SIMPLE_NER
+MWT_NER = """
+# text = This makes John's headache worse
+# sent_id = 0
+1	This	_	_	_	_	0	_	_	start_char=0|end_char=4|ner=O
+2	makes	_	_	_	_	1	_	_	start_char=5|end_char=10|ner=O
+3-4	John's	_	_	_	_	_	_	_	start_char=11|end_char=17|ner=S-PERSON
+3	John	_	_	_	_	2	_	_	_
+4	's	_	_	_	_	3	_	_	_
+5	headache	_	_	_	_	4	_	_	start_char=18|end_char=26|ner=O
+6	worse	_	_	_	_	5	_	_	start_char=27|end_char=32|ner=O
+""".strip()
+def test_mwt_ner_conversion():
+    """
+    Test that tokens including MWT get properly created with NER tags
+    Note that this kind of thing happens with the EWT tokenizer for English, for example
+    """
+    doc = CoNLL.conll2doc(input_str=MWT_NER)
+    assert len(doc.sentences) == 1
+    sentence = doc.sentences[0]
+    assert len(sentence.tokens) == 5
+    assert not sentence.has_enhanced_dependencies()
+    EXPECTED_NER = ["O", "O", "S-PERSON", "O", "O"]
+    EXPECTED_WORDS = [1, 1, 2, 1, 1]
+    for token, ner, expected_words in zip(sentence.tokens, EXPECTED_NER, EXPECTED_WORDS):
+        assert token.ner == ner
+        # check that the ner, start_char, end_char fields were not put on the token's misc
+        # those should all be set as specific fields on the token
+        assert not token.misc
+        assert len(token.words) == expected_words
+        # they should also not reach the word's misc field
+        assert not token.words[0].misc
+    conll = "{:C}".format(doc)
+    assert conll == MWT_NER
+# A random sentence from et_ewt-ud-train.conllu
+# which we use to test the deps conversion for multiple deps
+ESTONIAN_DEPS = """
+# newpar
+# sent_id = aia_foorum_37
+# text = Sestpeale ei mõistagi neid, kes koduaias sortidega tegelevad.
+1	Sestpeale	sest_peale	ADV	D	_	3	advmod	3:advmod	_
+2	ei	ei	AUX	V	Polarity=Neg	3	aux	3:aux	_
+3	mõistagi	mõistma	VERB	V	Connegative=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act	0	root	0:root	_
+4	neid	tema	PRON	P	Case=Par|Number=Plur|Person=3|PronType=Prs	3	obj	3:obj|9:nsubj	SpaceAfter=No
+5	,	,	PUNCT	Z	_	9	punct	9:punct	_
+6	kes	kes	PRON	P	Case=Nom|Number=Plur|PronType=Int,Rel	9	nsubj	4:ref	_
+7	koduaias	kodu_aed	NOUN	S	Case=Ine|Number=Sing	9	obl	9:obl	_
+8	sortidega	sort	NOUN	S	Case=Com|Number=Plur	9	obl	9:obl	_
+9	tegelevad	tegelema	VERB	V	Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act	4	acl:relcl	4:acl	SpaceAfter=No
+10	.	.	PUNCT	Z	_	3	punct	3:punct	_
+""".strip()
+def test_deps_conversion():
+    doc = CoNLL.conll2doc(input_str=ESTONIAN_DEPS)
+    assert len(doc.sentences) == 1
+    sentence = doc.sentences[0]
+    assert len(sentence.tokens) == 10
+    assert sentence.has_enhanced_dependencies()
+    word = doc.sentences[0].words[3]
+    assert word.deps == "3:obj|9:nsubj"
+    conll = "{:C}".format(doc)
+    assert conll == ESTONIAN_DEPS
+ESTONIAN_EMPTY_DEPS = """
+# sent_id = ewtb2_000035_15
+# text = Ja paari aasta pärast rôômalt maasikatele ...
+1	Ja	ja	CCONJ	J	_	3	cc	5.1:cc	_
+2	paari	paar	NUM	N	Case=Gen|Number=Sing|NumForm=Word|NumType=Card	3	nummod	3:nummod	_
+3	aasta	aasta	NOUN	S	Case=Gen|Number=Sing	0	root	5.1:obl	_
+4	pärast	pärast	ADP	K	AdpType=Post	3	case	3:case	_
+5	rôômalt	rõõmsalt	ADV	D	Typo=Yes	3	advmod	5.1:advmod	Orphan=Yes|CorrectForm=rõõmsalt
+5.1	panna	panema	VERB	V	VerbForm=Inf	_	_	0:root	Empty=5.1
+6	maasikatele	maasikas	NOUN	S	Case=All|Number=Plur	3	obl	5.1:obl	Orphan=Yes
+7	...	...	PUNCT	Z	_	3	punct	5.1:punct	_
+""".strip()
+ESTONIAN_EMPTY_END_DEPS = """
+# sent_id = ewtb2_000035_15
+# text = Ja paari aasta pärast rôômalt maasikatele ...
+1	Ja	ja	CCONJ	J	_	3	cc	5.1:cc	_
+2	paari	paar	NUM	N	Case=Gen|Number=Sing|NumForm=Word|NumType=Card	3	nummod	3:nummod	_
+3	aasta	aasta	NOUN	S	Case=Gen|Number=Sing	0	root	5.1:obl	_
+4	pärast	pärast	ADP	K	AdpType=Post	3	case	3:case	_
+5	rôômalt	rõõmsalt	ADV	D	Typo=Yes	3	advmod	5.1:advmod	Orphan=Yes|CorrectForm=rõõmsalt
+5.1	panna	panema	VERB	V	VerbForm=Inf	_	_	0:root	Empty=5.1
+""".strip()
+def test_empty_deps_conversion():
+    """
+    Check that we can read and then output a sentence with empty dependencies
+    """
+    check_empty_deps_conversion(ESTONIAN_EMPTY_DEPS, 7)
+def test_empty_deps_at_end_conversion():
+    """
+    The empty deps conversion should also work if the empty dep is at the end
+    """
+    check_empty_deps_conversion(ESTONIAN_EMPTY_END_DEPS, 5)
+def check_empty_deps_conversion(input_str, expected_words):
+    doc = CoNLL.conll2doc(input_str=input_str, ignore_gapping=False)
+    assert len(doc.sentences) == 1
+    assert len(doc.sentences[0].tokens) == expected_words
+    assert len(doc.sentences[0].words) == expected_words
+    assert len(doc.sentences[0].empty_words) == 1
+    sentence = doc.sentences[0]
+    conll = "{:C}".format(doc)
+    assert conll == input_str
+    sentence_dict = doc.sentences[0].to_dict()
+    assert len(sentence_dict) == expected_words + 1
+    # currently this is true for both of the examples we run
+    assert sentence_dict[5]['id'] == (5, 1)
+    # redo the above checks to make sure
+    # there are no weird bugs in the accessors
+    assert len(doc.sentences) == 1
+    assert len(doc.sentences[0].tokens) == expected_words
+    assert len(doc.sentences[0].words) == expected_words
+    assert len(doc.sentences[0].empty_words) == 1
+ESTONIAN_DOC_ID = """
+# doc_id = this_is_a_doc
+# sent_id = ewtb2_000035_15
+# text = Ja paari aasta pärast rôômalt maasikatele ...
+1	Ja	ja	CCONJ	J	_	3	cc	5.1:cc	_
+2	paari	paar	NUM	N	Case=Gen|Number=Sing|NumForm=Word|NumType=Card	3	nummod	3:nummod	_
+3	aasta	aasta	NOUN	S	Case=Gen|Number=Sing	0	root	5.1:obl	_
+4	pärast	pärast	ADP	K	AdpType=Post	3	case	3:case	_
+5	rôômalt	rõõmsalt	ADV	D	Typo=Yes	3	advmod	5.1:advmod	Orphan=Yes|CorrectForm=rõõmsalt
+5.1	panna	panema	VERB	V	VerbForm=Inf	_	_	0:root	Empty=5.1
+6	maasikatele	maasikas	NOUN	S	Case=All|Number=Plur	3	obl	5.1:obl	Orphan=Yes
+7	...	...	PUNCT	Z	_	3	punct	5.1:punct	_
+""".strip()
+def test_read_doc_id():
+    doc = CoNLL.conll2doc(input_str=ESTONIAN_DOC_ID, ignore_gapping=False)
+    assert "{:C}".format(doc) == ESTONIAN_DOC_ID
+    assert doc.sentences[0].doc_id == 'this_is_a_doc'
+SIMPLE_DEPENDENCY_INDEX_ERROR = """
+# text = Teferi's best friend is Karn
+# sent_id = 0
+# notes = this sentence has a dependency index outside the sentence.  it should throw an IndexError
+1	Teferi	_	_	_	_	0	root	_	start_char=0|end_char=6|ner=S-PERSON
+2	's	_	_	_	_	1	dep	_	start_char=6|end_char=8|ner=O
+3	best	_	_	_	_	2	dep	_	start_char=9|end_char=13|ner=O
+4	friend	_	_	_	_	3	dep	_	start_char=14|end_char=20|ner=O
+5	is	_	_	_	_	4	dep	_	start_char=21|end_char=23|ner=O
+6	Karn	_	_	_	_	8	dep	_	start_char=24|end_char=28|ner=S-PERSON
+""".strip()
+def test_read_dependency_errors():
+    with pytest.raises(IndexError):
+        doc = CoNLL.conll2doc(input_str=SIMPLE_DEPENDENCY_INDEX_ERROR)
+MULTIPLE_DOC_IDS = """
+# doc_id = doc_1
+# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0020
+# text = His mother was also killed in the attack.
+1	His	his	PRON	PRP$	Case=Gen|Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs	2	nmod:poss	2:nmod:poss	_
+2	mother	mother	NOUN	NN	Number=Sing	5	nsubj:pass	5:nsubj:pass	_
+3	was	be	AUX	VBD	Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin	5	aux:pass	5:aux:pass	_
+4	also	also	ADV	RB	_	5	advmod	5:advmod	_
+5	killed	kill	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	0	root	0:root	_
+6	in	in	ADP	IN	_	8	case	8:case	_
+7	the	the	DET	DT	Definite=Def|PronType=Art	8	det	8:det	_
+8	attack	attack	NOUN	NN	Number=Sing	5	obl	5:obl:in	SpaceAfter=No
+9	.	.	PUNCT	.	_	5	punct	5:punct	_
+# doc_id = doc_1
+# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0028
+# text = This item is a small one and easily missed.
+1	This	this	DET	DT	Number=Sing|PronType=Dem	2	det	2:det	_
+2	item	item	NOUN	NN	Number=Sing	6	nsubj	6:nsubj|9:nsubj:pass	_
+3	is	be	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	6	cop	6:cop	_
+4	a	a	DET	DT	Definite=Ind|PronType=Art	6	det	6:det	_
+5	small	small	ADJ	JJ	Degree=Pos	6	amod	6:amod	_
+6	one	one	NOUN	NN	Number=Sing	0	root	0:root	_
+7	and	and	CCONJ	CC	_	9	cc	9:cc	_
+8	easily	easily	ADV	RB	_	9	advmod	9:advmod	_
+9	missed	miss	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	6	conj	6:conj:and	SpaceAfter=No
+10	.	.	PUNCT	.	_	6	punct	6:punct	_
+# doc_id = doc_2
+# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0029
+# text = But in my view it is highly significant.
+1	But	but	CCONJ	CC	_	8	cc	8:cc	_
+2	in	in	ADP	IN	_	4	case	4:case	_
+3	my	my	PRON	PRP$	Case=Gen|Number=Sing|Person=1|Poss=Yes|PronType=Prs	4	nmod:poss	4:nmod:poss	_
+4	view	view	NOUN	NN	Number=Sing	8	obl	8:obl:in	_
+5	it	it	PRON	PRP	Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs	8	nsubj	8:nsubj	_
+6	is	be	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	8	cop	8:cop	_
+7	highly	highly	ADV	RB	_	8	advmod	8:advmod	_
+8	significant	significant	ADJ	JJ	Degree=Pos	0	root	0:root	SpaceAfter=No
+9	.	.	PUNCT	.	_	8	punct	8:punct	_
+# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0040
+# text = The trial begins again Nov.28.
+1	The	the	DET	DT	Definite=Def|PronType=Art	2	det	2:det	_
+2	trial	trial	NOUN	NN	Number=Sing	3	nsubj	3:nsubj	_
+3	begins	begin	VERB	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	0	root	0:root	_
+4	again	again	ADV	RB	_	3	advmod	3:advmod	_
+5	Nov.	November	PROPN	NNP	Abbr=Yes|Number=Sing	3	obl:tmod	3:obl:tmod	SpaceAfter=No
+6	28	28	NUM	CD	NumForm=Digit|NumType=Card	5	nummod	5:nummod	SpaceAfter=No
+7	.	.	PUNCT	.	_	3	punct	3:punct	_
+""".lstrip()
+def test_read_multiple_doc_ids():
+    docs = CoNLL.conll2multi_docs(input_str=MULTIPLE_DOC_IDS)
+    assert len(docs) == 2
+    assert len(docs[0].sentences) == 2
+    assert len(docs[1].sentences) == 2
+    # remove the first doc_id comment
+    text = "\n".join(MULTIPLE_DOC_IDS.split("\n")[1:])
+    docs = CoNLL.conll2multi_docs(input_str=text)
+    assert len(docs) == 3
+    assert len(docs[0].sentences) == 1
+    assert len(docs[1].sentences) == 1
+    assert len(docs[2].sentences) == 2
+ENGLISH_TEST_SENTENCE = """
+# text = This is a test
+# sent_id = 0
+1	This	this	PRON	DT	Number=Sing|PronType=Dem	4	nsubj	_	start_char=0|end_char=4
+2	is	be	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	4	cop	_	start_char=5|end_char=7
+3	a	a	DET	DT	Definite=Ind|PronType=Art	4	det	_	start_char=8|end_char=9
+4	test	test	NOUN	NN	Number=Sing	0	root	_	start_char=10|end_char=14|SpaceAfter=No
+""".lstrip()
+def test_convert_dict():
+    doc = CoNLL.conll2doc(input_str=ENGLISH_TEST_SENTENCE)
+    converted = CoNLL.convert_dict(doc.to_dict())
+    expected = [[['1', 'This', 'this', 'PRON', 'DT', 'Number=Sing|PronType=Dem', '4', 'nsubj', '_', 'start_char=0|end_char=4'],
+                 ['2', 'is', 'be', 'AUX', 'VBZ', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', '4', 'cop', '_', 'start_char=5|end_char=7'],
+                 ['3', 'a', 'a', 'DET', 'DT', 'Definite=Ind|PronType=Art', '4', 'det', '_', 'start_char=8|end_char=9'],
+                 ['4', 'test', 'test', 'NOUN', 'NN', 'Number=Sing', '0', 'root', '_', 'SpaceAfter=No|start_char=10|end_char=14']]]
+    assert converted == expected

stanza/stanza/tests/common/test_foundation_cache.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import glob
+import os
+import shutil
+import tempfile
+import pytest
+import stanza
+from stanza.models.common.foundation_cache import FoundationCache, load_charlm
+from stanza.tests import TEST_MODELS_DIR
+pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
+def test_charlm_cache():
+    models_path = os.path.join(TEST_MODELS_DIR, "en", "backward_charlm", "*")
+    models = glob.glob(models_path)
+    # we expect at least one English model downloaded for the tests
+    assert len(models) >= 1
+    model_file = models[0]
+    cache = FoundationCache()
+    with tempfile.TemporaryDirectory(dir=".") as test_dir:
+        temp_file = os.path.join(test_dir, "charlm.pt")
+        shutil.copy2(model_file, temp_file)
+        # this will work
+        model = load_charlm(temp_file)
+        # this will save the model
+        model = cache.load_charlm(temp_file)
+    # this should no longer work
+    with pytest.raises(FileNotFoundError):
+        model = load_charlm(temp_file)
+    # it should remember the cached version
+    model = cache.load_charlm(temp_file)

stanza/stanza/tests/common/test_pretrain.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import os
+import tempfile
+import pytest
+import numpy as np
+import torch
+from stanza.models.common import pretrain
+from stanza.models.common.vocab import UNK_ID
+from stanza.tests import *
+pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
+def check_vocab(vocab):
+    # 4 base vectors, plus the 3 vectors actually present in the file
+    assert len(vocab) == 7
+    assert 'unban' in vocab
+    assert 'mox' in vocab
+    assert 'opal' in vocab
+def check_embedding(emb, unk=False):
+    expected = np.array([[ 0.,  0.,  0.,  0.,],
+                         [ 0.,  0.,  0.,  0.,],
+                         [ 0.,  0.,  0.,  0.,],
+                         [ 0.,  0.,  0.,  0.,],
+                         [ 1.,  2.,  3.,  4.,],
+                         [ 5.,  6.,  7.,  8.,],
+                         [ 9., 10., 11., 12.,]])
+    if unk:
+        expected[UNK_ID] = -1
+    np.testing.assert_allclose(emb, expected)
+def check_pretrain(pt):
+    check_vocab(pt.vocab)
+    check_embedding(pt.emb)
+def test_text_pretrain():
+    pt = pretrain.Pretrain(vec_filename=f'{TEST_WORKING_DIR}/in/tiny_emb.txt', save_to_file=False)
+    check_pretrain(pt)
+def test_xz_pretrain():
+    pt = pretrain.Pretrain(vec_filename=f'{TEST_WORKING_DIR}/in/tiny_emb.xz', save_to_file=False)
+    check_pretrain(pt)
+def test_gz_pretrain():
+    pt = pretrain.Pretrain(vec_filename=f'{TEST_WORKING_DIR}/in/tiny_emb.gz', save_to_file=False)
+    check_pretrain(pt)
+def test_zip_pretrain():
+    pt = pretrain.Pretrain(vec_filename=f'{TEST_WORKING_DIR}/in/tiny_emb.zip', save_to_file=False)
+    check_pretrain(pt)
+def test_csv_pretrain():
+    pt = pretrain.Pretrain(csv_filename=f'{TEST_WORKING_DIR}/in/tiny_emb.csv', save_to_file=False)
+    check_pretrain(pt)
+def test_resave_pretrain():
+    """
+    Test saving a pretrain and then loading from the existing file
+    """
+    test_pt_file = tempfile.NamedTemporaryFile(dir=f'{TEST_WORKING_DIR}/out', suffix=".pt", delete=False)
+    try:
+        test_pt_file.close()
+        # note that this tests the ability to save a pretrain and the
+        # ability to fall back when the existing pretrain isn't working
+        pt = pretrain.Pretrain(filename=test_pt_file.name,
+                               vec_filename=f'{TEST_WORKING_DIR}/in/tiny_emb.xz')
+        check_pretrain(pt)
+        pt2 = pretrain.Pretrain(filename=test_pt_file.name,
+                               vec_filename=f'unban_mox_opal')
+        check_pretrain(pt2)
+        pt3 = torch.load(test_pt_file.name, weights_only=True)
+        check_embedding(pt3['emb'])
+    finally:
+        os.unlink(test_pt_file.name)
+SPACE_PRETRAIN="""
+3 4
+unban mox 1 2 3 4
+opal 5 6 7 8
+foo 9 10 11 12
+""".strip()
+def test_whitespace():
+    """
+    Test reading a pretrain with an ascii space in it
+    The vocab word with a space in it should have the correct number
+    of dimensions read, with the space converted to nbsp
+    """
+    test_txt_file = tempfile.NamedTemporaryFile(dir=f'{TEST_WORKING_DIR}/out', suffix=".txt", delete=False)
+    try:
+        test_txt_file.write(SPACE_PRETRAIN.encode())
+        test_txt_file.close()
+        pt = pretrain.Pretrain(vec_filename=test_txt_file.name, save_to_file=False)
+        check_embedding(pt.emb)
+        assert "unban\xa0mox" in pt.vocab
+        # this one also works because of the normalize_unit in vocab.py
+        assert "unban mox" in pt.vocab
+    finally:
+        os.unlink(test_txt_file.name)
+NO_HEADER_PRETRAIN="""
+unban 1 2 3 4
+mox 5 6 7 8
+opal 9 10 11 12
+""".strip()
+def test_no_header():
+    """
+    Check loading a pretrain with no rows,cols header
+    """
+    with tempfile.TemporaryDirectory(dir=TEST_WORKING_DIR) as tmpdir:
+        filename = os.path.join(tmpdir, "tiny.txt")
+        with open(filename, "w", encoding="utf-8") as fout:
+            fout.write(NO_HEADER_PRETRAIN)
+        pt = pretrain.Pretrain(vec_filename=filename, save_to_file=False)
+        check_embedding(pt.emb)
+UNK_PRETRAIN="""
+unban 1 2 3 4
+mox 5 6 7 8
+opal 9 10 11 12
+<unk> -1 -1 -1 -1
+""".strip()
+def test_no_header():
+    """
+    Check loading a pretrain with <unk> at the end, like GloVe does
+    """
+    with tempfile.TemporaryDirectory(dir=TEST_WORKING_DIR) as tmpdir:
+        filename = os.path.join(tmpdir, "tiny.txt")
+        with open(filename, "w", encoding="utf-8") as fout:
+            fout.write(UNK_PRETRAIN)
+        pt = pretrain.Pretrain(vec_filename=filename, save_to_file=False)
+        check_embedding(pt.emb, unk=True)

stanza/stanza/tests/common/test_utils.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import lzma
+import os
+import tempfile
+import pytest
+import stanza
+import stanza.models.common.utils as utils
+from stanza.tests import *
+pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
+def test_wordvec_not_found():
+    """
+    get_wordvec_file should fail if neither word2vec nor fasttext exists
+    """
+    with tempfile.TemporaryDirectory(dir=f'{TEST_WORKING_DIR}/out') as temp_dir:
+        with pytest.raises(FileNotFoundError):
+            utils.get_wordvec_file(wordvec_dir=temp_dir, shorthand='en_foo')
+def test_word2vec_xz():
+    """
+    Test searching for word2vec and xz files
+    """
+    with tempfile.TemporaryDirectory(dir=f'{TEST_WORKING_DIR}/out') as temp_dir:
+        # make a fake directory for English word vectors
+        word2vec_dir = os.path.join(temp_dir, 'word2vec', 'English')
+        os.makedirs(word2vec_dir)
+        # make a fake English word vector file
+        fake_file = os.path.join(word2vec_dir, 'en.vectors.xz')
+        fout = open(fake_file, 'w')
+        fout.close()
+        # get_wordvec_file should now find this fake file
+        filename = utils.get_wordvec_file(wordvec_dir=temp_dir, shorthand='en_foo')
+        assert filename == fake_file
+def test_fasttext_txt():
+    """
+    Test searching for fasttext and txt files
+    """
+    with tempfile.TemporaryDirectory(dir=f'{TEST_WORKING_DIR}/out') as temp_dir:
+        # make a fake directory for English word vectors
+        fasttext_dir = os.path.join(temp_dir, 'fasttext', 'English')
+        os.makedirs(fasttext_dir)
+        # make a fake English word vector file
+        fake_file = os.path.join(fasttext_dir, 'en.vectors.txt')
+        fout = open(fake_file, 'w')
+        fout.close()
+        # get_wordvec_file should now find this fake file
+        filename = utils.get_wordvec_file(wordvec_dir=temp_dir, shorthand='en_foo')
+        assert filename == fake_file
+def test_wordvec_type():
+    """
+    If we supply our own wordvec type, get_wordvec_file should find that
+    """
+    with tempfile.TemporaryDirectory(dir=f'{TEST_WORKING_DIR}/out') as temp_dir:
+        # make a fake directory for English word vectors
+        google_dir = os.path.join(temp_dir, 'google', 'English')
+        os.makedirs(google_dir)
+        # make a fake English word vector file
+        fake_file = os.path.join(google_dir, 'en.vectors.txt')
+        fout = open(fake_file, 'w')
+        fout.close()
+        # get_wordvec_file should now find this fake file
+        filename = utils.get_wordvec_file(wordvec_dir=temp_dir, shorthand='en_foo', wordvec_type='google')
+        assert filename == fake_file
+        # this file won't be found using the normal defaults
+        with pytest.raises(FileNotFoundError):
+            utils.get_wordvec_file(wordvec_dir=temp_dir, shorthand='en_foo')
+def test_sort_with_indices():
+    data = [[1, 2, 3], [4, 5], [6]]
+    ordered, orig_idx = utils.sort_with_indices(data, key=len)
+    assert ordered == ([6], [4, 5], [1, 2, 3])
+    assert orig_idx == (2, 1, 0)
+    unsorted = utils.unsort(ordered, orig_idx)
+    assert data == unsorted
+def test_empty_sort_with_indices():
+    ordered, orig_idx = utils.sort_with_indices([])
+    assert len(ordered) == 0
+    assert len(orig_idx) == 0
+    unsorted = utils.unsort(ordered, orig_idx)
+    assert [] == unsorted
+def test_split_into_batches():
+    data = []
+    for i in range(5):
+        data.append(["Unban", "mox", "opal", str(i)])
+    data.append(["Do", "n't", "ban", "Urza", "'s", "Saga", "that", "card", "is", "great"])
+    data.append(["Ban", "Ragavan"])
+    # small batches will put one element in each interval
+    batches = utils.split_into_batches(data, 5)
+    assert batches == [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7)]
+    # this one has a batch interrupted in the middle by a large element
+    batches = utils.split_into_batches(data, 8)
+    assert batches == [(0, 2), (2, 4), (4, 5), (5, 6), (6, 7)]
+    # this one has the large element at the start of its own batch
+    batches = utils.split_into_batches(data[1:], 8)
+    assert batches == [(0, 2), (2, 4), (4, 5), (5, 6)]
+    # overloading the test!  assert that the key & reverse is working
+    ordered, orig_idx = utils.sort_with_indices(data, key=len, reverse=True)
+    assert [len(x) for x in ordered] == [10, 4, 4, 4, 4, 4, 2]
+    # this has the large element at the start
+    batches = utils.split_into_batches(ordered, 8)
+    assert batches == [(0, 1), (1, 3), (3, 5), (5, 7)]
+    # double check that unsort is working as expected
+    assert data == utils.unsort(ordered, orig_idx)
+def test_find_missing_tags():
+    assert utils.find_missing_tags(["O", "PER", "LOC"], ["O", "PER", "LOC"]) == []
+    assert utils.find_missing_tags(["O", "PER", "LOC"], ["O", "PER", "LOC", "ORG"]) == ['ORG']
+    assert utils.find_missing_tags([["O", "PER"], ["O", "LOC"]], [["O", "PER"], ["LOC", "ORG"]]) == ['ORG']
+def test_open_read_text():
+    """
+    test that we can read either .xz or regular txt
+    """
+    TEXT = "this is a test"
+    with tempfile.TemporaryDirectory() as tempdir:
+        # test text file
+        filename = os.path.join(tempdir, "foo.txt")
+        with open(filename, "w") as fout:
+            fout.write(TEXT)
+        with utils.open_read_text(filename) as fin:
+            in_text = fin.read()
+            assert TEXT == in_text
+        assert fin.closed
+        # the context should close the file when we throw an exception!
+        try:
+            with utils.open_read_text(filename) as finex:
+                assert not finex.closed
+                raise ValueError("unban mox opal!")
+        except ValueError:
+            pass
+        assert finex.closed
+        # test xz file
+        filename = os.path.join(tempdir, "foo.txt.xz")
+        with lzma.open(filename, "wt") as fout:
+            fout.write(TEXT)
+        with utils.open_read_text(filename) as finxz:
+            in_text = finxz.read()
+            assert TEXT == in_text
+        assert finxz.closed
+        # the context should close the file when we throw an exception!
+        try:
+            with utils.open_read_text(filename) as finexxz:
+                assert not finexxz.closed
+                raise ValueError("unban mox opal!")
+        except ValueError:
+            pass
+        assert finexxz.closed
+def test_checkpoint_name():
+    """
+    Test some expected results for the checkpoint names
+    """
+    # use os.path.split so that the test is agnostic of file separator on Linux or Windows
+    checkpoint = utils.checkpoint_name("saved_models", "kk_oscar_forward_charlm.pt", None)
+    assert os.path.split(checkpoint) == ("saved_models", "kk_oscar_forward_charlm_checkpoint.pt")
+    checkpoint = utils.checkpoint_name("saved_models", "kk_oscar_forward_charlm", None)
+    assert os.path.split(checkpoint) == ("saved_models", "kk_oscar_forward_charlm_checkpoint")
+    checkpoint = utils.checkpoint_name("saved_models", "kk_oscar_forward_charlm", "othername.pt")
+    assert os.path.split(checkpoint) == ("saved_models", "othername.pt")

stanza/stanza/tests/constituency/__init__.py ADDED Viewed

File without changes

stanza/stanza/tests/constituency/test_convert_arboretum.py ADDED Viewed

	@@ -0,0 +1,235 @@

+"""
+Test a couple different classes of trees to check the output of the Arboretum conversion
+Note that the text has been removed
+"""
+import os
+import tempfile
+import pytest
+from stanza.server import tsurgeon
+from stanza.tests import TEST_WORKING_DIR
+from stanza.utils.datasets.constituency import convert_arboretum
+pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
+PROJ_EXAMPLE="""
+<s id="s2" ref="AACBPIGY" source="id=AACBPIGY" forest="1/1" text="A B C D E F G H.">
+	<graph root="s2_500">
+		<terminals>
+			<t id="s2_1" word="A" lemma="A" pos="prop" morph="NOM" extra="PROP:A compound brand"/>
+			<t id="s2_2" word="B" lemma="B" pos="v-fin" morph="PR AKT" extra="mv"/>
+			<t id="s2_3" word="C" lemma="C" pos="pron-pers" morph="2S ACC" extra="--"/>
+			<t id="s2_4" word="D" lemma="D" pos="adj" morph="UTR S IDF NOM" extra="F:u+afhængig"/>
+			<t id="s2_5" word="E" lemma="E" pos="prp" morph="--" extra="--"/>
+			<t id="s2_6" word="F" lemma="F" pos="art" morph="NEU S DEF" extra="--"/>
+			<t id="s2_7" word="G" lemma="G" pos="adj" morph="nG S DEF NOM" extra="--"/>
+			<t id="s2_8" word="H" lemma="H" pos="n" morph="NEU S IDF NOM" extra="N:lys+net"/>
+			<t id="s2_9" word="." lemma="--" pos="pu" morph="--" extra="--"/>
+		</terminals>
+		<nonterminals>
+			<nt id="s2_500" cat="s">
+				<edge label="STA" idref="s2_501"/>
+			</nt>
+			<nt id="s2_501" cat="fcl">
+				<edge label="S" idref="s2_1"/>
+				<edge label="P" idref="s2_2"/>
+				<edge label="Od" idref="s2_3"/>
+				<edge label="Co" idref="s2_502"/>
+				<edge label="PU" idref="s2_9"/>
+			</nt>
+			<nt id="s2_502" cat="adjp">
+				<edge label="H" idref="s2_4"/>
+				<edge label="DA" idref="s2_503"/>
+			</nt>
+			<nt id="s2_503" cat="pp">
+				<edge label="H" idref="s2_5"/>
+				<edge label="DP" idref="s2_504"/>
+			</nt>
+			<nt id="s2_504" cat="np">
+				<edge label="DN" idref="s2_6"/>
+				<edge label="DN" idref="s2_7"/>
+				<edge label="H" idref="s2_8"/>
+			</nt>
+		</nonterminals>
+	</graph>
+</s>
+"""
+NOT_FIX_NONPROJ_EXAMPLE="""
+<s id="s322" ref="EDGBITSZ" source="id=EDGBITSZ" forest="1/2" text="A B C D E, F G H I J.">
+        <graph root="s322_500">
+                <terminals>
+                        <t id="s322_1" word="A" lemma="A" pos="prop" morph="NOM" extra="hum fem"/>
+                        <t id="s322_2" word="B" lemma="B" pos="v-fin" morph="PR AKT" extra="mv"/>
+                        <t id="s322_3" word="C" lemma="C" pos="pron-dem" morph="UTR S" extra="dem"/>
+                        <t id="s322_4" word="D" lemma="D" pos="n" morph="UTR S IDF NOM" extra="--"/>
+                        <t id="s322_5" word="E" lemma="E" pos="adv" morph="--" extra="--"/>
+                        <t id="s322_6" word="," lemma="--" pos="pu" morph="--" extra="--"/>
+                        <t id="s322_7" word="F" lemma="F" pos="pron-rel" morph="--" extra="rel"/>
+                        <t id="s322_8" word="G" lemma="G" pos="prop" morph="NOM" extra="hum"/>
+                        <t id="s322_9" word="H" lemma="H" pos="v-fin" morph="IMPF AKT" extra="mv"/>
+                        <t id="s322_10" word="I" lemma="I" pos="prp" morph="--" extra="--"/>
+                        <t id="s322_11" word="J" lemma="J" pos="n" morph="UTR S DEF NOM" extra="F:ur+premiere"/>
+                        <t id="s322_12" word="." lemma="--" pos="pu" morph="--" extra="--"/>
+                </terminals>
+                <nonterminals>
+                        <nt id="s322_500" cat="s">
+                                <edge label="STA" idref="s322_501"/>
+                        </nt>
+                        <nt id="s322_501" cat="fcl">
+                                <edge label="S" idref="s322_1"/>
+                                <edge label="P" idref="s322_2"/>
+                                <edge label="Od" idref="s322_502"/>
+                                <edge label="Vpart" idref="s322_5"/>
+                                <edge label="PU" idref="s322_6"/>
+                                <edge label="PU" idref="s322_12"/>
+                        </nt>
+                        <nt id="s322_502" cat="np">
+                                <edge label="DN" idref="s322_3"/>
+                                <edge label="H" idref="s322_4"/>
+                                <edge label="DN" idref="s322_503"/>
+                        </nt>
+                        <nt id="s322_503" cat="fcl">
+                                <edge label="Od" idref="s322_7"/>
+                                <edge label="S" idref="s322_8"/>
+                                <edge label="P" idref="s322_9"/>
+                                <edge label="Ao" idref="s322_504"/>
+                        </nt>
+                        <nt id="s322_504" cat="pp">
+                                <edge label="H" idref="s322_10"/>
+                                <edge label="DP" idref="s322_11"/>
+                        </nt>
+                </nonterminals>
+        </graph>
+</s>
+"""
+NONPROJ_EXAMPLE="""
+<s id="s9" ref="AATCNKQZ" source="id=AATCNKQZ" forest="1/1" text="A B C D E F G H I.">
+        <graph root="s9_500">
+                <terminals>
+                        <t id="s9_1" word="A" lemma="A" pos="adv" morph="--" extra="--"/>
+                        <t id="s9_2" word="B" lemma="B" pos="adv" morph="--" extra="--"/>
+                        <t id="s9_3" word="C" lemma="C" pos="v-fin" morph="IMPF AKT" extra="aux"/>
+                        <t id="s9_4" word="D" lemma="D" pos="prop" morph="NOM" extra="hum"/>
+                        <t id="s9_5" word="E" lemma="E" pos="adv" morph="--" extra="--"/>
+                        <t id="s9_6" word="F" lemma="F" pos="v-pcp2" morph="PAS" extra="mv"/>
+                        <t id="s9_7" word="G" lemma="G" pos="prp" morph="--" extra="--"/>
+                        <t id="s9_8" word="H" lemma="H" pos="num" morph="--" extra="card"/>
+                        <t id="s9_9" word="I" lemma="I" pos="n" morph="UTR P IDF NOM" extra="N:patrulje+vogn"/>
+                        <t id="s9_10" word="." lemma="--" pos="pu" morph="--" extra="--"/>
+                </terminals>
+                <nonterminals>
+                        <nt id="s9_500" cat="s">
+                                <edge label="STA" idref="s9_501"/>
+                        </nt>
+                        <nt id="s9_501" cat="fcl">
+                                <edge label="fA" idref="s9_502"/>
+                                <edge label="P" idref="s9_503"/>
+                                <edge label="S" idref="s9_4"/>
+                                <edge label="fA" idref="s9_5"/>
+                                <edge label="fA" idref="s9_504"/>
+                                <edge label="PU" idref="s9_10"/>
+                        </nt>
+                        <nt id="s9_502" cat="advp">
+                                <edge label="DA" idref="s9_1"/>
+                                <edge label="H" idref="s9_2"/>
+                        </nt>
+                        <nt id="s9_503" cat="vp">
+                                <edge label="Vaux" idref="s9_3"/>
+                                <edge label="Vm" idref="s9_6"/>
+                        </nt>
+                        <nt id="s9_504" cat="pp">
+                                <edge label="H" idref="s9_7"/>
+                                <edge label="DP" idref="s9_505"/>
+                        </nt>
+                        <nt id="s9_505" cat="np">
+                                <edge label="DN" idref="s9_8"/>
+                                <edge label="H" idref="s9_9"/>
+                        </nt>
+                </nonterminals>
+        </graph>
+</s>
+"""
+def test_projective_example():
+    """
+    Test reading a basic tree, along with some further manipulations from the conversion program
+    """
+    with tempfile.TemporaryDirectory(dir=TEST_WORKING_DIR) as tempdir:
+        test_name = os.path.join(tempdir, "proj.xml")
+        with open(test_name, "w", encoding="utf-8") as fout:
+            fout.write(PROJ_EXAMPLE)
+        sentences = convert_arboretum.read_xml_file(test_name)
+        assert len(sentences) == 1
+    tree, words = convert_arboretum.process_tree(sentences[0])
+    expected_tree = "(s (fcl (prop s2_1) (v-fin s2_2) (pron-pers s2_3) (adjp (adj s2_4) (pp (prp s2_5) (np (art s2_6) (adj s2_7) (n s2_8)))) (pu s2_9)))"
+    assert str(tree) == expected_tree
+    assert [w.word for w in words.values()] == ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', '.']
+    assert not convert_arboretum.word_sequence_missing_words(tree)
+    with tsurgeon.Tsurgeon() as tsurgeon_processor:
+        assert tree == convert_arboretum.check_words(tree, tsurgeon_processor)
+    # check that the words can be replaced as expected
+    replaced_tree = convert_arboretum.replace_words(tree, words)
+    expected_tree = "(s (fcl (prop A) (v-fin B) (pron-pers C) (adjp (adj D) (pp (prp E) (np (art F) (adj G) (n H)))) (pu .)))"
+    assert str(replaced_tree) == expected_tree
+    assert convert_arboretum.split_underscores(replaced_tree) == replaced_tree
+    # fake a word which should be split
+    words['s2_1'] = words['s2_1']._replace(word='foo_bar')
+    replaced_tree = convert_arboretum.replace_words(tree, words)
+    expected_tree = "(s (fcl (prop foo_bar) (v-fin B) (pron-pers C) (adjp (adj D) (pp (prp E) (np (art F) (adj G) (n H)))) (pu .)))"
+    assert str(replaced_tree) == expected_tree
+    expected_tree = "(s (fcl (np (prop foo) (prop bar)) (v-fin B) (pron-pers C) (adjp (adj D) (pp (prp E) (np (art F) (adj G) (n H)))) (pu .)))"
+    assert str(convert_arboretum.split_underscores(replaced_tree)) == expected_tree
+def test_not_fix_example():
+    """
+    Test that a non-projective tree which we don't have a heuristic for quietly fails
+    """
+    with tempfile.TemporaryDirectory(dir=TEST_WORKING_DIR) as tempdir:
+        test_name = os.path.join(tempdir, "nofix.xml")
+        with open(test_name, "w", encoding="utf-8") as fout:
+            fout.write(NOT_FIX_NONPROJ_EXAMPLE)
+        sentences = convert_arboretum.read_xml_file(test_name)
+        assert len(sentences) == 1
+    tree, words = convert_arboretum.process_tree(sentences[0])
+    assert not convert_arboretum.word_sequence_missing_words(tree)
+    with tsurgeon.Tsurgeon() as tsurgeon_processor:
+        assert convert_arboretum.check_words(tree, tsurgeon_processor) is None
+def test_fix_proj_example():
+    """
+    Test that a non-projective tree can be rearranged as expected
+    Note that there are several other classes of non-proj tree we could test as well...
+    """
+    with tempfile.TemporaryDirectory(dir=TEST_WORKING_DIR) as tempdir:
+        test_name = os.path.join(tempdir, "fix.xml")
+        with open(test_name, "w", encoding="utf-8") as fout:
+            fout.write(NONPROJ_EXAMPLE)
+        sentences = convert_arboretum.read_xml_file(test_name)
+        assert len(sentences) == 1
+    tree, words = convert_arboretum.process_tree(sentences[0])
+    assert not convert_arboretum.word_sequence_missing_words(tree)
+    # the 4 and 5 are moved inside the 3-6 node
+    expected_orig = "(s (fcl (advp (adv s9_1) (adv s9_2)) (vp (v-fin s9_3) (v-pcp2 s9_6)) (prop s9_4) (adv s9_5) (pp (prp s9_7) (np (num s9_8) (n s9_9))) (pu s9_10)))"
+    expected_proj = "(s (fcl (advp (adv s9_1) (adv s9_2)) (vp (v-fin s9_3) (prop s9_4) (adv s9_5) (v-pcp2 s9_6)) (pp (prp s9_7) (np (num s9_8) (n s9_9))) (pu s9_10)))"
+    assert str(tree) == expected_orig
+    with tsurgeon.Tsurgeon() as tsurgeon_processor:
+        assert str(convert_arboretum.check_words(tree, tsurgeon_processor)) == expected_proj

stanza/stanza/tests/constituency/test_ensemble.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""
+Add a simple test of the Ensemble's inference path
+This just reuses one model several times - that should still check the main loop, at least
+"""
+import pytest
+from stanza import Pipeline
+from stanza.models.constituency import text_processing
+from stanza.models.constituency import tree_reader
+from stanza.models.constituency.ensemble import Ensemble, EnsembleTrainer
+from stanza.models.constituency.text_processing import parse_tokenized_sentences
+from stanza.tests import TEST_MODELS_DIR
+pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
+@pytest.fixture(scope="module")
+def pipeline():
+    return Pipeline(dir=TEST_MODELS_DIR, lang="en", processors="tokenize, pos, constituency", tokenize_pretokenized=True)
+@pytest.fixture(scope="module")
+def saved_ensemble(tmp_path_factory, pipeline):
+    tmp_path = tmp_path_factory.mktemp("ensemble")
+    # test the ensemble by reusing the same parser multiple times
+    con_processor = pipeline.processors["constituency"]
+    model = con_processor._model
+    args = dict(model.args)
+    foundation_cache = pipeline.foundation_cache
+    model_path = con_processor._config['model_path']
+    # reuse the same model 3 times just to make sure the code paths are working
+    filenames = [model_path, model_path, model_path]
+    ensemble = EnsembleTrainer.from_files(args, filenames, foundation_cache=foundation_cache)
+    save_path = tmp_path / "ensemble.pt"
+    ensemble.save(save_path)
+    return ensemble, save_path, args, foundation_cache
+def check_basic_predictions(trees):
+    predictions = [x.predictions for x in trees]
+    assert len(predictions) == 2
+    assert all(len(x) == 1 for x in predictions)
+    trees = [x[0].tree for x in predictions]
+    result = ["{}".format(tree) for tree in trees]
+    expected = ["(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN test)))))",
+                "(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT another) (NN test)))))"]
+    assert result == expected
+def test_ensemble_inference(pipeline):
+    # test the ensemble by reusing the same parser multiple times
+    con_processor = pipeline.processors["constituency"]
+    model = con_processor._model
+    args = dict(model.args)
+    foundation_cache = pipeline.foundation_cache
+    model_path = con_processor._config['model_path']
+    # reuse the same model 3 times just to make sure the code paths are working
+    filenames = [model_path, model_path, model_path]
+    ensemble = EnsembleTrainer.from_files(args, filenames, foundation_cache=foundation_cache)
+    ensemble = ensemble.model
+    sentences = [["This", "is", "a", "test"], ["This", "is", "another", "test"]]
+    trees = parse_tokenized_sentences(args, ensemble, [pipeline], sentences)
+    check_basic_predictions(trees)
+def test_ensemble_save(saved_ensemble):
+    """
+    Depending on the saved_ensemble fixture should be enough to ensure
+    that the ensemble was correctly saved
+    (loading is tested separately)
+    """
+def test_ensemble_save_load(pipeline, saved_ensemble):
+    _, save_path, args, foundation_cache = saved_ensemble
+    ensemble = EnsembleTrainer.load(save_path, args, foundation_cache=foundation_cache)
+    sentences = [["This", "is", "a", "test"], ["This", "is", "another", "test"]]
+    trees = parse_tokenized_sentences(args, ensemble.model, [pipeline], sentences)
+    check_basic_predictions(trees)
+def test_parse_text(tmp_path, pipeline, saved_ensemble):
+    _, model_path, args, foundation_cache = saved_ensemble
+    raw_file = str(tmp_path / "test_input.txt")
+    with open(raw_file, "w") as fout:
+        fout.write("This is a test\nThis is another test\n")
+    output_file = str(tmp_path / "test_output.txt")
+    args = dict(args)
+    args['tokenized_file'] = raw_file
+    args['predict_file'] = output_file
+    text_processing.load_model_parse_text(args, model_path, [pipeline])
+    trees = tree_reader.read_treebank(output_file)
+    trees = ["{}".format(x) for x in trees]
+    expected_trees = ["(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN test)))))",
+                      "(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT another) (NN test)))))"]
+    assert trees == expected_trees
+def test_pipeline(saved_ensemble):
+    _, model_path, _, foundation_cache = saved_ensemble
+    nlp = Pipeline("en", processors="tokenize,pos,constituency", constituency_model_path=str(model_path), foundation_cache=foundation_cache, download_method=None)
+    doc = nlp("This is a test")
+    tree = "{}".format(doc.sentences[0].constituency)
+    assert tree == "(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN test)))))"

stanza/stanza/tests/constituency/test_in_order_compound_oracle.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import pytest
+from stanza.models.constituency import in_order_compound_oracle
+from stanza.models.constituency import tree_reader
+from stanza.models.constituency.parse_transitions import CloseConstituent, OpenConstituent, Shift, TransitionScheme
+from stanza.models.constituency.transition_sequence import build_treebank
+from stanza.tests.constituency.test_transition_sequence import reconstruct_tree
+pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
+# A sample tree from PTB with a triple unary transition (at a location other than root)
+# Here we test the incorrect closing of various brackets
+TRIPLE_UNARY_START_TREE = """
+( (S
+    (PRN
+      (S
+        (NP-SBJ (-NONE- *) )
+        (VP (VB See) )))
+    (, ,)
+    (NP-SBJ
+      (NP (DT the) (JJ other) (NN rule) )
+      (PP (IN of)
+        (NP (NN thumb) ))
+      (PP (IN about)
+        (NP (NN ballooning) )))))
+"""
+TREES = [TRIPLE_UNARY_START_TREE]
+TREEBANK = "\n".join(TREES)
+ROOT_LABELS = ["ROOT"]
+@pytest.fixture(scope="module")
+def trees():
+    trees = tree_reader.read_trees(TREEBANK)
+    trees = [t.prune_none().simplify_labels() for t in trees]
+    assert len(trees) == len(TREES)
+    return trees
+@pytest.fixture(scope="module")
+def gold_sequences(trees):
+    gold_sequences = build_treebank(trees, TransitionScheme.IN_ORDER_COMPOUND)
+    return gold_sequences
+def get_repairs(gold_sequence, wrong_transition, repair_fn):
+    """
+    Use the repair function and the wrong transition to iterate over the gold sequence
+    Returns a list of possible repairs, one for each position in the sequence
+    Repairs are tuples, (idx, seq)
+    """
+    repairs = [(idx, repair_fn(gold_transition, wrong_transition, gold_sequence, idx, ROOT_LABELS, None, None))
+               for idx, gold_transition in enumerate(gold_sequence)]
+    repairs = [x for x in repairs if x[1] is not None]
+    return repairs
+def test_fix_shift_close():
+    trees = tree_reader.read_trees(TRIPLE_UNARY_START_TREE)
+    trees = [t.prune_none().simplify_labels() for t in trees]
+    assert len(trees) == 1
+    tree = trees[0]
+    gold_sequences = build_treebank(trees, TransitionScheme.IN_ORDER_COMPOUND)
+    # there are three places in this tree where a long bracket (more than 2 subtrees)
+    # could theoretically be closed and then reopened
+    repairs = get_repairs(gold_sequences[0], CloseConstituent(), in_order_compound_oracle.fix_shift_close_error)
+    assert len(repairs) == 3
+    expected_trees = ["(ROOT (S (S (PRN (S (VP (VB See)))) (, ,)) (NP (NP (DT the) (JJ other) (NN rule)) (PP (IN of) (NP (NN thumb))) (PP (IN about) (NP (NN ballooning))))))",
+                      "(ROOT (S (PRN (S (VP (VB See)))) (, ,) (NP (NP (NP (DT the) (JJ other)) (NN rule)) (PP (IN of) (NP (NN thumb))) (PP (IN about) (NP (NN ballooning))))))",
+                      "(ROOT (S (PRN (S (VP (VB See)))) (, ,) (NP (NP (NP (DT the) (JJ other) (NN rule)) (PP (IN of) (NP (NN thumb)))) (PP (IN about) (NP (NN ballooning))))))"]
+    for repair, expected in zip(repairs, expected_trees):
+        repaired_tree = reconstruct_tree(tree, repair[1], transition_scheme=TransitionScheme.IN_ORDER_COMPOUND)
+        assert str(repaired_tree) == expected
+def test_fix_open_close():
+    trees = tree_reader.read_trees(TRIPLE_UNARY_START_TREE)
+    trees = [t.prune_none().simplify_labels() for t in trees]
+    assert len(trees) == 1
+    tree = trees[0]
+    gold_sequences = build_treebank(trees, TransitionScheme.IN_ORDER_COMPOUND)
+    repairs = get_repairs(gold_sequences[0], CloseConstituent(), in_order_compound_oracle.fix_open_close_error)
+    print("------------------")
+    for repair in repairs:
+        print(repair)
+        repaired_tree = reconstruct_tree(tree, repair[1], transition_scheme=TransitionScheme.IN_ORDER_COMPOUND)
+        print("{:P}".format(repaired_tree))

stanza/stanza/tests/constituency/test_parse_transitions.py ADDED Viewed

	@@ -0,0 +1,486 @@

+import pytest
+from stanza.models.constituency import parse_transitions
+from stanza.models.constituency.base_model import SimpleModel, UNARY_LIMIT
+from stanza.models.constituency.parse_transitions import TransitionScheme, Shift, CloseConstituent, OpenConstituent
+from stanza.tests import *
+pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
+def build_initial_state(model, num_states=1):
+    words = ["Unban", "Mox", "Opal"]
+    tags = ["VB", "NNP", "NNP"]
+    sentences = [list(zip(words, tags)) for _ in range(num_states)]
+    states = model.initial_state_from_words(sentences)
+    assert len(states) == num_states
+    assert all(state.num_transitions == 0 for state in states)
+    return states
+def test_initial_state(model=None):
+    if model is None:
+        model = SimpleModel()
+    states = build_initial_state(model)
+    assert len(states) == 1
+    state = states[0]
+    assert state.sentence_length == 3
+    assert state.num_opens == 0
+    # each stack has a sentinel value at the end
+    assert len(state.word_queue) == 5
+    assert len(state.constituents) == 1
+    assert len(state.transitions) == 1
+    assert state.word_position == 0
+def test_shift(model=None):
+    if model is None:
+        model = SimpleModel()
+    state = build_initial_state(model)[0]
+    open_transition = parse_transitions.OpenConstituent("ROOT")
+    state = open_transition.apply(state, model)
+    open_transition = parse_transitions.OpenConstituent("S")
+    state = open_transition.apply(state, model)
+    shift = parse_transitions.Shift()
+    assert shift.is_legal(state, model)
+    assert len(state.word_queue) == 5
+    assert state.word_position == 0
+    state = shift.apply(state, model)
+    assert len(state.word_queue) == 5
+    # 4 because of the dummy created by the opens
+    assert len(state.constituents) == 4
+    assert len(state.transitions) == 4
+    assert shift.is_legal(state, model)
+    assert state.word_position == 1
+    assert not state.empty_word_queue()
+    state = shift.apply(state, model)
+    assert len(state.word_queue) == 5
+    assert len(state.constituents) == 5
+    assert len(state.transitions) == 5
+    assert shift.is_legal(state, model)
+    assert state.word_position == 2
+    assert not state.empty_word_queue()
+    state = shift.apply(state, model)
+    assert len(state.word_queue) == 5
+    assert len(state.constituents) == 6
+    assert len(state.transitions) == 6
+    assert not shift.is_legal(state, model)
+    assert state.word_position == 3
+    assert state.empty_word_queue()
+    constituents = state.constituents
+    assert model.get_top_constituent(constituents).children[0].label == 'Opal'
+    constituents = constituents.pop()
+    assert model.get_top_constituent(constituents).children[0].label == 'Mox'
+    constituents = constituents.pop()
+    assert model.get_top_constituent(constituents).children[0].label == 'Unban'
+def test_initial_unary(model=None):
+    # it doesn't make sense to start with a CompoundUnary
+    if model is None:
+        model = SimpleModel()
+    state = build_initial_state(model)[0]
+    unary = parse_transitions.CompoundUnary('ROOT', 'VP')
+    assert unary.label == ('ROOT', 'VP',)
+    assert not unary.is_legal(state, model)
+    unary = parse_transitions.CompoundUnary('VP')
+    assert unary.label == ('VP',)
+    assert not unary.is_legal(state, model)
+def test_unary(model=None):
+    if model is None:
+        model = SimpleModel()
+    state = build_initial_state(model)[0]
+    shift = parse_transitions.Shift()
+    state = shift.apply(state, model)
+    # this is technically the wrong parse but we're being lazy
+    unary = parse_transitions.CompoundUnary('S', 'VP')
+    assert unary.is_legal(state, model)
+    state = unary.apply(state, model)
+    assert not unary.is_legal(state, model)
+    tree = model.get_top_constituent(state.constituents)
+    assert tree.label == 'S'
+    assert len(tree.children) == 1
+    tree = tree.children[0]
+    assert tree.label == 'VP'
+    assert len(tree.children) == 1
+    tree = tree.children[0]
+    assert tree.label == 'VB'
+    assert tree.is_preterminal()
+def test_unary_requires_root(model=None):
+    if model is None:
+        model = SimpleModel(transition_scheme=TransitionScheme.TOP_DOWN_UNARY)
+    state = build_initial_state(model)[0]
+    open_transition = parse_transitions.OpenConstituent("S")
+    assert open_transition.is_legal(state, model)
+    state = open_transition.apply(state, model)
+    shift = parse_transitions.Shift()
+    assert shift.is_legal(state, model)
+    state = shift.apply(state, model)
+    assert shift.is_legal(state, model)
+    state = shift.apply(state, model)
+    assert shift.is_legal(state, model)
+    state = shift.apply(state, model)
+    assert not shift.is_legal(state, model)
+    close_transition = parse_transitions.CloseConstituent()
+    assert close_transition.is_legal(state, model)
+    state = close_transition.apply(state, model)
+    assert not open_transition.is_legal(state, model)
+    assert not close_transition.is_legal(state, model)
+    np_unary = parse_transitions.CompoundUnary("NP")
+    assert not np_unary.is_legal(state, model)
+    root_unary = parse_transitions.CompoundUnary("ROOT")
+    assert root_unary.is_legal(state, model)
+    assert not state.finished(model)
+    state = root_unary.apply(state, model)
+    assert not root_unary.is_legal(state, model)
+    assert state.finished(model)
+def test_open(model=None):
+    if model is None:
+        model = SimpleModel()
+    state = build_initial_state(model)[0]
+    shift = parse_transitions.Shift()
+    state = shift.apply(state, model)
+    state = shift.apply(state, model)
+    assert state.num_opens == 0
+    open_transition = parse_transitions.OpenConstituent("VP")
+    assert open_transition.is_legal(state, model)
+    state = open_transition.apply(state, model)
+    assert open_transition.is_legal(state, model)
+    assert state.num_opens == 1
+    # check that it is illegal if there are too many opens already
+    for i in range(20):
+        state = open_transition.apply(state, model)
+    assert not open_transition.is_legal(state, model)
+    assert state.num_opens == 21
+    # check that it is illegal if the state is out of words
+    state = build_initial_state(model)[0]
+    state = shift.apply(state, model)
+    state = shift.apply(state, model)
+    state = shift.apply(state, model)
+    assert not open_transition.is_legal(state, model)
+def test_compound_open(model=None):
+    if model is None:
+        model = SimpleModel()
+    state = build_initial_state(model)[0]
+    open_transition = parse_transitions.OpenConstituent("ROOT", "S")
+    assert open_transition.is_legal(state, model)
+    shift = parse_transitions.Shift()
+    close_transition = parse_transitions.CloseConstituent()
+    state = open_transition.apply(state, model)
+    state = shift.apply(state, model)
+    state = shift.apply(state, model)
+    state = shift.apply(state, model)
+    state = close_transition.apply(state, model)
+    tree = model.get_top_constituent(state.constituents)
+    assert tree.label == 'ROOT'
+    assert len(tree.children) == 1
+    tree = tree.children[0]
+    assert tree.label == 'S'
+    assert len(tree.children) == 3
+    assert tree.children[0].children[0].label == 'Unban'
+    assert tree.children[1].children[0].label == 'Mox'
+    assert tree.children[2].children[0].label == 'Opal'
+def test_in_order_open(model=None):
+    if model is None:
+        model = SimpleModel(TransitionScheme.IN_ORDER)
+    state = build_initial_state(model)[0]
+    shift = parse_transitions.Shift()
+    assert shift.is_legal(state, model)
+    state = shift.apply(state, model)
+    assert not shift.is_legal(state, model)
+    open_vp = parse_transitions.OpenConstituent("VP")
+    assert open_vp.is_legal(state, model)
+    state = open_vp.apply(state, model)
+    assert not open_vp.is_legal(state, model)
+    close_trans = parse_transitions.CloseConstituent()
+    assert close_trans.is_legal(state, model)
+    state = close_trans.apply(state, model)
+    open_s = parse_transitions.OpenConstituent("S")
+    assert open_s.is_legal(state, model)
+    state = open_s.apply(state, model)
+    assert not open_vp.is_legal(state, model)
+    # check that root transitions won't happen in the middle of a parse
+    open_root = parse_transitions.OpenConstituent("ROOT")
+    assert not open_root.is_legal(state, model)
+    # build (NP (NNP Mox) (NNP Opal))
+    open_np = parse_transitions.OpenConstituent("NP")
+    assert shift.is_legal(state, model)
+    state = shift.apply(state, model)
+    assert open_np.is_legal(state, model)
+    # make sure root can't happen in places where an arbitrary open is legal
+    assert not open_root.is_legal(state, model)
+    state = open_np.apply(state, model)
+    assert shift.is_legal(state, model)
+    state = shift.apply(state, model)
+    assert close_trans.is_legal(state, model)
+    state = close_trans.apply(state, model)
+    assert close_trans.is_legal(state, model)
+    state = close_trans.apply(state, model)
+    assert open_root.is_legal(state, model)
+    state = open_root.apply(state, model)
+def test_too_many_unaries_close():
+    """
+    This tests rejecting Close at the start of a sequence after too many unary transitions
+    The model should reject doing multiple "unaries" - eg, Open then Close - in an IN_ORDER sequence
+    """
+    model = SimpleModel(TransitionScheme.IN_ORDER)
+    state = build_initial_state(model)[0]
+    shift = parse_transitions.Shift()
+    assert shift.is_legal(state, model)
+    state = shift.apply(state, model)
+    open_np = parse_transitions.OpenConstituent("NP")
+    close_trans = parse_transitions.CloseConstituent()
+    for _ in range(UNARY_LIMIT):
+        assert open_np.is_legal(state, model)
+        state = open_np.apply(state, model)
+        assert close_trans.is_legal(state, model)
+        state = close_trans.apply(state, model)
+    assert open_np.is_legal(state, model)
+    state = open_np.apply(state, model)
+    assert not close_trans.is_legal(state, model)
+def test_too_many_unaries_open():
+    """
+    This tests rejecting Open in the middle of a sequence after too many unary transitions
+    The model should reject doing multiple "unaries" - eg, Open then Close - in an IN_ORDER sequence
+    """
+    model = SimpleModel(TransitionScheme.IN_ORDER)
+    state = build_initial_state(model)[0]
+    shift = parse_transitions.Shift()
+    assert shift.is_legal(state, model)
+    state = shift.apply(state, model)
+    open_np = parse_transitions.OpenConstituent("NP")
+    close_trans = parse_transitions.CloseConstituent()
+    assert open_np.is_legal(state, model)
+    state = open_np.apply(state, model)
+    assert not open_np.is_legal(state, model)
+    assert shift.is_legal(state, model)
+    state = shift.apply(state, model)
+    for _ in range(UNARY_LIMIT):
+        assert open_np.is_legal(state, model)
+        state = open_np.apply(state, model)
+        assert close_trans.is_legal(state, model)
+        state = close_trans.apply(state, model)
+    assert not open_np.is_legal(state, model)
+def test_close(model=None):
+    if model is None:
+        model = SimpleModel()
+    # this one actually tests an entire subtree building
+    state = build_initial_state(model)[0]
+    open_transition_vp = parse_transitions.OpenConstituent("VP")
+    assert open_transition_vp.is_legal(state, model)
+    state = open_transition_vp.apply(state, model)
+    assert state.num_opens == 1
+    shift = parse_transitions.Shift()
+    assert shift.is_legal(state, model)
+    state = shift.apply(state, model)
+    open_transition_np = parse_transitions.OpenConstituent("NP")
+    assert open_transition_np.is_legal(state, model)
+    state = open_transition_np.apply(state, model)
+    assert state.num_opens == 2
+    assert shift.is_legal(state, model)
+    state = shift.apply(state, model)
+    assert shift.is_legal(state, model)
+    state = shift.apply(state, model)
+    assert not shift.is_legal(state, model)
+    assert state.num_opens == 2
+    # now should have "mox", "opal" on the constituents
+    close_transition = parse_transitions.CloseConstituent()
+    assert close_transition.is_legal(state, model)
+    state = close_transition.apply(state, model)
+    assert state.num_opens == 1
+    assert close_transition.is_legal(state, model)
+    state = close_transition.apply(state, model)
+    assert state.num_opens == 0
+    assert not close_transition.is_legal(state, model)
+    tree = model.get_top_constituent(state.constituents)
+    assert tree.label == 'VP'
+    assert len(tree.children) == 2
+    tree = tree.children[1]
+    assert tree.label == 'NP'
+    assert len(tree.children) == 2
+    assert tree.children[0].is_preterminal()
+    assert tree.children[1].is_preterminal()
+    assert tree.children[0].children[0].label == 'Mox'
+    assert tree.children[1].children[0].label == 'Opal'
+    # extra one for None at the start of the TreeStack
+    assert len(state.constituents) == 2
+    assert state.all_transitions(model) == [open_transition_vp, shift, open_transition_np, shift, shift, close_transition, close_transition]
+def test_in_order_compound_finalize(model=None):
+    """
+    Test the Finalize transition is only legal at the end of a sequence
+    """
+    if model is None:
+        model = SimpleModel(transition_scheme=TransitionScheme.IN_ORDER_COMPOUND)
+    state = build_initial_state(model)[0]
+    finalize = parse_transitions.Finalize("ROOT")
+    shift = parse_transitions.Shift()
+    assert shift.is_legal(state, model)
+    assert not finalize.is_legal(state, model)
+    state = shift.apply(state, model)
+    open_transition = parse_transitions.OpenConstituent("NP")
+    assert open_transition.is_legal(state, model)
+    assert not finalize.is_legal(state, model)
+    state = open_transition.apply(state, model)
+    assert state.num_opens == 1
+    assert shift.is_legal(state, model)
+    assert not finalize.is_legal(state, model)
+    state = shift.apply(state, model)
+    assert shift.is_legal(state, model)
+    assert not finalize.is_legal(state, model)
+    state = shift.apply(state, model)
+    close_transition = parse_transitions.CloseConstituent()
+    assert close_transition.is_legal(state, model)
+    state = close_transition.apply(state, model)
+    assert state.num_opens == 0
+    assert not close_transition.is_legal(state, model)
+    assert finalize.is_legal(state, model)
+    state = finalize.apply(state, model)
+    assert not finalize.is_legal(state, model)
+    tree = model.get_top_constituent(state.constituents)
+    assert tree.label == 'ROOT'
+def test_hashes():
+    transitions = set()
+    shift = parse_transitions.Shift()
+    assert shift not in transitions
+    transitions.add(shift)
+    assert shift in transitions
+    shift = parse_transitions.Shift()
+    assert shift in transitions
+    for i in range(5):
+        transitions.add(shift)
+    assert len(transitions) == 1
+    unary = parse_transitions.CompoundUnary("asdf")
+    assert unary not in transitions
+    transitions.add(unary)
+    assert unary in transitions
+    unary = parse_transitions.CompoundUnary("asdf", "zzzz")
+    assert unary not in transitions
+    transitions.add(unary)
+    transitions.add(unary)
+    transitions.add(unary)
+    unary = parse_transitions.CompoundUnary("asdf", "zzzz")
+    assert unary in transitions
+    oc = parse_transitions.OpenConstituent("asdf")
+    assert oc not in transitions
+    transitions.add(oc)
+    assert oc in transitions
+    transitions.add(oc)
+    transitions.add(oc)
+    assert len(transitions) == 4
+    assert parse_transitions.OpenConstituent("asdf") in transitions
+    cc = parse_transitions.CloseConstituent()
+    assert cc not in transitions
+    transitions.add(cc)
+    transitions.add(cc)
+    transitions.add(cc)
+    assert cc in transitions
+    cc = parse_transitions.CloseConstituent()
+    assert cc in transitions
+    assert len(transitions) == 5
+def test_sort():
+    expected = []
+    expected.append(parse_transitions.Shift())
+    expected.append(parse_transitions.CloseConstituent())
+    expected.append(parse_transitions.CompoundUnary("NP"))
+    expected.append(parse_transitions.CompoundUnary("NP", "VP"))
+    expected.append(parse_transitions.OpenConstituent("mox"))
+    expected.append(parse_transitions.OpenConstituent("opal"))
+    expected.append(parse_transitions.OpenConstituent("unban"))
+    transitions = set(expected)
+    transitions = sorted(transitions)
+    assert transitions == expected
+def test_check_transitions():
+    """
+    Test that check_transitions passes or fails a couple simple, small test cases
+    """
+    transitions = {Shift(), CloseConstituent(), OpenConstituent("NP"), OpenConstituent("VP")}
+    other = {Shift(), CloseConstituent(), OpenConstituent("NP"), OpenConstituent("VP")}
+    parse_transitions.check_transitions(transitions, other, "test")
+    # This will get a pass because it is a unary made out of existing unaries
+    other = {Shift(), CloseConstituent(), OpenConstituent("NP", "VP")}
+    parse_transitions.check_transitions(transitions, other, "test")
+    # This should fail
+    with pytest.raises(RuntimeError):
+        other = {Shift(), CloseConstituent(), OpenConstituent("NP", "ZP")}
+        parse_transitions.check_transitions(transitions, other, "test")

stanza/stanza/tests/constituency/test_parse_tree.py ADDED Viewed

	@@ -0,0 +1,369 @@

+import pytest
+from stanza.models.constituency.parse_tree import Tree
+from stanza.models.constituency import tree_reader
+from stanza.tests import *
+pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
+def test_leaf_preterminal():
+    foo = Tree(label="foo")
+    assert foo.is_leaf()
+    assert not foo.is_preterminal()
+    assert len(foo.children) == 0
+    assert str(foo) == 'foo'
+    bar = Tree(label="bar", children=foo)
+    assert not bar.is_leaf()
+    assert bar.is_preterminal()
+    assert len(bar.children) == 1
+    assert str(bar) == "(bar foo)"
+    baz = Tree(label="baz", children=[bar])
+    assert not baz.is_leaf()
+    assert not baz.is_preterminal()
+    assert len(baz.children) == 1
+    assert str(baz) == "(baz (bar foo))"
+def test_yield_preterminals():
+    text = "((S (VP (VB Unban)) (NP (NNP Mox) (NNP Opal))))"
+    trees = tree_reader.read_trees(text)
+    preterminals = list(trees[0].yield_preterminals())
+    assert len(preterminals) == 3
+    assert str(preterminals) == "[(VB Unban), (NNP Mox), (NNP Opal)]"
+def test_depth():
+    text = "(foo) ((S (VP (VB Unban)) (NP (NNP Mox) (NNP Opal))))"
+    trees = tree_reader.read_trees(text)
+    assert trees[0].depth() == 0
+    assert trees[1].depth() == 4
+def test_unique_labels():
+    """
+    Test getting the unique labels from a tree
+    Assumes tree_reader works, which should be fine since it is tested elsewhere
+    """
+    text="((SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?))) ((SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))"
+    trees = tree_reader.read_trees(text)
+    labels = Tree.get_unique_constituent_labels(trees)
+    expected = ['NP', 'PP', 'ROOT', 'SBARQ', 'SQ', 'VP', 'WHNP']
+    assert labels == expected
+def test_unique_tags():
+    """
+    Test getting the unique tags from a tree
+    """
+    text="((SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))"
+    trees = tree_reader.read_trees(text)
+    tags = Tree.get_unique_tags(trees)
+    expected = ['.', 'DT', 'IN', 'NN', 'VBZ', 'WP']
+    assert tags == expected
+def test_unique_words():
+    """
+    Test getting the unique words from a tree
+    """
+    text="((SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))"
+    trees = tree_reader.read_trees(text)
+    words = Tree.get_unique_words(trees)
+    expected = ['?', 'Who', 'in', 'seat', 'sits', 'this']
+    assert words == expected
+def test_rare_words():
+    """
+    Test getting the unique words from a tree
+    """
+    text="((SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))  ((SBARQ (NP (DT this) (NN seat)) (. ?)))"
+    trees = tree_reader.read_trees(text)
+    words = Tree.get_rare_words(trees, 0.5)
+    expected = ['Who', 'in', 'sits']
+    assert words == expected
+def test_common_words():
+    """
+    Test getting the unique words from a tree
+    """
+    text="((SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))  ((SBARQ (NP (DT this) (NN seat)) (. ?)))"
+    trees = tree_reader.read_trees(text)
+    words = Tree.get_common_words(trees, 3)
+    expected = ['?', 'seat', 'this']
+    assert words == expected
+def test_root_labels():
+    text="( (SBARQ-FOO (WHNP-BAR (WP Who)) (SQ#ASDF (VP=1 (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))"
+    trees = tree_reader.read_trees(text)
+    assert ["ROOT"] == Tree.get_root_labels(trees)
+    text=("( (SBARQ-FOO (WHNP-BAR (WP Who)) (SQ#ASDF (VP=1 (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))" +
+          "( (SBARQ-FOO (WHNP-BAR (WP Who)) (SQ#ASDF (VP=1 (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))" +
+          "( (SBARQ-FOO (WHNP-BAR (WP Who)) (SQ#ASDF (VP=1 (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))")
+    trees = tree_reader.read_trees(text)
+    assert ["ROOT"] == Tree.get_root_labels(trees)
+    text="(FOO) (BAR)"
+    trees = tree_reader.read_trees(text)
+    assert ["BAR", "FOO"] == Tree.get_root_labels(trees)
+def test_prune_none():
+    text=["((SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (-NONE- in) (NP (DT this) (NN seat))))) (. ?)))", # test one dead node
+          "((SBARQ (WHNP (-NONE- Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))", # test recursive dead nodes
+          "((SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (-NONE- this) (-NONE- seat))))) (. ?)))"] # test all children dead
+    expected=["(ROOT (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (NP (DT this) (NN seat))))) (. ?)))",
+              "(ROOT (SBARQ (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))",
+              "(ROOT (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in)))) (. ?)))"]
+    for t, e in zip(text, expected):
+        trees = tree_reader.read_trees(t)
+        assert len(trees) == 1
+        tree = trees[0].prune_none()
+        assert e == str(tree)
+def test_simplify_labels():
+    text="( (SBARQ-FOO (WHNP-BAR (WP Who)) (SQ#ASDF (VP=1 (VBZ sits) (PP (IN in) (NP (DT this) (- -))))) (. ?)))"
+    expected = "(ROOT (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (- -))))) (. ?)))"
+    trees = tree_reader.read_trees(text)
+    trees = [t.simplify_labels() for t in trees]
+    assert len(trees) == 1
+    assert expected == str(trees[0])
+def test_remap_constituent_labels():
+    text="(ROOT (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in)))) (. ?)))"
+    expected="(ROOT (FOO (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in)))) (. ?)))"
+    label_map = { "SBARQ": "FOO" }
+    trees = tree_reader.read_trees(text)
+    trees = [t.remap_constituent_labels(label_map) for t in trees]
+    assert len(trees) == 1
+    assert expected == str(trees[0])
+def test_remap_constituent_words():
+    text="(ROOT (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in)))) (. ?)))"
+    expected="(ROOT (SBARQ (WHNP (WP unban)) (SQ (VP (VBZ mox) (PP (IN opal)))) (. ?)))"
+    word_map = { "Who": "unban", "sits": "mox", "in": "opal" }
+    trees = tree_reader.read_trees(text)
+    trees = [t.remap_words(word_map) for t in trees]
+    assert len(trees) == 1
+    assert expected == str(trees[0])
+def test_replace_words():
+    text="(ROOT (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in)))) (. ?)))"
+    expected="(ROOT (SBARQ (WHNP (WP unban)) (SQ (VP (VBZ mox) (PP (IN opal)))) (. ?)))"
+    new_words = ["unban", "mox", "opal", "?"]
+    trees = tree_reader.read_trees(text)
+    assert len(trees) == 1
+    tree = trees[0]
+    new_tree = tree.replace_words(new_words)
+    assert expected == str(new_tree)
+def test_compound_constituents():
+    # TODO: add skinny trees like this to the various transition tests
+    text="((VP (VB Unban)))"
+    trees = tree_reader.read_trees(text)
+    assert Tree.get_compound_constituents(trees) == [('ROOT', 'VP')]
+    text="(ROOT (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in)))) (. ?)))"
+    trees = tree_reader.read_trees(text)
+    assert Tree.get_compound_constituents(trees) == [('PP',), ('ROOT', 'SBARQ'), ('SQ', 'VP'), ('WHNP',)]
+    text="((VP (VB Unban)))   (ROOT (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in)))) (. ?)))"
+    trees = tree_reader.read_trees(text)
+    assert Tree.get_compound_constituents(trees) == [('PP',), ('ROOT', 'SBARQ'), ('ROOT', 'VP'), ('SQ', 'VP'), ('WHNP',)]
+def test_equals():
+    """
+    Check one tree from the actual dataset for ==
+    when built with compound Open, this didn't work because of a silly bug
+    """
+    text = "(ROOT (S (NP (DT The) (NNP Arizona) (NNPS Corporations) (NNP Commission)) (VP (VBD authorized) (NP (NP (DT an) (ADJP (CD 11.5)) (NN %) (NN rate) (NN increase)) (PP (IN at) (NP (NNP Tucson) (NNP Electric) (NNP Power) (NNP Co.))) (, ,) (UCP (ADJP (ADJP (RB substantially) (JJR lower)) (SBAR (IN than) (S (VP (VBN recommended) (NP (JJ last) (NN month)) (PP (IN by) (NP (DT a) (NN commission) (NN hearing) (NN officer))))))) (CC and) (NP (NP (QP (RB barely) (PDT half)) (DT the) (NN rise)) (VP (VBN sought) (PP (IN by) (NP (DT the) (NN utility)))))))) (. .)))"
+    trees = tree_reader.read_trees(text)
+    assert len(trees) == 1
+    tree = trees[0]
+    assert tree == tree
+    trees2 = tree_reader.read_trees(text)
+    tree2 = trees2[0]
+    assert tree is not tree2
+    assert tree == tree2
+# This tree was causing the model to barf on CTB7,
+# although it turns out the problem was just the
+# depth of the unary, not the list
+CHINESE_LONG_LIST_TREE = """
+(ROOT
+ (IP
+  (NP (NNP 证券法))
+  (VP
+   (PP
+    (IN 对)
+    (NP
+     (DNP
+      (NP
+       (NP (NNP 中国))
+       (NP
+        (NN 证券)
+        (NN 市场)))
+      (DEC 的))
+     (NP (NN 运作))))
+   (, ，)
+   (PP
+    (PP
+     (IN 从)
+     (NP
+      (NP (NN 股票))
+      (NP (VV 发行) (EC 、) (VV 交易))))
+    (, ，)
+    (PP
+     (VV 到)
+     (NP
+      (NP (NN 上市) (NN 公司) (NN 收购))
+      (EC 、)
+      (NP (NN 证券) (NN 交易所))
+      (EC 、)
+      (NP (NN 证券) (NN 公司))
+      (EC 、)
+      (NP (NN 登记) (NN 结算) (NN 机构))
+      (EC 、)
+      (NP (NN 交易) (NN 服务) (NN 机构))
+      (EC 、)
+      (NP (NN 证券业) (NN 协会))
+      (EC 、)
+      (NP (NN 证券) (NN 监督) (NN 管理) (NN 机构))
+      (CC 和)
+      (NP
+       (DNP
+        (NP (CP (CP (IP (VP (VV 违法))))))
+        (DEC 的))
+       (NP (NN 法律) (NN 责任))))))
+   (ADVP (RB 都))
+   (VP
+    (VV 作)
+    (AS 了)
+    (NP
+     (ADJP (JJ 详细))
+     (NP (NN 规定)))))
+  (. 。)))
+"""
+WEIRD_UNARY = """
+  (DNP
+    (NP (CP (CP (IP (VP (ASDF
+      (NP (NN 上市) (NN 公司) (NN 收购))
+      (EC 、)
+      (NP (NN 证券) (NN 交易所))
+      (EC 、)
+      (NP (NN 证券) (NN 公司))
+      (EC 、)
+      (NP (NN 登记) (NN 结算) (NN 机构))
+      (EC 、)
+      (NP (NN 交易) (NN 服务) (NN 机构))
+      (EC 、)
+      (NP (NN 证券业) (NN 协会))
+      (EC 、)
+      (NP (NN 证券) (NN 监督) (NN 管理) (NN 机构))))))))
+    (DEC 的))
+"""
+def test_count_unaries():
+    trees = tree_reader.read_trees(CHINESE_LONG_LIST_TREE)
+    assert len(trees) == 1
+    assert trees[0].count_unary_depth() == 5
+    trees = tree_reader.read_trees(WEIRD_UNARY)
+    assert len(trees) == 1
+    assert trees[0].count_unary_depth() == 5
+def test_str_bracket_labels():
+    text = "((S (VP (VB Unban)) (NP (NNP Mox) (NNP Opal))))"
+    expected = "(_ROOT (_S (_VP (_VB Unban )_VB )_VP (_NP (_NNP Mox )_NNP (_NNP Opal )_NNP )_NP )_S )_ROOT"
+    trees = tree_reader.read_trees(text)
+    assert len(trees) == 1
+    assert "{:L}".format(trees[0]) == expected
+def test_all_leaves_are_preterminals():
+    text = "((S (VP (VB Unban)) (NP (NNP Mox) (NNP Opal))))"
+    trees = tree_reader.read_trees(text)
+    assert len(trees) == 1
+    assert trees[0].all_leaves_are_preterminals()
+    text = "((S (VP (VB Unban)) (NP (Mox) (NNP Opal))))"
+    trees = tree_reader.read_trees(text)
+    assert len(trees) == 1
+    assert not trees[0].all_leaves_are_preterminals()
+def test_latex():
+    """
+    Test the latex format for trees
+    """
+    expected = "\\Tree [.S [.NP Jennifer ] [.VP has [.NP nice antennae ] ] ]"
+    tree = "(ROOT (S (NP (NNP Jennifer)) (VP (VBZ has) (NP (JJ nice) (NNS antennae)))))"
+    tree = tree_reader.read_trees(tree)[0]
+    text = "{:T}".format(tree)
+    assert text == expected
+def test_pretty_print():
+    """
+    Pretty print a couple trees - newlines & indentation
+    """
+    text = "(ROOT (S (VP (VB Unban)) (NP (NNP Mox) (NNP Opal)))) (ROOT (S (NP (DT The) (NNP Arizona) (NNPS Corporations) (NNP Commission)) (VP (VBD authorized) (NP (NP (DT an) (ADJP (CD 11.5)) (NN %) (NN rate) (NN increase)) (PP (IN at) (NP (NNP Tucson) (NNP Electric)))))))"
+    trees = tree_reader.read_trees(text)
+    assert len(trees) == 2
+    expected = """(ROOT
+  (S
+    (VP (VB Unban))
+    (NP (NNP Mox) (NNP Opal))))
+"""
+    assert "{:P}".format(trees[0]) == expected
+    expected = """(ROOT
+  (S
+    (NP (DT The) (NNP Arizona) (NNPS Corporations) (NNP Commission))
+    (VP
+      (VBD authorized)
+      (NP
+        (NP
+          (DT an)
+          (ADJP (CD 11.5))
+          (NN %)
+          (NN rate)
+          (NN increase))
+        (PP
+          (IN at)
+          (NP (NNP Tucson) (NNP Electric)))))))
+"""
+    assert "{:P}".format(trees[1]) == expected
+    assert text == "{:O} {:O}".format(*trees)
+def test_reverse():
+    text = "(ROOT (S (NP (PRP I)) (VP (VBP want) (S (VP (TO to) (VP (VB lick) (NP (NP (NNP Jennifer) (POS 's)) (NNS antennae))))))))"
+    trees = tree_reader.read_trees(text)
+    assert len(trees) == 1
+    reversed_tree = trees[0].reverse()
+    assert str(reversed_tree) == "(ROOT (S (VP (S (VP (VP (NP (NNS antennae) (NP (POS 's) (NNP Jennifer))) (VB lick)) (TO to))) (VBP want)) (NP (PRP I))))"

stanza/stanza/tests/constituency/test_positional_encoding.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import pytest
+import torch
+from stanza import Pipeline
+from stanza.models.constituency.positional_encoding import SinusoidalEncoding, AddSinusoidalEncoding
+from stanza.tests import *
+pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
+def test_positional_encoding():
+    encoding = SinusoidalEncoding(model_dim=10, max_len=6)
+    foo = encoding(torch.tensor([5]))
+    assert foo.shape == (1, 10)
+    # TODO: check the values
+def test_resize():
+    encoding = SinusoidalEncoding(model_dim=10, max_len=3)
+    foo = encoding(torch.tensor([5]))
+    assert foo.shape == (1, 10)
+def test_arange():
+    encoding = SinusoidalEncoding(model_dim=10, max_len=2)
+    foo = encoding(torch.arange(4))
+    assert foo.shape == (4, 10)
+    assert encoding.max_len() == 4
+def test_add():
+    encoding = AddSinusoidalEncoding(d_model=10, max_len=4)
+    x = torch.zeros(1, 4, 10)
+    y = encoding(x)
+    r = torch.randn(1, 4, 10)
+    r2 = encoding(r)
+    assert torch.allclose(r2 - r, y, atol=1e-07)
+    r = torch.randn(2, 4, 10)
+    r2 = encoding(r)
+    assert torch.allclose(r2[0] - r[0], y, atol=1e-07)
+    assert torch.allclose(r2[1] - r[1], y, atol=1e-07)

stanza/stanza/tests/constituency/test_selftrain_vi_quad.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""
+Test some of the methods in the vi_quad dataset
+Uses a small section of the dataset as a test
+"""
+import pytest
+from stanza.utils.datasets.constituency import selftrain_vi_quad
+from stanza.tests import *
+pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
+SAMPLE_TEXT = """
+{"version": "1.1", "data": [{"title": "Ph\u1ea1m V\u0103n \u0110\u1ed3ng", "paragraphs": [{"qas": [{"question": "T\u00ean g\u1ecdi n\u00e0o \u0111\u01b0\u1ee3c Ph\u1ea1m V\u0103n \u0110\u1ed3ng s\u1eed d\u1ee5ng khi l\u00e0m Ph\u00f3 ch\u1ee7 nhi\u1ec7m c\u01a1 quan Bi\u1ec7n s\u1ef1 x\u1ee9 t\u1ea1i Qu\u1ebf L\u00e2m?", "answers": [{"answer_start": 507, "text": "L\u00e2m B\u00e1 Ki\u1ec7t"}], "id": "uit_01__05272_0_1"}, {"question": "Ph\u1ea1m V\u0103n \u0110\u1ed3ng gi\u1eef ch\u1ee9c v\u1ee5 g\u00ec trong b\u1ed9 m\u00e1y Nh\u00e0 n\u01b0\u1edbc C\u1ed9ng h\u00f2a X\u00e3 h\u1ed9i ch\u1ee7 ngh\u0129a Vi\u1ec7t Nam?", "answers": [{"answer_start": 60, "text": "Th\u1ee7 t\u01b0\u1edbng"}], "id": "uit_01__05272_0_2"}, {"question": "Giai \u0111o\u1ea1n n\u0103m 1955-1976, Ph\u1ea1m V\u0103n \u0110\u1ed3ng n\u1eafm gi\u1eef ch\u1ee9c v\u1ee5 g\u00ec?", "answers": [{"answer_start": 245, "text": "Th\u1ee7 t\u01b0\u1edbng Ch\u00ednh ph\u1ee7 Vi\u1ec7t Nam D\u00e2n ch\u1ee7 C\u1ed9ng h\u00f2a"}], "id": "uit_01__05272_0_3"}], "context": "Ph\u1ea1m V\u0103n \u0110\u1ed3ng (1 th\u00e1ng 3 n\u0103m 1906 \u2013 29 th\u00e1ng 4 n\u0103m 2000) l\u00e0 Th\u1ee7 t\u01b0\u1edbng \u0111\u1ea7u ti\u00ean c\u1ee7a n\u01b0\u1edbc C\u1ed9ng h\u00f2a X\u00e3 h\u1ed9i ch\u1ee7 ngh\u0129a Vi\u1ec7t Nam t\u1eeb n\u0103m 1976 (t\u1eeb n\u0103m 1981 g\u1ecdi l\u00e0 Ch\u1ee7 t\u1ecbch H\u1ed9i \u0111\u1ed3ng B\u1ed9 tr\u01b0\u1edfng) cho \u0111\u1ebfn khi ngh\u1ec9 h\u01b0u n\u0103m 1987. Tr\u01b0\u1edbc \u0111\u00f3 \u00f4ng t\u1eebng gi\u1eef ch\u1ee9c v\u1ee5 Th\u1ee7 t\u01b0\u1edbng Ch\u00ednh ph\u1ee7 Vi\u1ec7t Nam D\u00e2n ch\u1ee7 C\u1ed9ng h\u00f2a t\u1eeb n\u0103m 1955 \u0111\u1ebfn n\u0103m 1976. \u00d4ng l\u00e0 v\u1ecb Th\u1ee7 t\u01b0\u1edbng Vi\u1ec7t Nam t\u1ea1i v\u1ecb l\u00e2u nh\u1ea5t (1955\u20131987). \u00d4ng l\u00e0 h\u1ecdc tr\u00f2, c\u1ed9ng s\u1ef1 c\u1ee7a Ch\u1ee7 t\u1ecbch H\u1ed3 Ch\u00ed Minh. \u00d4ng c\u00f3 t\u00ean g\u1ecdi th\u00e2n m\u1eadt l\u00e0 T\u00f4, \u0111\u00e2y t\u1eebng l\u00e0 b\u00ed danh c\u1ee7a \u00f4ng. \u00d4ng c\u00f2n c\u00f3 t\u00ean g\u1ecdi l\u00e0 L\u00e2m B\u00e1 Ki\u1ec7t khi l\u00e0m Ph\u00f3 ch\u1ee7 nhi\u1ec7m c\u01a1 quan Bi\u1ec7n s\u1ef1 x\u1ee9 t\u1ea1i Qu\u1ebf L\u00e2m (Ch\u1ee7 nhi\u1ec7m l\u00e0 H\u1ed3 H\u1ecdc L\u00e3m)."}, {"qas": [{"question": "S\u1ef1 ki\u1ec7n quan tr\u1ecdng n\u00e0o \u0111\u00e3 di\u1ec5n ra v\u00e0o ng\u00e0y 20/7/1954?", "answers": [{"answer_start": 364, "text": "b\u1ea3n Hi\u1ec7p \u0111\u1ecbnh \u0111\u00ecnh ch\u1ec9 chi\u1ebfn s\u1ef1 \u1edf Vi\u1ec7t Nam, Campuchia v\u00e0 L\u00e0o \u0111\u00e3 \u0111\u01b0\u1ee3c k\u00fd k\u1ebft th\u1eeba nh\u1eadn t\u00f4n tr\u1ecdng \u0111\u1ed9c l\u1eadp, ch\u1ee7 quy\u1ec1n, c\u1ee7a n\u01b0\u1edbc Vi\u1ec7t Nam, L\u00e0o v\u00e0 Campuchia"}], "id": "uit_01__05272_1_1"}, {"question": "Ch\u1ee9c v\u1ee5 m\u00e0 Ph\u1ea1m V\u0103n \u0110\u1ed3ng \u0111\u1ea3m nhi\u1ec7m t\u1ea1i H\u1ed9i ngh\u1ecb Gen\u00e8ve v\u1ec1 \u0110\u00f4ng D\u01b0\u01a1ng?", "answers": [{"answer_start": 33, "text": "Tr\u01b0\u1edfng ph\u00e1i \u0111o\u00e0n Ch\u00ednh ph\u1ee7"}], "id": "uit_01__05272_1_2"}, {"question": "H\u1ed9i ngh\u1ecb Gen\u00e8ve v\u1ec1 \u0110\u00f4ng D\u01b0\u01a1ng c\u00f3 t\u00ednh ch\u1ea5t nh\u01b0 th\u1ebf n\u00e0o?", "answers": [{"answer_start": 262, "text": "r\u1ea5t c\u0103ng th\u1eb3ng v\u00e0 ph\u1ee9c t\u1ea1p"}], "id": "uit_01__05272_1_3"}]}]}]}
+"""
+EXPECTED = ['Tên gọi nào được Phạm Văn Đồng sử dụng khi làm Phó chủ nhiệm cơ quan Biện sự xứ tại Quế Lâm?', 'Phạm Văn Đồng giữ chức vụ gì trong bộ máy Nhà nước Cộng hòa Xã hội chủ nghĩa Việt Nam?', 'Giai đoạn năm 1955-1976, Phạm Văn Đồng nắm giữ chức vụ gì?', 'Sự kiện quan trọng nào đã diễn ra vào ngày 20/7/1954?', 'Chức vụ mà Phạm Văn Đồng đảm nhiệm tại Hội nghị Genève về Đông Dương?', 'Hội nghị Genève về Đông Dương có tính chất như thế nào?']
+def test_read_file():
+    results = selftrain_vi_quad.parse_quad(SAMPLE_TEXT)
+    assert results == EXPECTED

stanza/stanza/tests/constituency/test_utils.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import pytest
+from stanza import Pipeline
+from stanza.models.constituency import tree_reader
+from stanza.models.constituency import utils
+from stanza.tests import *
+pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
+@pytest.fixture(scope="module")
+def pipeline():
+    return Pipeline(dir=TEST_MODELS_DIR, lang="en", processors="tokenize, pos", tokenize_pretokenized=True)
+def test_xpos_retag(pipeline):
+    """
+    Test using the English tagger that trees will be correctly retagged by read_trees using xpos
+    """
+    text = "((S (VP (X Find)) (NP (X Mox) (X Opal))))   ((S (NP (X Ragavan)) (VP (X steals) (NP (X important) (X cards)))))"
+    expected = "((S (VP (VB Find)) (NP (NNP Mox) (NNP Opal)))) ((S (NP (NNP Ragavan)) (VP (VBZ steals) (NP (JJ important) (NNS cards)))))"
+    trees = tree_reader.read_trees(text)
+    new_trees = utils.retag_trees(trees, [pipeline], xpos=True)
+    assert new_trees == tree_reader.read_trees(expected)
+def test_upos_retag(pipeline):
+    """
+    Test using the English tagger that trees will be correctly retagged by read_trees using upos
+    """
+    text = "((S (VP (X Find)) (NP (X Mox) (X Opal))))   ((S (NP (X Ragavan)) (VP (X steals) (NP (X important) (X cards)))))"
+    expected = "((S (VP (VERB Find)) (NP (PROPN Mox) (PROPN Opal)))) ((S (NP (PROPN Ragavan)) (VP (VERB steals) (NP (ADJ important) (NOUN cards)))))"
+    trees = tree_reader.read_trees(text)
+    new_trees = utils.retag_trees(trees, [pipeline], xpos=False)
+    assert new_trees == tree_reader.read_trees(expected)
+def test_replace_tags():
+    """
+    Test the underlying replace_tags method
+    Also tests that the method throws exceptions when it is supposed to
+    """
+    text = "((S (VP (X Find)) (NP (X Mox) (X Opal))))"
+    expected = "((S (VP (A Find)) (NP (B Mox) (C Opal))))"
+    trees = tree_reader.read_trees(text)
+    new_tags = ["A", "B", "C"]
+    new_tree = trees[0].replace_tags(new_tags)
+    assert new_tree == tree_reader.read_trees(expected)[0]
+    with pytest.raises(ValueError):
+        new_tags = ["A", "B"]
+        new_tree = trees[0].replace_tags(new_tags)
+    with pytest.raises(ValueError):
+        new_tags = ["A", "B", "C", "D"]
+        new_tree = trees[0].replace_tags(new_tags)

stanza/stanza/tests/data/example_french.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{"sentences":
+ [{"index": 0,
+   "tokens": [
+       {"index": 1, "word": "Cette", "originalText": "Cette", "characterOffsetBegin": 0, "characterOffsetEnd": 5, "pos": "DET", "before": "", "after": " "},
+       {"index": 2, "word": "enquête", "originalText": "enquête", "characterOffsetBegin": 6, "characterOffsetEnd": 13, "pos": "NOUN", "before": " ", "after": " "},
+       {"index": 3, "word": "préliminaire", "originalText": "préliminaire", "characterOffsetBegin": 14, "characterOffsetEnd": 26, "pos": "ADJ", "before": " ", "after": " "},
+       {"index": 4, "word": "fait", "originalText": "fait", "characterOffsetBegin": 27, "characterOffsetEnd": 31, "pos": "VERB", "before": " ", "after": " "},
+       {"index": 5, "word": "suite", "originalText": "suite", "characterOffsetBegin": 32, "characterOffsetEnd": 37, "pos": "NOUN", "before": " ", "after": " "},
+       {"index": 6, "word": "à", "originalText": "à", "characterOffsetBegin": 38, "characterOffsetEnd": 41, "pos": "ADP", "before": " ", "after": " "},
+       {"index": 7, "word": "les", "originalText": "les", "characterOffsetBegin": 38, "characterOffsetEnd": 41, "pos": "DET", "before": " ", "after": " "},
+       {"index": 8, "word": "révélations", "originalText": "révélations", "characterOffsetBegin": 42, "characterOffsetEnd": 53, "pos": "NOUN", "before": " ", "after": " "},
+       {"index": 9, "word": "de", "originalText": "de", "characterOffsetBegin": 54, "characterOffsetEnd": 56, "pos": "ADP", "before": " ", "after": " "},
+       {"index": 10, "word": "l’", "originalText": "l’", "characterOffsetBegin": 57, "characterOffsetEnd": 59, "pos": "NOUN", "before": " ", "after": ""},
+       {"index": 11, "word": "hebdomadaire", "originalText": "hebdomadaire", "characterOffsetBegin": 59, "characterOffsetEnd": 71, "pos": "ADJ", "before": "", "after": " "},
+       {"index": 12, "word": "quelques", "originalText": "quelques", "characterOffsetBegin": 72, "characterOffsetEnd": 80, "pos": "DET", "before": " ", "after": " "},
+       {"index": 13, "word": "jours", "originalText": "jours", "characterOffsetBegin": 81, "characterOffsetEnd": 86, "pos": "NOUN", "before": " ", "after": " "},
+       {"index": 14, "word": "plus", "originalText": "plus", "characterOffsetBegin": 87, "characterOffsetEnd": 91, "pos": "ADV", "before": " ", "after": " "},
+       {"index": 15, "word": "tôt", "originalText": "tôt", "characterOffsetBegin": 92, "characterOffsetEnd": 95, "pos": "ADV", "before": " ", "after": ""},
+       {"index": 16, "word": ".", "originalText": ".", "characterOffsetBegin": 95, "characterOffsetEnd": 96, "pos": "PUNCT", "before": "", "after": ""}
+   ]}
+ ]
+}

stanza/stanza/tests/data/test.dat ADDED Viewed

Binary file (4.24 kB). View file

stanza/stanza/tests/data/tiny_emb.csv ADDED Viewed

	@@ -0,0 +1,4 @@

+3 4
+unban,1,2,3,4
+mox,5,6,7,8
+opal,9,10,11,12

stanza/stanza/tests/datasets/__init__.py ADDED Viewed

File without changes

stanza/stanza/tests/datasets/ner/__init__.py ADDED Viewed

File without changes

stanza/stanza/tests/datasets/ner/test_prepare_ner_file.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""
+Test some simple conversions of NER bio files
+"""
+import pytest
+import json
+from stanza.models.common.doc import Document
+from stanza.utils.datasets.ner.prepare_ner_file import process_dataset
+BIO_1 = """
+Jennifer	B-PERSON
+Sh'reyan	I-PERSON
+has	O
+lovely	O
+antennae	O
+""".strip()
+BIO_2 = """
+but	O
+I	O
+don't	O
+like	O
+the	O
+way	O
+Jennifer	B-PERSON
+treated	O
+Beckett	B-PERSON
+on	O
+the	O
+Cerritos	B-LOCATION
+""".strip()
+def check_json_file(doc, raw_text, expected_sentences, expected_tokens):
+    raw_sentences = raw_text.strip().split("\n\n")
+    assert len(raw_sentences) == expected_sentences
+    if isinstance(expected_tokens, int):
+        expected_tokens = [expected_tokens]
+    for raw_sentence, expected_len in zip(raw_sentences, expected_tokens):
+        assert len(raw_sentence.strip().split("\n")) == expected_len
+    assert len(doc.sentences) == expected_sentences
+    for sentence, expected_len in zip(doc.sentences, expected_tokens):
+        assert len(sentence.tokens) == expected_len
+    for sentence, raw_sentence in zip(doc.sentences, raw_sentences):
+        for token, line in zip(sentence.tokens, raw_sentence.strip().split("\n")):
+            word, tag = line.strip().split()
+            assert token.text == word
+            assert token.ner == tag
+def write_and_convert(tmp_path, raw_text):
+    bio_file = tmp_path / "test.bio"
+    with open(bio_file, "w", encoding="utf-8") as fout:
+        fout.write(raw_text)
+    json_file = tmp_path / "json.bio"
+    process_dataset(bio_file, json_file)
+    with open(json_file) as fin:
+        doc = Document(json.load(fin))
+    return doc
+def run_test(tmp_path, raw_text, expected_sentences, expected_tokens):
+    doc = write_and_convert(tmp_path, raw_text)
+    check_json_file(doc, raw_text, expected_sentences, expected_tokens)
+def test_simple(tmp_path):
+    run_test(tmp_path, BIO_1, 1, 5)
+def test_ner_at_end(tmp_path):
+    run_test(tmp_path, BIO_2, 1, 12)
+def test_two_sentences(tmp_path):
+    raw_text = BIO_1 + "\n\n" + BIO_2
+    run_test(tmp_path, raw_text, 2, [5, 12])

stanza/stanza/tests/datasets/ner/test_utils.py ADDED Viewed

	@@ -0,0 +1,34 @@

+"""
+Test the utils file of the NER dataset processing
+"""
+import pytest
+from stanza.utils.datasets.ner.utils import list_doc_entities
+from stanza.tests.datasets.ner.test_prepare_ner_file import BIO_1, BIO_2, write_and_convert
+def test_list_doc_entities(tmp_path):
+    """
+    Test the function which lists all of the entities in a doc
+    """
+    doc = write_and_convert(tmp_path, BIO_1)
+    entities = list_doc_entities(doc)
+    expected = [(('Jennifer', "Sh'reyan"), 'PERSON')]
+    assert expected == entities
+    doc = write_and_convert(tmp_path, BIO_2)
+    entities = list_doc_entities(doc)
+    expected = [(('Jennifer',), 'PERSON'), (('Beckett',), 'PERSON'), (('Cerritos',), 'LOCATION')]
+    assert expected == entities
+    doc = write_and_convert(tmp_path, "\n\n".join([BIO_1, BIO_2]))
+    entities = list_doc_entities(doc)
+    expected = [(('Jennifer', "Sh'reyan"), 'PERSON'), (('Jennifer',), 'PERSON'), (('Beckett',), 'PERSON'), (('Cerritos',), 'LOCATION')]
+    assert expected == entities
+    doc = write_and_convert(tmp_path, "\n\n".join([BIO_1, BIO_1, BIO_2]))
+    entities = list_doc_entities(doc)
+    expected = [(('Jennifer', "Sh'reyan"), 'PERSON'), (('Jennifer', "Sh'reyan"), 'PERSON'), (('Jennifer',), 'PERSON'), (('Beckett',), 'PERSON'), (('Cerritos',), 'LOCATION')]
+    assert expected == entities

stanza/stanza/tests/lemma/test_data.py ADDED Viewed

	@@ -0,0 +1,106 @@

+"""
+Test a couple basic data functions, such as processing a doc for its lemmas
+"""
+import pytest
+from stanza.models.common.doc import Document
+from stanza.models.lemma.data import DataLoader
+from stanza.utils.conll import CoNLL
+TRAIN_DATA = """
+# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0003
+# text = DPA: Iraqi authorities announced that they had busted up 3 terrorist cells operating in Baghdad.
+1	DPA	DPA	PROPN	NNP	Number=Sing	0	root	0:root	SpaceAfter=No
+2	:	:	PUNCT	:	_	1	punct	1:punct	_
+3	Iraqi	Iraqi	ADJ	JJ	Degree=Pos	4	amod	4:amod	_
+4	authorities	authority	NOUN	NNS	Number=Plur	5	nsubj	5:nsubj	_
+5	announced	announce	VERB	VBD	Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin	1	parataxis	1:parataxis	_
+6	that	that	SCONJ	IN	_	9	mark	9:mark	_
+7	they	they	PRON	PRP	Case=Nom|Number=Plur|Person=3|PronType=Prs	9	nsubj	9:nsubj	_
+8	had	have	AUX	VBD	Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin	9	aux	9:aux	_
+9	busted	bust	VERB	VBN	Tense=Past|VerbForm=Part	5	ccomp	5:ccomp	_
+10	up	up	ADP	RP	_	9	compound:prt	9:compound:prt	_
+11	3	3	NUM	CD	NumForm=Digit|NumType=Card	13	nummod	13:nummod	_
+12	terrorist	terrorist	ADJ	JJ	Degree=Pos	13	amod	13:amod	_
+13	cells	cell	NOUN	NNS	Number=Plur	9	obj	9:obj	_
+14	operating	operate	VERB	VBG	VerbForm=Ger	13	acl	13:acl	_
+15	in	in	ADP	IN	_	16	case	16:case	_
+16	Baghdad	Baghdad	PROPN	NNP	Number=Sing	14	obl	14:obl:in	SpaceAfter=No
+17	.	.	PUNCT	.	_	1	punct	1:punct	_
+# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0004
+# text = Two of them were being run by 2 officials of the Ministry of the Interior!
+1	Two	two	NUM	CD	NumForm=Word|NumType=Card	6	nsubj:pass	6:nsubj:pass	_
+2	of	of	ADP	IN	_	3	case	3:case	_
+3	them	they	PRON	PRP	Case=Acc|Number=Plur|Person=3|PronType=Prs	1	nmod	1:nmod:of	_
+4	were	be	AUX	VBD	Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin	6	aux	6:aux	_
+5	being	be	AUX	VBG	VerbForm=Ger	6	aux:pass	6:aux:pass	_
+6	run	run	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	0	root	0:root	_
+7	by	by	ADP	IN	_	9	case	9:case	_
+8	2	2	NUM	CD	NumForm=Digit|NumType=Card	9	nummod	9:nummod	_
+9	officials	official	NOUN	NNS	Number=Plur	6	obl	6:obl:by	_
+10	of	of	ADP	IN	_	12	case	12:case	_
+11	the	the	DET	DT	Definite=Def|PronType=Art	12	det	12:det	_
+12	Ministry	Ministry	PROPN	NNP	Number=Sing	9	nmod	9:nmod:of	_
+13	of	of	ADP	IN	_	15	case	15:case	_
+14	the	the	DET	DT	Definite=Def|PronType=Art	15	det	15:det	_
+15	Interior	Interior	PROPN	NNP	Number=Sing	12	nmod	12:nmod:of	SpaceAfter=No
+16	!	!	PUNCT	.	_	6	punct	6:punct	_
+""".lstrip()
+GOESWITH_DATA = """
+# sent_id = email-enronsent27_01-0041
+# newpar id = email-enronsent27_01-p0005
+# text = Ken Rice@ENRON COMMUNICATIONS
+1	Ken	kenrice@enroncommunications	X	GW	Typo=Yes	0	root	0:root	_
+2	Rice@ENRON	_	X	GW	_	1	goeswith	1:goeswith	_
+3	COMMUNICATIONS	_	X	ADD	_	1	goeswith	1:goeswith	_
+""".lstrip()
+CORRECT_FORM_DATA = """
+# sent_id = weblog-blogspot.com_healingiraq_20040409053012_ENG_20040409_053012-0019
+# text = They are targetting ambulances
+1	They	they	PRON	PRP	Case=Nom|Number=Plur|Person=3|PronType=Prs	3	nsubj	3:nsubj	_
+2	are	be	AUX	VBP	Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin	3	aux	3:aux	_
+3	targetting	target	VERB	VBG	Tense=Pres|Typo=Yes|VerbForm=Part	0	root	0:root	CorrectForm=targeting
+4	ambulances	ambulance	NOUN	NNS	Number=Plur	3	obj	3:obj	SpaceAfter=No
+"""
+def test_load_document():
+    train_doc = CoNLL.conll2doc(input_str=TRAIN_DATA)
+    data = DataLoader.load_doc(train_doc, caseless=False, evaluation=True)
+    assert len(data) == 33 # meticulously counted by hand
+    assert all(len(x) == 3 for x in data)
+    data = DataLoader.load_doc(train_doc, caseless=False, evaluation=False)
+    assert len(data) == 33
+    assert all(len(x) == 3 for x in data)
+def test_load_goeswith():
+    raw_data = TRAIN_DATA + GOESWITH_DATA
+    train_doc = CoNLL.conll2doc(input_str=raw_data)
+    data = DataLoader.load_doc(train_doc, caseless=False, evaluation=True)
+    assert len(data) == 36 # will be the same as in test_load_document with three additional words
+    assert all(len(x) == 3 for x in data)
+    data = DataLoader.load_doc(train_doc, caseless=False, evaluation=False)
+    assert len(data) == 33 # will be the same as in test_load_document, but with the trailing 3 GOESWITH removed
+    assert all(len(x) == 3 for x in data)
+def test_correct_form():
+    raw_data = TRAIN_DATA + CORRECT_FORM_DATA
+    train_doc = CoNLL.conll2doc(input_str=raw_data)
+    data = DataLoader.load_doc(train_doc, caseless=False, evaluation=True)
+    assert len(data) == 37
+    # the 'targeting' correction should not be applied if evaluation=True
+    # when evaluation=False, then the CorrectForms will be applied
+    assert not any(x[0] == 'targeting' for x in data)
+    data = DataLoader.load_doc(train_doc, caseless=False, evaluation=False)
+    assert len(data) == 38 # the same, but with an extra row so the model learns both 'targetting' and 'targeting'
+    assert any(x[0] == 'targeting' for x in data)
+    assert any(x[0] == 'targetting' for x in data)

stanza/stanza/tests/lemma/test_lemma_trainer.py ADDED Viewed

	@@ -0,0 +1,154 @@

+"""
+Test a couple basic functions - load & save an existing model
+"""
+import pytest
+import glob
+import os
+import tempfile
+import torch
+from stanza.models import lemmatizer
+from stanza.models.lemma import trainer
+from stanza.tests import *
+from stanza.utils.training.common import choose_lemma_charlm, build_charlm_args
+pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
+@pytest.fixture(scope="module")
+def english_model():
+    models_path = os.path.join(TEST_MODELS_DIR, "en", "lemma", "*")
+    models = glob.glob(models_path)
+    # we expect at least one English model downloaded for the tests
+    assert len(models) >= 1
+    model_file = models[0]
+    return trainer.Trainer(model_file=model_file)
+def test_load_model(english_model):
+    """
+    Does nothing, just tests that loading works
+    """
+def test_save_load_model(english_model):
+    """
+    Load, save, and load again
+    """
+    with tempfile.TemporaryDirectory() as tempdir:
+        save_file = os.path.join(tempdir, "resaved", "lemma.pt")
+        english_model.save(save_file)
+        reloaded = trainer.Trainer(model_file=save_file)
+TRAIN_DATA = """
+# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0003
+# text = DPA: Iraqi authorities announced that they had busted up 3 terrorist cells operating in Baghdad.
+1	DPA	DPA	PROPN	NNP	Number=Sing	0	root	0:root	SpaceAfter=No
+2	:	:	PUNCT	:	_	1	punct	1:punct	_
+3	Iraqi	Iraqi	ADJ	JJ	Degree=Pos	4	amod	4:amod	_
+4	authorities	authority	NOUN	NNS	Number=Plur	5	nsubj	5:nsubj	_
+5	announced	announce	VERB	VBD	Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin	1	parataxis	1:parataxis	_
+6	that	that	SCONJ	IN	_	9	mark	9:mark	_
+7	they	they	PRON	PRP	Case=Nom|Number=Plur|Person=3|PronType=Prs	9	nsubj	9:nsubj	_
+8	had	have	AUX	VBD	Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin	9	aux	9:aux	_
+9	busted	bust	VERB	VBN	Tense=Past|VerbForm=Part	5	ccomp	5:ccomp	_
+10	up	up	ADP	RP	_	9	compound:prt	9:compound:prt	_
+11	3	3	NUM	CD	NumForm=Digit|NumType=Card	13	nummod	13:nummod	_
+12	terrorist	terrorist	ADJ	JJ	Degree=Pos	13	amod	13:amod	_
+13	cells	cell	NOUN	NNS	Number=Plur	9	obj	9:obj	_
+14	operating	operate	VERB	VBG	VerbForm=Ger	13	acl	13:acl	_
+15	in	in	ADP	IN	_	16	case	16:case	_
+16	Baghdad	Baghdad	PROPN	NNP	Number=Sing	14	obl	14:obl:in	SpaceAfter=No
+17	.	.	PUNCT	.	_	1	punct	1:punct	_
+# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0004
+# text = Two of them were being run by 2 officials of the Ministry of the Interior!
+1	Two	two	NUM	CD	NumForm=Word|NumType=Card	6	nsubj:pass	6:nsubj:pass	_
+2	of	of	ADP	IN	_	3	case	3:case	_
+3	them	they	PRON	PRP	Case=Acc|Number=Plur|Person=3|PronType=Prs	1	nmod	1:nmod:of	_
+4	were	be	AUX	VBD	Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin	6	aux	6:aux	_
+5	being	be	AUX	VBG	VerbForm=Ger	6	aux:pass	6:aux:pass	_
+6	run	run	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	0	root	0:root	_
+7	by	by	ADP	IN	_	9	case	9:case	_
+8	2	2	NUM	CD	NumForm=Digit|NumType=Card	9	nummod	9:nummod	_
+9	officials	official	NOUN	NNS	Number=Plur	6	obl	6:obl:by	_
+10	of	of	ADP	IN	_	12	case	12:case	_
+11	the	the	DET	DT	Definite=Def|PronType=Art	12	det	12:det	_
+12	Ministry	Ministry	PROPN	NNP	Number=Sing	9	nmod	9:nmod:of	_
+13	of	of	ADP	IN	_	15	case	15:case	_
+14	the	the	DET	DT	Definite=Def|PronType=Art	15	det	15:det	_
+15	Interior	Interior	PROPN	NNP	Number=Sing	12	nmod	12:nmod:of	SpaceAfter=No
+16	!	!	PUNCT	.	_	6	punct	6:punct	_
+""".lstrip()
+DEV_DATA = """
+1	From	from	ADP	IN	_	3	case	3:case	_
+2	the	the	DET	DT	Definite=Def|PronType=Art	3	det	3:det	_
+3	AP	AP	PROPN	NNP	Number=Sing	4	obl	4:obl:from	_
+4	comes	come	VERB	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	0	root	0:root	_
+5	this	this	DET	DT	Number=Sing|PronType=Dem	6	det	6:det	_
+6	story	story	NOUN	NN	Number=Sing	4	nsubj	4:nsubj	_
+7	:	:	PUNCT	:	_	4	punct	4:punct	_
+""".lstrip()
+class TestLemmatizer:
+    @pytest.fixture(scope="class")
+    def charlm_args(self):
+        charlm = choose_lemma_charlm("en", "test", "default")
+        charlm_args = build_charlm_args("en", charlm, model_dir=TEST_MODELS_DIR)
+        return charlm_args
+    def run_training(self, tmp_path, train_text, dev_text, extra_args=None):
+        """
+        Run the training for a few iterations, load & return the model
+        """
+        pred_file = str(tmp_path / "pred.conllu")
+        save_name = "test_tagger.pt"
+        save_file = str(tmp_path / save_name)
+        train_file = str(tmp_path / "train.conllu")
+        with open(train_file, "w", encoding="utf-8") as fout:
+            fout.write(train_text)
+        dev_file = str(tmp_path / "dev.conllu")
+        with open(dev_file, "w", encoding="utf-8") as fout:
+            fout.write(dev_text)
+        args = ["--train_file", train_file,
+                "--eval_file", dev_file,
+                "--gold_file", dev_file,
+                "--output_file", pred_file,
+                "--num_epoch", "2",
+                "--log_step", "10",
+                "--save_dir", str(tmp_path),
+                "--save_name", save_name,
+                "--shorthand", "en_test"]
+        if extra_args is not None:
+            args = args + extra_args
+        lemmatizer.main(args)
+        assert os.path.exists(save_file)
+        saved_model = trainer.Trainer(model_file=save_file)
+        return saved_model
+    def test_basic_train(self, tmp_path):
+        """
+        Simple test of a few 'epochs' of lemmatizer training
+        """
+        self.run_training(tmp_path, TRAIN_DATA, DEV_DATA)
+    def test_charlm_train(self, tmp_path, charlm_args):
+        """
+        Simple test of a few 'epochs' of lemmatizer training
+        """
+        saved_model = self.run_training(tmp_path, TRAIN_DATA, DEV_DATA, extra_args=charlm_args)
+        # check that the charlm wasn't saved in here
+        args = saved_model.args
+        save_name = os.path.join(args['save_dir'], args['save_name'])
+        checkpoint = torch.load(save_name, lambda storage, loc: storage, weights_only=True)
+        assert not any(x.startswith("contextual_embedding") for x in checkpoint['model'].keys())

stanza/stanza/tests/lemma_classifier/test_data_preparation.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import os
+import pytest
+import stanza.models.lemma_classifier.utils as utils
+import stanza.utils.datasets.prepare_lemma_classifier as prepare_lemma_classifier
+pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
+EWT_ONE_SENTENCE = """
+# sent_id = weblog-blogspot.com_rigorousintuition_20050518101500_ENG_20050518_101500-0002
+# newpar id = weblog-blogspot.com_rigorousintuition_20050518101500_ENG_20050518_101500-p0002
+# text = Here's a Miami Herald interview
+1-2	Here's	_	_	_	_	_	_	_	_
+1	Here	here	ADV	RB	PronType=Dem	0	root	0:root	_
+2	's	be	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	1	cop	1:cop	_
+3	a	a	DET	DT	Definite=Ind|PronType=Art	6	det	6:det	_
+4	Miami	Miami	PROPN	NNP	Number=Sing	5	compound	5:compound	_
+5	Herald	Herald	PROPN	NNP	Number=Sing	6	compound	6:compound	_
+6	interview	interview	NOUN	NN	Number=Sing	1	nsubj	1:nsubj	_
+""".lstrip()
+EWT_TRAIN_SENTENCES = """
+# sent_id = weblog-blogspot.com_rigorousintuition_20050518101500_ENG_20050518_101500-0002
+# newpar id = weblog-blogspot.com_rigorousintuition_20050518101500_ENG_20050518_101500-p0002
+# text = Here's a Miami Herald interview
+1-2	Here's	_	_	_	_	_	_	_	_
+1	Here	here	ADV	RB	PronType=Dem	0	root	0:root	_
+2	's	be	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	1	cop	1:cop	_
+3	a	a	DET	DT	Definite=Ind|PronType=Art	6	det	6:det	_
+4	Miami	Miami	PROPN	NNP	Number=Sing	5	compound	5:compound	_
+5	Herald	Herald	PROPN	NNP	Number=Sing	6	compound	6:compound	_
+6	interview	interview	NOUN	NN	Number=Sing	1	nsubj	1:nsubj	_
+# sent_id = weblog-blogspot.com_rigorousintuition_20050518101500_ENG_20050518_101500-0027
+# text = But Posada's nearly 80 years old
+1	But	but	CCONJ	CC	_	7	cc	7:cc	_
+2-3	Posada's	_	_	_	_	_	_	_	_
+2	Posada	Posada	PROPN	NNP	Number=Sing	7	nsubj	7:nsubj	_
+3	's	be	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	7	cop	7:cop	_
+4	nearly	nearly	ADV	RB	_	5	advmod	5:advmod	_
+5	80	80	NUM	CD	NumForm=Digit|NumType=Card	6	nummod	6:nummod	_
+6	years	year	NOUN	NNS	Number=Plur	7	obl:npmod	7:obl:npmod	_
+7	old	old	ADJ	JJ	Degree=Pos	0	root	0:root	SpaceAfter=No
+# sent_id = weblog-blogspot.com_rigorousintuition_20050518101500_ENG_20050518_101500-0067
+# newpar id = weblog-blogspot.com_rigorousintuition_20050518101500_ENG_20050518_101500-p0011
+# text = Now that's a post I can relate to.
+1	Now	now	ADV	RB	_	5	advmod	5:advmod	_
+2-3	that's	_	_	_	_	_	_	_	_
+2	that	that	PRON	DT	Number=Sing|PronType=Dem	5	nsubj	5:nsubj	_
+3	's	be	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	5	cop	5:cop	_
+4	a	a	DET	DT	Definite=Ind|PronType=Art	5	det	5:det	_
+5	post	post	NOUN	NN	Number=Sing	0	root	0:root	_
+6	I	I	PRON	PRP	Case=Nom|Number=Sing|Person=1|PronType=Prs	8	nsubj	8:nsubj	_
+7	can	can	AUX	MD	VerbForm=Fin	8	aux	8:aux	_
+8	relate	relate	VERB	VB	VerbForm=Inf	5	acl:relcl	5:acl:relcl	_
+9	to	to	ADP	IN	_	8	obl	8:obl	SpaceAfter=No
+10	.	.	PUNCT	.	_	5	punct	5:punct	_
+# sent_id = weblog-blogspot.com_rigorousintuition_20050518101500_ENG_20050518_101500-0073
+# newpar id = weblog-blogspot.com_rigorousintuition_20050518101500_ENG_20050518_101500-p0012
+# text = hey that's a great blog
+1	hey	hey	INTJ	UH	_	6	discourse	6:discourse	_
+2-3	that's	_	_	_	_	_	_	_	_
+2	that	that	PRON	DT	Number=Sing|PronType=Dem	6	nsubj	6:nsubj	_
+3	's	be	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	6	cop	6:cop	_
+4	a	a	DET	DT	Definite=Ind|PronType=Art	6	det	6:det	_
+5	great	great	ADJ	JJ	Degree=Pos	6	amod	6:amod	_
+6	blog	blog	NOUN	NN	Number=Sing	0	root	0:root	SpaceAfter=No
+# sent_id = weblog-blogspot.com_rigorousintuition_20050518101500_ENG_20050518_101500-0089
+# text = And It's Not Hard To Do
+1	And	and	CCONJ	CC	_	5	cc	5:cc	_
+2-3	It's	_	_	_	_	_	_	_	_
+2	It	it	PRON	PRP	Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs	5	expl	5:expl	_
+3	's	be	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	5	cop	5:cop	_
+4	Not	not	PART	RB	_	5	advmod	5:advmod	_
+5	Hard	hard	ADJ	JJ	Degree=Pos	0	root	0:root	_
+6	To	to	PART	TO	_	7	mark	7:mark	_
+7	Do	do	VERB	VB	VerbForm=Inf	5	csubj	5:csubj	SpaceAfter=No
+# sent_id = weblog-blogspot.com_rigorousintuition_20060511134300_ENG_20060511_134300-0029
+# text = Meanwhile, a decision's been reached
+1	Meanwhile	meanwhile	ADV	RB	_	7	advmod	7:advmod	SpaceAfter=No
+2	,	,	PUNCT	,	_	1	punct	1:punct	_
+3	a	a	DET	DT	Definite=Ind|PronType=Art	4	det	4:det	_
+4-5	decision's	_	_	_	_	_	_	_	_
+4	decision	decision	NOUN	NN	Number=Sing	7	nsubj:pass	7:nsubj:pass	_
+5	's	have	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	7	aux	7:aux	_
+6	been	be	AUX	VBN	Tense=Past|VerbForm=Part	7	aux:pass	7:aux:pass	_
+7	reached	reach	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	0	root	0:root	_
+# sent_id = weblog-blogspot.com_rigorousintuition_20060511134300_ENG_20060511_134300-0138
+# text = It's become a guardian of morality
+1-2	It's	_	_	_	_	_	_	_	_
+1	It	it	PRON	PRP	Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs	3	nsubj	3:nsubj|5:nsubj:xsubj	_
+2	's	have	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	3	aux	3:aux	_
+3	become	become	VERB	VBN	Tense=Past|VerbForm=Part	0	root	0:root	_
+4	a	a	DET	DT	Definite=Ind|PronType=Art	5	det	5:det	_
+5	guardian	guardian	NOUN	NN	Number=Sing	3	xcomp	3:xcomp	_
+6	of	of	ADP	IN	_	7	case	7:case	_
+7	morality	morality	NOUN	NN	Number=Sing	5	nmod	5:nmod:of	_
+# sent_id = email-enronsent15_01-0018
+# text = It's got its own bathroom and tv
+1-2	It's	_	_	_	_	_	_	_	_
+1	It	it	PRON	PRP	Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs	3	nsubj	3:nsubj|13:nsubj	_
+2	's	have	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	3	aux	3:aux	_
+3	got	get	VERB	VBN	Tense=Past|VerbForm=Part	0	root	0:root	_
+4	its	its	PRON	PRP$	Case=Gen|Gender=Neut|Number=Sing|Person=3|Poss=Yes|PronType=Prs	6	nmod:poss	6:nmod:poss	_
+5	own	own	ADJ	JJ	Degree=Pos	6	amod	6:amod	_
+6	bathroom	bathroom	NOUN	NN	Number=Sing	3	obj	3:obj	_
+7	and	and	CCONJ	CC	_	8	cc	8:cc	_
+8	tv	TV	NOUN	NN	Number=Sing	6	conj	3:obj|6:conj:and	SpaceAfter=No
+# sent_id = newsgroup-groups.google.com_alt.animals.cat_01ff709c4bf2c60c_ENG_20040418_040100-0022
+# text = It's also got the website
+1-2	It's	_	_	_	_	_	_	_	_
+1	It	it	PRON	PRP	Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs	4	nsubj	4:nsubj	_
+2	's	have	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	4	aux	4:aux	_
+3	also	also	ADV	RB	_	4	advmod	4:advmod	_
+4	got	get	VERB	VBN	Tense=Past|VerbForm=Part	0	root	0:root	_
+5	the	the	DET	DT	Definite=Def|PronType=Art	6	det	6:det	_
+6	website	website	NOUN	NN	Number=Sing	4	obj	4:obj|12:obl	_
+""".lstrip()
+# from the train set, actually
+EWT_DEV_SENTENCES = """
+# sent_id = answers-20111108104724AAuBUR7_ans-0044
+# text = He's only exhibited weight loss and some muscle atrophy
+1-2	He's	_	_	_	_	_	_	_	_
+1	He	he	PRON	PRP	Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs	4	nsubj	4:nsubj	_
+2	's	have	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	4	aux	4:aux	_
+3	only	only	ADV	RB	_	4	advmod	4:advmod	_
+4	exhibited	exhibit	VERB	VBN	Tense=Past|VerbForm=Part	0	root	0:root	_
+5	weight	weight	NOUN	NN	Number=Sing	6	compound	6:compound	_
+6	loss	loss	NOUN	NN	Number=Sing	4	obj	4:obj	_
+7	and	and	CCONJ	CC	_	10	cc	10:cc	_
+8	some	some	DET	DT	PronType=Ind	10	det	10:det	_
+9	muscle	muscle	NOUN	NN	Number=Sing	10	compound	10:compound	_
+10	atrophy	atrophy	NOUN	NN	Number=Sing	6	conj	4:obj|6:conj:and	SpaceAfter=No
+# sent_id = weblog-blogspot.com_rigorousintuition_20060511134300_ENG_20060511_134300-0097
+# text = It's a good thing too.
+1-2	It's	_	_	_	_	_	_	_	_
+1	It	it	PRON	PRP	Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs	5	nsubj	5:nsubj	_
+2	's	be	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	5	cop	5:cop	_
+3	a	a	DET	DT	Definite=Ind|PronType=Art	5	det	5:det	_
+4	good	good	ADJ	JJ	Degree=Pos	5	amod	5:amod	_
+5	thing	thing	NOUN	NN	Number=Sing	0	root	0:root	_
+6	too	too	ADV	RB	_	5	advmod	5:advmod	SpaceAfter=No
+7	.	.	PUNCT	.	_	5	punct	5:punct	_
+""".lstrip()
+# from the train set, actually
+EWT_TEST_SENTENCES = """
+# sent_id = reviews-162422-0015
+# text = He said he's had a long and bad day.
+1	He	he	PRON	PRP	Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs	2	nsubj	2:nsubj	_
+2	said	say	VERB	VBD	Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin	0	root	0:root	_
+3-4	he's	_	_	_	_	_	_	_	_
+3	he	he	PRON	PRP	Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs	5	nsubj	5:nsubj	_
+4	's	have	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	5	aux	5:aux	_
+5	had	have	VERB	VBN	Tense=Past|VerbForm=Part	2	ccomp	2:ccomp	_
+6	a	a	DET	DT	Definite=Ind|PronType=Art	10	det	10:det	_
+7	long	long	ADJ	JJ	Degree=Pos	10	amod	10:amod	_
+8	and	and	CCONJ	CC	_	9	cc	9:cc	_
+9	bad	bad	ADJ	JJ	Degree=Pos	7	conj	7:conj:and|10:amod	_
+10	day	day	NOUN	NN	Number=Sing	5	obj	5:obj	SpaceAfter=No
+11	.	.	PUNCT	.	_	2	punct	2:punct	_
+# sent_id = weblog-blogspot.com_rigorousintuition_20060511134300_ENG_20060511_134300-0100
+# text = What's a few dead soldiers
+1-2	What's	_	_	_	_	_	_	_	_
+1	What	what	PRON	WP	PronType=Int	6	nsubj	6:nsubj	_
+2	's	be	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	6	cop	6:cop	_
+3	a	a	DET	DT	Definite=Ind|PronType=Art	6	det	6:det	_
+4	few	few	ADJ	JJ	Degree=Pos	6	amod	6:amod	_
+5	dead	dead	ADJ	JJ	Degree=Pos	6	amod	6:amod	_
+6	soldiers	soldier	NOUN	NNS	Number=Plur	0	root	0:root	_
+"""
+def write_test_dataset(tmp_path, texts, datasets):
+    ud_path = tmp_path / "ud"
+    input_path = ud_path / "UD_English-EWT"
+    output_path = tmp_path / "data" / "lemma_classifier"
+    os.makedirs(input_path, exist_ok=True)
+    for text, dataset in zip(texts, datasets):
+        sample_file = input_path / ("en_ewt-ud-%s.conllu" % dataset)
+        with open(sample_file, "w", encoding="utf-8") as fout:
+            fout.write(text)
+    paths = {"UDBASE": ud_path,
+             "LEMMA_CLASSIFIER_DATA_DIR": output_path}
+    return paths
+def write_english_test_dataset(tmp_path):
+    texts = (EWT_TRAIN_SENTENCES, EWT_DEV_SENTENCES, EWT_TEST_SENTENCES)
+    datasets = prepare_lemma_classifier.SECTIONS
+    return write_test_dataset(tmp_path, texts, datasets)
+def convert_english_dataset(tmp_path):
+    paths = write_english_test_dataset(tmp_path)
+    converted_files = prepare_lemma_classifier.process_treebank(paths, "en_ewt", "'s", "AUX", "be|have")
+    assert len(converted_files) == 3
+    return converted_files
+def test_convert_one_sentence(tmp_path):
+    texts = [EWT_ONE_SENTENCE]
+    datasets = ["train"]
+    paths = write_test_dataset(tmp_path, texts, datasets)
+    converted_files = prepare_lemma_classifier.process_treebank(paths, "en_ewt", "'s", "AUX", "be|have", ["train"])
+    assert len(converted_files) == 1
+    dataset = utils.Dataset(converted_files[0], get_counts=True, batch_size=10, shuffle=False)
+    assert len(dataset) == 1
+    assert dataset.label_decoder == {'be': 0}
+    id_to_upos = {y: x for x, y in dataset.upos_to_id.items()}
+    for text_batches, _, upos_batches, _ in dataset:
+        assert text_batches == [['Here', "'s", 'a', 'Miami', 'Herald', 'interview']]
+        upos = [id_to_upos[x] for x in upos_batches[0]]
+        assert upos == ['ADV', 'AUX', 'DET', 'PROPN', 'PROPN', 'NOUN']
+def test_convert_dataset(tmp_path):
+    converted_files = convert_english_dataset(tmp_path)
+    dataset = utils.Dataset(converted_files[0], get_counts=True, batch_size=10, shuffle=False)
+    assert len(dataset) == 1
+    label_decoder = dataset.label_decoder
+    assert len(label_decoder) == 2
+    assert "be" in label_decoder
+    assert "have" in label_decoder
+    for text_batches, _, _, _ in dataset:
+        assert len(text_batches) == 9
+    dataset = utils.Dataset(converted_files[1], get_counts=True, batch_size=10, shuffle=False)
+    assert len(dataset) == 1
+    for text_batches, _, _, _ in dataset:
+        assert len(text_batches) == 2
+    dataset = utils.Dataset(converted_files[2], get_counts=True, batch_size=10, shuffle=False)
+    assert len(dataset) == 1
+    for text_batches, _, _, _ in dataset:
+        assert len(text_batches) == 2

stanza/stanza/tests/mwt/test_character_classifier.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import os
+import pytest
+from stanza.models import mwt_expander
+from stanza.models.mwt.character_classifier import CharacterClassifier
+from stanza.models.mwt.data import DataLoader
+from stanza.models.mwt.trainer import Trainer
+from stanza.utils.conll import CoNLL
+pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
+ENG_TRAIN = """
+# text = Elena's motorcycle tour
+1-2	Elena's	_	_	_	_	_	_	_	_
+1	Elena	Elena	PROPN	NNP	Number=Sing	4	nmod:poss	4:nmod:poss	_
+2	's	's	PART	POS	_	1	case	1:case	_
+3	motorcycle	motorcycle	NOUN	NN	Number=Sing	4	compound	4:compound	_
+4	tour	tour	NOUN	NN	Number=Sing	0	root	0:root	_
+# text = women's reproductive health
+1-2	women's	_	_	_	_	_	_	_	_
+1	women	woman	NOUN	NNS	Number=Plur	4	nmod:poss	4:nmod:poss	_
+2	's	's	PART	POS	_	1	case	1:case	_
+3	reproductive	reproductive	ADJ	JJ	Degree=Pos	4	amod	4:amod	_
+4	health	health	NOUN	NN	Number=Sing	0	root	0:root	SpaceAfter=No
+# text = The Chernobyl Children's Project
+1	The	the	DET	DT	Definite=Def|PronType=Art	3	det	3:det	_
+2	Chernobyl	Chernobyl	PROPN	NNP	Number=Sing	3	compound	3:compound	_
+3-4	Children's	_	_	_	_	_	_	_	_
+3	Children	Children	PROPN	NNP	Number=Sing	5	nmod:poss	5:nmod:poss	_
+4	's	's	PART	POS	_	3	case	3:case	_
+5	Project	Project	PROPN	NNP	Number=Sing	0	root	0:root	_
+""".lstrip()
+ENG_DEV = """
+# text = The Chernobyl Children's Project
+1	The	the	DET	DT	Definite=Def|PronType=Art	3	det	3:det	_
+2	Chernobyl	Chernobyl	PROPN	NNP	Number=Sing	3	compound	3:compound	_
+3-4	Children's	_	_	_	_	_	_	_	_
+3	Children	Children	PROPN	NNP	Number=Sing	5	nmod:poss	5:nmod:poss	_
+4	's	's	PART	POS	_	3	case	3:case	_
+5	Project	Project	PROPN	NNP	Number=Sing	0	root	0:root	_
+""".lstrip()
+def test_train(tmp_path):
+    test_train = str(os.path.join(tmp_path, "en_test.train.conllu"))
+    with open(test_train, "w") as fout:
+        fout.write(ENG_TRAIN)
+    test_dev = str(os.path.join(tmp_path, "en_test.dev.conllu"))
+    with open(test_dev, "w") as fout:
+        fout.write(ENG_DEV)
+    test_output = str(os.path.join(tmp_path, "en_test.dev.pred.conllu"))
+    model_name = "en_test_mwt.pt"
+    args = [
+        "--data_dir", str(tmp_path),
+        "--train_file", test_train,
+        "--eval_file", test_dev,
+        "--gold_file", test_dev,
+        "--lang", "en",
+        "--shorthand", "en_test",
+        "--output_file", test_output,
+        "--save_dir", str(tmp_path),
+        "--save_name", model_name,
+        "--num_epoch", "10",
+    ]
+    mwt_expander.main(args=args)
+    model = Trainer(model_file=os.path.join(tmp_path, model_name))
+    assert model.model is not None
+    assert isinstance(model.model, CharacterClassifier)
+    doc = CoNLL.conll2doc(input_str=ENG_DEV)
+    dataloader = DataLoader(doc, 10, model.args, vocab=model.vocab, evaluation=True, expand_unk_vocab=True)
+    preds = []
+    for i, batch in enumerate(dataloader.to_loader()):
+        assert i == 0 # there should only be one batch
+        preds += model.predict(batch, never_decode_unk=True, vocab=dataloader.vocab)
+    assert len(preds) == 1
+    # it is possible to make a version of the test where this happens almost every time
+    # for example, running for 100 epochs makes the model succeed 30 times in a row
+    # (never saw a failure)
+    # but the one time that failure happened, it would be really annoying
+    #assert preds[0] == "Children 's"

stanza/stanza/tests/mwt/test_english_corner_cases.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""
+Test a couple English MWT corner cases which might be more widely applicable to other MWT languages
+- unknown English character doesn't result in bizarre splits
+- Casing or CASING doesn't get lost in the dictionary lookup
+In the English UD datasets, the MWT are composed exactly of the
+subwords, so the MWT model should be chopping up the input text rather
+than generating new text.
+Furthermore, SHE'S and She's should be split "SHE 'S" and "She 's" respectively
+"""
+import pytest
+import stanza
+from stanza.tests import TEST_MODELS_DIR
+pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
+def test_mwt_unknown_char():
+    pipeline = stanza.Pipeline(processors='tokenize,mwt', dir=TEST_MODELS_DIR, lang='en', download_method=None)
+    mwt_trainer = pipeline.processors['mwt']._trainer
+    assert mwt_trainer.args['force_exact_pieces']
+    # find a letter 'i' which isn't in the training data
+    # the MWT model should still recognize a possessive containing this letter
+    assert "i" in mwt_trainer.vocab
+    for letter in "ĩîíìī":
+        if letter not in mwt_trainer.vocab:
+            break
+    else:
+        raise AssertionError("Need to update the MWT test - all of the non-standard letters 'i' are now in the MWT vocab")
+    word = "Jenn" + letter + "fer"
+    possessive = word + "'s"
+    text = "I wanna lick " + possessive + " antennae"
+    doc = pipeline(text)
+    assert doc.sentences[0].tokens[1].text == 'wanna'
+    assert len(doc.sentences[0].tokens[1].words) == 2
+    assert "".join(x.text for x in doc.sentences[0].tokens[1].words) == 'wanna'
+    assert doc.sentences[0].tokens[3].text == possessive
+    assert len(doc.sentences[0].tokens[3].words) == 2
+    assert "".join(x.text for x in doc.sentences[0].tokens[3].words) == possessive
+def test_english_mwt_casing():
+    """
+    Test that for a word where the lowercase split is known, the correct casing is still used
+    Once upon a time, the logic used in the MWT expander would split
+      SHE'S -> she 's
+    which is a very surprising tokenization to people expecting
+    the original text in the output document
+    """
+    pipeline = stanza.Pipeline(processors='tokenize,mwt', dir=TEST_MODELS_DIR, lang='en', download_method=None)
+    mwt_trainer = pipeline.processors['mwt']._trainer
+    for i in range(1, 20):
+        # many test cases follow this pattern for some reason,
+        # so we should proactively look for a test case which hasn't
+        # made its way into the MWT dictionary
+        unknown_name = "jennife" + "r" * i + "'s"
+        if unknown_name not in mwt_trainer.expansion_dict and unknown_name.upper() not in mwt_trainer.expansion_dict:
+            unknown_name = unknown_name.upper()
+            break
+    else:
+        raise AssertionError("Need a new heuristic for the unknown word in the English MWT!")
+    # this SHOULD show up in the expansion dict
+    assert "she's" in mwt_trainer.expansion_dict, "Expected |she's| to be in the English MWT expansion dict... perhaps find a different test case"
+    text = [x.text for x in pipeline("JENNIFER HAS NICE ANTENNAE").sentences[0].words]
+    assert text == ['JENNIFER', 'HAS', 'NICE', 'ANTENNAE']
+    text = [x.text for x in pipeline(unknown_name + " GOT NICE ANTENNAE").sentences[0].words]
+    assert text == [unknown_name[:-2], "'S", 'GOT', 'NICE', 'ANTENNAE']
+    text = [x.text for x in pipeline("SHE'S GOT NICE ANTENNAE").sentences[0].words]
+    assert text == ['SHE', "'S", 'GOT', 'NICE', 'ANTENNAE']
+    text = [x.text for x in pipeline("She's GOT NICE ANTENNAE").sentences[0].words]
+    assert text == ['She', "'s", 'GOT', 'NICE', 'ANTENNAE']

stanza/stanza/tests/ner/test_bsf_2_iob.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""
+Tests the conversion code for the lang_uk NER dataset
+"""
+import unittest
+from stanza.utils.datasets.ner.convert_bsf_to_beios import convert_bsf, parse_bsf, BsfInfo
+import pytest
+pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
+class TestBsf2Iob(unittest.TestCase):
+    def test_1line_follow_markup_iob(self):
+        data = 'тележурналіст Василь .'
+        bsf_markup = 'T1	PERS 14 20	Василь'
+        expected = '''тележурналіст O
+Василь B-PERS
+. O'''
+        self.assertEqual(expected, convert_bsf(data, bsf_markup, converter='iob'))
+    def test_1line_2tok_markup_iob(self):
+        data = 'тележурналіст Василь Нагірний .'
+        bsf_markup = 'T1	PERS 14 29	Василь Нагірний'
+        expected = '''тележурналіст O
+Василь B-PERS
+Нагірний I-PERS
+. O'''
+        self.assertEqual(expected, convert_bsf(data, bsf_markup, converter='iob'))
+    def test_1line_Long_tok_markup_iob(self):
+        data = 'А в музеї Гуцульщини і Покуття можна '
+        bsf_markup = 'T12	ORG 4 30	музеї Гуцульщини і Покуття'
+        expected = '''А O
+в O
+музеї B-ORG
+Гуцульщини I-ORG
+і I-ORG
+Покуття I-ORG
+можна O'''
+        self.assertEqual(expected, convert_bsf(data, bsf_markup, converter='iob'))
+    def test_2line_2tok_markup_iob(self):
+        data = '''тележурналіст Василь Нагірний .
+В івано-франківському видавництві «Лілея НВ» вийшла друком'''
+        bsf_markup = '''T1	PERS 14 29	Василь Нагірний
+T2	ORG 67 75	Лілея НВ'''
+        expected = '''тележурналіст O
+Василь B-PERS
+Нагірний I-PERS
+. O
+В O
+івано-франківському O
+видавництві O
+« O
+Лілея B-ORG
+НВ I-ORG
+» O
+вийшла O
+друком O'''
+        self.assertEqual(expected, convert_bsf(data, bsf_markup, converter='iob'))
+    def test_all_multiline_iob(self):
+        data = '''його книжечка «А .
+Kubler .
+Світло і тіні маестро» .
+Причому'''
+        bsf_markup = '''T4	MISC 15 49	А .
+Kubler .
+Світло і тіні маестро
+'''
+        expected = '''його O
+книжечка O
+« O
+А B-MISC
+. I-MISC
+Kubler I-MISC
+. I-MISC
+Світло I-MISC
+і I-MISC
+тіні I-MISC
+маестро I-MISC
+» O
+. O
+Причому O'''
+        self.assertEqual(expected, convert_bsf(data, bsf_markup, converter='iob'))
+if __name__ == '__main__':
+    unittest.main()

stanza/stanza/tests/ner/test_convert_amt.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+Test some of the functions used for converting an AMT json to a Stanza json
+"""
+import os
+import pytest
+import stanza
+from stanza.utils.datasets.ner import convert_amt
+from stanza.tests import TEST_MODELS_DIR
+pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
+TEXT = "Jennifer Sh'reyan has lovely antennae."
+def fake_label(label, start_char, end_char):
+    return {'label': label,
+            'startOffset': start_char,
+            'endOffset': end_char}
+LABELS = [
+    fake_label('Person', 0, 8),
+    fake_label('Person', 9, 17),
+    fake_label('Person', 0, 17),
+    fake_label('Andorian', 0, 8),
+    fake_label('Appendage', 29, 37),
+    fake_label('Person', 1, 8),
+    fake_label('Person', 0, 7),
+    fake_label('Person', 0, 9),
+    fake_label('Appendage', 29, 38),
+]
+def fake_labels(*indices):
+    return [LABELS[x] for x in indices]
+def fake_docs(*indices):
+    return [(TEXT, fake_labels(*indices))]
+def test_remove_nesting():
+    """
+    Test a few orders on nested items to make sure the desired results are coming back
+    """
+    # this should be unchanged
+    result = convert_amt.remove_nesting(fake_docs(0, 1))
+    assert result == fake_docs(0, 1)
+    # this should be returned sorted
+    result = convert_amt.remove_nesting(fake_docs(0, 4, 1))
+    assert result == fake_docs(0, 1, 4)
+    # this should just have one copy
+    result = convert_amt.remove_nesting(fake_docs(0, 0))
+    assert result == fake_docs(0)
+    # outer one preferred
+    result = convert_amt.remove_nesting(fake_docs(0, 2))
+    assert result == fake_docs(2)
+    result = convert_amt.remove_nesting(fake_docs(1, 2))
+    assert result == fake_docs(2)
+    result = convert_amt.remove_nesting(fake_docs(5, 2))
+    assert result == fake_docs(2)
+    # order doesn't matter
+    result = convert_amt.remove_nesting(fake_docs(0, 4, 2))
+    assert result == fake_docs(2, 4)
+    result = convert_amt.remove_nesting(fake_docs(2, 4, 0))
+    assert result == fake_docs(2, 4)
+    # first one preferred
+    result = convert_amt.remove_nesting(fake_docs(0, 3))
+    assert result == fake_docs(0)
+    result = convert_amt.remove_nesting(fake_docs(3, 0))
+    assert result == fake_docs(3)
+def test_process_doc():
+    nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, processors="tokenize", download_method=None)
+    def check_results(doc, *expected):
+        ner = [x[1] for x in doc[0]]
+        assert ner == list(expected)
+    # test a standard case of all the values lining up
+    doc = convert_amt.process_doc(TEXT, fake_labels(2, 4), nlp)
+    check_results(doc, "B-Person", "I-Person", "O", "O", "B-Appendage", "O")
+    # test a slightly wrong start index
+    doc = convert_amt.process_doc(TEXT, fake_labels(5, 1, 4), nlp)
+    check_results(doc, "B-Person", "B-Person", "O", "O", "B-Appendage", "O")
+    # test a slightly wrong end index
+    doc = convert_amt.process_doc(TEXT, fake_labels(6, 1, 4), nlp)
+    check_results(doc, "B-Person", "B-Person", "O", "O", "B-Appendage", "O")
+    # test a slightly wronger end index
+    doc = convert_amt.process_doc(TEXT, fake_labels(7, 4), nlp)
+    check_results(doc, "B-Person", "O", "O", "O", "B-Appendage", "O")
+    # test a period at the end of a text - should not be captured
+    doc = convert_amt.process_doc(TEXT, fake_labels(7, 8), nlp)
+    check_results(doc, "B-Person", "O", "O", "O", "B-Appendage", "O")

stanza/stanza/tests/ner/test_convert_starlang_ner.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""
+Test a couple different classes of trees to check the output of the Starlang conversion for NER
+"""
+import os
+import tempfile
+import pytest
+from stanza.utils.datasets.ner import convert_starlang_ner
+pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
+TREE="( (S (NP (NP {morphologicalAnalysis=bayan+NOUN+A3SG+PNON+NOM}{metaMorphemes=bayan}{turkish=Bayan}{english=Ms.}{semantics=TUR10-0396530}{namedEntity=PERSON}{propBank=ARG0$TUR10-0148580}{englishSemantics=ENG31-06352895-n}) (NP {morphologicalAnalysis=haag+NOUN+PROP+A3SG+PNON+NOM}{metaMorphemes=haag}{turkish=Haag}{english=Haag}{semantics=TUR10-0000000}{namedEntity=PERSON}{propBank=ARG0$TUR10-0148580}))  (VP (NP {morphologicalAnalysis=elianti+NOUN+PROP+A3SG+PNON+NOM}{metaMorphemes=elianti}{turkish=Elianti}{english=Elianti}{semantics=TUR10-0000000}{namedEntity=NONE}{propBank=ARG1$TUR10-0148580}) (VP {morphologicalAnalysis=çal+VERB+POS+AOR+A3SG}{metaMorphemes=çal+Ar}{turkish=çalar}{english=plays}{semantics=TUR10-0148580}{namedEntity=NONE}{propBank=PREDICATE$TUR10-0148580}{englishSemantics=ENG31-01730049-v}))  (. {morphologicalAnalysis=.+PUNC}{metaMorphemes=.}{metaMorphemesMoved=.}{turkish=.}{english=.}{semantics=TUR10-1081860}{namedEntity=NONE}{propBank=NONE}))  )"
+def test_read_tree():
+    """
+    Test a basic tree read
+    """
+    sentence = convert_starlang_ner.read_tree(TREE)
+    expected = [('Bayan', 'PERSON'), ('Haag', 'PERSON'), ('Elianti', 'O'), ('çalar', 'O'), ('.', 'O')]
+    assert sentence == expected

stanza/stanza/tests/ner/test_from_conllu.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import pytest
+from stanza import Pipeline
+from stanza.utils.conll import CoNLL
+from stanza.tests import TEST_MODELS_DIR
+pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
+def test_from_conllu():
+    """
+    If the doc does not have the entire text available, make sure it still safely processes the text
+    Test case supplied from user - see issue #1428
+    """
+    pipe = Pipeline("en", dir=TEST_MODELS_DIR, processors="tokenize,ner", download_method=None)
+    doc = pipe("In February, I traveled to Seattle.  Dr. Pritchett gave me a new hip")
+    ents = [x.text for x in doc.ents]
+    # the default NER model ought to find these three
+    assert ents == ['February', 'Seattle', 'Pritchett']
+    doc_conllu = "{:C}\n\n".format(doc)
+    doc = CoNLL.conll2doc(input_str=doc_conllu)
+    pipe = Pipeline("en", dir=TEST_MODELS_DIR, processors="tokenize,ner", tokenize_pretokenized=True, download_method=None)
+    pipe(doc)
+    ents = [x.text for x in doc.ents]
+    # this should still work when processed from a CoNLLu document
+    # the bug previously caused a crash because the text to construct
+    # the entities was not available, since the Document wouldn't have
+    # the entire document text available
+    assert ents == ['February', 'Seattle', 'Pritchett']

stanza/stanza/tests/ner/test_ner_utils.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import pytest
+from stanza.tests import *
+from stanza.models.common.vocab import EMPTY
+from stanza.models.ner import utils
+pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
+WORDS       = [["Unban",   "Mox",   "Opal"], ["Ragavan",  "is",     "red"], ["Urza",   "Lord",  "High", "Artificer", "goes", "infinite", "with",  "Thopter",    "Sword"]]
+BIO_TAGS    = [["O",     "B-ART",  "I-ART"], ["B-MONKEY", "O",  "B-COLOR"], ["B-PER", "I-PER", "I-PER", "I-PER",        "O",        "O",    "O", "B-WEAPON", "B-WEAPON"]]
+BIO_U_TAGS  = [["O",     "B_ART",  "I_ART"], ["B_MONKEY", "O",  "B_COLOR"], ["B_PER", "I_PER", "I_PER", "I_PER",        "O",        "O",    "O", "B_WEAPON", "B_WEAPON"]]
+BIOES_TAGS  = [["O",     "B-ART",  "E-ART"], ["S-MONKEY", "O",  "S-COLOR"], ["B-PER", "I-PER", "I-PER", "E-PER",        "O",        "O",    "O", "S-WEAPON", "S-WEAPON"]]
+# note the problem with not using BIO tags - the consecutive tags for thopter/sword get treated as one item
+BASIC_TAGS  = [["O",       "ART",    "ART"], ["MONKEY",   "O",    "COLOR"], [  "PER",   "PER",   "PER",   "PER",        "O",        "O",    "O",   "WEAPON",   "WEAPON"]]
+BASIC_BIOES = [["O",     "B-ART",  "E-ART"], ["S-MONKEY", "O",  "S-COLOR"], ["B-PER", "I-PER", "I-PER", "E-PER",        "O",        "O",    "O", "B-WEAPON", "E-WEAPON"]]
+ALT_BIO     = [["O",    "B-MANA", "I-MANA"], ["B-CRE",    "O",        "O"], ["B-CRE", "I-CRE", "I-CRE", "I-CRE",        "O",        "O",    "O",    "B-ART",    "B-ART"]]
+ALT_BIOES   = [["O",    "B-MANA", "E-MANA"], ["S-CRE",    "O",        "O"], ["B-CRE", "I-CRE", "I-CRE", "E-CRE",        "O",        "O",    "O",    "S-ART",    "S-ART"]]
+NONE_BIO    = [["O",    "B-MANA", "I-MANA"], [None,      None,       None], ["B-CRE", "I-CRE", "I-CRE", "I-CRE",        "O",        "O",    "O",    "B-ART",    "B-ART"]]
+NONE_BIOES  = [["O",    "B-MANA", "E-MANA"], [None,      None,       None], ["B-CRE", "I-CRE", "I-CRE", "E-CRE",        "O",        "O",    "O",    "S-ART",    "S-ART"]]
+EMPTY_BIO   = [["O",    "B-MANA", "I-MANA"], [EMPTY,     EMPTY,     EMPTY], ["B-CRE", "I-CRE", "I-CRE", "I-CRE",        "O",        "O",    "O",    "B-ART",    "B-ART"]]
+def test_normalize_empty_tags():
+    sentences = [[(word[0], (word[1],)) for word in zip(*sentence)] for sentence in zip(WORDS, NONE_BIO)]
+    new_sentences = utils.normalize_empty_tags(sentences)
+    expected = [[(word[0], (word[1],)) for word in zip(*sentence)] for sentence in zip(WORDS, EMPTY_BIO)]
+    assert new_sentences == expected
+def check_reprocessed_tags(words, input_tags, expected_tags):
+    sentences = [list(zip(x, y)) for x, y in zip(words, input_tags)]
+    retagged = utils.process_tags(sentences=sentences, scheme="bioes")
+    # process_tags selectively returns tuples or strings based on the input
+    # so we don't need to fiddle with the expected output format here
+    expected_retagged = [list(zip(x, y)) for x, y in zip(words, expected_tags)]
+    assert retagged == expected_retagged
+def test_process_tags_bio():
+    check_reprocessed_tags(WORDS, BIO_TAGS, BIOES_TAGS)
+    # check that the alternate version is correct as well
+    # that way we can independently check the two layer version
+    check_reprocessed_tags(WORDS, ALT_BIO, ALT_BIOES)
+def test_process_tags_with_none():
+    # if there is a block of tags with None in them, the Nones should be skipped over
+    check_reprocessed_tags(WORDS, NONE_BIO, NONE_BIOES)
+def merge_tags(*tags):
+    merged_tags = [[tuple(x) for x in zip(*sentences)]   # combine tags such as ("O", "O"), ("B-ART", "B-MANA"), ...
+                   for sentences in zip(*tags)]          # ... for each set of sentences
+    return merged_tags
+def test_combined_tags_bio():
+    bio_tags = merge_tags(BIO_TAGS, ALT_BIO)
+    expected = merge_tags(BIOES_TAGS, ALT_BIOES)
+    check_reprocessed_tags(WORDS, bio_tags, expected)
+def test_combined_tags_mixed():
+    bio_tags = merge_tags(BIO_TAGS, ALT_BIOES)
+    expected = merge_tags(BIOES_TAGS, ALT_BIOES)
+    check_reprocessed_tags(WORDS, bio_tags, expected)
+def test_process_tags_basic():
+    check_reprocessed_tags(WORDS, BASIC_TAGS, BASIC_BIOES)
+def test_process_tags_bioes():
+    """
+    This one should not change, naturally
+    """
+    check_reprocessed_tags(WORDS, BIOES_TAGS, BIOES_TAGS)
+    check_reprocessed_tags(WORDS, BASIC_BIOES, BASIC_BIOES)
+def run_flattened(fn, tags):
+    return fn([x for x in y for y in tags])
+def test_check_bio():
+    assert     utils.is_bio_scheme([x for y in BIO_TAGS for x in y])
+    assert not utils.is_bio_scheme([x for y in BIOES_TAGS for x in y])
+    assert not utils.is_bio_scheme([x for y in BASIC_TAGS for x in y])
+    assert not utils.is_bio_scheme([x for y in BASIC_BIOES for x in y])
+def test_check_basic():
+    assert not utils.is_basic_scheme([x for y in BIO_TAGS for x in y])
+    assert not utils.is_basic_scheme([x for y in BIOES_TAGS for x in y])
+    assert     utils.is_basic_scheme([x for y in BASIC_TAGS for x in y])
+    assert not utils.is_basic_scheme([x for y in BASIC_BIOES for x in y])
+def test_underscores():
+    """
+    Check that the methods work if the inputs are underscores instead of dashes
+    """
+    assert not utils.is_basic_scheme([x for y in BIO_U_TAGS for x in y])
+    check_reprocessed_tags(WORDS, BIO_U_TAGS, BIOES_TAGS)
+def test_merge_tags():
+    """
+    Check a few versions of the tag sequence merging
+    """
+    seq1     = [     "O",     "O",     "O", "B-FOO", "E-FOO",     "O"]
+    seq2     = [ "S-FOO",     "O", "B-FOO", "E-FOO",     "O",     "O"]
+    seq3     = [ "B-FOO", "E-FOO", "B-FOO", "E-FOO",     "O",     "O"]
+    seq_err  = [     "O", "B-FOO",     "O", "B-FOO", "E-FOO",     "O"]
+    seq_err2 = [     "O", "B-FOO",     "O", "B-FOO", "B-FOO",     "O"]
+    seq_err3 = [     "O", "B-FOO",     "O", "B-FOO", "I-FOO",     "O"]
+    seq_err4 = [     "O", "B-FOO",     "O", "B-FOO", "I-FOO", "I-FOO"]
+    result = utils.merge_tags(seq1, seq2)
+    expected = [ "S-FOO",     "O",     "O", "B-FOO", "E-FOO",     "O"]
+    assert result == expected
+    result = utils.merge_tags(seq2, seq1)
+    expected = [ "S-FOO",     "O", "B-FOO", "E-FOO",     "O",     "O"]
+    assert result == expected
+    result = utils.merge_tags(seq1, seq3)
+    expected = [ "B-FOO", "E-FOO",     "O", "B-FOO", "E-FOO",     "O"]
+    assert result == expected
+    with pytest.raises(ValueError):
+        result = utils.merge_tags(seq1, seq_err)
+    with pytest.raises(ValueError):
+        result = utils.merge_tags(seq1, seq_err2)
+    with pytest.raises(ValueError):
+        result = utils.merge_tags(seq1, seq_err3)
+    with pytest.raises(ValueError):
+        result = utils.merge_tags(seq1, seq_err4)

stanza/stanza/tests/pipeline/__init__.py ADDED Viewed

File without changes

stanza/stanza/tests/pipeline/test_arabic_pipeline.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""
+Small test of loading the Arabic pipeline
+The main goal is to check that nothing goes wrong with RtL languages,
+but incidentally this would have caught a bug where the xpos tags
+were split into individual pieces instead of reassembled as expected
+"""
+import pytest
+import stanza
+from stanza.tests import TEST_MODELS_DIR
+pytestmark = pytest.mark.pipeline
+def test_arabic_pos_pipeline():
+    pipe = stanza.Pipeline(**{'processors': 'tokenize,pos', 'dir': TEST_MODELS_DIR, 'download_method': None, 'lang': 'ar'})
+    text = "ولم يتم اعتقال احد بحسب المتحدث باسم الشرطة."
+    doc = pipe(text)
+    # the first token translates to "and not", seems common enough
+    # that we should be able to rely on it having a stable MWT and tag
+    assert len(doc.sentences) == 1
+    assert doc.sentences[0].tokens[0].text == "ولم"
+    assert doc.sentences[0].words[0].xpos == "C---------"
+    assert doc.sentences[0].words[1].xpos == "F---------"

stanza/stanza/tests/pipeline/test_core.py ADDED Viewed

	@@ -0,0 +1,248 @@

+import pytest
+import shutil
+import tempfile
+import stanza
+from stanza.tests import *
+from stanza.pipeline import core
+from stanza.resources.common import get_md5, load_resources_json
+pytestmark = pytest.mark.pipeline
+def test_pretagged():
+    """
+    Test that the pipeline does or doesn't build if pos is left out and pretagged is specified
+    """
+    nlp = stanza.Pipeline(lang='en', dir=TEST_MODELS_DIR, processors="tokenize,pos,lemma,depparse")
+    with pytest.raises(core.PipelineRequirementsException):
+        nlp = stanza.Pipeline(lang='en', dir=TEST_MODELS_DIR, processors="tokenize,lemma,depparse")
+    nlp = stanza.Pipeline(lang='en', dir=TEST_MODELS_DIR, processors="tokenize,lemma,depparse", depparse_pretagged=True)
+    nlp = stanza.Pipeline(lang='en', dir=TEST_MODELS_DIR, processors="tokenize,lemma,depparse", pretagged=True)
+    # test that the module specific flag overrides the general flag
+    nlp = stanza.Pipeline(lang='en', dir=TEST_MODELS_DIR, processors="tokenize,lemma,depparse", depparse_pretagged=True, pretagged=False)
+def test_download_missing_ner_model():
+    """
+    Test that the pipeline will automatically download missing models
+    """
+    with tempfile.TemporaryDirectory(dir=TEST_WORKING_DIR) as test_dir:
+        stanza.download("en", model_dir=test_dir, processors="tokenize", package="combined", verbose=False)
+        pipe = stanza.Pipeline("en", model_dir=test_dir, processors="tokenize,ner", package={"ner": ("ontonotes_charlm")})
+        assert sorted(os.listdir(test_dir)) == ['en', 'resources.json']
+        en_dir = os.path.join(test_dir, 'en')
+        en_dir_listing = sorted(os.listdir(en_dir))
+        assert en_dir_listing == ['backward_charlm', 'forward_charlm', 'mwt', 'ner', 'pretrain', 'tokenize']
+        assert os.listdir(os.path.join(en_dir, 'ner')) == ['ontonotes_charlm.pt']
+def test_download_missing_resources():
+    """
+    Test that the pipeline will automatically download missing models
+    """
+    with tempfile.TemporaryDirectory(dir=TEST_WORKING_DIR) as test_dir:
+        pipe = stanza.Pipeline("en", model_dir=test_dir, processors="tokenize,ner", package={"tokenize": "combined", "ner": "ontonotes_charlm"})
+        assert sorted(os.listdir(test_dir)) == ['en', 'resources.json']
+        en_dir = os.path.join(test_dir, 'en')
+        en_dir_listing = sorted(os.listdir(en_dir))
+        assert en_dir_listing == ['backward_charlm', 'forward_charlm', 'mwt', 'ner', 'pretrain', 'tokenize']
+        assert os.listdir(os.path.join(en_dir, 'ner')) == ['ontonotes_charlm.pt']
+def test_download_resources_overwrites():
+    """
+    Test that the DOWNLOAD_RESOURCES method overwrites an existing resources.json
+    """
+    with tempfile.TemporaryDirectory(dir=TEST_WORKING_DIR) as test_dir:
+        pipe = stanza.Pipeline("en", model_dir=test_dir, processors="tokenize", package={"tokenize": "combined"})
+        assert sorted(os.listdir(test_dir)) == ['en', 'resources.json']
+        resources_path = os.path.join(test_dir, 'resources.json')
+        mod_time = os.path.getmtime(resources_path)
+        pipe = stanza.Pipeline("en", model_dir=test_dir, processors="tokenize", package={"tokenize": "combined"})
+        new_mod_time = os.path.getmtime(resources_path)
+        assert mod_time != new_mod_time
+def test_reuse_resources_overwrites():
+    """
+    Test that the REUSE_RESOURCES method does *not* overwrite an existing resources.json
+    """
+    with tempfile.TemporaryDirectory(dir=TEST_WORKING_DIR) as test_dir:
+        pipe = stanza.Pipeline("en",
+                               download_method=core.DownloadMethod.REUSE_RESOURCES,
+                               model_dir=test_dir,
+                               processors="tokenize",
+                               package={"tokenize": "combined"})
+        assert sorted(os.listdir(test_dir)) == ['en', 'resources.json']
+        resources_path = os.path.join(test_dir, 'resources.json')
+        mod_time = os.path.getmtime(resources_path)
+        pipe = stanza.Pipeline("en",
+                               download_method=core.DownloadMethod.REUSE_RESOURCES,
+                               model_dir=test_dir,
+                               processors="tokenize",
+                               package={"tokenize": "combined"})
+        new_mod_time = os.path.getmtime(resources_path)
+        assert mod_time == new_mod_time
+def test_download_not_repeated():
+    """
+    Test that a model is only downloaded once if it already matches the expected model from the resources file
+    """
+    with tempfile.TemporaryDirectory(dir=TEST_WORKING_DIR) as test_dir:
+        stanza.download("en", model_dir=test_dir, processors="tokenize", package="combined")
+        assert sorted(os.listdir(test_dir)) == ['en', 'resources.json']
+        en_dir = os.path.join(test_dir, 'en')
+        en_dir_listing = sorted(os.listdir(en_dir))
+        assert en_dir_listing == ['mwt', 'tokenize']
+        tokenize_path = os.path.join(en_dir, "tokenize", "combined.pt")
+        mod_time = os.path.getmtime(tokenize_path)
+        pipe = stanza.Pipeline("en", model_dir=test_dir, processors="tokenize", package={"tokenize": "combined"})
+        assert os.path.getmtime(tokenize_path) == mod_time
+def test_download_none():
+    with tempfile.TemporaryDirectory(dir=TEST_WORKING_DIR) as test_dir:
+        stanza.download("it", model_dir=test_dir, processors="tokenize", package="combined")
+        stanza.download("it", model_dir=test_dir, processors="tokenize", package="vit")
+        it_dir = os.path.join(test_dir, 'it')
+        it_dir_listing = sorted(os.listdir(it_dir))
+        assert sorted(it_dir_listing) == ['mwt', 'tokenize']
+        combined_path = os.path.join(it_dir, "tokenize", "combined.pt")
+        vit_path = os.path.join(it_dir, "tokenize", "vit.pt")
+        assert os.path.exists(combined_path)
+        assert os.path.exists(vit_path)
+        combined_md5 = get_md5(combined_path)
+        vit_md5 = get_md5(vit_path)
+        # check that the models are different
+        # otherwise the test is not testing anything
+        assert combined_md5 != vit_md5
+        shutil.copyfile(vit_path, combined_path)
+        assert get_md5(combined_path) == vit_md5
+        pipe = stanza.Pipeline("it", model_dir=test_dir, processors="tokenize", package={"tokenize": "combined"}, download_method=None)
+        assert get_md5(combined_path) == vit_md5
+        pipe = stanza.Pipeline("it", model_dir=test_dir, processors="tokenize", package={"tokenize": "combined"})
+        assert get_md5(combined_path) != vit_md5
+def check_download_method_updates(download_method):
+    """
+    Run a single test of creating a pipeline with a given download_method, checking that the model is updated
+    """
+    with tempfile.TemporaryDirectory(dir=TEST_WORKING_DIR) as test_dir:
+        stanza.download("en", model_dir=test_dir, processors="tokenize", package="combined")
+        assert sorted(os.listdir(test_dir)) == ['en', 'resources.json']
+        en_dir = os.path.join(test_dir, 'en')
+        en_dir_listing = sorted(os.listdir(en_dir))
+        assert en_dir_listing == ['mwt', 'tokenize']
+        tokenize_path = os.path.join(en_dir, "tokenize", "combined.pt")
+        with open(tokenize_path, "w") as fout:
+            fout.write("Unban mox opal!")
+        mod_time = os.path.getmtime(tokenize_path)
+        pipe = stanza.Pipeline("en", model_dir=test_dir, processors="tokenize", package={"tokenize": "combined"}, download_method=download_method)
+        assert os.path.getmtime(tokenize_path) != mod_time
+def test_download_fixed():
+    """
+    Test that a model is fixed if the existing model doesn't match the md5sum
+    """
+    for download_method in (core.DownloadMethod.REUSE_RESOURCES, core.DownloadMethod.DOWNLOAD_RESOURCES):
+        check_download_method_updates(download_method)
+def test_download_strings():
+    """
+    Same as the test of the download_method, but tests that the pipeline works for string download_method
+    """
+    for download_method in ("reuse_resources", "download_resources"):
+        check_download_method_updates(download_method)
+def test_limited_pipeline():
+    """
+    Test loading a pipeline, but then only using a couple processors
+    """
+    pipe = stanza.Pipeline(processors="tokenize,pos,lemma,depparse,ner", dir=TEST_MODELS_DIR)
+    doc = pipe("John Bauer works at Stanford")
+    assert all(word.upos is not None for sentence in doc.sentences for word in sentence.words)
+    assert all(token.ner is not None for sentence in doc.sentences for token in sentence.tokens)
+    doc = pipe("John Bauer works at Stanford", processors=["tokenize","pos"])
+    assert all(word.upos is not None for sentence in doc.sentences for word in sentence.words)
+    assert not any(token.ner is not None for sentence in doc.sentences for token in sentence.tokens)
+    doc = pipe("John Bauer works at Stanford", processors="tokenize")
+    assert not any(word.upos is not None for sentence in doc.sentences for word in sentence.words)
+    assert not any(token.ner is not None for sentence in doc.sentences for token in sentence.tokens)
+    doc = pipe("John Bauer works at Stanford", processors="tokenize,ner")
+    assert not any(word.upos is not None for sentence in doc.sentences for word in sentence.words)
+    assert all(token.ner is not None for sentence in doc.sentences for token in sentence.tokens)
+    with pytest.raises(ValueError):
+        # this should fail
+        doc = pipe("John Bauer works at Stanford", processors="tokenize,depparse")
+@pytest.fixture(scope="module")
+def unknown_language_name():
+    resources = load_resources_json(model_dir=TEST_MODELS_DIR)
+    name = "en"
+    while name in resources:
+        name = name + "z"
+    assert name != "en"
+    return name
+def test_empty_unknown_language(unknown_language_name):
+    """
+    Check that there is an error for trying to load an unknown language
+    """
+    with pytest.raises(ValueError):
+        pipe = stanza.Pipeline(unknown_language_name, download_method=None)
+def test_unknown_language_tokenizer(unknown_language_name):
+    """
+    Test that loading tokenize works for an unknown language
+    """
+    base_pipe = stanza.Pipeline("en", dir=TEST_MODELS_DIR, processors="tokenize", download_method=None)
+    # even if we one day add MWT to English, the tokenizer by itself should still work
+    tokenize_processor = base_pipe.processors["tokenize"]
+    pipe=stanza.Pipeline(unknown_language_name,
+                         processors="tokenize",
+                         allow_unknown_language=True,
+                         tokenize_model_path=tokenize_processor.config['model_path'],
+                         download_method=None)
+    doc = pipe("This is a test")
+    words = [x.text for x in doc.sentences[0].words]
+    assert words == ['This', 'is', 'a', 'test']
+def test_unknown_language_mwt(unknown_language_name):
+    """
+    Test that loading tokenize & mwt works for an unknown language
+    """
+    base_pipe = stanza.Pipeline("fr", dir=TEST_MODELS_DIR, processors="tokenize,mwt", download_method=None)
+    assert len(base_pipe.processors) == 2
+    tokenize_processor = base_pipe.processors["tokenize"]
+    mwt_processor = base_pipe.processors["mwt"]
+    pipe=stanza.Pipeline(unknown_language_name,
+                         processors="tokenize,mwt",
+                         allow_unknown_language=True,
+                         tokenize_model_path=tokenize_processor.config['model_path'],
+                         mwt_model_path=mwt_processor.config['model_path'],
+                         download_method=None)

stanza/stanza/tests/pipeline/test_depparse.py ADDED Viewed

	@@ -0,0 +1,87 @@

+"""
+Basic tests of the depparse processor boolean flags
+"""
+import pytest
+import stanza
+from stanza.pipeline.core import PipelineRequirementsException
+from stanza.utils.conll import CoNLL
+from stanza.tests import *
+pytestmark = pytest.mark.pipeline
+# data for testing
+EN_DOC = "Barack Obama was born in Hawaii.  He was elected president in 2008.  Obama attended Harvard."
+EN_DOC_CONLLU_PRETAGGED = """
+1	Barack	Barack	PROPN	NNP	Number=Sing	0	_	_	_
+2	Obama	Obama	PROPN	NNP	Number=Sing	1	_	_	_
+3	was	be	AUX	VBD	Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin	2	_	_	_
+4	born	bear	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	3	_	_	_
+5	in	in	ADP	IN	_	4	_	_	_
+6	Hawaii	Hawaii	PROPN	NNP	Number=Sing	5	_	_	_
+7	.	.	PUNCT	.	_	6	_	_	_
+1	He	he	PRON	PRP	Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs	0	_	_	_
+2	was	be	AUX	VBD	Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin	1	_	_	_
+3	elected	elect	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	2	_	_	_
+4	president	president	PROPN	NNP	Number=Sing	3	_	_	_
+5	in	in	ADP	IN	_	4	_	_	_
+6	2008	2008	NUM	CD	NumType=Card	5	_	_	_
+7	.	.	PUNCT	.	_	6	_	_	_
+1	Obama	Obama	PROPN	NNP	Number=Sing	0	_	_	_
+2	attended	attend	VERB	VBD	Mood=Ind|Tense=Past|VerbForm=Fin	1	_	_	_
+3	Harvard	Harvard	PROPN	NNP	Number=Sing	2	_	_	_
+4	.	.	PUNCT	.	_	3	_	_	_
+""".lstrip()
+EN_DOC_DEPENDENCY_PARSES_GOLD = """
+('Barack', 4, 'nsubj:pass')
+('Obama', 1, 'flat')
+('was', 4, 'aux:pass')
+('born', 0, 'root')
+('in', 6, 'case')
+('Hawaii', 4, 'obl')
+('.', 4, 'punct')
+('He', 3, 'nsubj:pass')
+('was', 3, 'aux:pass')
+('elected', 0, 'root')
+('president', 3, 'xcomp')
+('in', 6, 'case')
+('2008', 3, 'obl')
+('.', 3, 'punct')
+('Obama', 2, 'nsubj')
+('attended', 0, 'root')
+('Harvard', 2, 'obj')
+('.', 2, 'punct')
+""".strip()
+@pytest.fixture(scope="module")
+def en_depparse_pipeline():
+    nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, lang='en', processors='tokenize,pos,lemma,depparse')
+    return nlp
+def test_depparse(en_depparse_pipeline):
+    doc = en_depparse_pipeline(EN_DOC)
+    assert EN_DOC_DEPENDENCY_PARSES_GOLD == '\n\n'.join([sent.dependencies_string() for sent in doc.sentences])
+def test_depparse_with_pretagged_doc():
+    nlp = stanza.Pipeline(**{'processors': 'depparse', 'dir': TEST_MODELS_DIR, 'lang': 'en',
+                                  'depparse_pretagged': True})
+    doc = CoNLL.conll2doc(input_str=EN_DOC_CONLLU_PRETAGGED)
+    processed_doc = nlp(doc)
+    assert EN_DOC_DEPENDENCY_PARSES_GOLD == '\n\n'.join(
+        [sent.dependencies_string() for sent in processed_doc.sentences])
+def test_raises_requirements_exception_if_pretagged_not_passed():
+    with pytest.raises(PipelineRequirementsException):
+        stanza.Pipeline(**{'processors': 'depparse', 'dir': TEST_MODELS_DIR, 'lang': 'en'})

stanza/stanza/tests/pipeline/test_english_pipeline.py ADDED Viewed

	@@ -0,0 +1,279 @@

+"""
+Basic testing of the English pipeline
+"""
+import pytest
+import stanza
+from stanza.utils.conll import CoNLL
+from stanza.models.common.doc import Document
+from stanza.tests import *
+from stanza.tests.pipeline.pipeline_device_tests import check_on_gpu, check_on_cpu
+pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
+# data for testing
+EN_DOC = "Barack Obama was born in Hawaii.  He was elected president in 2008.  Obama attended Harvard."
+EN_DOCS = ["Barack Obama was born in Hawaii.", "He was elected president in 2008.", "Obama attended Harvard."]
+EN_DOC_TOKENS_GOLD = """
+<Token id=1;words=[<Word id=1;text=Barack;lemma=Barack;upos=PROPN;xpos=NNP;feats=Number=Sing;head=4;deprel=nsubj:pass>]>
+<Token id=2;words=[<Word id=2;text=Obama;lemma=Obama;upos=PROPN;xpos=NNP;feats=Number=Sing;head=1;deprel=flat>]>
+<Token id=3;words=[<Word id=3;text=was;lemma=be;upos=AUX;xpos=VBD;feats=Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin;head=4;deprel=aux:pass>]>
+<Token id=4;words=[<Word id=4;text=born;lemma=bear;upos=VERB;xpos=VBN;feats=Tense=Past|VerbForm=Part|Voice=Pass;head=0;deprel=root>]>
+<Token id=5;words=[<Word id=5;text=in;lemma=in;upos=ADP;xpos=IN;head=6;deprel=case>]>
+<Token id=6;words=[<Word id=6;text=Hawaii;lemma=Hawaii;upos=PROPN;xpos=NNP;feats=Number=Sing;head=4;deprel=obl>]>
+<Token id=7;words=[<Word id=7;text=.;lemma=.;upos=PUNCT;xpos=.;head=4;deprel=punct>]>
+<Token id=1;words=[<Word id=1;text=He;lemma=he;upos=PRON;xpos=PRP;feats=Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs;head=3;deprel=nsubj:pass>]>
+<Token id=2;words=[<Word id=2;text=was;lemma=be;upos=AUX;xpos=VBD;feats=Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin;head=3;deprel=aux:pass>]>
+<Token id=3;words=[<Word id=3;text=elected;lemma=elect;upos=VERB;xpos=VBN;feats=Tense=Past|VerbForm=Part|Voice=Pass;head=0;deprel=root>]>
+<Token id=4;words=[<Word id=4;text=president;lemma=president;upos=NOUN;xpos=NN;feats=Number=Sing;head=3;deprel=xcomp>]>
+<Token id=5;words=[<Word id=5;text=in;lemma=in;upos=ADP;xpos=IN;head=6;deprel=case>]>
+<Token id=6;words=[<Word id=6;text=2008;lemma=2008;upos=NUM;xpos=CD;feats=NumForm=Digit|NumType=Card;head=3;deprel=obl>]>
+<Token id=7;words=[<Word id=7;text=.;lemma=.;upos=PUNCT;xpos=.;head=3;deprel=punct>]>
+<Token id=1;words=[<Word id=1;text=Obama;lemma=Obama;upos=PROPN;xpos=NNP;feats=Number=Sing;head=2;deprel=nsubj>]>
+<Token id=2;words=[<Word id=2;text=attended;lemma=attend;upos=VERB;xpos=VBD;feats=Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin;head=0;deprel=root>]>
+<Token id=3;words=[<Word id=3;text=Harvard;lemma=Harvard;upos=PROPN;xpos=NNP;feats=Number=Sing;head=2;deprel=obj>]>
+<Token id=4;words=[<Word id=4;text=.;lemma=.;upos=PUNCT;xpos=.;head=2;deprel=punct>]>
+""".strip()
+EN_DOC_WORDS_GOLD = """
+<Word id=1;text=Barack;lemma=Barack;upos=PROPN;xpos=NNP;feats=Number=Sing;head=4;deprel=nsubj:pass>
+<Word id=2;text=Obama;lemma=Obama;upos=PROPN;xpos=NNP;feats=Number=Sing;head=1;deprel=flat>
+<Word id=3;text=was;lemma=be;upos=AUX;xpos=VBD;feats=Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin;head=4;deprel=aux:pass>
+<Word id=4;text=born;lemma=bear;upos=VERB;xpos=VBN;feats=Tense=Past|VerbForm=Part|Voice=Pass;head=0;deprel=root>
+<Word id=5;text=in;lemma=in;upos=ADP;xpos=IN;head=6;deprel=case>
+<Word id=6;text=Hawaii;lemma=Hawaii;upos=PROPN;xpos=NNP;feats=Number=Sing;head=4;deprel=obl>
+<Word id=7;text=.;lemma=.;upos=PUNCT;xpos=.;head=4;deprel=punct>
+<Word id=1;text=He;lemma=he;upos=PRON;xpos=PRP;feats=Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs;head=3;deprel=nsubj:pass>
+<Word id=2;text=was;lemma=be;upos=AUX;xpos=VBD;feats=Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin;head=3;deprel=aux:pass>
+<Word id=3;text=elected;lemma=elect;upos=VERB;xpos=VBN;feats=Tense=Past|VerbForm=Part|Voice=Pass;head=0;deprel=root>
+<Word id=4;text=president;lemma=president;upos=NOUN;xpos=NN;feats=Number=Sing;head=3;deprel=xcomp>
+<Word id=5;text=in;lemma=in;upos=ADP;xpos=IN;head=6;deprel=case>
+<Word id=6;text=2008;lemma=2008;upos=NUM;xpos=CD;feats=NumForm=Digit|NumType=Card;head=3;deprel=obl>
+<Word id=7;text=.;lemma=.;upos=PUNCT;xpos=.;head=3;deprel=punct>
+<Word id=1;text=Obama;lemma=Obama;upos=PROPN;xpos=NNP;feats=Number=Sing;head=2;deprel=nsubj>
+<Word id=2;text=attended;lemma=attend;upos=VERB;xpos=VBD;feats=Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin;head=0;deprel=root>
+<Word id=3;text=Harvard;lemma=Harvard;upos=PROPN;xpos=NNP;feats=Number=Sing;head=2;deprel=obj>
+<Word id=4;text=.;lemma=.;upos=PUNCT;xpos=.;head=2;deprel=punct>
+""".strip()
+EN_DOC_DEPENDENCY_PARSES_GOLD = """
+('Barack', 4, 'nsubj:pass')
+('Obama', 1, 'flat')
+('was', 4, 'aux:pass')
+('born', 0, 'root')
+('in', 6, 'case')
+('Hawaii', 4, 'obl')
+('.', 4, 'punct')
+('He', 3, 'nsubj:pass')
+('was', 3, 'aux:pass')
+('elected', 0, 'root')
+('president', 3, 'xcomp')
+('in', 6, 'case')
+('2008', 3, 'obl')
+('.', 3, 'punct')
+('Obama', 2, 'nsubj')
+('attended', 0, 'root')
+('Harvard', 2, 'obj')
+('.', 2, 'punct')
+""".strip()
+EN_DOC_CONLLU_GOLD = """
+# text = Barack Obama was born in Hawaii.
+# sent_id = 0
+# constituency = (ROOT (S (NP (NNP Barack) (NNP Obama)) (VP (VBD was) (VP (VBN born) (PP (IN in) (NP (NNP Hawaii))))) (. .)))
+# sentiment = 1
+1	Barack	Barack	PROPN	NNP	Number=Sing	4	nsubj:pass	_	start_char=0|end_char=6|ner=B-PERSON
+2	Obama	Obama	PROPN	NNP	Number=Sing	1	flat	_	start_char=7|end_char=12|ner=E-PERSON
+3	was	be	AUX	VBD	Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin	4	aux:pass	_	start_char=13|end_char=16|ner=O
+4	born	bear	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	0	root	_	start_char=17|end_char=21|ner=O
+5	in	in	ADP	IN	_	6	case	_	start_char=22|end_char=24|ner=O
+6	Hawaii	Hawaii	PROPN	NNP	Number=Sing	4	obl	_	start_char=25|end_char=31|ner=S-GPE|SpaceAfter=No
+7	.	.	PUNCT	.	_	4	punct	_	start_char=31|end_char=32|ner=O|SpacesAfter=\\s\\s
+# text = He was elected president in 2008.
+# sent_id = 1
+# constituency = (ROOT (S (NP (PRP He)) (VP (VBD was) (VP (VBN elected) (S (NP (NN president))) (PP (IN in) (NP (CD 2008))))) (. .)))
+# sentiment = 1
+1	He	he	PRON	PRP	Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs	3	nsubj:pass	_	start_char=34|end_char=36|ner=O
+2	was	be	AUX	VBD	Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin	3	aux:pass	_	start_char=37|end_char=40|ner=O
+3	elected	elect	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	0	root	_	start_char=41|end_char=48|ner=O
+4	president	president	NOUN	NN	Number=Sing	3	xcomp	_	start_char=49|end_char=58|ner=O
+5	in	in	ADP	IN	_	6	case	_	start_char=59|end_char=61|ner=O
+6	2008	2008	NUM	CD	NumForm=Digit|NumType=Card	3	obl	_	start_char=62|end_char=66|ner=S-DATE|SpaceAfter=No
+7	.	.	PUNCT	.	_	3	punct	_	start_char=66|end_char=67|ner=O|SpacesAfter=\\s\\s
+# text = Obama attended Harvard.
+# sent_id = 2
+# constituency = (ROOT (S (NP (NNP Obama)) (VP (VBD attended) (NP (NNP Harvard))) (. .)))
+# sentiment = 1
+1	Obama	Obama	PROPN	NNP	Number=Sing	2	nsubj	_	start_char=69|end_char=74|ner=S-PERSON
+2	attended	attend	VERB	VBD	Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin	0	root	_	start_char=75|end_char=83|ner=O
+3	Harvard	Harvard	PROPN	NNP	Number=Sing	2	obj	_	start_char=84|end_char=91|ner=S-ORG|SpaceAfter=No
+4	.	.	PUNCT	.	_	2	punct	_	start_char=91|end_char=92|ner=O|SpaceAfter=No
+""".strip()
+EN_DOC_CONLLU_GOLD_MULTIDOC = """
+# text = Barack Obama was born in Hawaii.
+# sent_id = 0
+# constituency = (ROOT (S (NP (NNP Barack) (NNP Obama)) (VP (VBD was) (VP (VBN born) (PP (IN in) (NP (NNP Hawaii))))) (. .)))
+# sentiment = 1
+1	Barack	Barack	PROPN	NNP	Number=Sing	4	nsubj:pass	_	start_char=0|end_char=6|ner=B-PERSON
+2	Obama	Obama	PROPN	NNP	Number=Sing	1	flat	_	start_char=7|end_char=12|ner=E-PERSON
+3	was	be	AUX	VBD	Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin	4	aux:pass	_	start_char=13|end_char=16|ner=O
+4	born	bear	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	0	root	_	start_char=17|end_char=21|ner=O
+5	in	in	ADP	IN	_	6	case	_	start_char=22|end_char=24|ner=O
+6	Hawaii	Hawaii	PROPN	NNP	Number=Sing	4	obl	_	start_char=25|end_char=31|ner=S-GPE|SpaceAfter=No
+7	.	.	PUNCT	.	_	4	punct	_	start_char=31|end_char=32|ner=O|SpaceAfter=No
+# text = He was elected president in 2008.
+# sent_id = 1
+# constituency = (ROOT (S (NP (PRP He)) (VP (VBD was) (VP (VBN elected) (S (NP (NN president))) (PP (IN in) (NP (CD 2008))))) (. .)))
+# sentiment = 1
+1	He	he	PRON	PRP	Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs	3	nsubj:pass	_	start_char=0|end_char=2|ner=O
+2	was	be	AUX	VBD	Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin	3	aux:pass	_	start_char=3|end_char=6|ner=O
+3	elected	elect	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	0	root	_	start_char=7|end_char=14|ner=O
+4	president	president	NOUN	NN	Number=Sing	3	xcomp	_	start_char=15|end_char=24|ner=O
+5	in	in	ADP	IN	_	6	case	_	start_char=25|end_char=27|ner=O
+6	2008	2008	NUM	CD	NumForm=Digit|NumType=Card	3	obl	_	start_char=28|end_char=32|ner=S-DATE|SpaceAfter=No
+7	.	.	PUNCT	.	_	3	punct	_	start_char=32|end_char=33|ner=O|SpaceAfter=No
+# text = Obama attended Harvard.
+# sent_id = 2
+# constituency = (ROOT (S (NP (NNP Obama)) (VP (VBD attended) (NP (NNP Harvard))) (. .)))
+# sentiment = 1
+1	Obama	Obama	PROPN	NNP	Number=Sing	2	nsubj	_	start_char=0|end_char=5|ner=S-PERSON
+2	attended	attend	VERB	VBD	Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin	0	root	_	start_char=6|end_char=14|ner=O
+3	Harvard	Harvard	PROPN	NNP	Number=Sing	2	obj	_	start_char=15|end_char=22|ner=S-ORG|SpaceAfter=No
+4	.	.	PUNCT	.	_	2	punct	_	start_char=22|end_char=23|ner=O|SpaceAfter=No
+""".strip()
+class TestEnglishPipeline:
+    @pytest.fixture(scope="class")
+    def pipeline(self):
+        return stanza.Pipeline(dir=TEST_MODELS_DIR)
+    @pytest.fixture(scope="class")
+    def processed_doc(self, pipeline):
+        """ Document created by running full English pipeline on a few sentences """
+        return pipeline(EN_DOC)
+    def test_text(self, processed_doc):
+        assert processed_doc.text == EN_DOC
+    def test_conllu(self, processed_doc):
+        assert "{:C}".format(processed_doc) == EN_DOC_CONLLU_GOLD
+    def test_tokens(self, processed_doc):
+        assert "\n\n".join([sent.tokens_string() for sent in processed_doc.sentences]) == EN_DOC_TOKENS_GOLD
+    def test_words(self, processed_doc):
+        assert "\n\n".join([sent.words_string() for sent in processed_doc.sentences]) == EN_DOC_WORDS_GOLD
+    def test_dependency_parse(self, processed_doc):
+        assert "\n\n".join([sent.dependencies_string() for sent in processed_doc.sentences]) == \
+               EN_DOC_DEPENDENCY_PARSES_GOLD
+    def test_empty(self, pipeline):
+        # make sure that various models handle the degenerate empty case
+        pipeline("")
+        pipeline("--")
+    def test_bulk_process(self, pipeline):
+        """ Double check that the bulk_process method in Pipeline converts documents as expected """
+        # it should process strings
+        processed = pipeline.bulk_process(EN_DOCS)
+        assert "\n\n".join(["{:C}".format(doc) for doc in processed]) == EN_DOC_CONLLU_GOLD_MULTIDOC
+        # it should pass Documents through successfully
+        docs = [Document([], text=t) for t in EN_DOCS]
+        processed = pipeline.bulk_process(docs)
+        assert "\n\n".join(["{:C}".format(doc) for doc in processed]) == EN_DOC_CONLLU_GOLD_MULTIDOC
+    def test_empty_bulk_process(self, pipeline):
+        """ Previously we had a bug where an empty document list would cause a crash """
+        processed = pipeline.bulk_process([])
+        assert processed == []
+    def test_stream(self, pipeline):
+        """ Test the streaming interface to the Pipeline """
+        # Test all of the documents in one batch
+        # (the default batch size is significantly more than |EN_DOCS|)
+        processed = [doc for doc in pipeline.stream(EN_DOCS)]
+        assert "\n\n".join(["{:C}".format(doc) for doc in processed]) == EN_DOC_CONLLU_GOLD_MULTIDOC
+        # It should also work on an iterator rather than an iterable
+        processed = [doc for doc in pipeline.stream(iter(EN_DOCS))]
+        assert "\n\n".join(["{:C}".format(doc) for doc in processed]) == EN_DOC_CONLLU_GOLD_MULTIDOC
+        # Stream one at a time
+        processed = [doc for doc in pipeline.stream(EN_DOCS, batch_size=1)]
+        processed = ["{:C}".format(doc) for doc in processed]
+        assert "\n\n".join(processed) == EN_DOC_CONLLU_GOLD_MULTIDOC
+    @pytest.fixture(scope="class")
+    def processed_multidoc(self, pipeline):
+        """ Document created by running full English pipeline on a few sentences """
+        docs = [Document([], text=t) for t in EN_DOCS]
+        return pipeline(docs)
+    def test_conllu_multidoc(self, processed_multidoc):
+        assert "\n\n".join(["{:C}".format(doc) for doc in processed_multidoc]) == EN_DOC_CONLLU_GOLD_MULTIDOC
+    def test_tokens_multidoc(self, processed_multidoc):
+        assert "\n\n".join([sent.tokens_string() for processed_doc in processed_multidoc for sent in processed_doc.sentences]) == EN_DOC_TOKENS_GOLD
+    def test_words_multidoc(self, processed_multidoc):
+        assert "\n\n".join([sent.words_string() for processed_doc in processed_multidoc for sent in processed_doc.sentences]) == EN_DOC_WORDS_GOLD
+    def test_sentence_indices_multidoc(self, processed_multidoc):
+        sentences = [sent for doc in processed_multidoc for sent in doc.sentences]
+        for sent_idx, sentence in enumerate(sentences):
+            assert sent_idx == sentence.index
+    def test_dependency_parse_multidoc(self, processed_multidoc):
+        assert "\n\n".join([sent.dependencies_string() for processed_doc in processed_multidoc for sent in processed_doc.sentences]) == \
+               EN_DOC_DEPENDENCY_PARSES_GOLD
+    @pytest.fixture(scope="class")
+    def processed_multidoc_variant(self):
+        """ Document created by running full English pipeline on a few sentences """
+        docs = [Document([], text=t) for t in EN_DOCS]
+        nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, processors={'tokenize': 'spacy'})
+        return nlp(docs)
+    def test_dependency_parse_multidoc_variant(self, processed_multidoc_variant):
+        assert "\n\n".join([sent.dependencies_string() for processed_doc in processed_multidoc_variant for sent in processed_doc.sentences]) == \
+               EN_DOC_DEPENDENCY_PARSES_GOLD
+    def test_constituency_parser(self):
+        nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, processors="tokenize,pos,constituency")
+        doc = nlp("This is a test")
+        assert str(doc.sentences[0].constituency) == '(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN test)))))'
+    def test_on_gpu(self, pipeline):
+        """
+        The default pipeline should have all the models on the GPU
+        """
+        check_on_gpu(pipeline)
+    def test_on_cpu(self):
+        """
+        Create a pipeline on the CPU, check that all the models on CPU
+        """
+        pipeline = stanza.Pipeline("en", dir=TEST_MODELS_DIR, use_gpu=False)
+        check_on_cpu(pipeline)

stanza/stanza/tests/pipeline/test_french_pipeline.py ADDED Viewed

	@@ -0,0 +1,353 @@

+"""
+Basic testing of French pipeline
+The benefit of this test is to verify that the bulk processing works
+for languages with MWT in them
+"""
+import pytest
+import stanza
+from stanza.models.common.doc import Document
+from stanza.tests import *
+from stanza.tests.pipeline.pipeline_device_tests import check_on_gpu, check_on_cpu
+pytestmark = pytest.mark.pipeline
+FR_MWT_SENTENCE = "Alors encore inconnu du grand public, Emmanuel Macron devient en 2014 ministre de l'Économie, de " \
+                  "l'Industrie et du Numérique."
+EXPECTED_RESULT = """
+[
+  [
+    {
+      "id": 1,
+      "text": "Alors",
+      "lemma": "alors",
+      "upos": "ADV",
+      "head": 3,
+      "deprel": "advmod",
+      "start_char": 0,
+      "end_char": 5
+    },
+    {
+      "id": 2,
+      "text": "encore",
+      "lemma": "encore",
+      "upos": "ADV",
+      "head": 3,
+      "deprel": "advmod",
+      "start_char": 6,
+      "end_char": 12
+    },
+    {
+      "id": 3,
+      "text": "inconnu",
+      "lemma": "inconnu",
+      "upos": "ADJ",
+      "feats": "Gender=Masc|Number=Sing",
+      "head": 11,
+      "deprel": "advcl",
+      "start_char": 13,
+      "end_char": 20
+    },
+    {
+      "id": [
+        4,
+        5
+      ],
+      "text": "du",
+      "start_char": 21,
+      "end_char": 23
+    },
+    {
+      "id": 4,
+      "text": "de",
+      "lemma": "de",
+      "upos": "ADP",
+      "head": 7,
+      "deprel": "case"
+    },
+    {
+      "id": 5,
+      "text": "le",
+      "lemma": "le",
+      "upos": "DET",
+      "feats": "Definite=Def|Gender=Masc|Number=Sing|PronType=Art",
+      "head": 7,
+      "deprel": "det"
+    },
+    {
+      "id": 6,
+      "text": "grand",
+      "lemma": "grand",
+      "upos": "ADJ",
+      "feats": "Gender=Masc|Number=Sing",
+      "head": 7,
+      "deprel": "amod",
+      "start_char": 24,
+      "end_char": 29
+    },
+    {
+      "id": 7,
+      "text": "public",
+      "lemma": "public",
+      "upos": "NOUN",
+      "feats": "Gender=Masc|Number=Sing",
+      "head": 3,
+      "deprel": "obl:arg",
+      "start_char": 30,
+      "end_char": 36,
+      "misc": "SpaceAfter=No"
+    },
+    {
+      "id": 8,
+      "text": ",",
+      "lemma": ",",
+      "upos": "PUNCT",
+      "head": 3,
+      "deprel": "punct",
+      "start_char": 36,
+      "end_char": 37
+    },
+    {
+      "id": 9,
+      "text": "Emmanuel",
+      "lemma": "Emmanuel",
+      "upos": "PROPN",
+      "head": 11,
+      "deprel": "nsubj",
+      "start_char": 38,
+      "end_char": 46
+    },
+    {
+      "id": 10,
+      "text": "Macron",
+      "lemma": "Macron",
+      "upos": "PROPN",
+      "head": 9,
+      "deprel": "flat:name",
+      "start_char": 47,
+      "end_char": 53
+    },
+    {
+      "id": 11,
+      "text": "devient",
+      "lemma": "devenir",
+      "upos": "VERB",
+      "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
+      "head": 0,
+      "deprel": "root",
+      "start_char": 54,
+      "end_char": 61
+    },
+    {
+      "id": 12,
+      "text": "en",
+      "lemma": "en",
+      "upos": "ADP",
+      "head": 13,
+      "deprel": "case",
+      "start_char": 62,
+      "end_char": 64
+    },
+    {
+      "id": 13,
+      "text": "2014",
+      "lemma": "2014",
+      "upos": "NUM",
+      "feats": "Number=Plur",
+      "head": 11,
+      "deprel": "obl:mod",
+      "start_char": 65,
+      "end_char": 69
+    },
+    {
+      "id": 14,
+      "text": "ministre",
+      "lemma": "ministre",
+      "upos": "NOUN",
+      "feats": "Gender=Masc|Number=Sing",
+      "head": 11,
+      "deprel": "xcomp",
+      "start_char": 70,
+      "end_char": 78
+    },
+    {
+      "id": 15,
+      "text": "de",
+      "lemma": "de",
+      "upos": "ADP",
+      "head": 17,
+      "deprel": "case",
+      "start_char": 79,
+      "end_char": 81
+    },
+    {
+      "id": 16,
+      "text": "l'",
+      "lemma": "le",
+      "upos": "DET",
+      "feats": "Definite=Def|Number=Sing|PronType=Art",
+      "head": 17,
+      "deprel": "det",
+      "start_char": 82,
+      "end_char": 84,
+      "misc": "SpaceAfter=No"
+    },
+    {
+      "id": 17,
+      "text": "Économie",
+      "lemma": "économie",
+      "upos": "NOUN",
+      "feats": "Gender=Fem|Number=Sing",
+      "head": 14,
+      "deprel": "nmod",
+      "start_char": 84,
+      "end_char": 92,
+      "misc": "SpaceAfter=No"
+    },
+    {
+      "id": 18,
+      "text": ",",
+      "lemma": ",",
+      "upos": "PUNCT",
+      "head": 21,
+      "deprel": "punct",
+      "start_char": 92,
+      "end_char": 93
+    },
+    {
+      "id": 19,
+      "text": "de",
+      "lemma": "de",
+      "upos": "ADP",
+      "head": 21,
+      "deprel": "case",
+      "start_char": 94,
+      "end_char": 96
+    },
+    {
+      "id": 20,
+      "text": "l'",
+      "lemma": "le",
+      "upos": "DET",
+      "feats": "Definite=Def|Number=Sing|PronType=Art",
+      "head": 21,
+      "deprel": "det",
+      "start_char": 97,
+      "end_char": 99,
+      "misc": "SpaceAfter=No"
+    },
+    {
+      "id": 21,
+      "text": "Industrie",
+      "lemma": "industrie",
+      "upos": "NOUN",
+      "feats": "Gender=Fem|Number=Sing",
+      "head": 17,
+      "deprel": "conj",
+      "start_char": 99,
+      "end_char": 108
+    },
+    {
+      "id": 22,
+      "text": "et",
+      "lemma": "et",
+      "upos": "CCONJ",
+      "head": 25,
+      "deprel": "cc",
+      "start_char": 109,
+      "end_char": 111
+    },
+    {
+      "id": [
+        23,
+        24
+      ],
+      "text": "du",
+      "start_char": 112,
+      "end_char": 114
+    },
+    {
+      "id": 23,
+      "text": "de",
+      "lemma": "de",
+      "upos": "ADP",
+      "head": 25,
+      "deprel": "case"
+    },
+    {
+      "id": 24,
+      "text": "le",
+      "lemma": "le",
+      "upos": "DET",
+      "feats": "Definite=Def|Gender=Masc|Number=Sing|PronType=Art",
+      "head": 25,
+      "deprel": "det"
+    },
+    {
+      "id": 25,
+      "text": "Numérique",
+      "lemma": "numérique",
+      "upos": "NOUN",
+      "feats": "Gender=Masc|Number=Sing",
+      "head": 17,
+      "deprel": "conj",
+      "start_char": 115,
+      "end_char": 124,
+      "misc": "SpaceAfter=No"
+    },
+    {
+      "id": 26,
+      "text": ".",
+      "lemma": ".",
+      "upos": "PUNCT",
+      "head": 11,
+      "deprel": "punct",
+      "start_char": 124,
+      "end_char": 125,
+      "misc": "SpaceAfter=No"
+    }
+  ]
+]
+"""
+class TestFrenchPipeline:
+    @pytest.fixture(scope="class")
+    def pipeline(self):
+        """ Create a pipeline with French models """
+        pipeline = stanza.Pipeline(processors='tokenize,mwt,pos,lemma,depparse', dir=TEST_MODELS_DIR, lang='fr')
+        return pipeline
+    def test_single(self, pipeline):
+        doc = pipeline(FR_MWT_SENTENCE)
+        compare_ignoring_whitespace(str(doc), EXPECTED_RESULT)
+    def test_bulk(self, pipeline):
+        NUM_DOCS = 10
+        raw_text = [FR_MWT_SENTENCE] * NUM_DOCS
+        raw_doc = [Document([], text=doccontent) for doccontent in raw_text]
+        result = pipeline(raw_doc)
+        assert len(result) == NUM_DOCS
+        for doc in result:
+            compare_ignoring_whitespace(str(doc), EXPECTED_RESULT)
+            assert len(doc.sentences) == 1
+            assert doc.num_words == 26
+            assert doc.num_tokens == 24
+    def test_on_gpu(self, pipeline):
+        """
+        The default pipeline should have all the models on the GPU
+        """
+        check_on_gpu(pipeline)
+    def test_on_cpu(self):
+        """
+        Create a pipeline on the CPU, check that all the models on CPU
+        """
+        pipeline = stanza.Pipeline("fr", dir=TEST_MODELS_DIR, use_gpu=False)
+        check_on_cpu(pipeline)

stanza/stanza/tests/pipeline/test_lemmatizer.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""
+Basic testing of lemmatization
+"""
+import pytest
+import stanza
+from stanza.tests import *
+from stanza.models.common.doc import TEXT, UPOS, LEMMA
+pytestmark = pytest.mark.pipeline
+EN_DOC = "Joe Smith was born in California."
+EN_DOC_IDENTITY_GOLD = """
+Joe Joe
+Smith Smith
+was was
+born born
+in in
+California California
+. .
+""".strip()
+EN_DOC_LEMMATIZER_MODEL_GOLD = """
+Joe Joe
+Smith Smith
+was be
+born bear
+in in
+California California
+. .
+""".strip()
+def test_identity_lemmatizer():
+    nlp = stanza.Pipeline(**{'processors': 'tokenize,lemma', 'dir': TEST_MODELS_DIR, 'lang': 'en', 'lemma_use_identity': True}, download_method=None)
+    doc = nlp(EN_DOC)
+    word_lemma_pairs = []
+    for w in doc.iter_words():
+        word_lemma_pairs += [f"{w.text} {w.lemma}"]
+    assert EN_DOC_IDENTITY_GOLD == "\n".join(word_lemma_pairs)
+def test_full_lemmatizer():
+    nlp = stanza.Pipeline(**{'processors': 'tokenize,pos,lemma', 'dir': TEST_MODELS_DIR, 'lang': 'en'}, download_method=None)
+    doc = nlp(EN_DOC)
+    word_lemma_pairs = []
+    for w in doc.iter_words():
+        word_lemma_pairs += [f"{w.text} {w.lemma}"]
+    assert EN_DOC_LEMMATIZER_MODEL_GOLD == "\n".join(word_lemma_pairs)
+def find_unknown_word(lemmatizer, base):
+    for i in range(10):
+        base = base + "z"
+        if base not in lemmatizer.word_dict and all(x[0] != base for x in lemmatizer.composite_dict.keys()):
+            return base
+    raise RuntimeError("wtf?")
+def test_store_results():
+    nlp = stanza.Pipeline(**{'processors': 'tokenize,pos,lemma', 'dir': TEST_MODELS_DIR, 'lang': 'en'}, lemma_store_results=True, download_method=None)
+    lemmatizer = nlp.processors["lemma"]._trainer
+    az = find_unknown_word(lemmatizer, "a")
+    bz = find_unknown_word(lemmatizer, "b")
+    cz = find_unknown_word(lemmatizer, "c")
+    # try sentences with the order long, short
+    doc = nlp("I found an " + az + " in my " + bz + ".  It was a " + cz)
+    stuff = doc.get([TEXT, UPOS, LEMMA])
+    assert len(stuff) == 12
+    assert stuff[3][0] == az
+    assert stuff[6][0] == bz
+    assert stuff[11][0] == cz
+    assert lemmatizer.composite_dict[(az, stuff[3][1])] == stuff[3][2]
+    assert lemmatizer.composite_dict[(bz, stuff[6][1])] == stuff[6][2]
+    assert lemmatizer.composite_dict[(cz, stuff[11][1])] == stuff[11][2]
+    doc2 = nlp("I found an " + az + " in my " + bz + ".  It was a " + cz)
+    stuff2 = doc2.get([TEXT, UPOS, LEMMA])
+    assert stuff == stuff2
+    dz = find_unknown_word(lemmatizer, "d")
+    ez = find_unknown_word(lemmatizer, "e")
+    fz = find_unknown_word(lemmatizer, "f")
+    # try sentences with the order long, short
+    doc = nlp("It was a " + dz + ".  I found an " + ez + " in my " + fz)
+    stuff = doc.get([TEXT, UPOS, LEMMA])
+    assert len(stuff) == 12
+    assert stuff[3][0] == dz
+    assert stuff[8][0] == ez
+    assert stuff[11][0] == fz
+    assert lemmatizer.composite_dict[(dz, stuff[3][1])] == stuff[3][2]
+    assert lemmatizer.composite_dict[(ez, stuff[8][1])] == stuff[8][2]
+    assert lemmatizer.composite_dict[(fz, stuff[11][1])] == stuff[11][2]
+    doc2 = nlp("It was a " + dz + ".  I found an " + ez + " in my " + fz)
+    stuff2 = doc2.get([TEXT, UPOS, LEMMA])
+    assert stuff == stuff2
+    assert az not in lemmatizer.word_dict
+def test_caseless_lemmatizer():
+    """
+    Test that setting the lemmatizer as caseless at Pipeline time lowercases the text
+    """
+    nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma', model_dir=TEST_MODELS_DIR, download_method=None)
+    # the capital letter here should throw off the lemmatizer & it won't remove the plural
+    # although weirdly the current English model *does* lowercase the A
+    doc = nlp("Here is an Excerpt")
+    assert doc.sentences[0].words[-1].lemma == 'excerpt'
+    nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma', model_dir=TEST_MODELS_DIR, download_method=None, lemma_caseless=True)
+    # with the model set to lowercasing, the word will be treated as if it were 'antennae'
+    doc = nlp("Here is an Excerpt")
+    assert doc.sentences[0].words[-1].lemma == 'Excerpt'
+def test_latin_caseless_lemmatizer():
+    """
+    Test the Latin caseless lemmatizer
+    """
+    nlp = stanza.Pipeline('la', package='ittb', processors='tokenize,pos,lemma', model_dir=TEST_MODELS_DIR, download_method=None)
+    lemmatizer = nlp.processors['lemma']
+    assert lemmatizer.config['caseless']
+    doc = nlp("Quod Erat Demonstrandum")
+    expected_lemmas = "qui sum demonstro".split()
+    assert len(doc.sentences) == 1
+    assert len(doc.sentences[0].words) == 3
+    for word, expected in zip(doc.sentences[0].words, expected_lemmas):
+        assert word.lemma == expected

stanza/stanza/tests/pipeline/test_pipeline_constituency_processor.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import gc
+import pytest
+import stanza
+from stanza.models.common.foundation_cache import FoundationCache
+from stanza.tests import *
+pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
+# data for testing
+TEST_TEXT = "This is a test.  Another sentence.  Are these sorted?"
+TEST_TOKENS = [["This", "is", "a", "test", "."], ["Another", "sentence", "."], ["Are", "these", "sorted", "?"]]
+@pytest.fixture(scope="module")
+def foundation_cache():
+    gc.collect()
+    return FoundationCache()
+def check_results(doc):
+    assert len(doc.sentences) == len(TEST_TOKENS)
+    for sentence, expected in zip(doc.sentences, TEST_TOKENS):
+        assert sentence.constituency.leaf_labels() == expected
+def test_sorted_big_batch(foundation_cache):
+    pipe = stanza.Pipeline("en", model_dir=TEST_MODELS_DIR, processors="tokenize,pos,constituency", foundation_cache=foundation_cache, download_method=None)
+    doc = pipe(TEST_TEXT)
+    check_results(doc)
+def test_comments(foundation_cache):
+    """
+    Test that the pipeline is creating constituency comments
+    """
+    pipe = stanza.Pipeline("en", model_dir=TEST_MODELS_DIR, processors="tokenize,pos,constituency", foundation_cache=foundation_cache, download_method=None)
+    doc = pipe(TEST_TEXT)
+    check_results(doc)
+    for sentence in doc.sentences:
+        assert any(x.startswith("# constituency = ") for x in sentence.comments)
+    doc.sentences[0].constituency = "asdf"
+    assert "# constituency = asdf" in doc.sentences[0].comments
+    for sentence in doc.sentences:
+        assert len([x for x in sentence.comments if x.startswith("# constituency")]) == 1
+def test_illegal_batch_size(foundation_cache):
+    stanza.Pipeline("en", model_dir=TEST_MODELS_DIR, processors="tokenize,pos", constituency_batch_size="zzz", foundation_cache=foundation_cache, download_method=None)
+    with pytest.raises(ValueError):
+        stanza.Pipeline("en", model_dir=TEST_MODELS_DIR, processors="tokenize,pos,constituency", constituency_batch_size="zzz", foundation_cache=foundation_cache, download_method=None)
+def test_sorted_one_batch(foundation_cache):
+    pipe = stanza.Pipeline("en", model_dir=TEST_MODELS_DIR, processors="tokenize,pos,constituency", constituency_batch_size=1, foundation_cache=foundation_cache, download_method=None)
+    doc = pipe(TEST_TEXT)
+    check_results(doc)
+def test_sorted_two_batch(foundation_cache):
+    pipe = stanza.Pipeline("en", model_dir=TEST_MODELS_DIR, processors="tokenize,pos,constituency", constituency_batch_size=2, foundation_cache=foundation_cache, download_method=None)
+    doc = pipe(TEST_TEXT)
+    check_results(doc)
+def test_get_constituents(foundation_cache):
+    pipe = stanza.Pipeline("en", processors="tokenize,pos,constituency", foundation_cache=foundation_cache, download_method=None)
+    assert "SBAR" in pipe.processors["constituency"].get_constituents()