File size: 7,226 Bytes
19b8775 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import os
import pytest
import stanza
import stanza.models.classifier as classifier
import stanza.models.classifiers.data as data
from stanza.models.classifiers.trainer import Trainer
from stanza.tests import TEST_MODELS_DIR
from stanza.tests.classifiers.test_classifier import fake_embeddings
from stanza.tests.classifiers.test_data import train_file_with_trees, dev_file_with_trees
from stanza.models.common import utils
from stanza.tests.constituency.test_trainer import build_trainer, TREEBANK
pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
class TestConstituencyClassifier:
@pytest.fixture(scope="class")
def constituency_model(self, fake_embeddings, tmp_path_factory):
args = ['--pattn_num_layers', '0', '--lattn_d_proj', '0', '--hidden_size', '20', '--delta_embedding_dim', '10']
trainer = build_trainer(str(fake_embeddings), *args, treebank=TREEBANK)
trainer_pt = str(tmp_path_factory.mktemp("constituency") / "constituency.pt")
trainer.save(trainer_pt, save_optimizer=False)
return trainer_pt
def build_model(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, extra_args=None):
"""
Build a Constituency Classifier model to be used by one of the later tests
"""
save_dir = str(tmp_path / "classifier")
save_name = "model.pt"
args = ["--save_dir", save_dir,
"--save_name", save_name,
"--model_type", "constituency",
"--constituency_model", constituency_model,
"--wordvec_pretrain_file", str(fake_embeddings),
"--fc_shapes", "20,10",
"--train_file", str(train_file_with_trees),
"--dev_file", str(dev_file_with_trees),
"--max_epochs", "2",
"--batch_size", "60"]
if extra_args is not None:
args = args + extra_args
args = classifier.parse_args(args)
train_set = data.read_dataset(args.train_file, args.wordvec_type, args.min_train_len)
trainer = Trainer.build_new_model(args, train_set)
return trainer, train_set, args
def run_training(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, extra_args=None):
"""
Iterate a couple times over a model
"""
trainer, train_set, args = self.build_model(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, extra_args)
dev_set = data.read_dataset(args.dev_file, args.wordvec_type, args.min_train_len)
labels = data.dataset_labels(train_set)
save_filename = os.path.join(args.save_dir, args.save_name)
checkpoint_file = utils.checkpoint_name(args.save_dir, save_filename, args.checkpoint_save_name)
classifier.train_model(trainer, save_filename, checkpoint_file, args, train_set, dev_set, labels)
return trainer, train_set, args
def test_build_model(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees):
"""
Test that building a basic constituency-based model works
"""
self.build_model(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees)
def test_save_load(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees):
"""
Test that a constituency model can save & load
"""
trainer, _, args = self.build_model(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees)
save_filename = os.path.join(args.save_dir, args.save_name)
trainer.save(save_filename)
args.load_name = args.save_name
trainer = Trainer.load(args.load_name, args)
def test_train_basic(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees):
self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees)
def test_train_pipeline(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees):
"""
Test that writing out a temp model, then loading it in the pipeline is a thing that works
"""
trainer, _, args = self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees)
save_filename = os.path.join(args.save_dir, args.save_name)
assert os.path.exists(save_filename)
assert os.path.exists(args.constituency_model)
pipeline_args = {"lang": "en",
"download_method": None,
"model_dir": TEST_MODELS_DIR,
"processors": "tokenize,pos,constituency,sentiment",
"tokenize_pretokenized": True,
"constituency_model_path": args.constituency_model,
"constituency_pretrain_path": args.wordvec_pretrain_file,
"constituency_backward_charlm_path": None,
"constituency_forward_charlm_path": None,
"sentiment_model_path": save_filename,
"sentiment_pretrain_path": args.wordvec_pretrain_file,
"sentiment_backward_charlm_path": None,
"sentiment_forward_charlm_path": None}
pipeline = stanza.Pipeline(**pipeline_args)
doc = pipeline("This is a test")
# since the model is random, we have no expectations for what the result actually is
assert doc.sentences[0].sentiment is not None
def test_train_all_words(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees):
self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, ['--constituency_all_words'])
self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, ['--no_constituency_all_words'])
def test_train_top_layer(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees):
self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, ['--constituency_top_layer'])
self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, ['--no_constituency_top_layer'])
def test_train_attn(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees):
self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, ['--constituency_node_attn', '--no_constituency_all_words'])
self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, ['--constituency_node_attn', '--constituency_all_words'])
self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, ['--no_constituency_node_attn'])
|