stanza-digphil / stanza /tests /classifiers /test_constituency_classifier.py

Albin Thörn Cleland

Clean initial commit with LFS

19b8775 about 2 months ago

7.23 kB

	import os

	import pytest

	import stanza
	import stanza.models.classifier as classifier
	import stanza.models.classifiers.data as data
	from stanza.models.classifiers.trainer import Trainer
	from stanza.tests import TEST_MODELS_DIR
	from stanza.tests.classifiers.test_classifier import fake_embeddings
	from stanza.tests.classifiers.test_data import train_file_with_trees, dev_file_with_trees
	from stanza.models.common import utils
	from stanza.tests.constituency.test_trainer import build_trainer, TREEBANK

	pytestmark = [pytest.mark.pipeline, pytest.mark.travis]

	class TestConstituencyClassifier:
	@pytest.fixture(scope="class")
	def constituency_model(self, fake_embeddings, tmp_path_factory):
	args = ['--pattn_num_layers', '0', '--lattn_d_proj', '0', '--hidden_size', '20', '--delta_embedding_dim', '10']
	trainer = build_trainer(str(fake_embeddings), *args, treebank=TREEBANK)

	trainer_pt = str(tmp_path_factory.mktemp("constituency") / "constituency.pt")
	trainer.save(trainer_pt, save_optimizer=False)
	return trainer_pt

	def build_model(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, extra_args=None):
	"""
	Build a Constituency Classifier model to be used by one of the later tests
	"""
	save_dir = str(tmp_path / "classifier")
	save_name = "model.pt"
	args = ["--save_dir", save_dir,
	"--save_name", save_name,
	"--model_type", "constituency",
	"--constituency_model", constituency_model,
	"--wordvec_pretrain_file", str(fake_embeddings),
	"--fc_shapes", "20,10",
	"--train_file", str(train_file_with_trees),
	"--dev_file", str(dev_file_with_trees),
	"--max_epochs", "2",
	"--batch_size", "60"]
	if extra_args is not None:
	args = args + extra_args
	args = classifier.parse_args(args)
	train_set = data.read_dataset(args.train_file, args.wordvec_type, args.min_train_len)
	trainer = Trainer.build_new_model(args, train_set)
	return trainer, train_set, args

	def run_training(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, extra_args=None):
	"""
	Iterate a couple times over a model
	"""
	trainer, train_set, args = self.build_model(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, extra_args)
	dev_set = data.read_dataset(args.dev_file, args.wordvec_type, args.min_train_len)
	labels = data.dataset_labels(train_set)

	save_filename = os.path.join(args.save_dir, args.save_name)
	checkpoint_file = utils.checkpoint_name(args.save_dir, save_filename, args.checkpoint_save_name)
	classifier.train_model(trainer, save_filename, checkpoint_file, args, train_set, dev_set, labels)
	return trainer, train_set, args

	def test_build_model(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees):
	"""
	Test that building a basic constituency-based model works
	"""
	self.build_model(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees)

	def test_save_load(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees):
	"""
	Test that a constituency model can save & load
	"""
	trainer, _, args = self.build_model(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees)

	save_filename = os.path.join(args.save_dir, args.save_name)
	trainer.save(save_filename)

	args.load_name = args.save_name
	trainer = Trainer.load(args.load_name, args)

	def test_train_basic(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees):
	self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees)

	def test_train_pipeline(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees):
	"""
	Test that writing out a temp model, then loading it in the pipeline is a thing that works
	"""
	trainer, _, args = self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees)
	save_filename = os.path.join(args.save_dir, args.save_name)
	assert os.path.exists(save_filename)
	assert os.path.exists(args.constituency_model)

	pipeline_args = {"lang": "en",
	"download_method": None,
	"model_dir": TEST_MODELS_DIR,
	"processors": "tokenize,pos,constituency,sentiment",
	"tokenize_pretokenized": True,
	"constituency_model_path": args.constituency_model,
	"constituency_pretrain_path": args.wordvec_pretrain_file,
	"constituency_backward_charlm_path": None,
	"constituency_forward_charlm_path": None,
	"sentiment_model_path": save_filename,
	"sentiment_pretrain_path": args.wordvec_pretrain_file,
	"sentiment_backward_charlm_path": None,
	"sentiment_forward_charlm_path": None}
	pipeline = stanza.Pipeline(**pipeline_args)
	doc = pipeline("This is a test")
	# since the model is random, we have no expectations for what the result actually is
	assert doc.sentences[0].sentiment is not None


	def test_train_all_words(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees):
	self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, ['--constituency_all_words'])

	self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, ['--no_constituency_all_words'])

	def test_train_top_layer(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees):
	self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, ['--constituency_top_layer'])

	self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, ['--no_constituency_top_layer'])

	def test_train_attn(self, tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees):
	self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, ['--constituency_node_attn', '--no_constituency_all_words'])

	self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, ['--constituency_node_attn', '--constituency_all_words'])

	self.run_training(tmp_path, constituency_model, fake_embeddings, train_file_with_trees, dev_file_with_trees, ['--no_constituency_node_attn'])