stanza-digphil / stanza /tests /depparse /test_parser.py

Albin Thörn Cleland

Clean initial commit with LFS

19b8775 about 2 months ago

10.3 kB

	"""
	Run the tagger for a couple iterations on some fake data

	Uses a couple sentences of UD_English-EWT as training/dev data
	"""

	import os
	import pytest
	import zipfile

	import torch

	from stanza.models import parser
	from stanza.models.common import pretrain
	from stanza.models.depparse.trainer import Trainer
	from stanza.tests import TEST_WORKING_DIR

	pytestmark = [pytest.mark.pipeline, pytest.mark.travis]

	TRAIN_DATA = """
	# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0003
	# text = DPA: Iraqi authorities announced that they had busted up 3 terrorist cells operating in Baghdad.
	1 DPA DPA PROPN NNP Number=Sing 0 root 0:root SpaceAfter=No
	2 : : PUNCT : _ 1 punct 1:punct _
	3 Iraqi Iraqi ADJ JJ Degree=Pos 4 amod 4:amod _
	4 authorities authority NOUN NNS Number=Plur 5 nsubj 5:nsubj _
	5 announced announce VERB VBD Mood=Ind\|Number=Plur\|Person=3\|Tense=Past\|VerbForm=Fin 1 parataxis 1:parataxis _
	6 that that SCONJ IN _ 9 mark 9:mark _
	7 they they PRON PRP Case=Nom\|Number=Plur\|Person=3\|PronType=Prs 9 nsubj 9:nsubj _
	8 had have AUX VBD Mood=Ind\|Number=Plur\|Person=3\|Tense=Past\|VerbForm=Fin 9 aux 9:aux _
	9 busted bust VERB VBN Tense=Past\|VerbForm=Part 5 ccomp 5:ccomp _
	10 up up ADP RP _ 9 compound:prt 9:compound:prt _
	11 3 3 NUM CD NumForm=Digit\|NumType=Card 13 nummod 13:nummod _
	12 terrorist terrorist ADJ JJ Degree=Pos 13 amod 13:amod _
	13 cells cell NOUN NNS Number=Plur 9 obj 9:obj _
	14 operating operate VERB VBG VerbForm=Ger 13 acl 13:acl _
	15 in in ADP IN _ 16 case 16:case _
	16 Baghdad Baghdad PROPN NNP Number=Sing 14 obl 14:obl:in SpaceAfter=No
	17 . . PUNCT . _ 1 punct 1:punct _

	# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0004
	# text = Two of them were being run by 2 officials of the Ministry of the Interior!
	1 Two two NUM CD NumForm=Word\|NumType=Card 6 nsubj:pass 6:nsubj:pass _
	2 of of ADP IN _ 3 case 3:case _
	3 them they PRON PRP Case=Acc\|Number=Plur\|Person=3\|PronType=Prs 1 nmod 1:nmod:of _
	4 were be AUX VBD Mood=Ind\|Number=Plur\|Person=3\|Tense=Past\|VerbForm=Fin 6 aux 6:aux _
	5 being be AUX VBG VerbForm=Ger 6 aux:pass 6:aux:pass _
	6 run run VERB VBN Tense=Past\|VerbForm=Part\|Voice=Pass 0 root 0:root _
	7 by by ADP IN _ 9 case 9:case _
	8 2 2 NUM CD NumForm=Digit\|NumType=Card 9 nummod 9:nummod _
	9 officials official NOUN NNS Number=Plur 6 obl 6:obl:by _
	10 of of ADP IN _ 12 case 12:case _
	11 the the DET DT Definite=Def\|PronType=Art 12 det 12:det _
	12 Ministry Ministry PROPN NNP Number=Sing 9 nmod 9:nmod:of _
	13 of of ADP IN _ 15 case 15:case _
	14 the the DET DT Definite=Def\|PronType=Art 15 det 15:det _
	15 Interior Interior PROPN NNP Number=Sing 12 nmod 12:nmod:of SpaceAfter=No
	16 ! ! PUNCT . _ 6 punct 6:punct _

	""".lstrip()


	DEV_DATA = """
	1 From from ADP IN _ 3 case 3:case _
	2 the the DET DT Definite=Def\|PronType=Art 3 det 3:det _
	3 AP AP PROPN NNP Number=Sing 4 obl 4:obl:from _
	4 comes come VERB VBZ Mood=Ind\|Number=Sing\|Person=3\|Tense=Pres\|VerbForm=Fin 0 root 0:root _
	5 this this DET DT Number=Sing\|PronType=Dem 6 det 6:det _
	6 story story NOUN NN Number=Sing 4 nsubj 4:nsubj _
	7 : : PUNCT : _ 4 punct 4:punct _

	""".lstrip()



	class TestParser:
	@pytest.fixture(scope="class")
	def wordvec_pretrain_file(self):
	return f'{TEST_WORKING_DIR}/in/tiny_emb.pt'

	def run_training(self, tmp_path, wordvec_pretrain_file, train_text, dev_text, augment_nopunct=False, extra_args=None, zip_train_data=False):
	"""
	Run the training for a few iterations, load & return the model
	"""
	train_file = str(tmp_path / "train.zip") if zip_train_data else str(tmp_path / "train.conllu")
	dev_file = str(tmp_path / "dev.conllu")
	pred_file = str(tmp_path / "pred.conllu")

	save_name = "test_parser.pt"
	save_file = str(tmp_path / save_name)

	if zip_train_data:
	with zipfile.ZipFile(train_file, "w") as zout:
	with zout.open('train.conllu', 'w') as fout:
	fout.write(train_text.encode())
	else:
	with open(train_file, "w", encoding="utf-8") as fout:
	fout.write(train_text)

	with open(dev_file, "w", encoding="utf-8") as fout:
	fout.write(dev_text)

	args = ["--wordvec_pretrain_file", wordvec_pretrain_file,
	"--train_file", train_file,
	"--eval_file", dev_file,
	"--output_file", pred_file,
	"--log_step", "10",
	"--eval_interval", "20",
	"--max_steps", "100",
	"--shorthand", "en_test",
	"--save_dir", str(tmp_path),
	"--save_name", save_name,
	# in case we are doing a bert test
	"--bert_start_finetuning", "10",
	"--bert_warmup_steps", "10",
	"--lang", "en"]
	if not augment_nopunct:
	args.extend(["--augment_nopunct", "0.0"])
	if extra_args is not None:
	args = args + extra_args
	trainer, _ = parser.main(args)

	assert os.path.exists(save_file)
	pt = pretrain.Pretrain(wordvec_pretrain_file)
	# test loading the saved model
	saved_model = Trainer(pretrain=pt, model_file=save_file)
	return trainer

	def test_train(self, tmp_path, wordvec_pretrain_file):
	"""
	Simple test of a few 'epochs' of tagger training
	"""
	self.run_training(tmp_path, wordvec_pretrain_file, TRAIN_DATA, DEV_DATA)

	def test_zipfile_train(self, tmp_path, wordvec_pretrain_file):
	"""
	Simple test of a few 'epochs' of tagger training with a zipfile
	"""
	self.run_training(tmp_path, wordvec_pretrain_file, TRAIN_DATA, DEV_DATA, zip_train_data=True)

	def test_with_bert_nlayers(self, tmp_path, wordvec_pretrain_file):
	self.run_training(tmp_path, wordvec_pretrain_file, TRAIN_DATA, DEV_DATA, extra_args=['--bert_model', 'hf-internal-testing/tiny-bert', '--bert_hidden_layers', '2'])

	def test_with_bert_finetuning(self, tmp_path, wordvec_pretrain_file):
	trainer = self.run_training(tmp_path, wordvec_pretrain_file, TRAIN_DATA, DEV_DATA, extra_args=['--bert_model', 'hf-internal-testing/tiny-bert', '--bert_finetune', '--bert_hidden_layers', '2'])
	assert 'bert_optimizer' in trainer.optimizer.keys()
	assert 'bert_scheduler' in trainer.scheduler.keys()

	def test_with_bert_finetuning_resaved(self, tmp_path, wordvec_pretrain_file):
	"""
	Check that if we save, then load, then save a model with a finetuned bert, that bert isn't lost
	"""
	trainer = self.run_training(tmp_path, wordvec_pretrain_file, TRAIN_DATA, DEV_DATA, extra_args=['--bert_model', 'hf-internal-testing/tiny-bert', '--bert_finetune', '--bert_hidden_layers', '2'])
	assert 'bert_optimizer' in trainer.optimizer.keys()
	assert 'bert_scheduler' in trainer.scheduler.keys()

	save_name = trainer.args['save_name']
	filename = tmp_path / save_name
	assert os.path.exists(filename)
	checkpoint = torch.load(filename, lambda storage, loc: storage, weights_only=True)
	assert any(x.startswith("bert_model") for x in checkpoint['model'].keys())

	# Test loading the saved model, saving it, and still having bert in it
	# even if we have set bert_finetune to False for this incarnation
	pt = pretrain.Pretrain(wordvec_pretrain_file)
	args = {"bert_finetune": False}
	saved_model = Trainer(pretrain=pt, model_file=filename, args=args)

	saved_model.save(filename)

	# This is the part that would fail if the force_bert_saved option did not exist
	checkpoint = torch.load(filename, lambda storage, loc: storage, weights_only=True)
	assert any(x.startswith("bert_model") for x in checkpoint['model'].keys())

	def test_with_peft(self, tmp_path, wordvec_pretrain_file):
	trainer = self.run_training(tmp_path, wordvec_pretrain_file, TRAIN_DATA, DEV_DATA, extra_args=['--bert_model', 'hf-internal-testing/tiny-bert', '--bert_finetune', '--bert_hidden_layers', '2', '--use_peft'])
	assert 'bert_optimizer' in trainer.optimizer.keys()
	assert 'bert_scheduler' in trainer.scheduler.keys()

	def test_single_optimizer_checkpoint(self, tmp_path, wordvec_pretrain_file):
	trainer = self.run_training(tmp_path, wordvec_pretrain_file, TRAIN_DATA, DEV_DATA, extra_args=['--optim', 'adam'])

	save_dir = trainer.args['save_dir']
	save_name = trainer.args['save_name']
	checkpoint_name = trainer.args["checkpoint_save_name"]

	assert os.path.exists(os.path.join(save_dir, save_name))
	assert checkpoint_name is not None
	assert os.path.exists(checkpoint_name)

	assert len(trainer.optimizer) == 1
	for opt in trainer.optimizer.values():
	assert isinstance(opt, torch.optim.Adam)

	pt = pretrain.Pretrain(wordvec_pretrain_file)
	checkpoint = Trainer(args=trainer.args, pretrain=pt, model_file=checkpoint_name)
	assert checkpoint.optimizer is not None
	assert len(checkpoint.optimizer) == 1
	for opt in checkpoint.optimizer.values():
	assert isinstance(opt, torch.optim.Adam)

	def test_two_optimizers_checkpoint(self, tmp_path, wordvec_pretrain_file):
	trainer = self.run_training(tmp_path, wordvec_pretrain_file, TRAIN_DATA, DEV_DATA, extra_args=['--optim', 'adam', '--second_optim', 'sgd', '--second_optim_start_step', '40'])

	save_dir = trainer.args['save_dir']
	save_name = trainer.args['save_name']
	checkpoint_name = trainer.args["checkpoint_save_name"]

	assert os.path.exists(os.path.join(save_dir, save_name))
	assert checkpoint_name is not None
	assert os.path.exists(checkpoint_name)

	assert len(trainer.optimizer) == 1
	for opt in trainer.optimizer.values():
	assert isinstance(opt, torch.optim.SGD)

	pt = pretrain.Pretrain(wordvec_pretrain_file)
	checkpoint = Trainer(args=trainer.args, pretrain=pt, model_file=checkpoint_name)
	assert checkpoint.optimizer is not None
	assert len(checkpoint.optimizer) == 1
	for opt in trainer.optimizer.values():
	assert isinstance(opt, torch.optim.SGD)