Albin Thörn Cleland
Clean initial commit with LFS
19b8775
"""
Run the tagger for a couple iterations on some fake data
Uses a couple sentences of UD_English-EWT as training/dev data
"""
import os
import pytest
import zipfile
import torch
from stanza.models import parser
from stanza.models.common import pretrain
from stanza.models.depparse.trainer import Trainer
from stanza.tests import TEST_WORKING_DIR
pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
TRAIN_DATA = """
# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0003
# text = DPA: Iraqi authorities announced that they had busted up 3 terrorist cells operating in Baghdad.
1 DPA DPA PROPN NNP Number=Sing 0 root 0:root SpaceAfter=No
2 : : PUNCT : _ 1 punct 1:punct _
3 Iraqi Iraqi ADJ JJ Degree=Pos 4 amod 4:amod _
4 authorities authority NOUN NNS Number=Plur 5 nsubj 5:nsubj _
5 announced announce VERB VBD Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin 1 parataxis 1:parataxis _
6 that that SCONJ IN _ 9 mark 9:mark _
7 they they PRON PRP Case=Nom|Number=Plur|Person=3|PronType=Prs 9 nsubj 9:nsubj _
8 had have AUX VBD Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin 9 aux 9:aux _
9 busted bust VERB VBN Tense=Past|VerbForm=Part 5 ccomp 5:ccomp _
10 up up ADP RP _ 9 compound:prt 9:compound:prt _
11 3 3 NUM CD NumForm=Digit|NumType=Card 13 nummod 13:nummod _
12 terrorist terrorist ADJ JJ Degree=Pos 13 amod 13:amod _
13 cells cell NOUN NNS Number=Plur 9 obj 9:obj _
14 operating operate VERB VBG VerbForm=Ger 13 acl 13:acl _
15 in in ADP IN _ 16 case 16:case _
16 Baghdad Baghdad PROPN NNP Number=Sing 14 obl 14:obl:in SpaceAfter=No
17 . . PUNCT . _ 1 punct 1:punct _
# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0004
# text = Two of them were being run by 2 officials of the Ministry of the Interior!
1 Two two NUM CD NumForm=Word|NumType=Card 6 nsubj:pass 6:nsubj:pass _
2 of of ADP IN _ 3 case 3:case _
3 them they PRON PRP Case=Acc|Number=Plur|Person=3|PronType=Prs 1 nmod 1:nmod:of _
4 were be AUX VBD Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin 6 aux 6:aux _
5 being be AUX VBG VerbForm=Ger 6 aux:pass 6:aux:pass _
6 run run VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root 0:root _
7 by by ADP IN _ 9 case 9:case _
8 2 2 NUM CD NumForm=Digit|NumType=Card 9 nummod 9:nummod _
9 officials official NOUN NNS Number=Plur 6 obl 6:obl:by _
10 of of ADP IN _ 12 case 12:case _
11 the the DET DT Definite=Def|PronType=Art 12 det 12:det _
12 Ministry Ministry PROPN NNP Number=Sing 9 nmod 9:nmod:of _
13 of of ADP IN _ 15 case 15:case _
14 the the DET DT Definite=Def|PronType=Art 15 det 15:det _
15 Interior Interior PROPN NNP Number=Sing 12 nmod 12:nmod:of SpaceAfter=No
16 ! ! PUNCT . _ 6 punct 6:punct _
""".lstrip()
DEV_DATA = """
1 From from ADP IN _ 3 case 3:case _
2 the the DET DT Definite=Def|PronType=Art 3 det 3:det _
3 AP AP PROPN NNP Number=Sing 4 obl 4:obl:from _
4 comes come VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root 0:root _
5 this this DET DT Number=Sing|PronType=Dem 6 det 6:det _
6 story story NOUN NN Number=Sing 4 nsubj 4:nsubj _
7 : : PUNCT : _ 4 punct 4:punct _
""".lstrip()
class TestParser:
@pytest.fixture(scope="class")
def wordvec_pretrain_file(self):
return f'{TEST_WORKING_DIR}/in/tiny_emb.pt'
def run_training(self, tmp_path, wordvec_pretrain_file, train_text, dev_text, augment_nopunct=False, extra_args=None, zip_train_data=False):
"""
Run the training for a few iterations, load & return the model
"""
train_file = str(tmp_path / "train.zip") if zip_train_data else str(tmp_path / "train.conllu")
dev_file = str(tmp_path / "dev.conllu")
pred_file = str(tmp_path / "pred.conllu")
save_name = "test_parser.pt"
save_file = str(tmp_path / save_name)
if zip_train_data:
with zipfile.ZipFile(train_file, "w") as zout:
with zout.open('train.conllu', 'w') as fout:
fout.write(train_text.encode())
else:
with open(train_file, "w", encoding="utf-8") as fout:
fout.write(train_text)
with open(dev_file, "w", encoding="utf-8") as fout:
fout.write(dev_text)
args = ["--wordvec_pretrain_file", wordvec_pretrain_file,
"--train_file", train_file,
"--eval_file", dev_file,
"--output_file", pred_file,
"--log_step", "10",
"--eval_interval", "20",
"--max_steps", "100",
"--shorthand", "en_test",
"--save_dir", str(tmp_path),
"--save_name", save_name,
# in case we are doing a bert test
"--bert_start_finetuning", "10",
"--bert_warmup_steps", "10",
"--lang", "en"]
if not augment_nopunct:
args.extend(["--augment_nopunct", "0.0"])
if extra_args is not None:
args = args + extra_args
trainer, _ = parser.main(args)
assert os.path.exists(save_file)
pt = pretrain.Pretrain(wordvec_pretrain_file)
# test loading the saved model
saved_model = Trainer(pretrain=pt, model_file=save_file)
return trainer
def test_train(self, tmp_path, wordvec_pretrain_file):
"""
Simple test of a few 'epochs' of tagger training
"""
self.run_training(tmp_path, wordvec_pretrain_file, TRAIN_DATA, DEV_DATA)
def test_zipfile_train(self, tmp_path, wordvec_pretrain_file):
"""
Simple test of a few 'epochs' of tagger training with a zipfile
"""
self.run_training(tmp_path, wordvec_pretrain_file, TRAIN_DATA, DEV_DATA, zip_train_data=True)
def test_with_bert_nlayers(self, tmp_path, wordvec_pretrain_file):
self.run_training(tmp_path, wordvec_pretrain_file, TRAIN_DATA, DEV_DATA, extra_args=['--bert_model', 'hf-internal-testing/tiny-bert', '--bert_hidden_layers', '2'])
def test_with_bert_finetuning(self, tmp_path, wordvec_pretrain_file):
trainer = self.run_training(tmp_path, wordvec_pretrain_file, TRAIN_DATA, DEV_DATA, extra_args=['--bert_model', 'hf-internal-testing/tiny-bert', '--bert_finetune', '--bert_hidden_layers', '2'])
assert 'bert_optimizer' in trainer.optimizer.keys()
assert 'bert_scheduler' in trainer.scheduler.keys()
def test_with_bert_finetuning_resaved(self, tmp_path, wordvec_pretrain_file):
"""
Check that if we save, then load, then save a model with a finetuned bert, that bert isn't lost
"""
trainer = self.run_training(tmp_path, wordvec_pretrain_file, TRAIN_DATA, DEV_DATA, extra_args=['--bert_model', 'hf-internal-testing/tiny-bert', '--bert_finetune', '--bert_hidden_layers', '2'])
assert 'bert_optimizer' in trainer.optimizer.keys()
assert 'bert_scheduler' in trainer.scheduler.keys()
save_name = trainer.args['save_name']
filename = tmp_path / save_name
assert os.path.exists(filename)
checkpoint = torch.load(filename, lambda storage, loc: storage, weights_only=True)
assert any(x.startswith("bert_model") for x in checkpoint['model'].keys())
# Test loading the saved model, saving it, and still having bert in it
# even if we have set bert_finetune to False for this incarnation
pt = pretrain.Pretrain(wordvec_pretrain_file)
args = {"bert_finetune": False}
saved_model = Trainer(pretrain=pt, model_file=filename, args=args)
saved_model.save(filename)
# This is the part that would fail if the force_bert_saved option did not exist
checkpoint = torch.load(filename, lambda storage, loc: storage, weights_only=True)
assert any(x.startswith("bert_model") for x in checkpoint['model'].keys())
def test_with_peft(self, tmp_path, wordvec_pretrain_file):
trainer = self.run_training(tmp_path, wordvec_pretrain_file, TRAIN_DATA, DEV_DATA, extra_args=['--bert_model', 'hf-internal-testing/tiny-bert', '--bert_finetune', '--bert_hidden_layers', '2', '--use_peft'])
assert 'bert_optimizer' in trainer.optimizer.keys()
assert 'bert_scheduler' in trainer.scheduler.keys()
def test_single_optimizer_checkpoint(self, tmp_path, wordvec_pretrain_file):
trainer = self.run_training(tmp_path, wordvec_pretrain_file, TRAIN_DATA, DEV_DATA, extra_args=['--optim', 'adam'])
save_dir = trainer.args['save_dir']
save_name = trainer.args['save_name']
checkpoint_name = trainer.args["checkpoint_save_name"]
assert os.path.exists(os.path.join(save_dir, save_name))
assert checkpoint_name is not None
assert os.path.exists(checkpoint_name)
assert len(trainer.optimizer) == 1
for opt in trainer.optimizer.values():
assert isinstance(opt, torch.optim.Adam)
pt = pretrain.Pretrain(wordvec_pretrain_file)
checkpoint = Trainer(args=trainer.args, pretrain=pt, model_file=checkpoint_name)
assert checkpoint.optimizer is not None
assert len(checkpoint.optimizer) == 1
for opt in checkpoint.optimizer.values():
assert isinstance(opt, torch.optim.Adam)
def test_two_optimizers_checkpoint(self, tmp_path, wordvec_pretrain_file):
trainer = self.run_training(tmp_path, wordvec_pretrain_file, TRAIN_DATA, DEV_DATA, extra_args=['--optim', 'adam', '--second_optim', 'sgd', '--second_optim_start_step', '40'])
save_dir = trainer.args['save_dir']
save_name = trainer.args['save_name']
checkpoint_name = trainer.args["checkpoint_save_name"]
assert os.path.exists(os.path.join(save_dir, save_name))
assert checkpoint_name is not None
assert os.path.exists(checkpoint_name)
assert len(trainer.optimizer) == 1
for opt in trainer.optimizer.values():
assert isinstance(opt, torch.optim.SGD)
pt = pretrain.Pretrain(wordvec_pretrain_file)
checkpoint = Trainer(args=trainer.args, pretrain=pt, model_file=checkpoint_name)
assert checkpoint.optimizer is not None
assert len(checkpoint.optimizer) == 1
for opt in trainer.optimizer.values():
assert isinstance(opt, torch.optim.SGD)