File size: 10,290 Bytes

19b8775

"""
Run the tagger for a couple iterations on some fake data

Uses a couple sentences of UD_English-EWT as training/dev data
"""

import os
import pytest
import zipfile

import torch

from stanza.models import parser
from stanza.models.common import pretrain
from stanza.models.depparse.trainer import Trainer
from stanza.tests import TEST_WORKING_DIR

pytestmark = [pytest.mark.pipeline, pytest.mark.travis]

TRAIN_DATA = """
# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0003
# text = DPA: Iraqi authorities announced that they had busted up 3 terrorist cells operating in Baghdad.
1	DPA	DPA	PROPN	NNP	Number=Sing	0	root	0:root	SpaceAfter=No
2	:	:	PUNCT	:	_	1	punct	1:punct	_
3	Iraqi	Iraqi	ADJ	JJ	Degree=Pos	4	amod	4:amod	_
4	authorities	authority	NOUN	NNS	Number=Plur	5	nsubj	5:nsubj	_
5	announced	announce	VERB	VBD	Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin	1	parataxis	1:parataxis	_
6	that	that	SCONJ	IN	_	9	mark	9:mark	_
7	they	they	PRON	PRP	Case=Nom|Number=Plur|Person=3|PronType=Prs	9	nsubj	9:nsubj	_
8	had	have	AUX	VBD	Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin	9	aux	9:aux	_
9	busted	bust	VERB	VBN	Tense=Past|VerbForm=Part	5	ccomp	5:ccomp	_
10	up	up	ADP	RP	_	9	compound:prt	9:compound:prt	_
11	3	3	NUM	CD	NumForm=Digit|NumType=Card	13	nummod	13:nummod	_
12	terrorist	terrorist	ADJ	JJ	Degree=Pos	13	amod	13:amod	_
13	cells	cell	NOUN	NNS	Number=Plur	9	obj	9:obj	_
14	operating	operate	VERB	VBG	VerbForm=Ger	13	acl	13:acl	_
15	in	in	ADP	IN	_	16	case	16:case	_
16	Baghdad	Baghdad	PROPN	NNP	Number=Sing	14	obl	14:obl:in	SpaceAfter=No
17	.	.	PUNCT	.	_	1	punct	1:punct	_

# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0004
# text = Two of them were being run by 2 officials of the Ministry of the Interior!
1	Two	two	NUM	CD	NumForm=Word|NumType=Card	6	nsubj:pass	6:nsubj:pass	_
2	of	of	ADP	IN	_	3	case	3:case	_
3	them	they	PRON	PRP	Case=Acc|Number=Plur|Person=3|PronType=Prs	1	nmod	1:nmod:of	_
4	were	be	AUX	VBD	Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin	6	aux	6:aux	_
5	being	be	AUX	VBG	VerbForm=Ger	6	aux:pass	6:aux:pass	_
6	run	run	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	0	root	0:root	_
7	by	by	ADP	IN	_	9	case	9:case	_
8	2	2	NUM	CD	NumForm=Digit|NumType=Card	9	nummod	9:nummod	_
9	officials	official	NOUN	NNS	Number=Plur	6	obl	6:obl:by	_
10	of	of	ADP	IN	_	12	case	12:case	_
11	the	the	DET	DT	Definite=Def|PronType=Art	12	det	12:det	_
12	Ministry	Ministry	PROPN	NNP	Number=Sing	9	nmod	9:nmod:of	_
13	of	of	ADP	IN	_	15	case	15:case	_
14	the	the	DET	DT	Definite=Def|PronType=Art	15	det	15:det	_
15	Interior	Interior	PROPN	NNP	Number=Sing	12	nmod	12:nmod:of	SpaceAfter=No
16	!	!	PUNCT	.	_	6	punct	6:punct	_

""".lstrip()


DEV_DATA = """
1	From	from	ADP	IN	_	3	case	3:case	_
2	the	the	DET	DT	Definite=Def|PronType=Art	3	det	3:det	_
3	AP	AP	PROPN	NNP	Number=Sing	4	obl	4:obl:from	_
4	comes	come	VERB	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	0	root	0:root	_
5	this	this	DET	DT	Number=Sing|PronType=Dem	6	det	6:det	_
6	story	story	NOUN	NN	Number=Sing	4	nsubj	4:nsubj	_
7	:	:	PUNCT	:	_	4	punct	4:punct	_

""".lstrip()



class TestParser:
    @pytest.fixture(scope="class")
    def wordvec_pretrain_file(self):
        return f'{TEST_WORKING_DIR}/in/tiny_emb.pt'

    def run_training(self, tmp_path, wordvec_pretrain_file, train_text, dev_text, augment_nopunct=False, extra_args=None, zip_train_data=False):
        """
        Run the training for a few iterations, load & return the model
        """
        train_file = str(tmp_path / "train.zip") if zip_train_data else str(tmp_path / "train.conllu")
        dev_file = str(tmp_path / "dev.conllu")
        pred_file = str(tmp_path / "pred.conllu")

        save_name = "test_parser.pt"
        save_file = str(tmp_path / save_name)

        if zip_train_data:
            with zipfile.ZipFile(train_file, "w") as zout:
                with zout.open('train.conllu', 'w') as fout:
                    fout.write(train_text.encode())
        else:
            with open(train_file, "w", encoding="utf-8") as fout:
                fout.write(train_text)

        with open(dev_file, "w", encoding="utf-8") as fout:
            fout.write(dev_text)

        args = ["--wordvec_pretrain_file", wordvec_pretrain_file,
                "--train_file", train_file,
                "--eval_file", dev_file,
                "--output_file", pred_file,
                "--log_step", "10",
                "--eval_interval", "20",
                "--max_steps", "100",
                "--shorthand", "en_test",
                "--save_dir", str(tmp_path),
                "--save_name", save_name,
                # in case we are doing a bert test
                "--bert_start_finetuning", "10",
                "--bert_warmup_steps", "10",
                "--lang", "en"]
        if not augment_nopunct:
            args.extend(["--augment_nopunct", "0.0"])
        if extra_args is not None:
            args = args + extra_args
        trainer, _ = parser.main(args)

        assert os.path.exists(save_file)
        pt = pretrain.Pretrain(wordvec_pretrain_file)
        # test loading the saved model
        saved_model = Trainer(pretrain=pt, model_file=save_file)
        return trainer

    def test_train(self, tmp_path, wordvec_pretrain_file):
        """
        Simple test of a few 'epochs' of tagger training
        """
        self.run_training(tmp_path, wordvec_pretrain_file, TRAIN_DATA, DEV_DATA)

    def test_zipfile_train(self, tmp_path, wordvec_pretrain_file):
        """
        Simple test of a few 'epochs' of tagger training with a zipfile
        """
        self.run_training(tmp_path, wordvec_pretrain_file, TRAIN_DATA, DEV_DATA, zip_train_data=True)

    def test_with_bert_nlayers(self, tmp_path, wordvec_pretrain_file):
        self.run_training(tmp_path, wordvec_pretrain_file, TRAIN_DATA, DEV_DATA, extra_args=['--bert_model', 'hf-internal-testing/tiny-bert', '--bert_hidden_layers', '2'])

    def test_with_bert_finetuning(self, tmp_path, wordvec_pretrain_file):
        trainer = self.run_training(tmp_path, wordvec_pretrain_file, TRAIN_DATA, DEV_DATA, extra_args=['--bert_model', 'hf-internal-testing/tiny-bert', '--bert_finetune', '--bert_hidden_layers', '2'])
        assert 'bert_optimizer' in trainer.optimizer.keys()
        assert 'bert_scheduler' in trainer.scheduler.keys()

    def test_with_bert_finetuning_resaved(self, tmp_path, wordvec_pretrain_file):
        """
        Check that if we save, then load, then save a model with a finetuned bert, that bert isn't lost
        """
        trainer = self.run_training(tmp_path, wordvec_pretrain_file, TRAIN_DATA, DEV_DATA, extra_args=['--bert_model', 'hf-internal-testing/tiny-bert', '--bert_finetune', '--bert_hidden_layers', '2'])
        assert 'bert_optimizer' in trainer.optimizer.keys()
        assert 'bert_scheduler' in trainer.scheduler.keys()

        save_name = trainer.args['save_name']
        filename = tmp_path / save_name
        assert os.path.exists(filename)
        checkpoint = torch.load(filename, lambda storage, loc: storage, weights_only=True)
        assert any(x.startswith("bert_model") for x in checkpoint['model'].keys())

        # Test loading the saved model, saving it, and still having bert in it
        # even if we have set bert_finetune to False for this incarnation
        pt = pretrain.Pretrain(wordvec_pretrain_file)
        args = {"bert_finetune": False}
        saved_model = Trainer(pretrain=pt, model_file=filename, args=args)

        saved_model.save(filename)

        # This is the part that would fail if the force_bert_saved option did not exist
        checkpoint = torch.load(filename, lambda storage, loc: storage, weights_only=True)
        assert any(x.startswith("bert_model") for x in checkpoint['model'].keys())

    def test_with_peft(self, tmp_path, wordvec_pretrain_file):
        trainer = self.run_training(tmp_path, wordvec_pretrain_file, TRAIN_DATA, DEV_DATA, extra_args=['--bert_model', 'hf-internal-testing/tiny-bert', '--bert_finetune', '--bert_hidden_layers', '2', '--use_peft'])
        assert 'bert_optimizer' in trainer.optimizer.keys()
        assert 'bert_scheduler' in trainer.scheduler.keys()

    def test_single_optimizer_checkpoint(self, tmp_path, wordvec_pretrain_file):
        trainer = self.run_training(tmp_path, wordvec_pretrain_file, TRAIN_DATA, DEV_DATA, extra_args=['--optim', 'adam'])

        save_dir = trainer.args['save_dir']
        save_name = trainer.args['save_name']
        checkpoint_name = trainer.args["checkpoint_save_name"]

        assert os.path.exists(os.path.join(save_dir, save_name))
        assert checkpoint_name is not None
        assert os.path.exists(checkpoint_name)

        assert len(trainer.optimizer) == 1
        for opt in trainer.optimizer.values():
            assert isinstance(opt, torch.optim.Adam)

        pt = pretrain.Pretrain(wordvec_pretrain_file)
        checkpoint = Trainer(args=trainer.args, pretrain=pt, model_file=checkpoint_name)
        assert checkpoint.optimizer is not None
        assert len(checkpoint.optimizer) == 1
        for opt in checkpoint.optimizer.values():
            assert isinstance(opt, torch.optim.Adam)

    def test_two_optimizers_checkpoint(self, tmp_path, wordvec_pretrain_file):
        trainer = self.run_training(tmp_path, wordvec_pretrain_file, TRAIN_DATA, DEV_DATA, extra_args=['--optim', 'adam', '--second_optim', 'sgd', '--second_optim_start_step', '40'])

        save_dir = trainer.args['save_dir']
        save_name = trainer.args['save_name']
        checkpoint_name = trainer.args["checkpoint_save_name"]

        assert os.path.exists(os.path.join(save_dir, save_name))
        assert checkpoint_name is not None
        assert os.path.exists(checkpoint_name)

        assert len(trainer.optimizer) == 1
        for opt in trainer.optimizer.values():
            assert isinstance(opt, torch.optim.SGD)

        pt = pretrain.Pretrain(wordvec_pretrain_file)
        checkpoint = Trainer(args=trainer.args, pretrain=pt, model_file=checkpoint_name)
        assert checkpoint.optimizer is not None
        assert len(checkpoint.optimizer) == 1
        for opt in trainer.optimizer.values():
            assert isinstance(opt, torch.optim.SGD)