File size: 17,307 Bytes
19b8775 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 |
import glob
import os
import pytest
import numpy as np
import torch
import stanza
import stanza.models.classifier as classifier
import stanza.models.classifiers.data as data
from stanza.models.classifiers.trainer import Trainer
from stanza.models.common import pretrain
from stanza.models.common import utils
from stanza.tests import TEST_MODELS_DIR
from stanza.tests.classifiers.test_data import train_file, dev_file, test_file, DATASET, SENTENCES
pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
EMB_DIM = 5
@pytest.fixture(scope="module")
def fake_embeddings(tmp_path_factory):
"""
will return a path to a fake embeddings file with the words in SENTENCES
"""
# could set np random seed here
words = sorted(set([x.lower() for y in SENTENCES for x in y]))
words = words[:-1]
embedding_dir = tmp_path_factory.mktemp("data")
embedding_txt = embedding_dir / "embedding.txt"
embedding_pt = embedding_dir / "embedding.pt"
embedding = np.random.random((len(words), EMB_DIM))
with open(embedding_txt, "w", encoding="utf-8") as fout:
for word, emb in zip(words, embedding):
fout.write(word)
fout.write("\t")
fout.write("\t".join(str(x) for x in emb))
fout.write("\n")
pt = pretrain.Pretrain(str(embedding_pt), str(embedding_txt))
pt.load()
assert os.path.exists(embedding_pt)
return embedding_pt
class TestClassifier:
def build_model(self, tmp_path, fake_embeddings, train_file, dev_file, extra_args=None, checkpoint_file=None):
"""
Build a model to be used by one of the later tests
"""
save_dir = str(tmp_path / "classifier")
save_name = "model.pt"
args = ["--save_dir", save_dir,
"--save_name", save_name,
"--wordvec_pretrain_file", str(fake_embeddings),
"--filter_channels", "20",
"--fc_shapes", "20,10",
"--train_file", str(train_file),
"--dev_file", str(dev_file),
"--max_epochs", "2",
"--batch_size", "60"]
if extra_args is not None:
args = args + extra_args
args = classifier.parse_args(args)
train_set = data.read_dataset(args.train_file, args.wordvec_type, args.min_train_len)
if checkpoint_file:
trainer = Trainer.load(checkpoint_file, args, load_optimizer=True)
else:
trainer = Trainer.build_new_model(args, train_set)
return trainer, train_set, args
def run_training(self, tmp_path, fake_embeddings, train_file, dev_file, extra_args=None, checkpoint_file=None):
"""
Iterate a couple times over a model
"""
trainer, train_set, args = self.build_model(tmp_path, fake_embeddings, train_file, dev_file, extra_args, checkpoint_file)
dev_set = data.read_dataset(args.dev_file, args.wordvec_type, args.min_train_len)
labels = data.dataset_labels(train_set)
save_filename = os.path.join(args.save_dir, args.save_name)
if checkpoint_file is None:
checkpoint_file = utils.checkpoint_name(args.save_dir, save_filename, args.checkpoint_save_name)
classifier.train_model(trainer, save_filename, checkpoint_file, args, train_set, dev_set, labels)
return trainer, save_filename, checkpoint_file
def test_build_model(self, tmp_path, fake_embeddings, train_file, dev_file):
"""
Test that building a basic model works
"""
self.build_model(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20"])
def test_save_load(self, tmp_path, fake_embeddings, train_file, dev_file):
"""
Test that a basic model can save & load
"""
trainer, _, args = self.build_model(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20"])
save_filename = os.path.join(args.save_dir, args.save_name)
trainer.save(save_filename)
args.load_name = args.save_name
trainer = Trainer.load(args.load_name, args)
args.load_name = save_filename
trainer = Trainer.load(args.load_name, args)
def test_train_basic(self, tmp_path, fake_embeddings, train_file, dev_file):
self.run_training(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20"])
def test_train_bilstm(self, tmp_path, fake_embeddings, train_file, dev_file):
"""
Test w/ and w/o bilstm variations of the classifier
"""
args = ["--bilstm", "--bilstm_hidden_dim", "20"]
self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
args = ["--no_bilstm"]
self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
def test_train_maxpool_width(self, tmp_path, fake_embeddings, train_file, dev_file):
"""
Test various maxpool widths
Also sets --filter_channels to a multiple of 2 but not of 3 for
the test to make sure the math is done correctly on a non-divisible width
"""
args = ["--maxpool_width", "1", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
args = ["--maxpool_width", "2", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
args = ["--maxpool_width", "3", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
def test_train_conv_2d(self, tmp_path, fake_embeddings, train_file, dev_file):
args = ["--filter_sizes", "(3,4,5)", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
args = ["--filter_sizes", "((3,2),)", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
args = ["--filter_sizes", "((3,2),3)", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
def test_train_filter_channels(self, tmp_path, fake_embeddings, train_file, dev_file):
args = ["--filter_sizes", "((3,2),3)", "--filter_channels", "20", "--no_bilstm"]
trainer, _, _ = self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
assert trainer.model.fc_input_size == 40
args = ["--filter_sizes", "((3,2),3)", "--filter_channels", "15,20", "--no_bilstm"]
trainer, _, _ = self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
# 50 = 2x15 for the 2d conv (over 5 dim embeddings) + 20
assert trainer.model.fc_input_size == 50
def test_train_bert(self, tmp_path, fake_embeddings, train_file, dev_file):
"""
Test on a tiny Bert WITHOUT finetuning, which hopefully does not take up too much disk space or memory
"""
bert_model = "hf-internal-testing/tiny-bert"
trainer, save_filename, _ = self.run_training(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20", "--bert_model", bert_model])
assert os.path.exists(save_filename)
saved_model = torch.load(save_filename, lambda storage, loc: storage, weights_only=True)
# check that the bert model wasn't saved as part of the classifier
assert not saved_model['params']['config']['force_bert_saved']
assert not any(x.startswith("bert_model") for x in saved_model['params']['model'].keys())
def test_finetune_bert(self, tmp_path, fake_embeddings, train_file, dev_file):
"""
Test on a tiny Bert WITH finetuning, which hopefully does not take up too much disk space or memory
"""
bert_model = "hf-internal-testing/tiny-bert"
trainer, save_filename, _ = self.run_training(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20", "--bert_model", bert_model, "--bert_finetune"])
assert os.path.exists(save_filename)
saved_model = torch.load(save_filename, lambda storage, loc: storage, weights_only=True)
# after finetuning the bert model, make sure that the save file DOES contain parts of the transformer
assert saved_model['params']['config']['force_bert_saved']
assert any(x.startswith("bert_model") for x in saved_model['params']['model'].keys())
def test_finetune_bert_layers(self, tmp_path, fake_embeddings, train_file, dev_file):
"""Test on a tiny Bert WITH finetuning, which hopefully does not take up too much disk space or memory, using 2 layers
As an added bonus (or eager test), load the finished model and continue
training from there. Then check that the initial model and
the middle model are different, then that the middle model and
final model are different
"""
bert_model = "hf-internal-testing/tiny-bert"
trainer, save_filename, checkpoint_file = self.run_training(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20", "--bert_model", bert_model, "--bert_finetune", "--bert_hidden_layers", "2", "--save_intermediate_models"])
assert os.path.exists(save_filename)
save_path = os.path.split(save_filename)[0]
initial_model = glob.glob(os.path.join(save_path, "*E0000*"))
assert len(initial_model) == 1
initial_model = initial_model[0]
initial_model = torch.load(initial_model, lambda storage, loc: storage, weights_only=True)
second_model_file = glob.glob(os.path.join(save_path, "*E0002*"))
assert len(second_model_file) == 1
second_model_file = second_model_file[0]
second_model = torch.load(second_model_file, lambda storage, loc: storage, weights_only=True)
for layer_idx in range(2):
bert_names = [x for x in second_model['params']['model'].keys() if x.startswith("bert_model") and "layer.%d." % layer_idx in x]
assert len(bert_names) > 0
assert all(x in initial_model['params']['model'] and x in second_model['params']['model'] for x in bert_names)
assert not all(torch.allclose(initial_model['params']['model'].get(x), second_model['params']['model'].get(x)) for x in bert_names)
# put some random marker in the file to look for later,
# check the continued training didn't clobber the expected file
assert "asdf" not in second_model
second_model["asdf"] = 1234
torch.save(second_model, second_model_file)
trainer, save_filename, checkpoint_file = self.run_training(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20", "--bert_model", bert_model, "--bert_finetune", "--bert_hidden_layers", "2", "--save_intermediate_models", "--max_epochs", "5"], checkpoint_file=checkpoint_file)
second_model_file_redo = glob.glob(os.path.join(save_path, "*E0002*"))
assert len(second_model_file_redo) == 1
assert second_model_file == second_model_file_redo[0]
second_model = torch.load(second_model_file, lambda storage, loc: storage, weights_only=True)
assert "asdf" in second_model
fifth_model_file = glob.glob(os.path.join(save_path, "*E0005*"))
assert len(fifth_model_file) == 1
final_model = torch.load(fifth_model_file[0], lambda storage, loc: storage, weights_only=True)
for layer_idx in range(2):
bert_names = [x for x in final_model['params']['model'].keys() if x.startswith("bert_model") and "layer.%d." % layer_idx in x]
assert len(bert_names) > 0
assert all(x in final_model['params']['model'] and x in second_model['params']['model'] for x in bert_names)
assert not all(torch.allclose(final_model['params']['model'].get(x), second_model['params']['model'].get(x)) for x in bert_names)
def test_finetune_peft(self, tmp_path, fake_embeddings, train_file, dev_file):
"""
Test on a tiny Bert with PEFT finetuning
"""
bert_model = "hf-internal-testing/tiny-bert"
trainer, save_filename, _ = self.run_training(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20", "--bert_model", bert_model, "--bert_finetune", "--use_peft", "--lora_modules_to_save", "pooler"])
assert os.path.exists(save_filename)
saved_model = torch.load(save_filename, lambda storage, loc: storage, weights_only=True)
# after finetuning the bert model, make sure that the save file DOES contain parts of the transformer, but only in peft form
assert saved_model['params']['config']['bert_model'] == bert_model
assert saved_model['params']['config']['force_bert_saved']
assert saved_model['params']['config']['use_peft']
assert not saved_model['params']['config']['has_charlm_forward']
assert not saved_model['params']['config']['has_charlm_backward']
assert len(saved_model['params']['bert_lora']) > 0
assert any(x.find(".pooler.") >= 0 for x in saved_model['params']['bert_lora'])
assert any(x.find(".encoder.") >= 0 for x in saved_model['params']['bert_lora'])
assert not any(x.startswith("bert_model") for x in saved_model['params']['model'].keys())
# The Pipeline should load and run a PEFT trained model,
# although obviously we don't expect the results to do
# anything correct
pipeline = stanza.Pipeline("en", download_method=None, model_dir=TEST_MODELS_DIR, processors="tokenize,sentiment", sentiment_model_path=save_filename, sentiment_pretrain_path=str(fake_embeddings))
doc = pipeline("This is a test")
def test_finetune_peft_restart(self, tmp_path, fake_embeddings, train_file, dev_file):
"""
Test that if we restart training on a peft model, the peft weights change
"""
bert_model = "hf-internal-testing/tiny-bert"
trainer, save_file, checkpoint_file = self.run_training(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20", "--bert_model", bert_model, "--bert_finetune", "--use_peft", "--lora_modules_to_save", "pooler", "--save_intermediate_models"])
assert os.path.exists(save_file)
saved_model = torch.load(save_file, lambda storage, loc: storage, weights_only=True)
assert any(x.find(".encoder.") >= 0 for x in saved_model['params']['bert_lora'])
trainer, save_file, checkpoint_file = self.run_training(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20", "--bert_model", bert_model, "--bert_finetune", "--use_peft", "--lora_modules_to_save", "pooler", "--save_intermediate_models", "--max_epochs", "5"], checkpoint_file=checkpoint_file)
save_path = os.path.split(save_file)[0]
initial_model_file = glob.glob(os.path.join(save_path, "*E0000*"))
assert len(initial_model_file) == 1
initial_model_file = initial_model_file[0]
initial_model = torch.load(initial_model_file, lambda storage, loc: storage, weights_only=True)
second_model_file = glob.glob(os.path.join(save_path, "*E0002*"))
assert len(second_model_file) == 1
second_model_file = second_model_file[0]
second_model = torch.load(second_model_file, lambda storage, loc: storage, weights_only=True)
final_model_file = glob.glob(os.path.join(save_path, "*E0005*"))
assert len(final_model_file) == 1
final_model_file = final_model_file[0]
final_model = torch.load(final_model_file, lambda storage, loc: storage, weights_only=True)
# params in initial_model & second_model start with "base_model.model."
# whereas params in final_model start directly with "encoder" or "pooler"
initial_lora = initial_model['params']['bert_lora']
second_lora = second_model['params']['bert_lora']
final_lora = final_model['params']['bert_lora']
for side in ("_A.", "_B."):
for layer in (".0.", ".1."):
initial_params = sorted([x for x in initial_lora if x.find(".encoder.") > 0 and x.find(side) > 0 and x.find(layer) > 0])
second_params = sorted([x for x in second_lora if x.find(".encoder.") > 0 and x.find(side) > 0 and x.find(layer) > 0])
final_params = sorted([x for x in final_lora if x.startswith("encoder.") > 0 and x.find(side) > 0 and x.find(layer) > 0])
assert len(initial_params) > 0
assert len(initial_params) == len(second_params)
assert len(initial_params) == len(final_params)
for x, y in zip(second_params, final_params):
assert x.endswith(y)
if side != "_A.": # the A tensors don't move very much, if at all
assert not torch.allclose(initial_lora.get(x), second_lora.get(x))
assert not torch.allclose(second_lora.get(x), final_lora.get(y))
|