Albin Thörn Cleland
Clean initial commit with LFS
19b8775
import dataclasses
import logging
import math
import os
import random
import re
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import stanza.models.classifiers.data as data
from stanza.models.classifiers.base_classifier import BaseClassifier
from stanza.models.classifiers.config import CNNConfig
from stanza.models.classifiers.data import SentimentDatum
from stanza.models.classifiers.utils import ExtraVectors, ModelType, build_output_layers
from stanza.models.common.bert_embedding import extract_bert_embeddings
from stanza.models.common.data import get_long_tensor, sort_all
from stanza.models.common.utils import attach_bert_model
from stanza.models.common.vocab import PAD_ID, UNK_ID
"""
The CNN classifier is based on Yoon Kim's work:
https://arxiv.org/abs/1408.5882
Also included are maxpool 2d, conv 2d, and a bilstm, as in
Text Classification Improved by Integrating Bidirectional LSTM
with Two-dimensional Max Pooling
https://aclanthology.org/C16-1329.pdf
The architecture is simple:
- Embedding at the bottom layer
- separate learnable entry for UNK, since many of the embeddings we have use 0 for UNK
- maybe a bilstm layer, as per a command line flag
- Some number of conv2d layers over the embedding
- Maxpool layers over small windows, window size being a parameter
- FC layer to the classification layer
One experiment which was run and found to be a bit of a negative was
putting a layer on top of the pretrain. You would think that might
help, but dev performance went down for each variation of
- trans(emb)
- relu(trans(emb))
- dropout(trans(emb))
- dropout(relu(trans(emb)))
"""
logger = logging.getLogger('stanza')
tlogger = logging.getLogger('stanza.classifiers.trainer')
class CNNClassifier(BaseClassifier):
def __init__(self, pretrain, extra_vocab, labels,
charmodel_forward, charmodel_backward, elmo_model, bert_model, bert_tokenizer, force_bert_saved, peft_name,
args):
"""
pretrain is a pretrained word embedding. should have .emb and .vocab
extra_vocab is a collection of words in the training data to
be used for the delta word embedding, if used. can be set to
None if delta word embedding is not used.
labels is the list of labels we expect in the training data.
Used to derive the number of classes. Saving it in the model
will let us check that test data has the same labels
args is either the complete arguments when training, or the
subset of arguments stored in the model save file
"""
super(CNNClassifier, self).__init__()
self.labels = labels
bert_finetune = args.bert_finetune
use_peft = args.use_peft
force_bert_saved = force_bert_saved or bert_finetune
logger.debug("bert_finetune %s / force_bert_saved %s", bert_finetune, force_bert_saved)
# this may change when loaded in a new Pipeline, so it's not part of the config
self.peft_name = peft_name
# we build a separate config out of the args so that we can easily save it in torch
self.config = CNNConfig(filter_channels = args.filter_channels,
filter_sizes = args.filter_sizes,
fc_shapes = args.fc_shapes,
dropout = args.dropout,
num_classes = len(labels),
wordvec_type = args.wordvec_type,
extra_wordvec_method = args.extra_wordvec_method,
extra_wordvec_dim = args.extra_wordvec_dim,
extra_wordvec_max_norm = args.extra_wordvec_max_norm,
char_lowercase = args.char_lowercase,
charlm_projection = args.charlm_projection,
has_charlm_forward = charmodel_forward is not None,
has_charlm_backward = charmodel_backward is not None,
use_elmo = args.use_elmo,
elmo_projection = args.elmo_projection,
bert_model = args.bert_model,
bert_finetune = bert_finetune,
bert_hidden_layers = args.bert_hidden_layers,
force_bert_saved = force_bert_saved,
use_peft = use_peft,
lora_rank = args.lora_rank,
lora_alpha = args.lora_alpha,
lora_dropout = args.lora_dropout,
lora_modules_to_save = args.lora_modules_to_save,
lora_target_modules = args.lora_target_modules,
bilstm = args.bilstm,
bilstm_hidden_dim = args.bilstm_hidden_dim,
maxpool_width = args.maxpool_width,
model_type = ModelType.CNN)
self.char_lowercase = args.char_lowercase
self.unsaved_modules = []
emb_matrix = pretrain.emb
self.add_unsaved_module('embedding', nn.Embedding.from_pretrained(emb_matrix, freeze=True))
self.add_unsaved_module('elmo_model', elmo_model)
self.vocab_size = emb_matrix.shape[0]
self.embedding_dim = emb_matrix.shape[1]
self.add_unsaved_module('forward_charlm', charmodel_forward)
if charmodel_forward is not None:
tlogger.debug("Got forward char model of dimension {}".format(charmodel_forward.hidden_dim()))
if not charmodel_forward.is_forward_lm:
raise ValueError("Got a backward charlm as a forward charlm!")
self.add_unsaved_module('backward_charlm', charmodel_backward)
if charmodel_backward is not None:
tlogger.debug("Got backward char model of dimension {}".format(charmodel_backward.hidden_dim()))
if charmodel_backward.is_forward_lm:
raise ValueError("Got a forward charlm as a backward charlm!")
attach_bert_model(self, bert_model, bert_tokenizer, self.config.use_peft, force_bert_saved)
# The Pretrain has PAD and UNK already (indices 0 and 1), but we
# possibly want to train UNK while freezing the rest of the embedding
# note that the /10.0 operation has to be inside nn.Parameter unless
# you want to spend a long time debugging this
self.unk = nn.Parameter(torch.randn(self.embedding_dim) / np.sqrt(self.embedding_dim) / 10.0)
# replacing NBSP picks up a whole bunch of words for VI
self.vocab_map = { word.replace('\xa0', ' '): i for i, word in enumerate(pretrain.vocab) }
if self.config.extra_wordvec_method is not ExtraVectors.NONE:
if not extra_vocab:
raise ValueError("Should have had extra_vocab set for extra_wordvec_method {}".format(self.config.extra_wordvec_method))
if not args.extra_wordvec_dim:
self.config.extra_wordvec_dim = self.embedding_dim
if self.config.extra_wordvec_method is ExtraVectors.SUM:
if self.config.extra_wordvec_dim != self.embedding_dim:
raise ValueError("extra_wordvec_dim must equal embedding_dim for {}".format(self.config.extra_wordvec_method))
self.extra_vocab = list(extra_vocab)
self.extra_vocab_map = { word: i for i, word in enumerate(self.extra_vocab) }
# TODO: possibly add regularization specifically on the extra embedding?
# note: it looks like a bug that this doesn't add UNK or PAD, but actually
# those are expected to already be the first two entries
self.extra_embedding = nn.Embedding(num_embeddings = len(extra_vocab),
embedding_dim = self.config.extra_wordvec_dim,
max_norm = self.config.extra_wordvec_max_norm,
padding_idx = 0)
tlogger.debug("Extra embedding size: {}".format(self.extra_embedding.weight.shape))
else:
self.extra_vocab = None
self.extra_vocab_map = None
self.config.extra_wordvec_dim = 0
self.extra_embedding = None
# Pytorch is "aware" of the existence of the nn.Modules inside
# an nn.ModuleList in terms of parameters() etc
if self.config.extra_wordvec_method is ExtraVectors.NONE:
total_embedding_dim = self.embedding_dim
elif self.config.extra_wordvec_method is ExtraVectors.SUM:
total_embedding_dim = self.embedding_dim
elif self.config.extra_wordvec_method is ExtraVectors.CONCAT:
total_embedding_dim = self.embedding_dim + self.config.extra_wordvec_dim
else:
raise ValueError("unable to handle {}".format(self.config.extra_wordvec_method))
if charmodel_forward is not None:
if args.charlm_projection:
self.charmodel_forward_projection = nn.Linear(charmodel_forward.hidden_dim(), args.charlm_projection)
total_embedding_dim += args.charlm_projection
else:
self.charmodel_forward_projection = None
total_embedding_dim += charmodel_forward.hidden_dim()
if charmodel_backward is not None:
if args.charlm_projection:
self.charmodel_backward_projection = nn.Linear(charmodel_backward.hidden_dim(), args.charlm_projection)
total_embedding_dim += args.charlm_projection
else:
self.charmodel_backward_projection = None
total_embedding_dim += charmodel_backward.hidden_dim()
if self.config.use_elmo:
if elmo_model is None:
raise ValueError("Model requires elmo, but elmo_model not passed in")
elmo_dim = elmo_model.sents2elmo([["Test"]])[0].shape[1]
# this mapping will combine 3 layers of elmo to 1 layer of features
self.elmo_combine_layers = nn.Linear(in_features=3, out_features=1, bias=False)
if self.config.elmo_projection:
self.elmo_projection = nn.Linear(in_features=elmo_dim, out_features=self.config.elmo_projection)
total_embedding_dim = total_embedding_dim + self.config.elmo_projection
else:
total_embedding_dim = total_embedding_dim + elmo_dim
if bert_model is not None:
if self.config.bert_hidden_layers:
# The average will be offset by 1/N so that the default zeros
# repressents an average of the N layers
if self.config.bert_hidden_layers > bert_model.config.num_hidden_layers:
# limit ourselves to the number of layers actually available
# note that we can +1 because of the initial embedding layer
self.config.bert_hidden_layers = bert_model.config.num_hidden_layers + 1
self.bert_layer_mix = nn.Linear(self.config.bert_hidden_layers, 1, bias=False)
nn.init.zeros_(self.bert_layer_mix.weight)
else:
# an average of layers 2, 3, 4 will be used
# (for historic reasons)
self.bert_layer_mix = None
if bert_tokenizer is None:
raise ValueError("Cannot have a bert model without a tokenizer")
self.bert_dim = self.bert_model.config.hidden_size
total_embedding_dim += self.bert_dim
if self.config.bilstm:
conv_input_dim = self.config.bilstm_hidden_dim * 2
self.bilstm = nn.LSTM(batch_first=True,
input_size=total_embedding_dim,
hidden_size=self.config.bilstm_hidden_dim,
num_layers=2,
bidirectional=True,
dropout=0.2)
else:
conv_input_dim = total_embedding_dim
self.bilstm = None
self.fc_input_size = 0
self.conv_layers = nn.ModuleList()
self.max_window = 0
for filter_idx, filter_size in enumerate(self.config.filter_sizes):
if isinstance(filter_size, int):
self.max_window = max(self.max_window, filter_size)
if isinstance(self.config.filter_channels, int):
filter_channels = self.config.filter_channels
else:
filter_channels = self.config.filter_channels[filter_idx]
fc_delta = filter_channels // self.config.maxpool_width
tlogger.debug("Adding full width filter %d. Output channels: %d -> %d", filter_size, filter_channels, fc_delta)
self.fc_input_size += fc_delta
self.conv_layers.append(nn.Conv2d(in_channels=1,
out_channels=filter_channels,
kernel_size=(filter_size, conv_input_dim)))
elif isinstance(filter_size, tuple) and len(filter_size) == 2:
filter_height, filter_width = filter_size
self.max_window = max(self.max_window, filter_width)
if isinstance(self.config.filter_channels, int):
filter_channels = max(1, self.config.filter_channels // (conv_input_dim // filter_width))
else:
filter_channels = self.config.filter_channels[filter_idx]
fc_delta = filter_channels * (conv_input_dim // filter_width) // self.config.maxpool_width
tlogger.debug("Adding filter %s. Output channels: %d -> %d", filter_size, filter_channels, fc_delta)
self.fc_input_size += fc_delta
self.conv_layers.append(nn.Conv2d(in_channels=1,
out_channels=filter_channels,
stride=(1, filter_width),
kernel_size=(filter_height, filter_width)))
else:
raise ValueError("Expected int or 2d tuple for conv size")
tlogger.debug("Input dim to FC layers: %d", self.fc_input_size)
self.fc_layers = build_output_layers(self.fc_input_size, self.config.fc_shapes, self.config.num_classes)
self.dropout = nn.Dropout(self.config.dropout)
def add_unsaved_module(self, name, module):
self.unsaved_modules += [name]
setattr(self, name, module)
if module is not None and (name in ('forward_charlm', 'backward_charlm') or
(name == 'bert_model' and not self.config.use_peft)):
# if we are using peft, we should not save the transformer directly
# instead, the peft parameters only will be saved later
for _, parameter in module.named_parameters():
parameter.requires_grad = False
def is_unsaved_module(self, name):
return name.split('.')[0] in self.unsaved_modules
def log_configuration(self):
"""
Log some essential information about the model configuration to the training logger
"""
tlogger.info("Filter sizes: %s" % str(self.config.filter_sizes))
tlogger.info("Filter channels: %s" % str(self.config.filter_channels))
tlogger.info("Intermediate layers: %s" % str(self.config.fc_shapes))
def log_norms(self):
lines = ["NORMS FOR MODEL PARAMTERS"]
for name, param in self.named_parameters():
if param.requires_grad and name.split(".")[0] not in ('forward_charlm', 'backward_charlm'):
lines.append("%s %.6g" % (name, torch.norm(param).item()))
logger.info("\n".join(lines))
def build_char_reps(self, inputs, max_phrase_len, charlm, projection, begin_paddings, device):
char_reps = charlm.build_char_representation(inputs)
if projection is not None:
char_reps = [projection(x) for x in char_reps]
char_inputs = torch.zeros((len(inputs), max_phrase_len, char_reps[0].shape[-1]), device=device)
for idx, rep in enumerate(char_reps):
start = begin_paddings[idx]
end = start + rep.shape[0]
char_inputs[idx, start:end, :] = rep
return char_inputs
def extract_bert_embeddings(self, inputs, max_phrase_len, begin_paddings, device):
bert_embeddings = extract_bert_embeddings(self.config.bert_model, self.bert_tokenizer, self.bert_model, inputs, device,
keep_endpoints=False,
num_layers=self.bert_layer_mix.in_features if self.bert_layer_mix is not None else None,
detach=not self.config.bert_finetune,
peft_name=self.peft_name)
if self.bert_layer_mix is not None:
# add the average so that the default behavior is to
# take an average of the N layers, and anything else
# other than that needs to be learned
bert_embeddings = [self.bert_layer_mix(feature).squeeze(2) + feature.sum(axis=2) / self.bert_layer_mix.in_features for feature in bert_embeddings]
bert_inputs = torch.zeros((len(inputs), max_phrase_len, bert_embeddings[0].shape[-1]), device=device)
for idx, rep in enumerate(bert_embeddings):
start = begin_paddings[idx]
end = start + rep.shape[0]
bert_inputs[idx, start:end, :] = rep
return bert_inputs
def forward(self, inputs):
# assume all pieces are on the same device
device = next(self.parameters()).device
vocab_map = self.vocab_map
def map_word(word):
idx = vocab_map.get(word, None)
if idx is not None:
return idx
if word[-1] == "'":
idx = vocab_map.get(word[:-1], None)
if idx is not None:
return idx
return vocab_map.get(word.lower(), UNK_ID)
inputs = [x.text if isinstance(x, SentimentDatum) else x for x in inputs]
# we will pad each phrase so either it matches the longest
# conv or the longest phrase in the input, whichever is longer
max_phrase_len = max(len(x) for x in inputs)
if self.max_window > max_phrase_len:
max_phrase_len = self.max_window
batch_indices = []
batch_unknowns = []
extra_batch_indices = []
begin_paddings = []
end_paddings = []
elmo_batch_words = []
for phrase in inputs:
# we use random at training time to try to learn different
# positions of padding. at test time, though, we want to
# have consistent results, so we set that to 0 begin_pad
if self.training:
begin_pad_width = random.randint(0, max_phrase_len - len(phrase))
else:
begin_pad_width = 0
end_pad_width = max_phrase_len - begin_pad_width - len(phrase)
begin_paddings.append(begin_pad_width)
end_paddings.append(end_pad_width)
# the initial lists are the length of the begin padding
sentence_indices = [PAD_ID] * begin_pad_width
sentence_indices.extend([map_word(x) for x in phrase])
sentence_indices.extend([PAD_ID] * end_pad_width)
# the "unknowns" will be the locations of the unknown words.
# these locations will get the specially trained unknown vector
# TODO: split UNK based on part of speech? might be an interesting experiment
sentence_unknowns = [idx for idx, word in enumerate(sentence_indices) if word == UNK_ID]
batch_indices.append(sentence_indices)
batch_unknowns.append(sentence_unknowns)
if self.extra_vocab:
extra_sentence_indices = [PAD_ID] * begin_pad_width
for word in phrase:
if word in self.extra_vocab_map:
# the extra vocab is initialized from the
# words in the training set, which means there
# would be no unknown words. to occasionally
# train the extra vocab's unknown words, we
# replace 1% of the words with UNK
# we don't do that for the original embedding
# on the assumption that there may be some
# unknown words in the training set anyway
# TODO: maybe train unk for the original embedding?
if self.training and random.random() < 0.01:
extra_sentence_indices.append(UNK_ID)
else:
extra_sentence_indices.append(self.extra_vocab_map[word])
else:
extra_sentence_indices.append(UNK_ID)
extra_sentence_indices.extend([PAD_ID] * end_pad_width)
extra_batch_indices.append(extra_sentence_indices)
if self.config.use_elmo:
elmo_phrase_words = [""] * begin_pad_width
for word in phrase:
elmo_phrase_words.append(word)
elmo_phrase_words.extend([""] * end_pad_width)
elmo_batch_words.append(elmo_phrase_words)
# creating a single large list with all the indices lets us
# create a single tensor, which is much faster than creating
# many tiny tensors
# we can convert this to the input to the CNN
# it is padded at one or both ends so that it is now num_phrases x max_len x emb_size
# there are two ways in which this padding is suboptimal
# the first is that for short sentences, smaller windows will
# be padded to the point that some windows are entirely pad
# the second is that a sentence S will have more or less padding
# depending on what other sentences are in its batch
# we assume these effects are pretty minimal
batch_indices = torch.tensor(batch_indices, requires_grad=False, device=device)
input_vectors = self.embedding(batch_indices)
# we use the random unk so that we are not necessarily
# learning to match 0s for unk
for phrase_num, sentence_unknowns in enumerate(batch_unknowns):
input_vectors[phrase_num][sentence_unknowns] = self.unk
if self.extra_vocab:
extra_batch_indices = torch.tensor(extra_batch_indices, requires_grad=False, device=device)
extra_input_vectors = self.extra_embedding(extra_batch_indices)
if self.config.extra_wordvec_method is ExtraVectors.CONCAT:
all_inputs = [input_vectors, extra_input_vectors]
elif self.config.extra_wordvec_method is ExtraVectors.SUM:
all_inputs = [input_vectors + extra_input_vectors]
else:
raise ValueError("unable to handle {}".format(self.config.extra_wordvec_method))
else:
all_inputs = [input_vectors]
if self.forward_charlm is not None:
char_reps_forward = self.build_char_reps(inputs, max_phrase_len, self.forward_charlm, self.charmodel_forward_projection, begin_paddings, device)
all_inputs.append(char_reps_forward)
if self.backward_charlm is not None:
char_reps_backward = self.build_char_reps(inputs, max_phrase_len, self.backward_charlm, self.charmodel_backward_projection, begin_paddings, device)
all_inputs.append(char_reps_backward)
if self.config.use_elmo:
# this will be N arrays of 3xMx1024 where M is the number of words
# and N is the number of sentences (and 1024 is actually the number of weights)
elmo_arrays = self.elmo_model.sents2elmo(elmo_batch_words, output_layer=-2)
elmo_tensors = [torch.tensor(x).to(device=device) for x in elmo_arrays]
# elmo_tensor will now be Nx3xMx1024
elmo_tensor = torch.stack(elmo_tensors)
# Nx1024xMx3
elmo_tensor = torch.transpose(elmo_tensor, 1, 3)
# NxMx1024x3
elmo_tensor = torch.transpose(elmo_tensor, 1, 2)
# NxMx1024x1
elmo_tensor = self.elmo_combine_layers(elmo_tensor)
# NxMx1024
elmo_tensor = elmo_tensor.squeeze(3)
if self.config.elmo_projection:
elmo_tensor = self.elmo_projection(elmo_tensor)
all_inputs.append(elmo_tensor)
if self.bert_model is not None:
bert_embeddings = self.extract_bert_embeddings(inputs, max_phrase_len, begin_paddings, device)
all_inputs.append(bert_embeddings)
# still works even if there's just one item
input_vectors = torch.cat(all_inputs, dim=2)
if self.config.bilstm:
input_vectors, _ = self.bilstm(self.dropout(input_vectors))
# reshape to fit the input tensors
x = input_vectors.unsqueeze(1)
conv_outs = []
for conv, filter_size in zip(self.conv_layers, self.config.filter_sizes):
if isinstance(filter_size, int):
conv_out = self.dropout(F.relu(conv(x).squeeze(3)))
conv_outs.append(conv_out)
else:
conv_out = conv(x).transpose(2, 3).flatten(1, 2)
conv_out = self.dropout(F.relu(conv_out))
conv_outs.append(conv_out)
pool_outs = [F.max_pool2d(out, (self.config.maxpool_width, out.shape[2])).squeeze(2) for out in conv_outs]
pooled = torch.cat(pool_outs, dim=1)
previous_layer = pooled
for fc in self.fc_layers[:-1]:
previous_layer = self.dropout(F.relu(fc(previous_layer)))
out = self.fc_layers[-1](previous_layer)
# note that we return the raw logits rather than use a softmax
# https://discuss.pytorch.org/t/multi-class-cross-entropy-loss-and-softmax-in-pytorch/24920/4
return out
def get_params(self, skip_modules=True):
model_state = self.state_dict()
# skip saving modules like pretrained embeddings, because they are large and will be saved in a separate file
if skip_modules:
skipped = [k for k in model_state.keys() if self.is_unsaved_module(k)]
for k in skipped:
del model_state[k]
config = dataclasses.asdict(self.config)
config['wordvec_type'] = config['wordvec_type'].name
config['extra_wordvec_method'] = config['extra_wordvec_method'].name
config['model_type'] = config['model_type'].name
params = {
'model': model_state,
'config': config,
'labels': self.labels,
'extra_vocab': self.extra_vocab,
}
if self.config.use_peft:
# Hide import so that peft dependency is optional
from peft import get_peft_model_state_dict
params["bert_lora"] = get_peft_model_state_dict(self.bert_model, adapter_name=self.peft_name)
return params
def preprocess_data(self, sentences):
sentences = [data.update_text(s, self.config.wordvec_type) for s in sentences]
return sentences
def extract_sentences(self, doc):
# TODO: tokens or words better here?
return [[token.text for token in sentence.tokens] for sentence in doc.sentences]