poet-validators / utils /poet_model_utils.py

New Gen

13cea7c verified about 2 years ago

11.9 kB

	import torch

	class PoetModelInterface(torch.nn.Module):
	"""Pytorch Model Interface. Abstract class for all Poet model types

	Args:
	torch (_type_): Is child of torch.nn.Module for integration with torch and huggingface
	"""
	def __init__(self, args, *kwargs) -> None:
	""" Constructor. As child Class needs to construct Parent
	"""
	super().__init__(args, *kwargs)


	def forward(self, input_ids=None, labels=None, attention_mask=None, args, *kwargs):
	"""Compute model output and model loss

	Args:
	input_ids (_type_, optional): Model inputs. Defaults to None.
	labels (_type_, optional): Language Model labels. Defaults to None.
	attention_mask (_type_, optional): Attention mask where padding starts. Defaults to None.

	Raises:
	NotImplementedError: Abstract class
	"""
	raise NotImplementedError()

	def generate_forced(self, args, *kwargs):
	"""Generates model output with restriction on inputs and past generation

	Raises:
	NotImplementedError: Abstract class
	"""
	raise NotImplementedError()

	@staticmethod
	def rhyme_like(rhyme:str):
	"""DEPRECATED: Check string in rhyme format

	Args:
	rhyme (str): String with possible rhyme

	Returns:
	bool: Boolean if string like rhyme
	"""
	return rhyme.isupper() and len(rhyme) in [4,6]

	def save_LM(self, LM_path):
	"""Save raw LM

	Args:
	LM_path (str): Where to store the LM

	Raises:
	NotImplementedError: Abstract class
	"""
	raise NotImplementedError()


	from transformers import GPT2Config, GPT2Model
	from .poet_utils import StropheParams

	class ContextModule(torch.nn.Module):
	"""Module for understanding poet context

	Args:
	torch (_type_): Is child of torch.nn.Module for integration with torch and huggingface
	"""
	def __init__(self, block_count, input_size, n_embd ,output_size,args, *kwargs) -> None:
	"""Construct the underlying small LM for context

	Args:
	block_count (_type_): LM number of blocks of GPT2Block
	input_size (_type_): LM size of input
	n_embd (_type_): LM size of hidden layers
	output_size (_type_): LM size of output
	"""
	super().__init__(args, *kwargs)
	self.config = GPT2Config(n_positions=input_size, n_head=(n_embd//(768//12)),n_embd=n_embd,
	n_layer=block_count, output_hidden_states=True, output_attentions =True)
	self.context_model = GPT2Model(self.config)
	self.linear_downscale = torch.nn.Linear(n_embd, output_size)
	self.input_size = input_size
	self.n_embd = n_embd
	self.output_size = output_size
	# Context is getting injected from Outside
	self.context_ids = None
	self.context_attention_mask = None


	def forward(self, hidden_states,layer_past=None,args, *kwargs):
	"""Compute Context LM output, Data are injected from outside

	Args:
	hidden_states (_type_): Current hidden states
	layer_past (_type_, optional): Past layer outputs. Defaults to None.

	Returns:
	_type_: GPT2Block structured output (hidden states, layer past, attention, keys)
	"""
	down = torch.zeros_like(hidden_states)
	model_output = None
	# Sometimes there might be no context
	if self.context_ids != None:
	model_output = self.context_model.forward(input_ids=self.context_ids, attention_mask=self.context_attention_mask)
	# Take only the Class token as
	down = self.linear_downscale.forward(model_output["hidden_states"][-1][:,0,:].view(-1, self.n_embd))[:, None, :]
	return (hidden_states + down,
	down[None, :, :, :],
	(None if model_output == None else model_output["attentions"],
	None))

	class PoetTypeModule(torch.nn.Module):
	"""Module to classify poet type

	Args:
	torch (_type_): Is child of torch.nn.Module for integration with torch and huggingface
	"""

	def __init__(self, block_count, input_size, n_embd,output_size,args, *kwargs) -> None:
	"""Construct LM for poet classification from inputs

	Args:
	block_count (_type_): LM number of blocks of GPT2Block
	input_size (_type_): LM size of input
	n_embd (_type_): LM size of hidden layers
	output_size (_type_): LM size of output
	"""
	super().__init__(args, *kwargs)
	self.config = GPT2Config(n_positions=input_size, n_head=(n_embd//(768//12)),n_embd=n_embd,
	n_layer=block_count, output_hidden_states=True, output_attentions =True)
	self.type_model = GPT2Model(self.config)
	self.type_predict = torch.nn.Linear(n_embd, len(StropheParams.YEAR))
	self.softmax = torch.nn.Softmax()
	self.linear_scale = torch.nn.Linear(len(StropheParams.YEAR), output_size)
	self.input_size = input_size
	self.n_embd = n_embd
	self.output_size = output_size
	# Context and labels are getting injected from Outside
	self.context_ids = None
	self.context_attention_mask = None
	self.type_labels=None
	# Store for loss for model itself
	self.indiv_loss=None

	def forward(self, hidden_states,layer_past=None,args, *kwargs):
	"""Compute Classification LM output and loss

	Args:
	hidden_states (_type_): Current hidden states
	layer_past (_type_, optional): Past layer outputs. Defaults to None.

	Returns:
	_type_: GPT2Block structured output (hidden states, layer past, attention, keys)
	"""
	type_prob = torch.zeros((hidden_states.shape[0], len(StropheParams.YEAR))).to("cuda" if torch.cuda.is_available() else "cpu")
	model_output = None
	# Sometimes there might be no context
	if self.context_ids != None:
	model_output = self.type_model.forward(input_ids=self.context_ids, attention_mask=self.context_attention_mask)
	# Only Class token is taken
	poet_type = self.type_predict.forward(model_output["hidden_states"][-1][:,0,:].view(-1, self.n_embd))
	type_prob = self.softmax.forward(poet_type)
	# If type labels are present, inject the true labels to future blocks
	if self.type_labels != None:
	loss_fct = torch.nn.CrossEntropyLoss()
	self.indiv_loss = loss_fct(type_prob, self.type_labels)
	type_prob = (self.type_labels.type(torch.FloatTensor)).to("cuda" if torch.cuda.is_available() else "cpu")
	linear_up = self.linear_scale.forward(type_prob)
	return (hidden_states + linear_up[:, None, :],
	linear_up[None, :, None, :],
	(None if model_output == None else model_output["attentions"],
	None))

	from transformers import PreTrainedTokenizerBase

	class ModelManipulation:
	"""Static Class incorporating methods for Manipulation with LMs
	Code Inspired by article: Fine-tuning the English GPT-2 in any language with Hugging Face
	Link: https://github.com/piegu/fastai-projects/blob/master/finetuning-English-GPT2-any-language-Portuguese-HuggingFace-fastaiv2.ipynb
	"""

	@staticmethod
	def exchange_embedding(poet_model: PoetModelInterface, new_tokenizer: PreTrainedTokenizerBase, old_tokenizer: PreTrainedTokenizerBase, mirror_imbed:bool=False):
	"""Exchange embedding matrixes for GPT2 Models

	Args:
	poet_model (PoetModelInterface): Model to manipulate with
	new_tokenizer (PreTrainedTokenizerBase): New tokenization
	old_tokenizer (PreTrainedTokenizerBase): Old tokenization
	"""
	# Get old Embeddings
	if hasattr(poet_model.model, "transformer"):
	old_embed_in = poet_model.model.transformer.get_input_embeddings().weight.clone().detach()
	else:
	old_embed_in = poet_model.model.get_input_embeddings().weight.clone().detach()
	old_mean_in = old_embed_in.mean(0)
	# Generate new Embedding based on new tokenization
	new_embd_in = old_embed_in.new_zeros(new_tokenizer.vocab_size, old_embed_in.size(1))
	old_vocab = old_tokenizer.get_vocab()

	vocab_hit = 0
	# Keep as much from old Embeddings as possible
	for w, idx_new in new_tokenizer.get_vocab().items():
	idx_old = old_vocab.get(w, -1)
	if idx_old >= 0:
	new_embd_in[idx_new] = old_embed_in[idx_old]
	vocab_hit +=1
	else:
	new_embd_in[idx_new] = old_mean_in

	print(f"Vocab hit rate: {vocab_hit}/{old_tokenizer.vocab_size}")
	#Exchange Embeddings and Decoding
	new_embd_layer_in = torch.nn.Embedding(new_tokenizer.vocab_size, old_embed_in.size(1))
	new_embd_layer_in.weight.data = new_embd_in
	if hasattr(poet_model.model, "transformer"):
	poet_model.model.transformer.set_input_embeddings(new_embd_layer_in)
	else:
	poet_model.model.set_input_embeddings(new_embd_layer_in)

	new_decoder = torch.nn.Linear( old_embed_in.size(1), new_tokenizer.vocab_size, bias=False)
	if hasattr(poet_model.model, "transformer"):
	new_decoder.weight = poet_model.model.transformer.wte.weight
	else:
	new_decoder.weight = poet_model.model.base_model.embeddings.weight
	if hasattr(poet_model.model, "lm_head"):
	poet_model.model.lm_head = new_decoder
	else:
	poet_model.model.head = new_decoder


	# Update LM config to reflect possible change in vocab size
	poet_model.model.config.vocab_size = new_tokenizer.vocab_size


	@staticmethod
	def exchange_embedding_roberta(metre_model, new_tokenizer: PreTrainedTokenizerBase, old_tokenizer: PreTrainedTokenizerBase):
	"""Exchange embedding matrixes for Roberta Models

	Args:
	poet_model (PoetModelInterface): Model to manipulate with
	new_tokenizer (PreTrainedTokenizerBase): New tokenization
	old_tokenizer (PreTrainedTokenizerBase): Old tokenization
	"""
	# Get old Embeddings
	old_embed = metre_model.model.get_input_embeddings().weight.clone().detach()
	old_mean = old_embed.mean(0)
	# Generate new Embedding based on new tokenization
	new_embd = old_embed.new_zeros(new_tokenizer.vocab_size, old_embed.size(1))
	old_vocab = old_tokenizer.get_vocab()

	vocab_hit = 0
	# Keep as much from old Embeddings as possible
	for w, idx_new in new_tokenizer.get_vocab().items():
	idx_old = old_vocab.get(w, -1)
	if idx_old >= 0:
	new_embd[idx_new] = old_embed[idx_old]
	vocab_hit +=1
	else:
	new_embd[idx_new] = old_mean

	print(f"Vocab hit rate: {vocab_hit}/{old_tokenizer.vocab_size}")
	#Exchange Embeddings and Decoding
	new_embd_layer = torch.nn.Embedding(new_tokenizer.vocab_size, old_embed.size(1))
	new_embd_layer.weight.data = new_embd
	metre_model.model.set_input_embeddings(new_embd_layer)
	new_decoder = torch.nn.Linear( old_embed.size(1), new_tokenizer.vocab_size)
	new_decoder.weight = metre_model.model.roberta.embeddings.word_embeddings.weight
	metre_model.model.lm_head.decoder = new_decoder
	# Update LM config to reflect possible change in vocab size
	metre_model.model.config.vocab_size = new_tokenizer.vocab_size