Model save

85dacbc verified 8 days ago

17.4 kB

	from typing import Optional, Union

	import torch
	from torch import nn
	from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

	from transformers.cache_utils import Cache, DynamicCache, EncoderDecoderCache
	from transformers.masking_utils import create_bidirectional_mask, create_causal_mask
	from transformers.models.bert.modeling_bert import BertEncoder, BertPooler, BertPreTrainedModel, BertOnlyMLMHead
	from transformers.modeling_outputs import (
	BaseModelOutputWithPoolingAndCrossAttentions,
	MaskedLMOutput,
	SequenceClassifierOutput,
	)
	from transformers.processing_utils import Unpack
	from transformers.utils import TransformersKwargs, auto_docstring, logging
	from transformers.utils.generic import can_return_tuple, merge_with_config_defaults
	from transformers.utils.output_capturing import capture_outputs

	from .configuration_bert_hash import BertHashConfig

	logger = logging.get_logger(__name__)


	class BertHashTokens(nn.Module):
	"""
	Module that embeds token vocabulary to an intermediate embeddings layer then projects those embeddings to the
	hidden size.

	The number of projections is like a hash. Setting the projections parameter to 5 is like generating a
	160-bit hash (5 x float32) for each token. That hash is then projected to the hidden size.

	This significantly reduces the number of parameters necessary for token embeddings.

	For example:
	Standard token embeddings:
	30,522 (vocab size) x 768 (hidden size) = 23,440,896 parameters
	23,440,896 x 4 (float32) = 93,763,584 bytes

	Hash token embeddings:
	30,522 (vocab size) x 5 (hash buckets) + 5 x 768 (projection matrix)= 156,450 parameters
	156,450 x 4 (float32) = 625,800 bytes
	"""

	def __init__(self, config):
	super().__init__()
	self.config = config

	# Token embeddings
	self.embeddings = nn.Embedding(config.vocab_size, config.projections, padding_idx=config.pad_token_id)

	# Token embeddings projections
	self.projections = nn.Linear(config.projections, config.hidden_size)

	def forward(self, input_ids):
	# Project embeddings to hidden size
	return self.projections(self.embeddings(input_ids))


	class BertHashEmbeddings(nn.Module):
	"""Construct the embeddings from word, position and token_type embeddings."""

	def __init__(self, config):
	super().__init__()
	self.word_embeddings = BertHashTokens(config)
	self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
	self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

	self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
	self.dropout = nn.Dropout(config.hidden_dropout_prob)
	# position_ids (1, len position emb) is contiguous in memory and exported when serialized
	self.register_buffer(
	"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
	)
	self.register_buffer(
	"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
	)

	def forward(
	self,
	input_ids: torch.LongTensor \| None = None,
	token_type_ids: torch.LongTensor \| None = None,
	position_ids: torch.LongTensor \| None = None,
	inputs_embeds: torch.FloatTensor \| None = None,
	past_key_values_length: int = 0,
	) -> torch.Tensor:
	if input_ids is not None:
	input_shape = input_ids.size()
	else:
	input_shape = inputs_embeds.size()[:-1]

	batch_size, seq_length = input_shape
	device = input_ids.device if input_ids is not None else inputs_embeds.device

	if position_ids is None:
	position_ids = (
	torch.arange(seq_length, dtype=torch.long, device=device)
	.unsqueeze(0)
	.expand(batch_size, seq_length)
	)

	# Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
	# when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
	# issue #5664
	if token_type_ids is None:
	if hasattr(self, "token_type_ids"):
	# NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
	buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1)
	buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
	token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length)
	else:
	token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

	if inputs_embeds is None:
	inputs_embeds = self.word_embeddings(input_ids)
	token_type_embeddings = self.token_type_embeddings(token_type_ids)
	embeddings = inputs_embeds + token_type_embeddings

	position_embeddings = self.position_embeddings(position_ids)
	embeddings = embeddings + position_embeddings

	embeddings = self.LayerNorm(embeddings)
	embeddings = self.dropout(embeddings)
	return embeddings


	@auto_docstring(
	custom_intro="""
	The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
	cross-attention is added between the self-attention layers, following the architecture described in [Attention is
	all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
	Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

	To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
	to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
	`add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
	"""
	)
	class BertHashModel(BertPreTrainedModel):
	config_class = BertHashConfig

	_no_split_modules = ["BertEmbeddings", "BertLayer"]

	def __init__(self, config, add_pooling_layer=True):
	r"""
	add_pooling_layer (bool, optional, defaults to `True`):
	Whether to add a pooling layer
	"""
	super().__init__(config)
	self.config = config
	self.gradient_checkpointing = False

	self.embeddings = BertHashEmbeddings(config)
	self.encoder = BertEncoder(config)

	self.pooler = BertPooler(config) if add_pooling_layer else None

	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self):
	return self.embeddings.word_embeddings.embeddings

	def set_input_embeddings(self, value):
	self.embeddings.word_embeddings = value

	@merge_with_config_defaults
	@capture_outputs
	@auto_docstring
	def forward(
	self,
	input_ids: torch.Tensor \| None = None,
	attention_mask: torch.Tensor \| None = None,
	token_type_ids: torch.Tensor \| None = None,
	position_ids: torch.Tensor \| None = None,
	inputs_embeds: torch.Tensor \| None = None,
	encoder_hidden_states: torch.Tensor \| None = None,
	encoder_attention_mask: torch.Tensor \| None = None,
	past_key_values: Cache \| None = None,
	use_cache: bool \| None = None,
	**kwargs: Unpack[TransformersKwargs],
	) -> tuple[torch.Tensor] \| BaseModelOutputWithPoolingAndCrossAttentions:
	if (input_ids is None) ^ (inputs_embeds is not None):
	raise ValueError("You must specify exactly one of input_ids or inputs_embeds")

	if self.config.is_decoder:
	use_cache = use_cache if use_cache is not None else self.config.use_cache
	else:
	use_cache = False

	if use_cache and past_key_values is None:
	past_key_values = (
	EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
	if encoder_hidden_states is not None or self.config.is_encoder_decoder
	else DynamicCache(config=self.config)
	)

	past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0

	embedding_output = self.embeddings(
	input_ids=input_ids,
	position_ids=position_ids,
	token_type_ids=token_type_ids,
	inputs_embeds=inputs_embeds,
	past_key_values_length=past_key_values_length,
	)

	attention_mask, encoder_attention_mask = self._create_attention_masks(
	attention_mask=attention_mask,
	encoder_attention_mask=encoder_attention_mask,
	embedding_output=embedding_output,
	encoder_hidden_states=encoder_hidden_states,
	past_key_values=past_key_values,
	)

	encoder_outputs = self.encoder(
	embedding_output,
	attention_mask=attention_mask,
	encoder_hidden_states=encoder_hidden_states,
	encoder_attention_mask=encoder_attention_mask,
	past_key_values=past_key_values,
	use_cache=use_cache,
	position_ids=position_ids,
	**kwargs,
	)
	sequence_output = encoder_outputs.last_hidden_state
	pooled_output = self.pooler(sequence_output) if self.pooler is not None else None

	return BaseModelOutputWithPoolingAndCrossAttentions(
	last_hidden_state=sequence_output,
	pooler_output=pooled_output,
	past_key_values=encoder_outputs.past_key_values,
	)

	def _create_attention_masks(
	self,
	attention_mask,
	encoder_attention_mask,
	embedding_output,
	encoder_hidden_states,
	past_key_values,
	):
	if self.config.is_decoder:
	attention_mask = create_causal_mask(
	config=self.config,
	inputs_embeds=embedding_output,
	attention_mask=attention_mask,
	past_key_values=past_key_values,
	)
	else:
	attention_mask = create_bidirectional_mask(
	config=self.config,
	inputs_embeds=embedding_output,
	attention_mask=attention_mask,
	)

	if encoder_attention_mask is not None:
	encoder_attention_mask = create_bidirectional_mask(
	config=self.config,
	inputs_embeds=embedding_output,
	attention_mask=encoder_attention_mask,
	encoder_hidden_states=encoder_hidden_states,
	)

	return attention_mask, encoder_attention_mask


	@auto_docstring
	class BertForMaskedLM(BertPreTrainedModel):
	_tied_weights_keys = {
	"cls.predictions.decoder.weight": "bert.embeddings.word_embeddings.weight",
	"cls.predictions.decoder.bias": "cls.predictions.bias",
	}
	config_class = BertHashConfig

	def __init__(self, config):
	super().__init__(config)

	if config.is_decoder:
	logger.warning(
	"If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for "
	"bi-directional self-attention."
	)

	self.bert = BertHashModel(config, add_pooling_layer=False)
	self.cls = BertOnlyMLMHead(config)

	# Initialize weights and apply final processing
	self.post_init()

	def get_output_embeddings(self):
	return self.cls.predictions.decoder

	def set_output_embeddings(self, new_embeddings):
	self.cls.predictions.decoder = new_embeddings
	self.cls.predictions.bias = new_embeddings.bias

	@can_return_tuple
	@auto_docstring
	def forward(
	self,
	input_ids: torch.Tensor \| None = None,
	attention_mask: torch.Tensor \| None = None,
	token_type_ids: torch.Tensor \| None = None,
	position_ids: torch.Tensor \| None = None,
	inputs_embeds: torch.Tensor \| None = None,
	encoder_hidden_states: torch.Tensor \| None = None,
	encoder_attention_mask: torch.Tensor \| None = None,
	labels: torch.Tensor \| None = None,
	**kwargs: Unpack[TransformersKwargs],
	) -> tuple[torch.Tensor] \| MaskedLMOutput:
	r"""
	labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, optional):
	Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
	config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
	loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
	"""
	outputs = self.bert(
	input_ids,
	attention_mask=attention_mask,
	token_type_ids=token_type_ids,
	position_ids=position_ids,
	inputs_embeds=inputs_embeds,
	encoder_hidden_states=encoder_hidden_states,
	encoder_attention_mask=encoder_attention_mask,
	return_dict=True,
	**kwargs,
	)

	sequence_output = outputs[0]
	prediction_scores = self.cls(sequence_output)

	masked_lm_loss = None
	if labels is not None:
	loss_fct = CrossEntropyLoss() # -100 index = padding token
	masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

	return MaskedLMOutput(
	loss=masked_lm_loss,
	logits=prediction_scores,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	)



	@auto_docstring(
	custom_intro="""
	Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
	output) e.g. for GLUE tasks.
	"""
	)
	class BertHashForSequenceClassification(BertPreTrainedModel):
	config_class = BertHashConfig

	def __init__(self, config):
	super().__init__(config)
	self.num_labels = config.num_labels
	self.config = config

	self.bert = BertHashModel(config)
	classifier_dropout = (
	config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
	)
	self.dropout = nn.Dropout(classifier_dropout)
	self.classifier = nn.Linear(config.hidden_size, config.num_labels)

	# Initialize weights and apply final processing
	self.post_init()

	@can_return_tuple
	@auto_docstring
	def forward(
	self,
	input_ids: torch.Tensor \| None = None,
	attention_mask: torch.Tensor \| None = None,
	token_type_ids: torch.Tensor \| None = None,
	position_ids: torch.Tensor \| None = None,
	inputs_embeds: torch.Tensor \| None = None,
	labels: torch.Tensor \| None = None,
	**kwargs: Unpack[TransformersKwargs],
	) -> tuple[torch.Tensor] \| SequenceClassifierOutput:
	r"""
	labels (`torch.LongTensor` of shape `(batch_size,)`, optional):
	Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
	config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
	`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
	"""
	outputs = self.bert(
	input_ids,
	attention_mask=attention_mask,
	token_type_ids=token_type_ids,
	position_ids=position_ids,
	inputs_embeds=inputs_embeds,
	return_dict=True,
	**kwargs,
	)

	pooled_output = outputs[1]

	pooled_output = self.dropout(pooled_output)
	logits = self.classifier(pooled_output)

	loss = None
	if labels is not None:
	if self.config.problem_type is None:
	if self.num_labels == 1:
	self.config.problem_type = "regression"
	elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
	self.config.problem_type = "single_label_classification"
	else:
	self.config.problem_type = "multi_label_classification"

	if self.config.problem_type == "regression":
	loss_fct = MSELoss()
	if self.num_labels == 1:
	loss = loss_fct(logits.squeeze(), labels.squeeze())
	else:
	loss = loss_fct(logits, labels)
	elif self.config.problem_type == "single_label_classification":
	loss_fct = CrossEntropyLoss()
	loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
	elif self.config.problem_type == "multi_label_classification":
	loss_fct = BCEWithLogitsLoss()
	loss = loss_fct(logits, labels)

	return SequenceClassifierOutput(
	loss=loss,
	logits=logits,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	)