Upload folder using huggingface_hub

0867146 verified about 1 month ago

39.1 kB

	"""
	The code is modified from the EsmModel in the transformers library.
	Sources: https://github.com/huggingface/transformers/blob/main/src/transformers/models/esm/modeling_esm.py
	"""

	from dataclasses import dataclass
	from functools import partial
	from typing import Optional, Sequence, Tuple, Union

	import torch
	import torch.utils.checkpoint
	from torch import nn
	from torch.nn import CrossEntropyLoss
	from transformers.modeling_outputs import (
	BaseModelOutputWithPastAndCrossAttentions,
	BaseModelOutputWithPoolingAndCrossAttentions,
	MaskedLMOutput,
	ModelOutput,
	)
	from transformers.modeling_utils import PreTrainedModel
	from transformers.utils import logging

	from .config import UniRNAConfig

	logger = logging.get_logger(__name__)


	@dataclass
	class UniRNASSPredictionOutput(ModelOutput):
	loss: Optional[torch.FloatTensor] = None
	logits: Optional[torch.FloatTensor] = None
	hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
	pair_mask: Optional[torch.BoolTensor] = None


	def rotate_half(x):
	x1, x2 = x.chunk(2, dim=-1)
	return torch.cat((-x2, x1), dim=-1)


	def apply_rotary_pos_emb(x, cos, sin):
	cos = cos[:, :, : x.shape[-2], :]
	sin = sin[:, :, : x.shape[-2], :]

	return (x * cos) + (rotate_half(x) * sin)


	class RotaryEmbedding(nn.Module):
	"""
	Rotary position embeddings based on those in
	[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer). Query and keys are transformed by rotation
	matrices which depend on their relative positions.
	"""

	def __init__(self, dim: int):
	super().__init__()
	# Generate and save the inverse frequency buffer (non trainable)
	inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
	inv_freq = inv_freq
	self.register_buffer("inv_freq", inv_freq)

	self._seq_len_cached = None
	self._cos_cached = None
	self._sin_cached = None

	def _update_cos_sin_tables(self, x, seq_dimension=2):
	seq_len = x.shape[seq_dimension]

	# Reset the tables if the sequence length has changed,
	# or if we're on a new device (possibly due to tracing for instance)
	if seq_len != self._seq_len_cached or self._cos_cached.device != x.device:
	self._seq_len_cached = seq_len
	t = torch.arange(x.shape[seq_dimension], device=x.device).type_as(self.inv_freq)
	freqs = torch.outer(t, self.inv_freq)
	emb = torch.cat((freqs, freqs), dim=-1).to(x.device)

	self._cos_cached = emb.cos()[None, None, :, :]
	self._sin_cached = emb.sin()[None, None, :, :]

	return self._cos_cached, self._sin_cached

	def forward(self, q: torch.Tensor, k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
	self._cos_cached, self._sin_cached = self._update_cos_sin_tables(k, seq_dimension=-2)

	return (
	apply_rotary_pos_emb(q, self._cos_cached, self._sin_cached),
	apply_rotary_pos_emb(k, self._cos_cached, self._sin_cached),
	)


	class UniRNAEmbedding(nn.Module):
	"""
	Same as BertEmbeddings with a additional token_dropout.
	"""

	def __init__(self, config):
	super().__init__()
	self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)

	if config.emb_layer_norm_before:
	self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
	else:
	self.layer_norm = None
	self.dropout = nn.Dropout(config.hidden_dropout_prob)

	self.padding_idx = config.pad_token_id
	self.token_dropout = config.token_dropout
	self.mask_token_id = config.mask_token_id

	def forward(self, input_ids=None, attention_mask=None, inputs_embeds=None):
	if inputs_embeds is None:
	inputs_embeds = self.word_embeddings(input_ids)

	embeddings = inputs_embeds
	if attention_mask is None:
	attention_mask = torch.ones(embeddings.shape[:2], device=embeddings.device)

	# By default, we use token dropout, similar to UniRNA.
	if self.layer_norm is not None:
	embeddings = self.layer_norm(embeddings)
	if attention_mask is not None:
	embeddings = (embeddings * attention_mask.unsqueeze(-1)).to(embeddings.dtype)

	embeddings = self.dropout(embeddings)
	if self.token_dropout and input_ids is not None:
	embeddings = embeddings.masked_fill((input_ids == self.mask_token_id).unsqueeze(-1), 0.0)
	# 0.15 is MaskedLM's default mask probability, and 0.8 is the default keep probability
	mask_ratio_train = 0.15 * 0.8
	src_lengths = attention_mask.sum(-1).clamp(min=1).to(embeddings.dtype)
	mask_ratio_observed = (input_ids == self.mask_token_id).sum(-1).to(embeddings.dtype) / src_lengths
	denom = (1 - mask_ratio_observed).clamp(min=1e-6)
	embeddings = (embeddings * (1 - mask_ratio_train) / denom[:, None, None]).to(embeddings.dtype)

	return embeddings


	class UniRNASelfAttention(nn.Module):
	def __init__(self, config):
	super().__init__()
	if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
	raise ValueError(
	f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
	f"heads ({config.num_attention_heads})"
	)

	self.num_attention_heads = config.num_attention_heads
	self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
	self.all_head_size = self.num_attention_heads * self.attention_head_size

	self.query = nn.Linear(config.hidden_size, self.all_head_size)
	self.key = nn.Linear(config.hidden_size, self.all_head_size)
	self.value = nn.Linear(config.hidden_size, self.all_head_size)

	self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

	self.rotary_embeddings = RotaryEmbedding(dim=self.attention_head_size)

	self.is_decoder = config.is_decoder

	def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
	new_x_shape = x.size()[:-1] + (
	self.num_attention_heads,
	self.attention_head_size,
	)
	x = x.view(new_x_shape)
	return x.permute(0, 2, 1, 3)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.FloatTensor] = None,
	output_attentions: Optional[bool] = False,
	) -> Tuple[torch.Tensor]:
	mixed_query_layer = self.query(hidden_states)

	key_layer = self.transpose_for_scores(self.key(hidden_states))
	value_layer = self.transpose_for_scores(self.value(hidden_states))
	query_layer = self.transpose_for_scores(mixed_query_layer)

	# Hardcoded from EsmModel provided by transformers
	query_layer = query_layer * self.attention_head_size**-0.5

	# Apply rotary embeddings
	query_layer, key_layer = self.rotary_embeddings(query_layer, key_layer)

	# Take the dot product between "query" and "key" to get the raw attention scores.
	# For faster computation, you can used torch.nn.functional.scaled_dot_product_attention

	attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

	if attention_mask is not None:
	# Apply the attention mask is (precomputed for all layers in UniRNAModel forward() function)
	attention_scores = attention_scores + attention_mask

	# Normalize the attention scores to probabilities.
	attention_probs = nn.functional.softmax(attention_scores, dim=-1)

	# This is actually dropping out entire tokens to attend to, which might
	# seem a bit unusual, but is taken from the original Transformer paper.
	attention_probs = self.dropout(attention_probs)

	context_layer = torch.matmul(attention_probs, value_layer)

	context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
	new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
	context_layer = context_layer.view(new_context_layer_shape)

	outputs = (context_layer, attention_probs) if output_attentions else (context_layer, None)

	return outputs


	class UniRNAFlashSelfAttention(UniRNASelfAttention):
	"""Self-attention using PyTorch's scaled_dot_product_attention (SDPA) backend."""

	def __init__(self, config):
	super().__init__(config)
	self.dropout_prob = config.attention_probs_dropout_prob

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = False,
	) -> Tuple[torch.Tensor]:
	if output_attentions:
	raise ValueError("SDPA attention does not support output_attentions=True")

	mixed_query_layer = self.query(hidden_states)
	key_layer = self.transpose_for_scores(self.key(hidden_states))
	value_layer = self.transpose_for_scores(self.value(hidden_states))
	query_layer = self.transpose_for_scores(mixed_query_layer)

	# Same manual scaling as UniRNASelfAttention
	query_layer = query_layer * self.attention_head_size**-0.5

	# Apply rotary embeddings
	query_layer, key_layer = self.rotary_embeddings(query_layer, key_layer)

	# Use PyTorch SDPA; scale=1.0 because we already scaled query above
	attn_output = torch.nn.functional.scaled_dot_product_attention(
	query_layer,
	key_layer,
	value_layer,
	attn_mask=attention_mask,
	dropout_p=self.dropout_prob if self.training else 0.0,
	scale=1.0,
	)

	attn_output = attn_output.permute(0, 2, 1, 3).contiguous()
	new_shape = attn_output.size()[:-2] + (self.all_head_size,)
	attn_output = attn_output.view(new_shape)

	return (attn_output, None)


	class UniRNASelfOutput(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.dense = nn.Linear(config.hidden_size, config.hidden_size)
	self.dropout = nn.Dropout(config.hidden_dropout_prob)

	def forward(self, hidden_states, input_tensor):
	hidden_states = self.dense(hidden_states)
	hidden_states = self.dropout(hidden_states)
	hidden_states = hidden_states + input_tensor
	return hidden_states


	class UniRNA_Attention(nn.Module):
	def __init__(self, config):
	super().__init__()

	if getattr(config, "use_flash_attention", False):
	self.self = UniRNAFlashSelfAttention(config)
	else:
	self.self = UniRNASelfAttention(config)
	self.output = UniRNASelfOutput(config)
	self.pruned_heads = set()
	self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

	# TODO: add pruning heads
	# def prune_heads(self, heads):
	# if len(heads) == 0:
	# return
	# heads, index = find_pruneable_heads_and_indices(
	# heads,
	# self.self.num_attention_heads,
	# self.self.attention_head_size,
	# self.pruned_heads,
	# )

	# # Prune linear layers
	# self.self.query = prune_linear_layer(self.self.query, index)
	# self.self.key = prune_linear_layer(self.self.key, index)
	# self.self.value = prune_linear_layer(self.self.value, index)
	# self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

	# # Update hyper params and store pruned heads
	# self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
	# self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
	# self.pruned_heads = self.pruned_heads.union(heads)

	def forward(
	self,
	hidden_states,
	attention_mask=None,
	output_attentions=False,
	):
	hidden_states_ln = self.LayerNorm(hidden_states)
	self_outputs = self.self(
	hidden_states_ln,
	attention_mask,
	output_attentions,
	)
	attention_output = self.output(self_outputs[0], hidden_states)
	return (attention_output, self_outputs[1])


	class UniRNAIntermediate(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.dense = nn.Linear(config.hidden_size, config.intermediate_size)

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	hidden_states = self.dense(hidden_states)
	hidden_states = nn.functional.gelu(hidden_states)
	return hidden_states


	class UniRNAOutput(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
	self.dropout = nn.Dropout(config.hidden_dropout_prob)

	def forward(self, hidden_states, input_tensor):
	hidden_states = self.dense(hidden_states)
	hidden_states = self.dropout(hidden_states)
	hidden_states = hidden_states + input_tensor
	return hidden_states


	class UniRNALayer(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.chunk_size_feed_forward = config.chunk_size_feed_forward
	self.seq_len_dim = 1
	self.attention = UniRNA_Attention(config)
	self.intermediate = UniRNAIntermediate(config)
	self.output = UniRNAOutput(config)
	self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

	def forward(
	self,
	hidden_states,
	attention_mask=None,
	output_attentions=False,
	):
	self_attention_outputs = self.attention(hidden_states, attention_mask, output_attentions=output_attentions)
	layer_output = self.feed_forward_chunk(self_attention_outputs[0])
	return (layer_output, self_attention_outputs[1])

	def feed_forward_chunk(self, attention_output):
	attention_output_ln = self.LayerNorm(attention_output)
	intermediate_output = self.intermediate(attention_output_ln)
	layer_output = self.output(intermediate_output, attention_output)
	return layer_output


	class UniRNAEncoder(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.config = config
	self.layer = nn.ModuleList([UniRNALayer(config) for _ in range(config.num_hidden_layers)])
	self.emb_layer_norm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
	self.gradient_checkpointing = False

	def forward(
	self,
	hidden_states,
	attention_mask=None,
	output_attentions=False,
	output_hidden_states=False,
	):

	all_hidden_states = () if output_hidden_states else None
	all_self_attentions = () if output_attentions else None

	for layer_module in self.layer:
	if output_hidden_states:
	all_hidden_states = all_hidden_states + (hidden_states,)

	if self.gradient_checkpointing and self.training:
	layer_outputs = self._gradient_checkpointing_func(
	layer_module.__call__,
	hidden_states,
	attention_mask,
	output_attentions,
	)
	else:
	layer_outputs = layer_module(
	hidden_states,
	attention_mask,
	output_attentions,
	)
	hidden_states = layer_outputs[0]
	if output_attentions:
	all_self_attentions = all_self_attentions + (layer_outputs[1],)

	if self.emb_layer_norm_after:
	hidden_states = self.emb_layer_norm_after(hidden_states)

	if output_hidden_states:
	all_hidden_states = all_hidden_states + (hidden_states,)

	return BaseModelOutputWithPastAndCrossAttentions(
	last_hidden_state=hidden_states,
	hidden_states=all_hidden_states,
	attentions=all_self_attentions,
	)


	# Copied from transformers.models.bert.modeling_bert.BertPooler
	class UniRNAPooler(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.dense = nn.Linear(config.hidden_size, config.hidden_size)
	self.activation = nn.Tanh()

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	# We "pool" the model by simply taking the hidden state corresponding
	# to the first token.
	first_token_tensor = hidden_states[:, 0]
	pooled_output = self.dense(first_token_tensor)
	pooled_output = self.activation(pooled_output)
	return pooled_output


	class UniRNAModel(PreTrainedModel):
	config_class = UniRNAConfig
	supports_gradient_checkpointing = True
	main_input_name = "input_ids"
	"""

	The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
	cross-attention is added between the self-attention layers, following the architecture described in [Attention is
	all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
	Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
	"""

	def __init__(self, config, add_pooling_layer=True):
	super().__init__(config)
	self.config = config
	self.embeddings = UniRNAEmbedding(config)
	self.encoder = UniRNAEncoder(config)
	self.pooler = UniRNAPooler(config) if add_pooling_layer else None

	use_flash_attention = getattr(config, "use_flash_attention", False)
	if use_flash_attention:
	logger.info("Using Uni-RNA SDPA Attention")
	else:
	logger.info("Using Uni-RNA Attention")

	# Initialize weights and apply final processing
	self.post_init()

	def _set_gradient_checkpointing(self, enable: bool, gradient_checkpointing_func=None):
	self.encoder.gradient_checkpointing = enable
	if gradient_checkpointing_func is not None:
	self.encoder._gradient_checkpointing_func = gradient_checkpointing_func

	def get_input_embeddings(self):
	return self.embeddings.word_embeddings

	def set_input_embeddings(self, value):
	self.embeddings.word_embeddings = value

	def _prune_heads(self, heads_to_prune):
	"""
	Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
	class PreTrainedModel
	"""
	for layer, heads in heads_to_prune.items():
	self.encoder.layer[layer].attention.prune_heads(heads)

	def forward(
	self,
	input_ids: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	inputs_embeds: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
	r"""
	encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, optional):
	Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
	the model is configured as a decoder.
	encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, optional):
	Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
	the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:

	- 1 for tokens that are not masked,
	- 0 for tokens that are masked.
	past_key_values (`Tuple[Tuple[torch.FloatTensor]]`, optional):
	Tuple of length `config.n_layers`. Each tuple has 4 tensors of shape
	`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`. Contains precomputed key and value
	hidden states of the attention blocks. Can be used to speed up decoding.

	If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
	don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
	`decoder_input_ids` of shape `(batch_size, sequence_length)`.
	use_cache (`bool`, optional):
	If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
	`past_key_values`).
	"""
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	input_shape, attention_mask = self._validate_and_shape_inputs(input_ids, inputs_embeds, attention_mask)
	extended_attention_mask = self._prepare_attention_mask(attention_mask, input_shape)
	embedding_output = self._compute_embedding_output(input_ids, attention_mask, inputs_embeds)
	encoder_outputs = self.encoder(
	embedding_output,
	attention_mask=extended_attention_mask,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	)
	sequence_output, pooled_output = self._pool_outputs(encoder_outputs[0], attention_mask)

	if not return_dict:
	output = (sequence_output, pooled_output) + encoder_outputs[1:]
	return output

	return BaseModelOutputWithPoolingAndCrossAttentions(
	last_hidden_state=sequence_output,
	pooler_output=pooled_output,
	past_key_values=encoder_outputs.past_key_values,
	hidden_states=encoder_outputs.hidden_states,
	attentions=encoder_outputs.attentions,
	cross_attentions=encoder_outputs.cross_attentions,
	)

	def _validate_and_shape_inputs(
	self,
	input_ids: Optional[torch.Tensor],
	inputs_embeds: Optional[torch.Tensor],
	attention_mask: Optional[torch.Tensor],
	) -> Tuple[Tuple[int, ...], torch.Tensor]:
	if input_ids is not None and inputs_embeds is not None:
	raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
	if input_ids is None and inputs_embeds is None:
	raise ValueError("You have to specify either input_ids or inputs_embeds")

	if input_ids is not None:
	self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
	input_shape = input_ids.size()
	device = input_ids.device
	else:
	input_shape = inputs_embeds.size()[:-1]
	device = inputs_embeds.device

	batch_size, seq_length = input_shape
	if attention_mask is None:
	attention_mask = torch.ones((batch_size, seq_length), device=device)
	return input_shape, attention_mask

	def _prepare_attention_mask(self, attention_mask: torch.Tensor, input_shape: Tuple[int, ...]) -> torch.Tensor:
	return self.get_extended_attention_mask(attention_mask, input_shape)

	def _compute_embedding_output(
	self,
	input_ids: Optional[torch.Tensor],
	attention_mask: torch.Tensor,
	inputs_embeds: Optional[torch.Tensor],
	) -> torch.Tensor:
	return self.embeddings(input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds)

	def _pool_outputs(
	self, sequence_output: torch.Tensor, attention_mask: torch.Tensor
	) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
	# make it compatible with deepprotein which wraps the model with different pooler
	try:
	pooled_output = self.pooler(sequence_output, attention_mask) if self.pooler is not None else None
	except TypeError:
	pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
	return sequence_output, pooled_output


	class UniRNAForMaskedLM(PreTrainedModel):
	_tied_weights_keys = ["lm_head.decoder.weight"]
	config_class = UniRNAConfig
	supports_gradient_checkpointing = True
	main_input_name = "input_ids"

	def __init__(self, config):
	super().__init__(config)

	self.config = config
	self.embeddings = UniRNAEmbedding(config)
	self.encoder = UniRNAEncoder(config)
	self.lm_head = UniRNALMHead(config)

	self.post_init()

	def _set_gradient_checkpointing(self, enable: bool, gradient_checkpointing_func=None):
	self.encoder.gradient_checkpointing = enable
	if gradient_checkpointing_func is not None:
	self.encoder._gradient_checkpointing_func = gradient_checkpointing_func

	def get_input_embeddings(self):
	return self.embeddings.word_embeddings

	def set_input_embeddings(self, value):
	self.embeddings.word_embeddings = value

	def get_output_embeddings(self):
	return self.lm_head.decoder

	def set_output_embeddings(self, new_embeddings):
	self.lm_head.decoder = new_embeddings

	def forward(
	self,
	input_ids: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	inputs_embeds: Optional[torch.Tensor] = None,
	labels: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple, MaskedLMOutput]:
	r"""
	labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, optional):
	Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
	config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
	loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
	kwargs (`Dict[str, any]`, optional, defaults to {}):
	Used to hide legacy arguments that have been deprecated.
	"""
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	if input_ids is not None and inputs_embeds is not None:
	raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
	elif input_ids is not None:
	self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
	input_shape = input_ids.size()
	elif inputs_embeds is not None:
	input_shape = inputs_embeds.size()[:-1]
	else:
	raise ValueError("You have to specify either input_ids or inputs_embeds")

	batch_size, seq_length = input_shape
	device = input_ids.device if input_ids is not None else inputs_embeds.device

	if attention_mask is None:
	attention_mask = torch.ones(((batch_size, seq_length)), device=device)

	extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)

	embedding_output = self.embeddings(
	input_ids=input_ids,
	attention_mask=attention_mask,
	inputs_embeds=inputs_embeds,
	)

	encoder_outputs = self.encoder(
	embedding_output,
	attention_mask=extended_attention_mask,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	)
	sequence_output = encoder_outputs[0]

	prediction_scores = self.lm_head(sequence_output)

	loss = None
	if labels is not None:
	loss_fct = CrossEntropyLoss()
	loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

	if not return_dict:
	output = (prediction_scores,) + encoder_outputs[1:]
	return ((loss,) + output) if loss is not None else output

	return MaskedLMOutput(
	loss=loss,
	logits=prediction_scores,
	hidden_states=encoder_outputs.hidden_states,
	attentions=encoder_outputs.attentions,
	)


	class UniRNAForSSPredict(PreTrainedModel):
	"""
	TODO: make it compatible with transformers, create new 'modeling_outputs' class for SS prediction
	"""

	config_class = UniRNAConfig
	supports_gradient_checkpointing = True
	main_input_name = "input_ids"

	def __init__(self, config):
	# Explicitly block usage until this head is trained and validated.
	raise RuntimeError(
	"UniRNAForSSPredict is disabled and not supported. This head is untrained and must not be called."
	)

	def _set_gradient_checkpointing(self, enable: bool, gradient_checkpointing_func=None):
	self.encoder.gradient_checkpointing = enable
	if gradient_checkpointing_func is not None:
	self.encoder._gradient_checkpointing_func = gradient_checkpointing_func

	def get_input_embeddings(self):
	return self.embeddings.word_embeddings

	def set_input_embeddings(self, value):
	self.embeddings.word_embeddings = value

	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	inputs_embeds: Optional[torch.Tensor] = None,
	labels: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple, UniRNASSPredictionOutput]:
	r"""
	labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, optional):
	Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
	config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
	loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
	kwargs (`Dict[str, any]`, optional, defaults to {}):
	Used to hide legacy arguments that have been deprecated.
	"""

	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	if input_ids is not None and inputs_embeds is not None:
	raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
	elif input_ids is not None:
	self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
	input_shape = input_ids.size()
	elif inputs_embeds is not None:
	input_shape = inputs_embeds.size()[:-1]
	else:
	raise ValueError("You have to specify either input_ids or inputs_embeds")

	batch_size, seq_length = input_shape
	device = input_ids.device if input_ids is not None else inputs_embeds.device

	if attention_mask is None:
	attention_mask = torch.ones(((batch_size, seq_length)), device=device)

	extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)

	embedding_output = self.embeddings(
	input_ids=input_ids,
	attention_mask=attention_mask,
	inputs_embeds=inputs_embeds,
	)

	encoder_outputs = self.encoder(
	embedding_output,
	attention_mask=extended_attention_mask,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	)

	sequence_output = encoder_outputs[0]
	logits, pair_mask = self.heads(sequence_output, attention_mask=attention_mask, return_mask=True)

	loss = None
	if labels is not None:
	if labels.dim() == 3:
	labels = labels.unsqueeze(-1)
	if labels.shape[1] == logits.shape[1] + 2 and labels.shape[2] == logits.shape[2] + 2:
	labels = labels[:, 1:-1, 1:-1, :]
	labels = labels.to(logits.dtype)
	loss_fct = nn.BCEWithLogitsLoss()
	if pair_mask is not None:
	loss = loss_fct(logits[pair_mask], labels[pair_mask])
	else:
	loss = loss_fct(logits, labels)

	if not return_dict:
	output = (logits, encoder_outputs.hidden_states, encoder_outputs.attentions, pair_mask)
	return ((loss,) + output) if loss is not None else output

	return UniRNASSPredictionOutput(
	loss=loss,
	logits=logits,
	hidden_states=encoder_outputs.hidden_states,
	attentions=encoder_outputs.attentions,
	pair_mask=pair_mask,
	)


	class UniRNALMHead(nn.Module):
	"""UniRNA Head for masked language modeling."""

	def __init__(self, config):
	super().__init__()
	self.dense = nn.Linear(config.hidden_size, config.hidden_size)
	self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
	self.decoder = nn.Linear(config.hidden_size, config.vocab_size)

	def forward(self, features):
	x = self.dense(features)
	x = nn.functional.gelu(x)
	x = self.layer_norm(x)

	# project back to size of vocabulary with bias
	x = self.decoder(x)
	return x


	class Dense(nn.Module):
	def __init__(
	self,
	in_features: int,
	out_features: int,
	norm: str = "LayerNorm",
	activation: str = "ReLU",
	dropout: float = 0.1,
	pool: str = "AdaptiveAvgPool1d",
	bias: bool = True,
	residual: bool = True,
	) -> None:
	super().__init__()
	self.residual = residual
	self.linear = nn.Linear(in_features, out_features, bias=bias)
	self.norm = getattr(nn, norm)(out_features) if norm else nn.Identity()
	self.activation = getattr(nn, activation)() if activation else nn.Identity()
	self.dropout = nn.Dropout(dropout)
	self.pool = getattr(nn, pool)(out_features) if pool else nn.Identity() if self.residual else None

	def forward(self, x):
	out = self.linear(x)
	out = self.norm(out)
	out = self.activation(out)
	out = self.dropout(out)
	if self.residual:
	out = out + self.pool(x)
	return out


	class MLP(nn.Module):
	def __init__(
	self,
	*features: Sequence[int],
	norm: str = "LayerNorm",
	activation: str = "ReLU",
	dropout: float = 0.1,
	pool: str = "AdaptiveAvgPool1d",
	bias: bool = True,
	residual: bool = True,
	linear_output: bool = True
	) -> None:
	super().__init__()
	if len(features) == 0 and isinstance(features, Sequence):
	features = features[0] # type: ignore[assignment]
	if not len(features) > 1:
	raise ValueError(f"`features` of MLP should have at least 2 elements, but got {len(features)}")
	dense = partial(
	Dense,
	norm=norm,
	activation=activation,
	dropout=dropout,
	pool=pool,
	bias=bias,
	residual=residual,
	)
	if linear_output:
	layers = [dense(in_features, out_features) for in_features, out_features in zip(features, features[1:-1])]
	layers.append(nn.Linear(features[-2], features[-1], bias=bias))
	else:
	layers = [dense(in_features, out_features) for in_features, out_features in zip(features, features[1:])]
	self.layers = nn.Sequential(*layers)

	def forward(self, x):
	return self.layers(x)


	class UniRNASSHead(nn.Module):
	"""UniRNA head for Secondary Structure Prediction"""

	def __init__(self, config) -> None:
	super().__init__()

	self.qk_proj = nn.Linear(config.hidden_size, 2 * config.hidden_size)
	self.ffn = MLP(1, config.hidden_size, residual=False)
	self.linear = nn.Linear(config.hidden_size, 1)

	def forward(self, features, attention_mask: Optional[torch.Tensor] = None, return_mask: bool = False):
	x = features[:, 1:-1] # remove CLS and EOS tokens
	q, k = self.qk_proj(x).chunk(2, dim=-1)
	contact_map = (q @ k.transpose(-2, -1)).unsqueeze(-1)
	contact_map = contact_map + self.ffn(contact_map)
	logits = self.linear(contact_map)

	pair_mask = None
	if attention_mask is not None:
	core_mask = attention_mask[:, 1:-1].bool()
	pair_mask = core_mask.unsqueeze(-1) & core_mask.unsqueeze(-2)
	pair_mask = pair_mask.unsqueeze(-1)
	logits = logits.masked_fill(~pair_mask, 0.0)

	return (logits, pair_mask) if return_mask else logits


	class AvgPooler(nn.Module):
	def __init__(self):
	super().__init__()

	def forward(self, hidden_states, attention_mask=None):
	if attention_mask is None:
	attention_mask = torch.ones(hidden_states.shape[:2], device=hidden_states.device, dtype=torch.bool)
	else:
	attention_mask = attention_mask.bool()

	if hidden_states.size(1) > 2:
	core_states = hidden_states[:, 1:-1, :]
	core_mask = attention_mask[:, 1:-1]
	else:
	core_states = hidden_states
	core_mask = attention_mask

	core_mask = core_mask.unsqueeze(-1)
	masked_states = core_states * core_mask
	denom = core_mask.sum(dim=1).clamp(min=1).to(hidden_states.dtype)
	return masked_states.sum(dim=1) / denom


	class UniRNAModels(UniRNAModel):
	config_class = UniRNAConfig
	supports_gradient_checkpointing = True

	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs)

	# We didn't include weight for original pooler, so we replace it with a simple cls pooler
	del self.pooler
	self.pooler = AvgPooler()


	class UniRNAForMLM(UniRNAForMaskedLM):
	config_class = UniRNAConfig
	supports_gradient_checkpointing = True

	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs)

	# We didn't include weight for original pooler, so we replace it with a simple cls pooler
	self.pooler = AvgPooler()