UniRNA-L8 / model.py
EscheWang's picture
Upload folder using huggingface_hub
df55103 verified
"""
The code is modified from the EsmModel in the transformers library.
Sources: https://github.com/huggingface/transformers/blob/main/src/transformers/models/esm/modeling_esm.py
"""
from dataclasses import dataclass
from functools import partial
from typing import Optional, Sequence, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
from transformers.modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
BaseModelOutputWithPoolingAndCrossAttentions,
MaskedLMOutput,
ModelOutput,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.utils import logging
from .config import UniRNAConfig
logger = logging.get_logger(__name__)
@dataclass
class UniRNASSPredictionOutput(ModelOutput):
loss: Optional[torch.FloatTensor] = None
logits: Optional[torch.FloatTensor] = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
pair_mask: Optional[torch.BoolTensor] = None
def rotate_half(x):
x1, x2 = x.chunk(2, dim=-1)
return torch.cat((-x2, x1), dim=-1)
def apply_rotary_pos_emb(x, cos, sin):
cos = cos[:, :, : x.shape[-2], :]
sin = sin[:, :, : x.shape[-2], :]
return (x * cos) + (rotate_half(x) * sin)
class RotaryEmbedding(nn.Module):
"""
Rotary position embeddings based on those in
[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer). Query and keys are transformed by rotation
matrices which depend on their relative positions.
"""
def __init__(self, dim: int):
super().__init__()
# Generate and save the inverse frequency buffer (non trainable)
inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
inv_freq = inv_freq
self.register_buffer("inv_freq", inv_freq)
self._seq_len_cached = None
self._cos_cached = None
self._sin_cached = None
def _update_cos_sin_tables(self, x, seq_dimension=2):
seq_len = x.shape[seq_dimension]
# Reset the tables if the sequence length has changed,
# or if we're on a new device (possibly due to tracing for instance)
if seq_len != self._seq_len_cached or self._cos_cached.device != x.device:
self._seq_len_cached = seq_len
t = torch.arange(x.shape[seq_dimension], device=x.device).type_as(self.inv_freq)
freqs = torch.outer(t, self.inv_freq)
emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
self._cos_cached = emb.cos()[None, None, :, :]
self._sin_cached = emb.sin()[None, None, :, :]
return self._cos_cached, self._sin_cached
def forward(self, q: torch.Tensor, k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
self._cos_cached, self._sin_cached = self._update_cos_sin_tables(k, seq_dimension=-2)
return (
apply_rotary_pos_emb(q, self._cos_cached, self._sin_cached),
apply_rotary_pos_emb(k, self._cos_cached, self._sin_cached),
)
class UniRNAEmbedding(nn.Module):
"""
Same as BertEmbeddings with a additional token_dropout.
"""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
if config.emb_layer_norm_before:
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
else:
self.layer_norm = None
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.padding_idx = config.pad_token_id
self.token_dropout = config.token_dropout
self.mask_token_id = config.mask_token_id
def forward(self, input_ids=None, attention_mask=None, inputs_embeds=None):
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
embeddings = inputs_embeds
if attention_mask is None:
attention_mask = torch.ones(embeddings.shape[:2], device=embeddings.device)
# By default, we use token dropout, similar to UniRNA.
if self.layer_norm is not None:
embeddings = self.layer_norm(embeddings)
if attention_mask is not None:
embeddings = (embeddings * attention_mask.unsqueeze(-1)).to(embeddings.dtype)
embeddings = self.dropout(embeddings)
if self.token_dropout and input_ids is not None:
embeddings = embeddings.masked_fill((input_ids == self.mask_token_id).unsqueeze(-1), 0.0)
# 0.15 is MaskedLM's default mask probability, and 0.8 is the default keep probability
mask_ratio_train = 0.15 * 0.8
src_lengths = attention_mask.sum(-1).clamp(min=1).to(embeddings.dtype)
mask_ratio_observed = (input_ids == self.mask_token_id).sum(-1).to(embeddings.dtype) / src_lengths
denom = (1 - mask_ratio_observed).clamp(min=1e-6)
embeddings = (embeddings * (1 - mask_ratio_train) / denom[:, None, None]).to(embeddings.dtype)
return embeddings
class UniRNASelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.rotary_embeddings = RotaryEmbedding(dim=self.attention_head_size)
self.is_decoder = config.is_decoder
def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
new_x_shape = x.size()[:-1] + (
self.num_attention_heads,
self.attention_head_size,
)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
mixed_query_layer = self.query(hidden_states)
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
query_layer = self.transpose_for_scores(mixed_query_layer)
# Hardcoded from EsmModel provided by transformers
query_layer = query_layer * self.attention_head_size**-0.5
# Apply rotary embeddings
query_layer, key_layer = self.rotary_embeddings(query_layer, key_layer)
# Take the dot product between "query" and "key" to get the raw attention scores.
# For faster computation, you can used torch.nn.functional.scaled_dot_product_attention
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in UniRNAModel forward() function)
attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities.
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer, None)
return outputs
class UniRNAFlashSelfAttention(UniRNASelfAttention):
"""Self-attention using PyTorch's scaled_dot_product_attention (SDPA) backend."""
def __init__(self, config):
super().__init__(config)
self.dropout_prob = config.attention_probs_dropout_prob
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
if output_attentions:
raise ValueError("SDPA attention does not support output_attentions=True")
mixed_query_layer = self.query(hidden_states)
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
query_layer = self.transpose_for_scores(mixed_query_layer)
# Same manual scaling as UniRNASelfAttention
query_layer = query_layer * self.attention_head_size**-0.5
# Apply rotary embeddings
query_layer, key_layer = self.rotary_embeddings(query_layer, key_layer)
# Use PyTorch SDPA; scale=1.0 because we already scaled query above
attn_output = torch.nn.functional.scaled_dot_product_attention(
query_layer,
key_layer,
value_layer,
attn_mask=attention_mask,
dropout_p=self.dropout_prob if self.training else 0.0,
scale=1.0,
)
attn_output = attn_output.permute(0, 2, 1, 3).contiguous()
new_shape = attn_output.size()[:-2] + (self.all_head_size,)
attn_output = attn_output.view(new_shape)
return (attn_output, None)
class UniRNASelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = hidden_states + input_tensor
return hidden_states
class UniRNA_Attention(nn.Module):
def __init__(self, config):
super().__init__()
if getattr(config, "use_flash_attention", False):
self.self = UniRNAFlashSelfAttention(config)
else:
self.self = UniRNASelfAttention(config)
self.output = UniRNASelfOutput(config)
self.pruned_heads = set()
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# TODO: add pruning heads
# def prune_heads(self, heads):
# if len(heads) == 0:
# return
# heads, index = find_pruneable_heads_and_indices(
# heads,
# self.self.num_attention_heads,
# self.self.attention_head_size,
# self.pruned_heads,
# )
# # Prune linear layers
# self.self.query = prune_linear_layer(self.self.query, index)
# self.self.key = prune_linear_layer(self.self.key, index)
# self.self.value = prune_linear_layer(self.self.value, index)
# self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
# # Update hyper params and store pruned heads
# self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
# self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
# self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states,
attention_mask=None,
output_attentions=False,
):
hidden_states_ln = self.LayerNorm(hidden_states)
self_outputs = self.self(
hidden_states_ln,
attention_mask,
output_attentions,
)
attention_output = self.output(self_outputs[0], hidden_states)
return (attention_output, self_outputs[1])
class UniRNAIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = nn.functional.gelu(hidden_states)
return hidden_states
class UniRNAOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = hidden_states + input_tensor
return hidden_states
class UniRNALayer(nn.Module):
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = UniRNA_Attention(config)
self.intermediate = UniRNAIntermediate(config)
self.output = UniRNAOutput(config)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(
self,
hidden_states,
attention_mask=None,
output_attentions=False,
):
self_attention_outputs = self.attention(hidden_states, attention_mask, output_attentions=output_attentions)
layer_output = self.feed_forward_chunk(self_attention_outputs[0])
return (layer_output, self_attention_outputs[1])
def feed_forward_chunk(self, attention_output):
attention_output_ln = self.LayerNorm(attention_output)
intermediate_output = self.intermediate(attention_output_ln)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class UniRNAEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([UniRNALayer(config) for _ in range(config.num_hidden_layers)])
self.emb_layer_norm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.gradient_checkpointing = False
def forward(
self,
hidden_states,
attention_mask=None,
output_attentions=False,
output_hidden_states=False,
):
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
for layer_module in self.layer:
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
attention_mask,
output_attentions,
)
else:
layer_outputs = layer_module(
hidden_states,
attention_mask,
output_attentions,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if self.emb_layer_norm_after:
hidden_states = self.emb_layer_norm_after(hidden_states)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
# Copied from transformers.models.bert.modeling_bert.BertPooler
class UniRNAPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class UniRNAModel(PreTrainedModel):
config_class = UniRNAConfig
supports_gradient_checkpointing = True
main_input_name = "input_ids"
"""
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in [Attention is
all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
"""
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
self.embeddings = UniRNAEmbedding(config)
self.encoder = UniRNAEncoder(config)
self.pooler = UniRNAPooler(config) if add_pooling_layer else None
use_flash_attention = getattr(config, "use_flash_attention", False)
if use_flash_attention:
logger.info("Using Uni-RNA SDPA Attention")
else:
logger.info("Using Uni-RNA Attention")
# Initialize weights and apply final processing
self.post_init()
def _set_gradient_checkpointing(self, enable: bool, gradient_checkpointing_func=None):
self.encoder.gradient_checkpointing = enable
if gradient_checkpointing_func is not None:
self.encoder._gradient_checkpointing_func = gradient_checkpointing_func
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
r"""
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
past_key_values (`Tuple[Tuple[torch.FloatTensor]]`, *optional*):
Tuple of length `config.n_layers`. Each tuple has 4 tensors of shape
`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`. Contains precomputed key and value
hidden states of the attention blocks. Can be used to speed up decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
input_shape, attention_mask = self._validate_and_shape_inputs(input_ids, inputs_embeds, attention_mask)
extended_attention_mask = self._prepare_attention_mask(attention_mask, input_shape)
embedding_output = self._compute_embedding_output(input_ids, attention_mask, inputs_embeds)
encoder_outputs = self.encoder(
embedding_output,
attention_mask=extended_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
)
sequence_output, pooled_output = self._pool_outputs(encoder_outputs[0], attention_mask)
if not return_dict:
output = (sequence_output, pooled_output) + encoder_outputs[1:]
return output
return BaseModelOutputWithPoolingAndCrossAttentions(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
past_key_values=encoder_outputs.past_key_values,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
cross_attentions=encoder_outputs.cross_attentions,
)
def _validate_and_shape_inputs(
self,
input_ids: Optional[torch.Tensor],
inputs_embeds: Optional[torch.Tensor],
attention_mask: Optional[torch.Tensor],
) -> Tuple[Tuple[int, ...], torch.Tensor]:
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
if input_ids is None and inputs_embeds is None:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if input_ids is not None:
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
input_shape = input_ids.size()
device = input_ids.device
else:
input_shape = inputs_embeds.size()[:-1]
device = inputs_embeds.device
batch_size, seq_length = input_shape
if attention_mask is None:
attention_mask = torch.ones((batch_size, seq_length), device=device)
return input_shape, attention_mask
def _prepare_attention_mask(self, attention_mask: torch.Tensor, input_shape: Tuple[int, ...]) -> torch.Tensor:
return self.get_extended_attention_mask(attention_mask, input_shape)
def _compute_embedding_output(
self,
input_ids: Optional[torch.Tensor],
attention_mask: torch.Tensor,
inputs_embeds: Optional[torch.Tensor],
) -> torch.Tensor:
return self.embeddings(input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds)
def _pool_outputs(
self, sequence_output: torch.Tensor, attention_mask: torch.Tensor
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
# make it compatible with deepprotein which wraps the model with different pooler
try:
pooled_output = self.pooler(sequence_output, attention_mask) if self.pooler is not None else None
except TypeError:
pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
return sequence_output, pooled_output
class UniRNAForMaskedLM(PreTrainedModel):
_tied_weights_keys = ["lm_head.decoder.weight"]
config_class = UniRNAConfig
supports_gradient_checkpointing = True
main_input_name = "input_ids"
def __init__(self, config):
super().__init__(config)
self.config = config
self.embeddings = UniRNAEmbedding(config)
self.encoder = UniRNAEncoder(config)
self.lm_head = UniRNALMHead(config)
self.post_init()
def _set_gradient_checkpointing(self, enable: bool, gradient_checkpointing_func=None):
self.encoder.gradient_checkpointing = enable
if gradient_checkpointing_func is not None:
self.encoder._gradient_checkpointing_func = gradient_checkpointing_func
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def get_output_embeddings(self):
return self.lm_head.decoder
def set_output_embeddings(self, new_embeddings):
self.lm_head.decoder = new_embeddings
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, MaskedLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
Used to hide legacy arguments that have been deprecated.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
input_shape = input_ids.size()
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
batch_size, seq_length = input_shape
device = input_ids.device if input_ids is not None else inputs_embeds.device
if attention_mask is None:
attention_mask = torch.ones(((batch_size, seq_length)), device=device)
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
embedding_output = self.embeddings(
input_ids=input_ids,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
)
encoder_outputs = self.encoder(
embedding_output,
attention_mask=extended_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
)
sequence_output = encoder_outputs[0]
prediction_scores = self.lm_head(sequence_output)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (prediction_scores,) + encoder_outputs[1:]
return ((loss,) + output) if loss is not None else output
return MaskedLMOutput(
loss=loss,
logits=prediction_scores,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
class UniRNAForSSPredict(PreTrainedModel):
"""
TODO: make it compatible with transformers, create new 'modeling_outputs' class for SS prediction
"""
config_class = UniRNAConfig
supports_gradient_checkpointing = True
main_input_name = "input_ids"
def __init__(self, config):
# Explicitly block usage until this head is trained and validated.
raise RuntimeError(
"UniRNAForSSPredict is disabled and not supported. This head is untrained and must not be called."
)
def _set_gradient_checkpointing(self, enable: bool, gradient_checkpointing_func=None):
self.encoder.gradient_checkpointing = enable
if gradient_checkpointing_func is not None:
self.encoder._gradient_checkpointing_func = gradient_checkpointing_func
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, UniRNASSPredictionOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
Used to hide legacy arguments that have been deprecated.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
input_shape = input_ids.size()
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
batch_size, seq_length = input_shape
device = input_ids.device if input_ids is not None else inputs_embeds.device
if attention_mask is None:
attention_mask = torch.ones(((batch_size, seq_length)), device=device)
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
embedding_output = self.embeddings(
input_ids=input_ids,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
)
encoder_outputs = self.encoder(
embedding_output,
attention_mask=extended_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
)
sequence_output = encoder_outputs[0]
logits, pair_mask = self.heads(sequence_output, attention_mask=attention_mask, return_mask=True)
loss = None
if labels is not None:
if labels.dim() == 3:
labels = labels.unsqueeze(-1)
if labels.shape[1] == logits.shape[1] + 2 and labels.shape[2] == logits.shape[2] + 2:
labels = labels[:, 1:-1, 1:-1, :]
labels = labels.to(logits.dtype)
loss_fct = nn.BCEWithLogitsLoss()
if pair_mask is not None:
loss = loss_fct(logits[pair_mask], labels[pair_mask])
else:
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits, encoder_outputs.hidden_states, encoder_outputs.attentions, pair_mask)
return ((loss,) + output) if loss is not None else output
return UniRNASSPredictionOutput(
loss=loss,
logits=logits,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
pair_mask=pair_mask,
)
class UniRNALMHead(nn.Module):
"""UniRNA Head for masked language modeling."""
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
def forward(self, features):
x = self.dense(features)
x = nn.functional.gelu(x)
x = self.layer_norm(x)
# project back to size of vocabulary with bias
x = self.decoder(x)
return x
class Dense(nn.Module):
def __init__(
self,
in_features: int,
out_features: int,
norm: str = "LayerNorm",
activation: str = "ReLU",
dropout: float = 0.1,
pool: str = "AdaptiveAvgPool1d",
bias: bool = True,
residual: bool = True,
) -> None:
super().__init__()
self.residual = residual
self.linear = nn.Linear(in_features, out_features, bias=bias)
self.norm = getattr(nn, norm)(out_features) if norm else nn.Identity()
self.activation = getattr(nn, activation)() if activation else nn.Identity()
self.dropout = nn.Dropout(dropout)
self.pool = getattr(nn, pool)(out_features) if pool else nn.Identity() if self.residual else None
def forward(self, x):
out = self.linear(x)
out = self.norm(out)
out = self.activation(out)
out = self.dropout(out)
if self.residual:
out = out + self.pool(x)
return out
class MLP(nn.Module):
def __init__(
self,
*features: Sequence[int],
norm: str = "LayerNorm",
activation: str = "ReLU",
dropout: float = 0.1,
pool: str = "AdaptiveAvgPool1d",
bias: bool = True,
residual: bool = True,
linear_output: bool = True
) -> None:
super().__init__()
if len(features) == 0 and isinstance(features, Sequence):
features = features[0] # type: ignore[assignment]
if not len(features) > 1:
raise ValueError(f"`features` of MLP should have at least 2 elements, but got {len(features)}")
dense = partial(
Dense,
norm=norm,
activation=activation,
dropout=dropout,
pool=pool,
bias=bias,
residual=residual,
)
if linear_output:
layers = [dense(in_features, out_features) for in_features, out_features in zip(features, features[1:-1])]
layers.append(nn.Linear(features[-2], features[-1], bias=bias))
else:
layers = [dense(in_features, out_features) for in_features, out_features in zip(features, features[1:])]
self.layers = nn.Sequential(*layers)
def forward(self, x):
return self.layers(x)
class UniRNASSHead(nn.Module):
"""UniRNA head for Secondary Structure Prediction"""
def __init__(self, config) -> None:
super().__init__()
self.qk_proj = nn.Linear(config.hidden_size, 2 * config.hidden_size)
self.ffn = MLP(1, config.hidden_size, residual=False)
self.linear = nn.Linear(config.hidden_size, 1)
def forward(self, features, attention_mask: Optional[torch.Tensor] = None, return_mask: bool = False):
x = features[:, 1:-1] # remove CLS and EOS tokens
q, k = self.qk_proj(x).chunk(2, dim=-1)
contact_map = (q @ k.transpose(-2, -1)).unsqueeze(-1)
contact_map = contact_map + self.ffn(contact_map)
logits = self.linear(contact_map)
pair_mask = None
if attention_mask is not None:
core_mask = attention_mask[:, 1:-1].bool()
pair_mask = core_mask.unsqueeze(-1) & core_mask.unsqueeze(-2)
pair_mask = pair_mask.unsqueeze(-1)
logits = logits.masked_fill(~pair_mask, 0.0)
return (logits, pair_mask) if return_mask else logits
class AvgPooler(nn.Module):
def __init__(self):
super().__init__()
def forward(self, hidden_states, attention_mask=None):
if attention_mask is None:
attention_mask = torch.ones(hidden_states.shape[:2], device=hidden_states.device, dtype=torch.bool)
else:
attention_mask = attention_mask.bool()
if hidden_states.size(1) > 2:
core_states = hidden_states[:, 1:-1, :]
core_mask = attention_mask[:, 1:-1]
else:
core_states = hidden_states
core_mask = attention_mask
core_mask = core_mask.unsqueeze(-1)
masked_states = core_states * core_mask
denom = core_mask.sum(dim=1).clamp(min=1).to(hidden_states.dtype)
return masked_states.sum(dim=1) / denom
class UniRNAModels(UniRNAModel):
config_class = UniRNAConfig
supports_gradient_checkpointing = True
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# We didn't include weight for original pooler, so we replace it with a simple cls pooler
del self.pooler
self.pooler = AvgPooler()
class UniRNAForMLM(UniRNAForMaskedLM):
config_class = UniRNAConfig
supports_gradient_checkpointing = True
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# We didn't include weight for original pooler, so we replace it with a simple cls pooler
self.pooler = AvgPooler()