"""
    The code is modified from the EsmModel in the transformers library.
    Sources: https://github.com/huggingface/transformers/blob/main/src/transformers/models/esm/modeling_esm.py
"""

from dataclasses import dataclass
from functools import partial
from typing import Optional, Sequence, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
from transformers.modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    MaskedLMOutput,
    ModelOutput,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.utils import logging

from .config import UniRNAConfig

logger = logging.get_logger(__name__)


@dataclass
class UniRNASSPredictionOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: Optional[torch.FloatTensor] = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    pair_mask: Optional[torch.BoolTensor] = None


def rotate_half(x):
    x1, x2 = x.chunk(2, dim=-1)
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_pos_emb(x, cos, sin):
    cos = cos[:, :, : x.shape[-2], :]
    sin = sin[:, :, : x.shape[-2], :]

    return (x * cos) + (rotate_half(x) * sin)


class RotaryEmbedding(nn.Module):
    """
    Rotary position embeddings based on those in
    [RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer). Query and keys are transformed by rotation
    matrices which depend on their relative positions.
    """

    def __init__(self, dim: int):
        super().__init__()
        # Generate and save the inverse frequency buffer (non trainable)
        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
        inv_freq = inv_freq
        self.register_buffer("inv_freq", inv_freq)

        self._seq_len_cached = None
        self._cos_cached = None
        self._sin_cached = None

    def _update_cos_sin_tables(self, x, seq_dimension=2):
        seq_len = x.shape[seq_dimension]

        # Reset the tables if the sequence length has changed,
        # or if we're on a new device (possibly due to tracing for instance)
        if seq_len != self._seq_len_cached or self._cos_cached.device != x.device:
            self._seq_len_cached = seq_len
            t = torch.arange(x.shape[seq_dimension], device=x.device).type_as(self.inv_freq)
            freqs = torch.outer(t, self.inv_freq)
            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)

            self._cos_cached = emb.cos()[None, None, :, :]
            self._sin_cached = emb.sin()[None, None, :, :]

        return self._cos_cached, self._sin_cached

    def forward(self, q: torch.Tensor, k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        self._cos_cached, self._sin_cached = self._update_cos_sin_tables(k, seq_dimension=-2)

        return (
            apply_rotary_pos_emb(q, self._cos_cached, self._sin_cached),
            apply_rotary_pos_emb(k, self._cos_cached, self._sin_cached),
        )


class UniRNAEmbedding(nn.Module):
    """
    Same as BertEmbeddings with a additional token_dropout.
    """

    def __init__(self, config):
        super().__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)

        if config.emb_layer_norm_before:
            self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        else:
            self.layer_norm = None
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        self.padding_idx = config.pad_token_id
        self.token_dropout = config.token_dropout
        self.mask_token_id = config.mask_token_id

    def forward(self, input_ids=None, attention_mask=None, inputs_embeds=None):
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)

        embeddings = inputs_embeds
        if attention_mask is None:
            attention_mask = torch.ones(embeddings.shape[:2], device=embeddings.device)

        # By default, we use token dropout, similar to UniRNA.
        if self.layer_norm is not None:
            embeddings = self.layer_norm(embeddings)
        if attention_mask is not None:
            embeddings = (embeddings * attention_mask.unsqueeze(-1)).to(embeddings.dtype)

        embeddings = self.dropout(embeddings)
        if self.token_dropout and input_ids is not None:
            embeddings = embeddings.masked_fill((input_ids == self.mask_token_id).unsqueeze(-1), 0.0)
            # 0.15 is MaskedLM's default mask probability, and 0.8 is the default keep probability
            mask_ratio_train = 0.15 * 0.8
            src_lengths = attention_mask.sum(-1).clamp(min=1).to(embeddings.dtype)
            mask_ratio_observed = (input_ids == self.mask_token_id).sum(-1).to(embeddings.dtype) / src_lengths
            denom = (1 - mask_ratio_observed).clamp(min=1e-6)
            embeddings = (embeddings * (1 - mask_ratio_train) / denom[:, None, None]).to(embeddings.dtype)

        return embeddings


class UniRNASelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

        self.rotary_embeddings = RotaryEmbedding(dim=self.attention_head_size)

        self.is_decoder = config.is_decoder

    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        new_x_shape = x.size()[:-1] + (
            self.num_attention_heads,
            self.attention_head_size,
        )
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        mixed_query_layer = self.query(hidden_states)

        key_layer = self.transpose_for_scores(self.key(hidden_states))
        value_layer = self.transpose_for_scores(self.value(hidden_states))
        query_layer = self.transpose_for_scores(mixed_query_layer)

        # Hardcoded from EsmModel provided by transformers
        query_layer = query_layer * self.attention_head_size**-0.5

        # Apply rotary embeddings
        query_layer, key_layer = self.rotary_embeddings(query_layer, key_layer)

        # Take the dot product between "query" and "key" to get the raw attention scores.
        # For faster computation, you can used torch.nn.functional.scaled_dot_product_attention

        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

        if attention_mask is not None:
            # Apply the attention mask is (precomputed for all layers in UniRNAModel forward() function)
            attention_scores = attention_scores + attention_mask

        # Normalize the attention scores to probabilities.
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, value_layer)

        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(new_context_layer_shape)

        outputs = (context_layer, attention_probs) if output_attentions else (context_layer, None)

        return outputs


class UniRNAFlashSelfAttention(UniRNASelfAttention):
    """Self-attention using PyTorch's scaled_dot_product_attention (SDPA) backend."""

    def __init__(self, config):
        super().__init__(config)
        self.dropout_prob = config.attention_probs_dropout_prob

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        if output_attentions:
            raise ValueError("SDPA attention does not support output_attentions=True")

        mixed_query_layer = self.query(hidden_states)
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        value_layer = self.transpose_for_scores(self.value(hidden_states))
        query_layer = self.transpose_for_scores(mixed_query_layer)

        # Same manual scaling as UniRNASelfAttention
        query_layer = query_layer * self.attention_head_size**-0.5

        # Apply rotary embeddings
        query_layer, key_layer = self.rotary_embeddings(query_layer, key_layer)

        # Use PyTorch SDPA; scale=1.0 because we already scaled query above
        attn_output = torch.nn.functional.scaled_dot_product_attention(
            query_layer,
            key_layer,
            value_layer,
            attn_mask=attention_mask,
            dropout_p=self.dropout_prob if self.training else 0.0,
            scale=1.0,
        )

        attn_output = attn_output.permute(0, 2, 1, 3).contiguous()
        new_shape = attn_output.size()[:-2] + (self.all_head_size,)
        attn_output = attn_output.view(new_shape)

        return (attn_output, None)


class UniRNASelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = hidden_states + input_tensor
        return hidden_states


class UniRNA_Attention(nn.Module):
    def __init__(self, config):
        super().__init__()

        if getattr(config, "use_flash_attention", False):
            self.self = UniRNAFlashSelfAttention(config)
        else:
            self.self = UniRNASelfAttention(config)
        self.output = UniRNASelfOutput(config)
        self.pruned_heads = set()
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    # TODO: add pruning heads
    # def prune_heads(self, heads):
    #     if len(heads) == 0:
    #         return
    #     heads, index = find_pruneable_heads_and_indices(
    #         heads,
    #         self.self.num_attention_heads,
    #         self.self.attention_head_size,
    #         self.pruned_heads,
    #     )

    #     # Prune linear layers
    #     self.self.query = prune_linear_layer(self.self.query, index)
    #     self.self.key = prune_linear_layer(self.self.key, index)
    #     self.self.value = prune_linear_layer(self.self.value, index)
    #     self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

    #     # Update hyper params and store pruned heads
    #     self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
    #     self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
    #     self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        output_attentions=False,
    ):
        hidden_states_ln = self.LayerNorm(hidden_states)
        self_outputs = self.self(
            hidden_states_ln,
            attention_mask,
            output_attentions,
        )
        attention_output = self.output(self_outputs[0], hidden_states)
        return (attention_output, self_outputs[1])


class UniRNAIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.dense(hidden_states)
        hidden_states = nn.functional.gelu(hidden_states)
        return hidden_states


class UniRNAOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = hidden_states + input_tensor
        return hidden_states


class UniRNALayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        self.seq_len_dim = 1
        self.attention = UniRNA_Attention(config)
        self.intermediate = UniRNAIntermediate(config)
        self.output = UniRNAOutput(config)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        output_attentions=False,
    ):
        self_attention_outputs = self.attention(hidden_states, attention_mask, output_attentions=output_attentions)
        layer_output = self.feed_forward_chunk(self_attention_outputs[0])
        return (layer_output, self_attention_outputs[1])

    def feed_forward_chunk(self, attention_output):
        attention_output_ln = self.LayerNorm(attention_output)
        intermediate_output = self.intermediate(attention_output_ln)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output


class UniRNAEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.layer = nn.ModuleList([UniRNALayer(config) for _ in range(config.num_hidden_layers)])
        self.emb_layer_norm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.gradient_checkpointing = False

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        output_attentions=False,
        output_hidden_states=False,
    ):

        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None

        for layer_module in self.layer:
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    output_attentions,
                )
            else:
                layer_outputs = layer_module(
                    hidden_states,
                    attention_mask,
                    output_attentions,
                )
            hidden_states = layer_outputs[0]
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        if self.emb_layer_norm_after:
            hidden_states = self.emb_layer_norm_after(hidden_states)

        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )


# Copied from transformers.models.bert.modeling_bert.BertPooler
class UniRNAPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output


class UniRNAModel(PreTrainedModel):
    config_class = UniRNAConfig
    supports_gradient_checkpointing = True
    main_input_name = "input_ids"
    """

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
    """

    def __init__(self, config, add_pooling_layer=True):
        super().__init__(config)
        self.config = config
        self.embeddings = UniRNAEmbedding(config)
        self.encoder = UniRNAEncoder(config)
        self.pooler = UniRNAPooler(config) if add_pooling_layer else None

        use_flash_attention = getattr(config, "use_flash_attention", False)
        if use_flash_attention:
            logger.info("Using Uni-RNA SDPA Attention")
        else:
            logger.info("Using Uni-RNA Attention")

        # Initialize weights and apply final processing
        self.post_init()

    def _set_gradient_checkpointing(self, enable: bool, gradient_checkpointing_func=None):
        self.encoder.gradient_checkpointing = enable
        if gradient_checkpointing_func is not None:
            self.encoder._gradient_checkpointing_func = gradient_checkpointing_func

    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
        r"""
        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        past_key_values (`Tuple[Tuple[torch.FloatTensor]]`, *optional*):
            Tuple of length `config.n_layers`. Each tuple has 4 tensors of shape
            `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`. Contains precomputed key and value
            hidden states of the attention blocks. Can be used to speed up decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        input_shape, attention_mask = self._validate_and_shape_inputs(input_ids, inputs_embeds, attention_mask)
        extended_attention_mask = self._prepare_attention_mask(attention_mask, input_shape)
        embedding_output = self._compute_embedding_output(input_ids, attention_mask, inputs_embeds)
        encoder_outputs = self.encoder(
            embedding_output,
            attention_mask=extended_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )
        sequence_output, pooled_output = self._pool_outputs(encoder_outputs[0], attention_mask)

        if not return_dict:
            output = (sequence_output, pooled_output) + encoder_outputs[1:]
            return output

        return BaseModelOutputWithPoolingAndCrossAttentions(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            past_key_values=encoder_outputs.past_key_values,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
            cross_attentions=encoder_outputs.cross_attentions,
        )

    def _validate_and_shape_inputs(
        self,
        input_ids: Optional[torch.Tensor],
        inputs_embeds: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor],
    ) -> Tuple[Tuple[int, ...], torch.Tensor]:
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        if input_ids is None and inputs_embeds is None:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        if input_ids is not None:
            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
            input_shape = input_ids.size()
            device = input_ids.device
        else:
            input_shape = inputs_embeds.size()[:-1]
            device = inputs_embeds.device

        batch_size, seq_length = input_shape
        if attention_mask is None:
            attention_mask = torch.ones((batch_size, seq_length), device=device)
        return input_shape, attention_mask

    def _prepare_attention_mask(self, attention_mask: torch.Tensor, input_shape: Tuple[int, ...]) -> torch.Tensor:
        return self.get_extended_attention_mask(attention_mask, input_shape)

    def _compute_embedding_output(
        self,
        input_ids: Optional[torch.Tensor],
        attention_mask: torch.Tensor,
        inputs_embeds: Optional[torch.Tensor],
    ) -> torch.Tensor:
        return self.embeddings(input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds)

    def _pool_outputs(
        self, sequence_output: torch.Tensor, attention_mask: torch.Tensor
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        # make it compatible with deepprotein which wraps the model with different pooler
        try:
            pooled_output = self.pooler(sequence_output, attention_mask) if self.pooler is not None else None
        except TypeError:
            pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
        return sequence_output, pooled_output


class UniRNAForMaskedLM(PreTrainedModel):
    _tied_weights_keys = ["lm_head.decoder.weight"]
    config_class = UniRNAConfig
    supports_gradient_checkpointing = True
    main_input_name = "input_ids"

    def __init__(self, config):
        super().__init__(config)

        self.config = config
        self.embeddings = UniRNAEmbedding(config)
        self.encoder = UniRNAEncoder(config)
        self.lm_head = UniRNALMHead(config)

        self.post_init()

    def _set_gradient_checkpointing(self, enable: bool, gradient_checkpointing_func=None):
        self.encoder.gradient_checkpointing = enable
        if gradient_checkpointing_func is not None:
            self.encoder._gradient_checkpointing_func = gradient_checkpointing_func

    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    def get_output_embeddings(self):
        return self.lm_head.decoder

    def set_output_embeddings(self, new_embeddings):
        self.lm_head.decoder = new_embeddings

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, MaskedLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
            Used to hide legacy arguments that have been deprecated.
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
            input_shape = input_ids.size()
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        batch_size, seq_length = input_shape
        device = input_ids.device if input_ids is not None else inputs_embeds.device

        if attention_mask is None:
            attention_mask = torch.ones(((batch_size, seq_length)), device=device)

        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)

        embedding_output = self.embeddings(
            input_ids=input_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
        )

        encoder_outputs = self.encoder(
            embedding_output,
            attention_mask=extended_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )
        sequence_output = encoder_outputs[0]

        prediction_scores = self.lm_head(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        if not return_dict:
            output = (prediction_scores,) + encoder_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return MaskedLMOutput(
            loss=loss,
            logits=prediction_scores,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )


class UniRNAForSSPredict(PreTrainedModel):
    """
    TODO: make it compatible with transformers, create new 'modeling_outputs' class for SS prediction
    """

    config_class = UniRNAConfig
    supports_gradient_checkpointing = True
    main_input_name = "input_ids"

    def __init__(self, config):
        # Explicitly block usage until this head is trained and validated.
        raise RuntimeError(
            "UniRNAForSSPredict is disabled and not supported. This head is untrained and must not be called."
        )

    def _set_gradient_checkpointing(self, enable: bool, gradient_checkpointing_func=None):
        self.encoder.gradient_checkpointing = enable
        if gradient_checkpointing_func is not None:
            self.encoder._gradient_checkpointing_func = gradient_checkpointing_func

    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, UniRNASSPredictionOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
            Used to hide legacy arguments that have been deprecated.
        """

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
            input_shape = input_ids.size()
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        batch_size, seq_length = input_shape
        device = input_ids.device if input_ids is not None else inputs_embeds.device

        if attention_mask is None:
            attention_mask = torch.ones(((batch_size, seq_length)), device=device)

        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)

        embedding_output = self.embeddings(
            input_ids=input_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
        )

        encoder_outputs = self.encoder(
            embedding_output,
            attention_mask=extended_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )

        sequence_output = encoder_outputs[0]
        logits, pair_mask = self.heads(sequence_output, attention_mask=attention_mask, return_mask=True)

        loss = None
        if labels is not None:
            if labels.dim() == 3:
                labels = labels.unsqueeze(-1)
            if labels.shape[1] == logits.shape[1] + 2 and labels.shape[2] == logits.shape[2] + 2:
                labels = labels[:, 1:-1, 1:-1, :]
            labels = labels.to(logits.dtype)
            loss_fct = nn.BCEWithLogitsLoss()
            if pair_mask is not None:
                loss = loss_fct(logits[pair_mask], labels[pair_mask])
            else:
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits, encoder_outputs.hidden_states, encoder_outputs.attentions, pair_mask)
            return ((loss,) + output) if loss is not None else output

        return UniRNASSPredictionOutput(
            loss=loss,
            logits=logits,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
            pair_mask=pair_mask,
        )


class UniRNALMHead(nn.Module):
    """UniRNA Head for masked language modeling."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)

    def forward(self, features):
        x = self.dense(features)
        x = nn.functional.gelu(x)
        x = self.layer_norm(x)

        # project back to size of vocabulary with bias
        x = self.decoder(x)
        return x


class Dense(nn.Module):
    def __init__(
        self,
        in_features: int,
        out_features: int,
        norm: str = "LayerNorm",
        activation: str = "ReLU",
        dropout: float = 0.1,
        pool: str = "AdaptiveAvgPool1d",
        bias: bool = True,
        residual: bool = True,
    ) -> None:
        super().__init__()
        self.residual = residual
        self.linear = nn.Linear(in_features, out_features, bias=bias)
        self.norm = getattr(nn, norm)(out_features) if norm else nn.Identity()
        self.activation = getattr(nn, activation)() if activation else nn.Identity()
        self.dropout = nn.Dropout(dropout)
        self.pool = getattr(nn, pool)(out_features) if pool else nn.Identity() if self.residual else None

    def forward(self, x):
        out = self.linear(x)
        out = self.norm(out)
        out = self.activation(out)
        out = self.dropout(out)
        if self.residual:
            out = out + self.pool(x)
        return out


class MLP(nn.Module):
    def __init__(
        self,
        *features: Sequence[int],
        norm: str = "LayerNorm",
        activation: str = "ReLU",
        dropout: float = 0.1,
        pool: str = "AdaptiveAvgPool1d",
        bias: bool = True,
        residual: bool = True,
        linear_output: bool = True
    ) -> None:
        super().__init__()
        if len(features) == 0 and isinstance(features, Sequence):
            features = features[0]  # type: ignore[assignment]
        if not len(features) > 1:
            raise ValueError(f"`features` of MLP should have at least 2 elements, but got {len(features)}")
        dense = partial(
            Dense,
            norm=norm,
            activation=activation,
            dropout=dropout,
            pool=pool,
            bias=bias,
            residual=residual,
        )
        if linear_output:
            layers = [dense(in_features, out_features) for in_features, out_features in zip(features, features[1:-1])]
            layers.append(nn.Linear(features[-2], features[-1], bias=bias))
        else:
            layers = [dense(in_features, out_features) for in_features, out_features in zip(features, features[1:])]
        self.layers = nn.Sequential(*layers)

    def forward(self, x):
        return self.layers(x)


class UniRNASSHead(nn.Module):
    """UniRNA head for Secondary Structure Prediction"""

    def __init__(self, config) -> None:
        super().__init__()

        self.qk_proj = nn.Linear(config.hidden_size, 2 * config.hidden_size)
        self.ffn = MLP(1, config.hidden_size, residual=False)
        self.linear = nn.Linear(config.hidden_size, 1)

    def forward(self, features, attention_mask: Optional[torch.Tensor] = None, return_mask: bool = False):
        x = features[:, 1:-1]  # remove CLS and EOS tokens
        q, k = self.qk_proj(x).chunk(2, dim=-1)
        contact_map = (q @ k.transpose(-2, -1)).unsqueeze(-1)
        contact_map = contact_map + self.ffn(contact_map)
        logits = self.linear(contact_map)

        pair_mask = None
        if attention_mask is not None:
            core_mask = attention_mask[:, 1:-1].bool()
            pair_mask = core_mask.unsqueeze(-1) & core_mask.unsqueeze(-2)
            pair_mask = pair_mask.unsqueeze(-1)
            logits = logits.masked_fill(~pair_mask, 0.0)

        return (logits, pair_mask) if return_mask else logits


class AvgPooler(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, hidden_states, attention_mask=None):
        if attention_mask is None:
            attention_mask = torch.ones(hidden_states.shape[:2], device=hidden_states.device, dtype=torch.bool)
        else:
            attention_mask = attention_mask.bool()

        if hidden_states.size(1) > 2:
            core_states = hidden_states[:, 1:-1, :]
            core_mask = attention_mask[:, 1:-1]
        else:
            core_states = hidden_states
            core_mask = attention_mask

        core_mask = core_mask.unsqueeze(-1)
        masked_states = core_states * core_mask
        denom = core_mask.sum(dim=1).clamp(min=1).to(hidden_states.dtype)
        return masked_states.sum(dim=1) / denom


class UniRNAModels(UniRNAModel):
    config_class = UniRNAConfig
    supports_gradient_checkpointing = True

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # We didn't include weight for original pooler, so we replace it with a simple cls pooler
        del self.pooler
        self.pooler = AvgPooler()


class UniRNAForMLM(UniRNAForMaskedLM):
    config_class = UniRNAConfig
    supports_gradient_checkpointing = True

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # We didn't include weight for original pooler, so we replace it with a simple cls pooler
        self.pooler = AvgPooler()