from typing import Optional, Tuple, Union, Dict, Literal, Callable

import math
import os
from contextlib import nullcontext
from dataclasses import dataclass

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import PreTrainedModel
from transformers.utils import ModelOutput, logging
from transformers.activations import ACT2FN
from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
from transformers.configuration_utils import PretrainedConfig
from transformers.utils.doc import add_code_sample_docstrings, add_start_docstrings
from transformers.utils.import_utils import is_triton_available, is_flash_attn_2_available
from transformers.modeling_outputs import SequenceClassifierOutput, BaseModelOutput, MaskedLMOutput

try:
    from transformers.utils.hub import cached_file
except ImportError:  # transformers<5 compatibility
    from transformers.utils import cached_file  # type: ignore

try:
    from transformers.modeling_rope_utils import RopeParameters
except ImportError:
    RopeParameters = object


if is_flash_attn_2_available():
    from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
    from flash_attn.layers.rotary import RotaryEmbedding
    from flash_attn.ops.triton.rotary import apply_rotary
else:
    RotaryEmbedding = object

logger = logging.get_logger(__name__)


_HF_CONFIG_LOAD_KWARGS = {
    "cache_dir",
    "force_download",
    "local_files_only",
    "token",
    "revision",
    "subfolder",
    "proxies",
}

_HF_NON_MODEL_INIT_KWARGS = {
    "trust_remote_code",
    "_from_auto",
    "adapter_kwargs",
}

_HF_MODEL_INIT_BLACKLIST = {
    "device_map",
    "low_cpu_mem_usage",
    "offload_folder",
    "offload_state_dict",
    "max_memory",
    "quantization_config",
    "tp_plan",
    "tp_size",
    "weights_only",
    #"use_flash_attention_2",
    #"attn_implementation",
    #"torch_dtype"
}

logger = logging.get_logger(__name__)


def _split_pretrained_kwargs(kwargs):
    kwargs = dict(kwargs)

    for k in _HF_NON_MODEL_INIT_KWARGS:
        kwargs.pop(k, None)

    config_load_kwargs = {
        k: kwargs.pop(k) for k in list(kwargs) if k in _HF_CONFIG_LOAD_KWARGS
    }

    use_safetensors = kwargs.pop("use_safetensors", None)
    weights_only = kwargs.pop("weights_only", True)
    return config_load_kwargs, use_safetensors, weights_only, kwargs


def _resolve_weights_file(pretrained_model_name_or_path, use_safetensors=None, **load_kwargs) -> str:
    pretrained_model_name_or_path = os.fspath(pretrained_model_name_or_path)

    if use_safetensors is True:
        candidates = ("model.safetensors",)
    elif use_safetensors is False:
        candidates = ("pytorch_model.bin",)
    else:
        candidates = ("model.safetensors", "pytorch_model.bin")

    subfolder = load_kwargs.get("subfolder")

    if os.path.isdir(pretrained_model_name_or_path):
        base_dir = (
            os.path.join(pretrained_model_name_or_path, subfolder)
            if subfolder
            else pretrained_model_name_or_path
        )
        for name in candidates:
            path = os.path.join(base_dir, name)
            if os.path.exists(path):
                return path

    for name in candidates:
        try:
            path = cached_file(pretrained_model_name_or_path, name, **load_kwargs)
            if path is not None:
                return path
        except Exception:
            pass

    raise FileNotFoundError(
        f"No checkpoint file found in {pretrained_model_name_or_path!r} "
        f"(candidates: {', '.join(candidates)})"
    )


def _read_state_dict(weights_path, weights_only: bool = True) -> dict[str, torch.Tensor]:
    weights_path = os.fspath(weights_path)

    if weights_path.endswith(".safetensors"):
        from safetensors.torch import load_file as safe_load_file

        return safe_load_file(weights_path, device="cpu")

    try:
        return torch.load(weights_path, map_location="cpu", weights_only=weights_only)
    except TypeError:  # older torch versions
        return torch.load(weights_path, map_location="cpu")


def _align_state_dict_with_base_prefix(model: nn.Module, state_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
    state_dict = dict(state_dict)
    base_prefix = getattr(model, "base_model_prefix", None)
    if not base_prefix:
        return state_dict

    prefix = f"{base_prefix}."
    model_keys = set(model.state_dict().keys())
    model_has_prefix = any(k.startswith(prefix) for k in model_keys)
    ckpt_has_prefix = any(k.startswith(prefix) for k in state_dict.keys())

    # Loading a wrapped checkpoint (model.*) into the bare encoder.
    if ckpt_has_prefix and not model_has_prefix:
        stripped = {k[len(prefix):]: v for k, v in state_dict.items() if k.startswith(prefix)}
        return stripped if stripped else state_dict

    # Loading a bare encoder checkpoint into a wrapped task model.
    if model_has_prefix and not ckpt_has_prefix:
        remapped = {}
        for k, v in state_dict.items():
            prefixed = f"{prefix}{k}"
            remapped[prefixed if prefixed in model_keys else k] = v
        return remapped

    return state_dict


class _SafeFromPretrainedMixin:
    @classmethod
    def _adapt_state_dict(cls, model, state_dict):
        return state_dict

    @staticmethod
    def _filter_keys_with_patterns(keys, patterns):
        if not patterns:
            return list(keys)

        import re

        compiled = [re.compile(p) if isinstance(p, str) else p for p in patterns]
        return [k for k in keys if not any(p.search(k) for p in compiled)]

    @classmethod
    def _resolve_config_and_init_kwargs(
        cls,
        pretrained_model_name_or_path,
        config,
        config_load_kwargs,
        other_kwargs,
    ):
        if isinstance(config, PretrainedConfig):
            return config, other_kwargs

        if config is None:
            config_source = pretrained_model_name_or_path
        elif isinstance(config, (str, os.PathLike)):
            config_source = config
        else:
            raise TypeError(
                "`config` must be None, a path-like object, or an instance of PretrainedConfig"
            )

        config, init_kwargs = cls.config_class.from_pretrained(
            config_source,
            return_unused_kwargs=True,
            **config_load_kwargs,
            **other_kwargs,
        )
        return config, init_kwargs

    @staticmethod
    def _remove_mismatched_keys(model, state_dict):
        state_dict = dict(state_dict)
        model_state = model.state_dict()
        mismatched_keys = []

        for key in list(state_dict.keys()):
            if key not in model_state:
                continue

            loaded_value = state_dict[key]
            model_value = model_state[key]

            if not isinstance(loaded_value, torch.Tensor) or not isinstance(model_value, torch.Tensor):
                continue

            if tuple(loaded_value.shape) != tuple(model_value.shape):
                mismatched_keys.append((key, tuple(loaded_value.shape), tuple(model_value.shape)))
                state_dict.pop(key)

        return state_dict, mismatched_keys

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        output_loading_info = kwargs.pop("output_loading_info", False)
        state_dict = kwargs.pop("state_dict", None)
        config = kwargs.pop("config", None)
        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
        strict = kwargs.pop("strict", False)

        config_load_kwargs, use_safetensors, weights_only, other_kwargs = _split_pretrained_kwargs(kwargs)

        config, init_kwargs = cls._resolve_config_and_init_kwargs(
            pretrained_model_name_or_path=pretrained_model_name_or_path,
            config=config,
            config_load_kwargs=config_load_kwargs,
            other_kwargs=other_kwargs,
        )
        '''
        config = cls._autoset_attn_implementation(
            config,
            use_flash_attention_2=bool(init_kwargs.get("use_flash_attention_2", False)),
            torch_dtype=init_kwargs.get("torch_dtype", None),
            device_map=init_kwargs.get("device_map", None),
        )
        ''' 

        for k in _HF_MODEL_INIT_BLACKLIST:
            init_kwargs.pop(k, None)

        model = cls(config, *model_args, **init_kwargs)

        if state_dict is None:
            weights_path = _resolve_weights_file(
                pretrained_model_name_or_path,
                use_safetensors=use_safetensors,
                **config_load_kwargs,
            )
            state_dict = _read_state_dict(
                weights_path,
                weights_only=True if weights_only is None else bool(weights_only),
            )

        if not isinstance(state_dict, dict):
            raise TypeError(f"Expected a state dict, got {type(state_dict).__name__}")

        state_dict = _align_state_dict_with_base_prefix(model, state_dict)
        state_dict = cls._adapt_state_dict(model, state_dict)

        mismatched_keys = []
        if ignore_mismatched_sizes:
            state_dict, mismatched_keys = cls._remove_mismatched_keys(model, state_dict)

        incompatible = model.load_state_dict(state_dict, strict=strict)

        if hasattr(model, "tie_weights"):
            model.tie_weights()

        model.eval()

        missing_keys = cls._filter_keys_with_patterns(
            list(incompatible.missing_keys),
            getattr(model, "_keys_to_ignore_on_load_missing", None),
        )
        unexpected_keys = cls._filter_keys_with_patterns(
            list(incompatible.unexpected_keys),
            getattr(model, "_keys_to_ignore_on_load_unexpected", None),
        )

        info = {
            "missing_keys": missing_keys,
            "unexpected_keys": unexpected_keys,
            "mismatched_keys": mismatched_keys,
            "error_msgs": [],
        }
        return (model, info) if output_loading_info else model


def l2_norm(input, axis=1, epsilon=1e-12):
    norm = torch.norm(input, 2, axis, True)
    norm = torch.clamp(norm, min=epsilon)  # Avoid zero division
    output = torch.div(input, norm)
    return output


def initialize_linear_kaiming(layer: nn.Linear):
    if isinstance(layer, nn.Linear):
        nn.init.kaiming_uniform_(layer.weight, nonlinearity='linear')
        if layer.bias is not None:
            nn.init.zeros_(layer.bias)


def _autocast_disabled(device_type: str):
    try:
        return torch.amp.autocast(device_type=device_type, enabled=False)
    except (AttributeError, TypeError):
        if device_type == "cuda":
            return torch.cuda.amp.autocast(enabled=False)
        if device_type == "cpu" and hasattr(torch, "cpu") and hasattr(torch.cpu, "amp"):
            return torch.cpu.amp.autocast(enabled=False)
        return nullcontext()


def _normalize_token_attention_mask(
    attention_mask: Optional[torch.Tensor],
    *,
    batch_size: Optional[int] = None,
    seq_len: Optional[int] = None,
    device: Optional[torch.device] = None,
) -> Optional[torch.Tensor]:
    """Convert common attention-mask layouts to a `(batch, seq_len)` boolean keep-mask.

    Supported inputs are 2D token masks, 3D masks of shape `(batch, 1, seq_len)`, and 4D masks
    of shape `(batch, 1, query_len, key_len)` used by eager/SDPA attention. Returned values use
    `True` for valid (non-padding) tokens.
    """
    if attention_mask is None:
        if batch_size is None or seq_len is None:
            return None
        return torch.ones((batch_size, seq_len), device=device, dtype=torch.bool)

    mask = attention_mask

    if mask.dim() == 4:
        if mask.dtype == torch.bool:
            mask = mask[:, 0].any(dim=-2)
        else:
            mask = mask[:, 0].amax(dim=-2) >= 0
        return mask.to(dtype=torch.bool)

    if mask.dim() == 3:
        if mask.size(1) != 1:
            raise ValueError(
                "3D attention masks must have shape (batch, 1, seq_len) when used in ProkBert."
            )
        mask = mask[:, 0, :]

    if mask.dim() != 2:
        raise ValueError(
            "Attention masks for ProkBert must be 2D, 3D `(batch, 1, seq_len)`, or 4D "
            "`(batch, 1, query_len, key_len)`."
        )

    if mask.dtype == torch.bool:
        return mask

    if mask.is_floating_point():
        if mask.numel() == 0:
            return mask.to(dtype=torch.bool)
        return (mask >= 0) if torch.any(mask < 0) else (mask > 0)

    return mask != 0


def _to_additive_attention_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
    """Convert a boolean keep-mask to the additive attention-bias format expected by eager attention."""
    min_dtype = torch.finfo(dtype).min
    bias = torch.zeros(mask.shape, device=mask.device, dtype=dtype)
    return bias.masked_fill(~mask, min_dtype)


def _build_bidirectional_attention_biases(
    attention_mask: torch.Tensor,
    dtype: torch.dtype,
    sliding_window: int,
) -> Tuple[torch.Tensor, torch.Tensor]:
    """Create full-attention and sliding-window additive masks for eager/SDPA attention."""
    if attention_mask.dim() == 4:
        if attention_mask.dtype == torch.bool:
            global_attention_mask = _to_additive_attention_bias(attention_mask, dtype)
        else:
            global_attention_mask = attention_mask.to(dtype=dtype)
        query_length = global_attention_mask.shape[-2]
        key_length = global_attention_mask.shape[-1]
    else:
        token_mask = _normalize_token_attention_mask(attention_mask)
        if token_mask is None:
            raise ValueError("`attention_mask` cannot be None when creating additive attention biases.")
        query_length = token_mask.shape[-1]
        key_length = token_mask.shape[-1]
        expanded_token_mask = token_mask[:, None, None, :].expand(-1, 1, query_length, -1)
        global_attention_mask = _to_additive_attention_bias(expanded_token_mask, dtype)

    row_positions = torch.arange(query_length, device=global_attention_mask.device)[:, None]
    col_positions = torch.arange(key_length, device=global_attention_mask.device)[None, :]
    local_window = (row_positions - col_positions).abs() <= sliding_window
    min_dtype = torch.finfo(dtype).min
    sliding_window_mask = global_attention_mask.masked_fill(
        ~local_window.view(1, 1, query_length, key_length),
        min_dtype,
    )
    return global_attention_mask, sliding_window_mask


class ProkBertConfig(PretrainedConfig):
    r"""
    Configuration for the standalone ProkBERT ModernBERT-style encoder stack.

    Canonical config names follow the current ModernBERT conventions:
      - `layer_types` instead of `global_attn_every_n_layers`
      - nested `rope_parameters` keyed by attention type instead of a single flat RoPE dict
      - `tie_word_embeddings` explicitly tracked in the config

    `classifier_pooling` is also standardized as a single field and extended with the custom
    `"attention"` option used by the standalone ProkBERT sequence-classification head.

    The tokenizer metadata fields `kmer` and `shift` are also stored on the config so the model artifact keeps
    the sequence-tokenization contract alongside the architectural settings.

    Legacy names are still accepted for backward compatibility when loading older checkpoints/configs.
    """

    model_type = "prokbert"
    keys_to_ignore_at_inference = ["past_key_values"]
    default_theta = {"full_attention": 160_000.0, "sliding_attention": 10_000.0}

    attribute_map = {
        "classification_dropout_rate": "classifier_dropout",
        "num_class_labels": "num_labels",
        "curricular_num_labels": "num_labels",
        "curricular_face_m": "curricular_margin",
        "curricular_face_s": "curricular_scale",
        "curriculum_hidden_size": "curricular_embedding_size",
    }


    @classmethod
    def _build_layer_types(
        cls,
        num_hidden_layers: int,
        global_attn_every_n_layers: int,
    ) -> list[str]:
        if global_attn_every_n_layers <= 0:
            raise ValueError("`global_attn_every_n_layers` must be a positive integer.")
        return [
            "sliding_attention" if bool(i % global_attn_every_n_layers) else "full_attention"
            for i in range(num_hidden_layers)
        ]

    @classmethod
    def _normalize_rope_parameters(
        cls,
        rope_parameters: RopeParameters | dict | None,
        global_rope_theta: float,
        local_rope_theta: float,
    ) -> dict[str, dict]:
        default_rope_parameters = {
            "full_attention": {"rope_type": "default", "rope_theta": float(global_rope_theta)},
            "sliding_attention": {"rope_type": "default", "rope_theta": float(local_rope_theta)},
        }

        if rope_parameters is None:
            return default_rope_parameters

        rope_parameters = dict(rope_parameters)

        # Backward compatibility: older configs used a single flat dict plus `global_rope_theta` / `local_rope_theta`.
        if "rope_type" in rope_parameters or "rope_theta" in rope_parameters:
            shared_rope_type = rope_parameters.get("rope_type", "default")
            full_theta = float(rope_parameters.get("rope_theta", global_rope_theta))
            return {
                "full_attention": {
                    **{k: v for k, v in rope_parameters.items() if k != "rope_theta"},
                    "rope_type": shared_rope_type,
                    "rope_theta": full_theta,
                },
                "sliding_attention": {
                    **{k: v for k, v in rope_parameters.items() if k != "rope_theta"},
                    "rope_type": shared_rope_type,
                    "rope_theta": float(local_rope_theta),
                },
            }

        normalized_rope_parameters = {}
        for layer_type in ("full_attention", "sliding_attention"):
            layer_params = rope_parameters.get(layer_type)
            if layer_params is None:
                layer_params = {"rope_type": "default"}
            else:
                layer_params = dict(layer_params)
            layer_params.setdefault("rope_type", "default")
            layer_params.setdefault("rope_theta", cls.default_theta[layer_type])
            normalized_rope_parameters[layer_type] = layer_params

        return normalized_rope_parameters

    def __init__(
        self,
        vocab_size: int = 4608,
        hidden_size: int = 384,
        intermediate_size: int = 1152,
        num_hidden_layers: int = 6,
        num_attention_heads: int = 6,
        hidden_activation: str = "gelu",
        max_position_embeddings: int = 16384,
        initializer_range: float = 0.02,
        initializer_cutoff_factor: float = 2.0,
        norm_eps: float = 1e-6,
        norm_bias: bool = False,
        kmer: int = 6,
        shift: int = 1,
        pad_token_id: int = 0,
        eos_token_id: int = 3,
        bos_token_id: int = 2,
        cls_token_id: int = 2,
        sep_token_id: int = 3,
        attention_bias: bool = False,
        attention_dropout: float = 0.0,
        layer_types: list[str] | None = None,
        rope_parameters: RopeParameters | dict | None = None,
        local_attention: int = 256,
        embedding_dropout: float = 0.0,
        mlp_bias: bool = False,
        mlp_dropout: float = 0.0,
        decoder_bias: bool = True,
        classifier_pooling: Literal["attention", "cls", "mean"] = "attention",
        classifier_dropout: float = 0.0,
        classifier_bias: bool = False,
        classifier_activation: str = "gelu",
        deterministic_flash_attn: bool = False,
        sparse_prediction: bool = False,
        sparse_pred_ignore_index: int = -100,
        reference_compile: bool | None = None,
        repad_logits_with_grad: bool = False,
        norm_type: str = "rms",
        tie_word_embeddings: bool = True,
        num_labels: int = 2,
        problem_type: str | None = None,
        curricular_margin: float = 0.5,
        curricular_scale: float = 64.0,
        curricular_embedding_size: int | None = None,
        **kwargs,
    ):
        legacy_global_attn_every_n_layers = int(kwargs.pop("global_attn_every_n_layers", 1))
        legacy_global_rope_theta = float(kwargs.pop("global_rope_theta", self.default_theta["full_attention"]))
        legacy_local_rope_theta = float(kwargs.pop("local_rope_theta", self.default_theta["sliding_attention"]))

        legacy_num_class_labels = kwargs.pop("num_class_labels", None)
        legacy_curricular_num_labels = kwargs.pop("curricular_num_labels", None)
        legacy_classifier_dropout = kwargs.pop("classification_dropout_rate", None)
        legacy_curricular_margin = kwargs.pop("curricular_face_m", None)
        legacy_curricular_scale = kwargs.pop("curricular_face_s", None)
        legacy_curricular_embedding_size = kwargs.pop("curriculum_hidden_size", None)
        kwargs.pop("bert_base_model", None)

        loaded_id2label = kwargs.get("id2label")
        if loaded_id2label is not None:
            num_labels = len(loaded_id2label)
        elif legacy_curricular_num_labels is not None:
            num_labels = int(legacy_curricular_num_labels)
        elif legacy_num_class_labels is not None:
            num_labels = int(legacy_num_class_labels)

        if legacy_classifier_dropout is not None:
            classifier_dropout = float(legacy_classifier_dropout)
        if legacy_curricular_margin is not None:
            curricular_margin = float(legacy_curricular_margin)
        if legacy_curricular_scale is not None:
            curricular_scale = float(legacy_curricular_scale)
        if curricular_embedding_size is None and legacy_curricular_embedding_size not in (None, -1):
            curricular_embedding_size = int(legacy_curricular_embedding_size)

        if layer_types is None:
            layer_types = self._build_layer_types(
                num_hidden_layers=num_hidden_layers,
                global_attn_every_n_layers=legacy_global_attn_every_n_layers,
            )
        else:
            layer_types = list(layer_types)

        rope_parameters = self._normalize_rope_parameters(
            rope_parameters=rope_parameters,
            global_rope_theta=legacy_global_rope_theta,
            local_rope_theta=legacy_local_rope_theta,
        )

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            cls_token_id=cls_token_id,
            sep_token_id=sep_token_id,
            tie_word_embeddings=tie_word_embeddings,
            num_labels=num_labels,
            problem_type=problem_type,
            **kwargs,
        )

        self.kmer = int(kmer)
        self.shift = int(shift)
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_activation = hidden_activation
        self.initializer_range = initializer_range
        self.initializer_cutoff_factor = initializer_cutoff_factor
        self.norm_eps = norm_eps
        self.norm_bias = norm_bias
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        self.layer_types = layer_types
        self.rope_parameters = rope_parameters
        self.local_attention = local_attention
        self.embedding_dropout = embedding_dropout
        self.mlp_bias = mlp_bias
        self.mlp_dropout = mlp_dropout
        self.decoder_bias = decoder_bias
        self.classifier_pooling = classifier_pooling
        self.classifier_dropout = classifier_dropout
        self.classifier_bias = classifier_bias
        self.classifier_activation = classifier_activation
        self.deterministic_flash_attn = deterministic_flash_attn
        self.sparse_prediction = sparse_prediction
        self.sparse_pred_ignore_index = sparse_pred_ignore_index
        self.reference_compile = reference_compile
        self.repad_logits_with_grad = repad_logits_with_grad
        self.norm_type = norm_type
        self.tie_word_embeddings = tie_word_embeddings
        self.num_labels = num_labels
        self.problem_type = problem_type
        self.curricular_margin = curricular_margin
        self.curricular_scale = curricular_scale
        self.curricular_embedding_size = curricular_embedding_size

        if self.kmer <= 0:
            raise ValueError(f"`kmer` must be a positive integer, got {self.kmer}.")

        if self.shift <= 0:
            raise ValueError(f"`shift` must be a positive integer, got {self.shift}.")

        if len(self.layer_types) != self.num_hidden_layers:
            raise ValueError(
                "`layer_types` must contain one entry per hidden layer: "
                f"expected {self.num_hidden_layers}, got {len(self.layer_types)}."
            )

        invalid_layer_types = sorted(set(self.layer_types) - {"full_attention", "sliding_attention"})
        if invalid_layer_types:
            raise ValueError(
                f"Unsupported values in `layer_types`: {invalid_layer_types}. "
                'Allowed values are ["full_attention", "sliding_attention"].'
            )

        if self.classifier_pooling not in ["attention", "cls", "mean"]:
            raise ValueError(
                f'Invalid value for `classifier_pooling`, should be one of ["attention", "cls", "mean"], but is {self.classifier_pooling}.'
            )

        if self.norm_type not in {"rms", "layernorm"}:
            raise ValueError(
                f'Invalid value for `norm_type`, should be either "rms" or "layernorm", but is {self.norm_type}.'
            )

    def get_rope_parameters(self, layer_type: str) -> dict:
        if layer_type not in {"full_attention", "sliding_attention"}:
            raise ValueError(
                f"Unsupported `layer_type`={layer_type!r}. Expected 'full_attention' or 'sliding_attention'."
            )
        rope_params = self.rope_parameters.get(layer_type)
        if rope_params is None:
            rope_params = {"rope_type": "default", "rope_theta": self.default_theta[layer_type]}
        return rope_params

    @property
    def sliding_window(self) -> int:
        return self.local_attention // 2

    @sliding_window.setter
    def sliding_window(self, value: int):
        self.local_attention = int(value) * 2

    def to_dict(self):
        output = super().to_dict()
        output.pop("reference_compile", None)
        return output


_CHECKPOINT_FOR_DOC = "example/prokbert-base"
_CONFIG_FOR_DOC = "ProkBertConfig"

PROK_BERT_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods
    the library implements for all its models (such as downloading or saving, resizing the input embeddings, pruning heads, etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch module and refer to the PyTorch documentation for general usage and behavior.

    Parameters:
        config ([`ProkBertConfig`]):
            Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the model weights; see [`PreTrainedModel.from_pretrained`] for weight loading.
"""

class RMSNorm(nn.Module):
    def __init__(self, dim: int, eps: float = 1e-6, bias: bool = False):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))
        self.bias = nn.Parameter(torch.zeros(dim)) if bias else None

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # root-mean-square normalization over last dim
        rms = x.pow(2).mean(dim=-1, keepdim=True).add(self.eps).rsqrt()
        x = x * rms
        if self.bias is not None:
            x = x + self.bias
        return self.weight * x


def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (torch.Tensor): The query tensor.
        k (torch.Tensor): The key tensor.
        cos (torch.Tensor): The cosine part of the rotary embedding.
        sin (torch.Tensor): The sine part of the rotary embedding.
        position_ids (torch.Tensor, optional): Deprecated and unused.
        unsqueeze_dim (int, optional): The dimension along which to unsqueeze cos and sin.
    Returns:
        tuple(torch.Tensor): The rotated query and key tensors.
    """
    original_dtype = q.dtype
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)
    q_embed = (q.float() * cos) + (rotate_half(q.float()) * sin)
    k_embed = (k.float() * cos) + (rotate_half(k.float()) * sin)
    return q_embed.to(original_dtype), k_embed.to(original_dtype)


def eager_attention_forward(
    module: "ProkBertAttention",
    qkv: torch.Tensor,
    attention_mask: torch.Tensor,
    sliding_window_mask: torch.Tensor,
    position_ids: Optional[torch.LongTensor],
    local_attention: Tuple[int, int],
    bs: int,
    dim: int,
    output_attentions: Optional[bool] = False,
    **_kwargs,
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
    # qkv: [batch_size, seqlen, 3, nheads, headdim]
    cos, sin = module.rotary_emb(qkv, position_ids=position_ids)
    query, key, value = qkv.transpose(3, 1).unbind(dim=2)
    # Apply rotary positional embedding to query and key.
    query, key = apply_rotary_pos_emb(query, key, cos, sin)

    scale = module.head_dim ** -0.5
    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scale

    if local_attention != (-1, -1):
        attention_mask = sliding_window_mask

    attn_weights = attn_weights + attention_mask

    # Upcast attention to fp32 for stability.
    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
    attn_weights = nn.functional.dropout(attn_weights, p=module.attention_dropout, training=module.training)
    attn_output = torch.matmul(attn_weights, value)
    attn_output = attn_output.transpose(1, 2).contiguous()
    attn_output = attn_output.view(bs, -1, dim)
    if output_attentions:
        return (attn_output, attn_weights)
    return (attn_output,)


def flash_attention_forward(
    module: "ProkBertAttention",
    qkv: torch.Tensor,
    rotary_emb: "ProkBertUnpaddedRotaryEmbedding",
    cu_seqlens: torch.Tensor,
    max_seqlen: int,
    local_attention: Tuple[int, int],
    bs: int,
    dim: int,
    target_dtype: torch.dtype = torch.bfloat16,
    **_kwargs,
) -> Tuple[torch.Tensor]:
    # qkv: (total_seqlen, 3, nheads, headdim)
    qkv = rotary_emb(qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen)
    convert_dtype = qkv.dtype not in (torch.float16, torch.bfloat16)
    if convert_dtype:
        orig_dtype = qkv.dtype
        qkv = qkv.to(target_dtype)
        attn = flash_attn_varlen_qkvpacked_func(
            qkv,
            cu_seqlens=cu_seqlens,
            max_seqlen=max_seqlen,
            dropout_p=module.attention_dropout if module.training else 0.0,
            deterministic=module.deterministic_flash_attn,
            window_size=local_attention,
        )
        attn = attn.to(orig_dtype)
    else:
        attn = flash_attn_varlen_qkvpacked_func(
            qkv,
            cu_seqlens=cu_seqlens,
            max_seqlen=max_seqlen,
            dropout_p=module.attention_dropout if module.training else 0.0,
            deterministic=module.deterministic_flash_attn,
            window_size=local_attention,
        )
    return (attn.view(bs, dim),)


def sdpa_attention_forward(
    module: "ProkBertAttention",
    qkv: torch.Tensor,
    attention_mask: torch.Tensor,
    sliding_window_mask: torch.Tensor,
    position_ids: Optional[torch.LongTensor],
    local_attention: Tuple[int, int],
    bs: int,
    dim: int,
    **_kwargs,
) -> Tuple[torch.Tensor]:
    # qkv: [batch_size, seqlen, 3, nheads, headdim]
    cos, sin = module.rotary_emb(qkv, position_ids=position_ids)
    query, key, value = qkv.transpose(3, 1).unbind(dim=2)
    query, key = apply_rotary_pos_emb(query, key, cos, sin)

    if local_attention != (-1, -1):
        attention_mask = sliding_window_mask

    attn_output = (
        F.scaled_dot_product_attention(
            query,
            key,
            value,
            dropout_p=module.attention_dropout if module.training else 0.0,
            attn_mask=attention_mask,
        )
        .transpose(1, 2)
        .contiguous()
    )
    attn_output = attn_output.view(bs, -1, dim)
    return (attn_output,)


PROK_BERT_ATTENTION_FUNCTION = {
    "flash_attention_2": flash_attention_forward,
    "eager": eager_attention_forward,
    "sdpa": sdpa_attention_forward,
}


def _unpad_prokbert_input(
    inputs: torch.Tensor,
    attention_mask: torch.Tensor,
    position_ids: Optional[torch.Tensor] = None,
    labels: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, Optional[torch.Tensor], Optional[torch.Tensor]]:
    """
    Remove padding from input sequences.

    Args:
        inputs: (batch, seqlen, ...) or (batch, seqlen)
        attention_mask: (batch, seqlen), where 1 means valid and 0 means padding.
        position_ids: (batch, seqlen), optional position ids.
        labels: (batch, seqlen), optional labels.

    Returns:
        unpadded_inputs: Tensor of shape (total_nnz, ...) containing only valid tokens.
        indices: Tensor of indices corresponding to valid tokens.
        cu_seqlens: Cumulative sequence lengths of the unpadded tokens (shape: batch + 1).
        max_seqlen_in_batch: Maximum sequence length among all sequences (excluding padding).
        unpadded_position_ids: (total_nnz,) or None.
        unpadded_labels: (total_nnz,) or None.
    """
    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
    max_seqlen_in_batch = int(seqlens_in_batch.max().item())
    cu_seqlens = torch.nn.functional.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))

    if inputs.dim() == 2:
        unpadded_inputs = inputs.flatten()[indices]
    else:
        batch, seqlen, *rest = inputs.shape
        shape = batch * seqlen
        unpadded_inputs = inputs.view(shape, *rest)[indices]

    unpadded_position_ids = position_ids.flatten()[indices] if position_ids is not None else None
    unpadded_labels = labels.flatten()[indices] if labels is not None else None

    return unpadded_inputs, indices, cu_seqlens, max_seqlen_in_batch, unpadded_position_ids, unpadded_labels


def _pad_prokbert_output(
    inputs: torch.Tensor,
    indices: torch.Tensor,
    batch: int,
    seqlen: int,
) -> torch.Tensor:
    """
    Add padding back to the output tensor.

    Args:
        inputs: Tensor of shape (total_nnz, ...) containing outputs for only valid tokens.
        indices: Tensor of indices indicating positions of valid tokens.
        batch: Batch size.
        seqlen: Maximum sequence length (including padding).

    Returns:
        Tensor of shape (batch, seqlen, ...) with outputs in their original padded positions.
    """
    if inputs.dim() == 1:
        output = torch.zeros(batch * seqlen, dtype=inputs.dtype, device=inputs.device)
        output[indices] = inputs
        padded_inputs = output.view(batch, seqlen)
    else:
        _, *rest = inputs.shape
        output = torch.zeros(batch * seqlen, *rest, dtype=inputs.dtype, device=inputs.device)
        output[indices] = inputs
        padded_inputs = output.view(batch, seqlen, *rest)
    return padded_inputs


class ApplyRotaryEmbUnpad(torch.autograd.Function):
    @staticmethod
    def forward(
        ctx,
        qkv,
        cos,
        sin,
        cu_seqlens: Optional[torch.Tensor] = None,
        max_seqlen: Optional[int] = None,
    ):
        # qkv: (total_nnz, 3, nheads, headdim)
        qkv = qkv.contiguous()
        total_nnz, _three, _nheads, headdim = qkv.shape
        # Combine the (3, nheads) dimensions for the first two channels to create a (total_nnz, 2*nheads, headdim) tensor.
        qk = qkv[:, :2].view(total_nnz, -1, headdim)
        apply_rotary(
            qk,
            cos,
            sin,
            seqlen_offsets=0,
            cu_seqlens=cu_seqlens,
            max_seqlen=max_seqlen,
            interleaved=False,
            inplace=True,
        )

        ctx.save_for_backward(cos, sin, cu_seqlens)
        ctx.max_seqlen = max_seqlen
        return qkv

    @staticmethod
    def backward(ctx, do):
        cos, sin, cu_seqlens = ctx.saved_tensors
        do = do.contiguous()
        total_nnz, _three, _nheads, headdim = do.shape
        dqk = do[:, :2].view(total_nnz, -1, headdim)
        apply_rotary(
            dqk,
            cos,
            sin,
            seqlen_offsets=0,
            cu_seqlens=cu_seqlens,
            max_seqlen=ctx.max_seqlen,
            interleaved=False,
            inplace=True,
            conjugate=True,
        )
        return do, None, None, None, None


def apply_rotary_unpadded(
    qkv,
    cos,
    sin,
    cu_seqlens: Optional[torch.Tensor] = None,
    max_seqlen: Optional[int] = None,
):
    """
    Apply rotary embeddings to an unpadded (packed) QKV tensor.

    Args:
        qkv: Tensor of shape (total_nnz, 3, nheads, headdim) for packed QKV.
        cos, sin: Precomputed cosine and sine caches.
        cu_seqlens: Cumulative sequence lengths (batch + 1,).
        max_seqlen: Maximum sequence length in the batch.
    Returns:
        Tensor with rotary embeddings applied.
    """
    return ApplyRotaryEmbUnpad.apply(qkv, cos, sin, cu_seqlens, max_seqlen)


class ProkBertUnpaddedRotaryEmbedding(RotaryEmbedding):
    """
    Rotary embeddings for unpadded (packed) sequences used in ProkBERT.
    """

    def __init__(
        self,
        dim: int,
        base: float = 16000.0,
        max_seqlen: Optional[int] = None,
        device: Optional[torch.device] = None,
        dtype: Optional[torch.dtype] = None,
    ):
        """
        Args:
            dim: Dimension of each head.
            base: Base for the rotary frequency computation.
            max_seqlen: Maximum sequence length to precompute the cosine and sine cache.
            device: Device on which to create the cache.
            dtype: Data type for the cache.
        """
        #super().__init__(dim=dim, base=base, pos_idx_in_fp32=True, device=device, interleaved=False)
        super().__init__(dim=dim, base=base, device=device, interleaved=False)

        self.max_seqlen = max_seqlen

        if max_seqlen is not None and device is not None and dtype is not None:
            self._update_cos_sin_cache(max_seqlen, device=device, dtype=dtype)

    def forward(
        self,
        qkv: torch.Tensor,
        cu_seqlens: torch.Tensor,
        max_seqlen: Optional[int] = None,
    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """
        Apply rotary embeddings *inplace* to a packed QKV tensor.

        Args:
            qkv: Tensor of shape (total_nnz, 3, nheads, headdim).
            cu_seqlens: Cumulative sequence lengths tensor (batch + 1,).
            max_seqlen: Maximum sequence length in the current batch.
        Returns:
            Tensor with rotary embeddings applied.
        """
        if max_seqlen is not None:
            self._update_cos_sin_cache(max_seqlen, device=qkv.device, dtype=qkv.dtype)

        qkv = apply_rotary_unpadded(
            qkv,
            self._cos_cached,
            self._sin_cached,
            cu_seqlens=cu_seqlens,
            max_seqlen=max_seqlen,
        )
        return qkv

    def extra_repr(self) -> str:
        return f"dim={self.dim}, base={self.base}, scale_base={self.scale_base}"


class ProkBertEmbeddings(nn.Module):
    """
    Construct the embeddings from token embeddings, layer normalization, and dropout.
    """

    def __init__(self, config: ProkBertConfig):
        super().__init__()
        self.config = config
        self.tok_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        if config.norm_type == "rms":
            self.norm = RMSNorm(config.hidden_size, eps=config.norm_eps, bias=config.norm_bias)
        else:
            self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps, bias=config.norm_bias)
        self.drop = nn.Dropout(config.embedding_dropout)

    def forward(
        self, input_ids: Optional[torch.LongTensor] = None, inputs_embeds: Optional[torch.Tensor] = None
    ) -> torch.Tensor:
        """
        Forward pass for the embeddings layer.
        Args:
            input_ids: Tensor of input token ids.
            inputs_embeds: Alternatively, a pre-computed embedding tensor.
        Returns:
            Tensor of embeddings with normalization and dropout applied.
        """
        if inputs_embeds is not None:
            hidden_states = self.drop(self.norm(inputs_embeds))
        else:
            hidden_states = self.drop(self.norm(self.tok_embeddings(input_ids)))
        return hidden_states


class ProkBertRotaryEmbedding(nn.Module):
    def __init__(self, config: ProkBertConfig, layer_type: str, device: Optional[torch.device] = None):
        super().__init__()

        self.max_seq_len_cached = config.max_position_embeddings
        self.original_max_seq_len = config.max_position_embeddings

        self.config = config
        self.layer_type = layer_type

        rope_params = config.get_rope_parameters(layer_type)
        self.rope_type = rope_params["rope_type"]
        self.rope_init_fn: Callable = self.compute_default_rope_parameters
        if self.rope_type != "default":
            self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]

        inv_freq, self.attention_scaling = self.rope_init_fn(config, device, layer_type=layer_type)

        self.register_buffer("inv_freq", inv_freq, persistent=False)
        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)

    @staticmethod
    def compute_default_rope_parameters(
        config: ProkBertConfig,
        device: Optional["torch.device"] = None,
        seq_len: Optional[int] = None,
        layer_type: Optional[str] = None,
    ) -> tuple["torch.Tensor", float]:
        """
        Computes the inverse frequencies according to the original RoPE implementation.
        """
        current_layer_type = layer_type or "full_attention"
        rope_params = config.get_rope_parameters(current_layer_type)
        base = rope_params["rope_theta"]
        dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads

        attention_factor = 1.0
        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim))
        return inv_freq, attention_factor

    def _dynamic_frequency_update(self, position_ids, device):
        """
        Dynamic RoPE layers should recompute `inv_freq` in the following situations:
        1 - Growing beyond the cached sequence length (allow scaling)
        2 - The current sequence length is in the original scale (avoid losing precision with small sequences)
        """
        seq_len = torch.max(position_ids) + 1
        if seq_len > self.max_seq_len_cached:
            inv_freq, self.attention_scaling = self.rope_init_fn(
                self.config,
                device,
                seq_len=seq_len,
                layer_type=self.layer_type,
            )
            self.register_buffer("inv_freq", inv_freq, persistent=False)
            self.max_seq_len_cached = seq_len

        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:
            self.original_inv_freq = self.original_inv_freq.to(device)
            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
            self.max_seq_len_cached = self.original_max_seq_len

    @torch.no_grad()
    def forward(self, x, position_ids):
        if "dynamic" in self.rope_type:
            self._dynamic_frequency_update(position_ids, device=x.device)

        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        position_ids_expanded = position_ids[:, None, :].float()
        device_type = x.device.type
        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos()
            sin = emb.sin()

        cos = cos * self.attention_scaling
        sin = sin * self.attention_scaling

        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)


class ProkBertMLP(nn.Module):
    """Applies the GLU at the end of each ModernBERT layer.

    Compared to the default BERT architecture, this block replaces :class:`~transformers.model.bert.modeling_bert.BertIntermediate`
    and :class:`~transformers.model.bert.modeling_bert.SelfOutput` with a single module that has similar functionality.
    """

    def __init__(self, config: ProkBertConfig):
        super().__init__()
        self.config = config
        self.Wi = nn.Linear(config.hidden_size, int(config.intermediate_size) * 2, bias=config.mlp_bias)
        self.act = ACT2FN[config.hidden_activation]
        self.drop = nn.Dropout(config.mlp_dropout)
        self.Wo = nn.Linear(config.intermediate_size, config.hidden_size, bias=config.mlp_bias)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        input, gate = self.Wi(hidden_states).chunk(2, dim=-1)
        return self.Wo(self.drop(self.act(input) * gate))


class ProkBertAttention(nn.Module):
    """Performs multi-headed self attention on a batch of unpadded sequences.

    If Flash Attention 2 is available, this module uses it to improve throughput.
    Otherwise, it falls back on PyTorch's SDPA (or eager) implementation.
    """

    def __init__(self, config: ProkBertConfig, layer_id: Optional[int] = None):
        super().__init__()
        self.config = config
        self.layer_id = layer_id

        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention heads ({config.num_attention_heads})"
            )

        self.attention_dropout = config.attention_dropout
        self.deterministic_flash_attn = config.deterministic_flash_attn
        self.num_heads = config.num_attention_heads
        self.head_dim = config.hidden_size // config.num_attention_heads
        self.all_head_size = self.head_dim * self.num_heads
        self.Wqkv = nn.Linear(config.hidden_size, 3 * self.all_head_size, bias=config.attention_bias)

        if layer_id is None:
            self.attention_type = "full_attention"
        else:
            self.attention_type = config.layer_types[layer_id]

        if self.attention_type == "sliding_attention":
            local_window = config.sliding_window
            if config._attn_implementation == "flash_attention_2":
                # FlashAttention uses inclusive local-attention boundaries.
                local_window = local_window + 1
            self.local_attention = (local_window, local_window)
            max_position_embeddings = config.local_attention
        elif self.attention_type == "full_attention":
            self.local_attention = (-1, -1)
            max_position_embeddings = config.max_position_embeddings
        else:
            raise ValueError(
                f"Unsupported attention type {self.attention_type!r}. "
                "Expected 'full_attention' or 'sliding_attention'."
            )

        rope_theta = float(config.get_rope_parameters(self.attention_type)["rope_theta"])

        if config._attn_implementation == "flash_attention_2":
            self.rotary_emb = ProkBertUnpaddedRotaryEmbedding(
                dim=self.head_dim,
                max_seqlen=max_position_embeddings,
                base=rope_theta,
            )
        else:
            self.rotary_emb = ProkBertRotaryEmbedding(config=config, layer_type=self.attention_type)

        self.Wo = nn.Linear(config.hidden_size, config.hidden_size, bias=config.attention_bias)
        self.out_drop = nn.Dropout(config.attention_dropout) if config.attention_dropout > 0.0 else nn.Identity()
        self.pruned_heads = set()

    def forward(
        self,
        hidden_states: torch.Tensor,
        output_attentions: Optional[bool] = False,
        **kwargs,
    ) -> torch.Tensor:
        qkv = self.Wqkv(hidden_states)
        bs = hidden_states.shape[0]
        if self.config._attn_implementation == "flash_attention_2":
            qkv = qkv.view(-1, 3, self.num_heads, self.head_dim)
        else:
            qkv = qkv.view(bs, -1, 3, self.num_heads, self.head_dim)

        attn_outputs = PROK_BERT_ATTENTION_FUNCTION[self.config._attn_implementation](
            self,
            qkv=qkv,
            rotary_emb=self.rotary_emb,
            local_attention=self.local_attention,
            bs=bs,
            dim=self.all_head_size,
            output_attentions=output_attentions,
            **kwargs,
        )
        hidden_states = attn_outputs[0]
        hidden_states = self.out_drop(self.Wo(hidden_states))
        return (hidden_states,) + attn_outputs[1:]


class ProkBertEncoderLayer(nn.Module):
    def __init__(self, config: ProkBertConfig, layer_id: Optional[int] = None):
        super().__init__()
        self.config = config
        self.attention_type = "full_attention" if layer_id is None else config.layer_types[layer_id]

        Norm = RMSNorm if config.norm_type == "rms" else nn.LayerNorm

        self.attn_norm = Norm(config.hidden_size, eps=config.norm_eps, bias=config.norm_bias)
        self.mlp_norm = Norm(config.hidden_size, eps=config.norm_eps, bias=config.norm_bias)

        self.attn = ProkBertAttention(config=config, layer_id=layer_id)
        self.mlp = ProkBertMLP(config)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        sliding_window_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        cu_seqlens: Optional[torch.Tensor] = None,
        max_seqlen: Optional[int] = None,
        output_attentions: Optional[bool] = False,
    ) -> torch.Tensor:
        attn_outputs = self.attn(
            self.attn_norm(hidden_states),
            attention_mask=attention_mask,
            sliding_window_mask=sliding_window_mask,
            position_ids=position_ids,
            cu_seqlens=cu_seqlens,
            max_seqlen=max_seqlen,
            output_attentions=output_attentions,
        )
        hidden_states = hidden_states + attn_outputs[0]
        mlp_output = self.mlp(self.mlp_norm(hidden_states))
        hidden_states = hidden_states + mlp_output

        return (hidden_states,) + attn_outputs[1:]


PROK_BERT_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods
    the library implements for all its models (such as downloading or saving, resizing the input embeddings, pruning heads, etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch module and refer to the PyTorch documentation for general usage and behavior.

    Parameters:
        config ([`ProkBertConfig`]):
            Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the model weights; see [`PreTrainedModel.from_pretrained`] for weight loading.
"""


@add_start_docstrings(
    "The bare ProkBert Model outputting raw hidden-states without any specific head on top.",
    PROK_BERT_START_DOCSTRING,
)
class ProkBertPreTrainedModel(PreTrainedModel):
    config_class = ProkBertConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["ProkBertEmbeddings", "ProkBertEncoderLayer"]
    _supports_flash_attn_2 = True
    _supports_sdpa = True
    _supports_flex_attn = False

    def _init_weights(self, module: nn.Module):
        cutoff_factor = self.config.initializer_cutoff_factor
        if cutoff_factor is None:
            cutoff_factor = 3

        def init_weight(module: nn.Module, std: float):
            nn.init.trunc_normal_(
                module.weight,
                mean=0.0,
                std=std,
                a=-cutoff_factor * std,
                b=cutoff_factor * std,
            )
            if isinstance(module, nn.Linear):
                if module.bias is not None:
                    nn.init.zeros_(module.bias)

        stds = {
            "in": self.config.initializer_range,
            "out": self.config.initializer_range / math.sqrt(2.0 * self.config.num_hidden_layers),
            "embedding": self.config.initializer_range,
            "final_out": self.config.hidden_size ** -0.5,
        }

        if isinstance(module, ProkBertEmbeddings):
            init_weight(module.tok_embeddings, stds["embedding"])
        elif isinstance(module, ProkBertMLP):
            init_weight(module.Wi, stds["in"])
            init_weight(module.Wo, stds["out"])
        elif isinstance(module, ProkBertAttention):
            init_weight(module.Wqkv, stds["in"])
            init_weight(module.Wo, stds["out"])
        elif isinstance(module, ProkBertPredictionHead):
            init_weight(module.dense, stds["out"])
        elif isinstance(module, (ProkBertForMaskedLM, ProkBertForMaskedLM2)):
            init_weight(module.decoder, stds["out"])


    @classmethod
    def _autoset_attn_implementation(
        cls,
        config,
        use_flash_attention_2: bool = False,
        torch_dtype: Optional[torch.dtype] = None,
        device_map: Optional[Union[str, Dict[str, int]]] = None,
        check_device_map: bool = True,
    ):
        if config._attn_implementation_internal is None:
            config._attn_implementation_internal = "flash_attention_2"
            try:
                return cls._check_and_enable_flash_attn_2(
                    config,
                    torch_dtype=torch.float16,
                    device_map=device_map,
                    hard_check_only=False,
                    check_device_map=check_device_map,
                )
            except (ValueError, ImportError):
                config._attn_implementation_internal = None
        return super()._autoset_attn_implementation(
            config,
            use_flash_attention_2=use_flash_attention_2,
            torch_dtype=torch.float16,
            device_map=device_map,
            check_device_map=check_device_map,
        )

    def resize_token_embeddings(self, *args, **kwargs):
        model_embeds = super().resize_token_embeddings(*args, **kwargs)
        return model_embeds

@add_start_docstrings(
    "The bare ProkBert Model outputting raw hidden-states without any specific head on top.",
    PROK_BERT_START_DOCSTRING,
)
class ProkBertModel(_SafeFromPretrainedMixin, ProkBertPreTrainedModel):
    def __init__(self, config: ProkBertConfig):
        #config = self._autoset_attn_implementation(config)
        super().__init__(config)
        self.config = config
        self.embeddings = ProkBertEmbeddings(config)
        self.layers = nn.ModuleList(
            [ProkBertEncoderLayer(config, layer_id) for layer_id in range(config.num_hidden_layers)]
        )
        if config.norm_type == "rms":
            self.final_norm = RMSNorm(config.hidden_size, eps=config.norm_eps, bias=config.norm_bias)
        else:
            self.final_norm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps, bias=config.norm_bias)
        self.gradient_checkpointing = False
        self.post_init()

    def get_input_embeddings(self):
        return self.embeddings.tok_embeddings

    def set_input_embeddings(self, value):
        self.embeddings.tok_embeddings = value

    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        sliding_window_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        indices: Optional[torch.Tensor] = None,
        cu_seqlens: Optional[torch.Tensor] = None,
        max_seqlen: Optional[int] = None,
        batch_size: Optional[int] = None,
        seq_len: Optional[int] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutput]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if output_attentions and self.config._attn_implementation == "flash_attention_2":
            logger.warning_once(
                "`output_attentions=True` is not supported with flash_attention_2 in ProkBertModel. "
                "Falling back to `output_attentions=False`."
            )
            output_attentions = False

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")

        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None

        if input_ids is not None:
            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)

        if batch_size is None or seq_len is None:
            if inputs_embeds is not None:
                batch_size, seq_len = inputs_embeds.shape[:2]
            else:
                batch_size, seq_len = input_ids.shape[:2]
        device = input_ids.device if input_ids is not None else inputs_embeds.device

        if attention_mask is None:
            attention_mask = torch.ones((batch_size, seq_len), device=device, dtype=torch.bool)

        repad = False
        restore_attn_implementation = None
        if self.config._attn_implementation == "flash_attention_2":
            if indices is None and cu_seqlens is None and max_seqlen is None:
                repad = True
                if inputs_embeds is None:
                    with torch.no_grad():
                        input_ids, indices, cu_seqlens, max_seqlen, *_ = _unpad_prokbert_input(
                            inputs=input_ids,
                            attention_mask=attention_mask,
                        )
                else:
                    inputs_embeds, indices, cu_seqlens, max_seqlen, *_ = _unpad_prokbert_input(
                        inputs=inputs_embeds,
                        attention_mask=attention_mask,
                    )
        else:
            if output_attentions and self.config._attn_implementation == "sdpa":
                restore_attn_implementation = self.config._attn_implementation
            if position_ids is None:
                position_ids = torch.arange(seq_len, device=device).unsqueeze(0)
            attention_mask, sliding_window_mask = self._update_attention_mask(
                attention_mask,
                output_attentions=output_attentions,
            )

        hidden_states = self.embeddings(input_ids=input_ids, inputs_embeds=inputs_embeds)

        for encoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)
            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    encoder_layer.__call__,
                    hidden_states,
                    attention_mask,
                    sliding_window_mask,
                    position_ids,
                    cu_seqlens,
                    max_seqlen,
                    output_attentions,
                )
            else:
                layer_outputs = encoder_layer(
                    hidden_states,
                    attention_mask=attention_mask,
                    sliding_window_mask=sliding_window_mask,
                    position_ids=position_ids,
                    cu_seqlens=cu_seqlens,
                    max_seqlen=max_seqlen,
                    output_attentions=output_attentions,
                )
            hidden_states = layer_outputs[0]
            if output_attentions and len(layer_outputs) > 1:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        hidden_states = self.final_norm(hidden_states)

        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        if repad:
            hidden_states = _pad_prokbert_output(inputs=hidden_states, indices=indices, batch=batch_size, seqlen=seq_len)
            if all_hidden_states is not None:
                all_hidden_states = tuple(
                    _pad_prokbert_output(inputs=hs, indices=indices, batch=batch_size, seqlen=seq_len)
                    for hs in all_hidden_states
                )

        if restore_attn_implementation is not None:
            self.config._attn_implementation = restore_attn_implementation

        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )

    def _update_attention_mask(
        self,
        attention_mask: torch.Tensor,
        output_attentions: bool,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        if output_attentions:
            if self.config._attn_implementation == "sdpa":
                logger.warning_once(
                    "Outputting attentions is only supported with the 'eager' attention implementation, "
                    'not with "sdpa". Falling back to `attn_implementation="eager"`.'
                )
                self.config._attn_implementation = "eager"
            elif self.config._attn_implementation != "eager":
                logger.warning_once(
                    "Outputting attentions is only supported with the eager attention implementation, "
                    f'not with {self.config._attn_implementation}. Consider setting `attn_implementation="eager"`. '
                    "Setting `output_attentions=False`."
                )

        return _build_bidirectional_attention_biases(
            attention_mask=attention_mask,
            dtype=self.dtype,
            sliding_window=self.config.sliding_window,
        )


class ProkBertPredictionHead(nn.Module):
    def __init__(self, config: ProkBertConfig):
        super().__init__()
        Norm = RMSNorm if getattr(config, "norm_type", "layernorm") == "rms" else nn.LayerNorm
        self.dense = nn.Linear(config.hidden_size, config.hidden_size, config.classifier_bias)
        self.act = ACT2FN[config.classifier_activation]
        self.norm = Norm(config.hidden_size, eps=config.norm_eps, bias=config.norm_bias)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        return self.norm(self.act(self.dense(hidden_states)))


@add_start_docstrings(
    "The ProkBert Model with a decoder head on top that is used for masked language modeling.",
    PROK_BERT_START_DOCSTRING,
)

class ProkBertForMaskedLM(_SafeFromPretrainedMixin, ProkBertPreTrainedModel):
    _tied_weights_keys = {"decoder.weight": "model.embeddings.tok_embeddings.weight"}

    def __init__(self, config: ProkBertConfig):
        super().__init__(config)
        self.config = config
        self.model = ProkBertModel(config)
        self.head = ProkBertPredictionHead(config)
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=config.decoder_bias)

        self.sparse_prediction = self.config.sparse_prediction
        self.sparse_pred_ignore_index = self.config.sparse_pred_ignore_index

        self.post_init()

    def get_output_embeddings(self):
        return self.decoder

    def get_input_embeddings(self):
        return self.model.get_input_embeddings()

    def set_input_embeddings(self, value):
        self.model.set_input_embeddings(value)

    def set_output_embeddings(self, new_embeddings: nn.Linear):
        self.decoder = new_embeddings

    def _prediction_logits(self, hidden_states: torch.Tensor) -> torch.Tensor:
        return self.decoder(self.head(hidden_states))

    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        sliding_window_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        indices: Optional[torch.Tensor] = None,
        cu_seqlens: Optional[torch.Tensor] = None,
        max_seqlen: Optional[int] = None,
        batch_size: Optional[int] = None,
        seq_len: Optional[int] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if self.config._attn_implementation == "flash_attention_2":
            if indices is None and cu_seqlens is None and max_seqlen is None:
                if batch_size is None or seq_len is None:
                    if inputs_embeds is not None:
                        batch_size, seq_len = inputs_embeds.shape[:2]
                    else:
                        batch_size, seq_len = input_ids.shape[:2]
                device = input_ids.device if input_ids is not None else inputs_embeds.device

                if attention_mask is None:
                    attention_mask = torch.ones((batch_size, seq_len), device=device, dtype=torch.bool)

                if inputs_embeds is None:
                    with torch.no_grad():
                        input_ids, indices, cu_seqlens, max_seqlen, position_ids, labels = _unpad_prokbert_input(
                            inputs=input_ids,
                            attention_mask=attention_mask,
                            position_ids=position_ids,
                            labels=labels,
                        )
                else:
                    inputs_embeds, indices, cu_seqlens, max_seqlen, position_ids, labels = _unpad_prokbert_input(
                        inputs=inputs_embeds,
                        attention_mask=attention_mask,
                        position_ids=position_ids,
                        labels=labels,
                    )

        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            sliding_window_mask=sliding_window_mask,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            indices=indices,
            cu_seqlens=cu_seqlens,
            max_seqlen=max_seqlen,
            batch_size=batch_size,
            seq_len=seq_len,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        last_hidden_state = outputs[0]

        logits_are_sparse = False
        if self.sparse_prediction and labels is not None:
            labels = labels.view(-1)
            last_hidden_state = last_hidden_state.view(labels.shape[0], -1)
            mask_tokens = labels != self.sparse_pred_ignore_index
            last_hidden_state = last_hidden_state[mask_tokens]
            labels = labels[mask_tokens]
            logits_are_sparse = True

        logits = self._prediction_logits(last_hidden_state)

        loss = None
        if labels is not None:
            loss = self.loss_function(logits, labels, vocab_size=self.config.vocab_size, **kwargs)

        should_repad_logits = (
            self.config._attn_implementation == "flash_attention_2"
            and indices is not None
            and batch_size is not None
            and seq_len is not None
            and not logits_are_sparse
        )
        if should_repad_logits:
            with nullcontext() if self.config.repad_logits_with_grad or labels is None else torch.no_grad():
                logits = _pad_prokbert_output(inputs=logits, indices=indices, batch=batch_size, seqlen=seq_len)

        if not return_dict:
            output = (logits,)
            return ((loss,) + output) if loss is not None else output

        return MaskedLMOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


class ProkBertForSequenceClassification(_SafeFromPretrainedMixin, ProkBertPreTrainedModel):
    """Standalone ProkBERT sequence classifier with mask-aware pooling."""

    def __init__(self, config: ProkBertConfig):
        super().__init__(config)
        self.num_labels = int(config.num_labels)
        self.config = config

        self.model = ProkBertModel(config)
        self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps)

        # Kept for backward compatibility with existing sequence-classification checkpoints.
        self.weighting_layer = nn.Linear(config.hidden_size, 1)
        self.dropout = nn.Dropout(config.classifier_dropout)
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)

        self.post_init()

    def _init_weights(self, module: nn.Module):
        super()._init_weights(module)

        if module is self.weighting_layer:
            nn.init.zeros_(module.weight)
            if module.bias is not None:
                nn.init.zeros_(module.bias)

        if module is self.classifier:
            nn.init.xavier_uniform_(module.weight, gain=1.0)
            module.weight.data /= math.sqrt(self.classifier.in_features)
            if module.bias is not None:
                nn.init.zeros_(module.bias)

    def _pool_cls(self, sequence_output: torch.Tensor, token_mask: Optional[torch.Tensor]) -> torch.Tensor:
        if token_mask is None:
            return sequence_output[:, 0]

        first_token_indices = token_mask.to(dtype=torch.long).argmax(dim=-1)
        batch_indices = torch.arange(sequence_output.shape[0], device=sequence_output.device)
        return sequence_output[batch_indices, first_token_indices]

    def _pool_mean(self, sequence_output: torch.Tensor, token_mask: Optional[torch.Tensor]) -> torch.Tensor:
        if token_mask is None:
            return sequence_output.mean(dim=1)

        weights = token_mask.unsqueeze(-1).to(dtype=sequence_output.dtype)
        denom = weights.sum(dim=1).clamp(min=1.0)
        return (sequence_output * weights).sum(dim=1) / denom

    def _pool_attention(self, sequence_output: torch.Tensor, token_mask: Optional[torch.Tensor]) -> torch.Tensor:
        scores = self.weighting_layer(sequence_output)

        if token_mask is not None:
            empty_rows = token_mask.sum(dim=1) == 0
            if empty_rows.any():
                token_mask = token_mask.clone()
                token_mask[empty_rows, 0] = True
            scores = scores.masked_fill(~token_mask.unsqueeze(-1), torch.finfo(scores.dtype).min)

        weights = torch.softmax(scores.float(), dim=1).to(dtype=sequence_output.dtype)
        return torch.sum(weights * sequence_output, dim=1)

    def _pool_sequence(self, sequence_output: torch.Tensor, attention_mask: Optional[torch.Tensor]) -> torch.Tensor:
        token_mask = _normalize_token_attention_mask(
            attention_mask,
            batch_size=sequence_output.shape[0],
            seq_len=sequence_output.shape[1],
            device=sequence_output.device,
        )

        pooling = self.config.classifier_pooling
        if pooling == "attention":
            return self._pool_attention(sequence_output, token_mask)
        if pooling == "mean":
            return self._pool_mean(sequence_output, token_mask)
        if pooling == "cls":
            return self._pool_cls(sequence_output, token_mask)
        raise ValueError(
            f"Unsupported `classifier_pooling`={pooling!r}. Expected one of ['attention', 'cls', 'mean']."
        )

    def _compute_loss(self, logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
        if self.config.problem_type is None:
            if self.num_labels == 1:
                self.config.problem_type = "regression"
            elif labels.dtype in (torch.int8, torch.int16, torch.int32, torch.int64, torch.long, torch.uint8):
                self.config.problem_type = "single_label_classification"
            else:
                self.config.problem_type = "multi_label_classification"

        if self.config.problem_type == "regression":
            loss_fct = nn.MSELoss()
            if self.num_labels == 1:
                return loss_fct(logits.squeeze(), labels.squeeze().to(logits.dtype))
            return loss_fct(logits, labels.to(logits.dtype))

        if self.config.problem_type == "single_label_classification":
            loss_fct = nn.CrossEntropyLoss()
            return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if self.config.problem_type == "multi_label_classification":
            loss_fct = nn.BCEWithLogitsLoss()
            return loss_fct(logits, labels.to(logits.dtype))

        raise ValueError(
            f"Unsupported `problem_type`={self.config.problem_type!r}. "
            "Expected 'regression', 'single_label_classification', or 'multi_label_classification'."
        )

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: torch.Tensor = None,
        sliding_window_mask: torch.Tensor = None,
        position_ids: torch.LongTensor = None,
        inputs_embeds: torch.Tensor = None,
        labels: torch.Tensor = None,
        indices: torch.Tensor = None,
        cu_seqlens: torch.Tensor = None,
        max_seqlen: int = None,
        batch_size: int = None,
        seq_len: int = None,
        output_attentions: bool = None,
        output_hidden_states: bool = None,
        return_dict: bool = None,
        **kwargs,
    ) -> SequenceClassifierOutput:
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            sliding_window_mask=sliding_window_mask,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            indices=indices,
            cu_seqlens=cu_seqlens,
            max_seqlen=max_seqlen,
            batch_size=batch_size,
            seq_len=seq_len,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]
        if sequence_output.dim() == 2:
            if indices is None or batch_size is None or seq_len is None:
                raise ValueError(
                    "Received unpadded hidden states from `ProkBertModel`, but `indices`, `batch_size`, and "
                    "`seq_len` were not provided to repad them for sequence classification."
                )
            sequence_output = _pad_prokbert_output(
                inputs=sequence_output,
                indices=indices,
                batch=batch_size,
                seqlen=seq_len,
            )

        pooled_output = self._pool_sequence(sequence_output, attention_mask)
        pooled_output = self.norm(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss = self._compute_loss(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


@dataclass
class CurricularSequenceClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: Optional[torch.FloatTensor] = None
    embeddings: Optional[torch.FloatTensor] = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None


class CurricularFace(nn.Module):
    def __init__(self, in_features: int, out_features: int, m: float = 0.5, s: float = 64.0, ema_alpha: float = 0.01):
        super().__init__()
        self.in_features = int(in_features)
        self.out_features = int(out_features)
        self.m = float(m)
        self.s = float(s)
        self.ema_alpha = float(ema_alpha)

        self.cos_m = math.cos(self.m)
        self.sin_m = math.sin(self.m)
        self.threshold = math.cos(math.pi - self.m)
        self.mm = math.sin(math.pi - self.m) * self.m

        self.kernel = nn.Parameter(torch.empty(self.in_features, self.out_features))
        self.register_buffer("t", torch.zeros(1, dtype=torch.float32))
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.kernel)
        self.t.zero_()

    def cosine(self, embeddings: torch.Tensor) -> torch.Tensor:
        with _autocast_disabled(embeddings.device.type):
            x = F.normalize(embeddings.float(), p=2.0, dim=1, eps=1e-12)
            w = F.normalize(self.kernel.float(), p=2.0, dim=0, eps=1e-12)
            cos_theta = F.linear(x, w.t()).clamp(-1.0, 1.0)
        return cos_theta

    def inference_logits(self, embeddings: torch.Tensor) -> torch.Tensor:
        return self.cosine(embeddings) * self.s

    def margin_logits_from_cosine(
        self,
        cos_theta: torch.Tensor,
        labels: torch.LongTensor,
        update_t: bool = False,
    ) -> torch.Tensor:
        labels = labels.reshape(-1).long()
        target = cos_theta.gather(1, labels.unsqueeze(1))

        sin_theta = torch.sqrt((1.0 - target.square()).clamp(min=0.0))
        cos_theta_m = target * self.cos_m - sin_theta * self.sin_m

        hard_mask = cos_theta > cos_theta_m
        final_target = torch.where(
            target > self.threshold,
            cos_theta_m,
            target - self.mm,
        )

        if update_t:
            with torch.no_grad():
                target_mean = target.mean().to(dtype=self.t.dtype).view_as(self.t)
                self.t.lerp_(target_mean, self.ema_alpha)

        t = self.t.to(device=cos_theta.device, dtype=cos_theta.dtype)
        adjusted = torch.where(hard_mask, cos_theta * (t + cos_theta), cos_theta)
        adjusted = adjusted.scatter(1, labels.unsqueeze(1), final_target)

        return adjusted * self.s


class ProkBertForCurricularClassification(_SafeFromPretrainedMixin, ProkBertPreTrainedModel):
    """ProkBERT sequence classifier with CurricularFace logits for single-label classification."""

    def __init__(self, config: ProkBertConfig):
        super().__init__(config)
        self.config = config
        self.num_labels = int(config.num_labels)

        if self.num_labels < 2:
            raise ValueError(
                "`ProkBertForCurricularClassification` requires `config.num_labels >= 2`. "
                "CurricularFace is intended for single-label classification."
            )
        if self.config.problem_type is None:
            self.config.problem_type = "single_label_classification"
        elif self.config.problem_type != "single_label_classification":
            raise ValueError(
                "`ProkBertForCurricularClassification` only supports `problem_type='single_label_classification'`."
            )

        self.model = ProkBertModel(config)
        self.weighting_layer = nn.Linear(config.hidden_size, 1)
        self.dropout = nn.Dropout(config.classifier_dropout)

        use_projection = config.curricular_embedding_size not in (None, -1)
        embedding_dim = config.hidden_size if not use_projection else int(config.curricular_embedding_size)
        self.linear = nn.Linear(config.hidden_size, embedding_dim) if use_projection else nn.Identity()

        self.curricular_face = CurricularFace(
            in_features=embedding_dim,
            out_features=self.num_labels,
            m=float(config.curricular_margin),
            s=float(config.curricular_scale),
        )
        self.loss_fct = nn.CrossEntropyLoss()

        self.post_init()

        with torch.no_grad():
            nn.init.zeros_(self.weighting_layer.weight)
            if self.weighting_layer.bias is not None:
                nn.init.zeros_(self.weighting_layer.bias)
            if isinstance(self.linear, nn.Linear):
                initialize_linear_kaiming(self.linear)

    def get_input_embeddings(self):
        return self.model.get_input_embeddings()

    def set_input_embeddings(self, value):
        self.model.set_input_embeddings(value)

    def _pool_cls(self, sequence_output: torch.Tensor, token_mask: Optional[torch.Tensor]) -> torch.Tensor:
        if token_mask is None:
            return sequence_output[:, 0]

        first_token_indices = token_mask.to(dtype=torch.long).argmax(dim=-1)
        batch_indices = torch.arange(sequence_output.shape[0], device=sequence_output.device)
        return sequence_output[batch_indices, first_token_indices]

    def _pool_mean(self, sequence_output: torch.Tensor, token_mask: Optional[torch.Tensor]) -> torch.Tensor:
        if token_mask is None:
            return sequence_output.mean(dim=1)

        weights = token_mask.unsqueeze(-1).to(dtype=sequence_output.dtype)
        denom = weights.sum(dim=1).clamp(min=1.0)
        return (sequence_output * weights).sum(dim=1) / denom

    def _pool_attention(self, sequence_output: torch.Tensor, token_mask: Optional[torch.Tensor]) -> torch.Tensor:
        scores = self.weighting_layer(sequence_output)

        if token_mask is not None:
            empty_rows = token_mask.sum(dim=1) == 0
            if empty_rows.any():
                token_mask = token_mask.clone()
                token_mask[empty_rows, 0] = True
            scores = scores.masked_fill(~token_mask.unsqueeze(-1), torch.finfo(scores.dtype).min)

        weights = torch.softmax(scores.float(), dim=1).to(dtype=sequence_output.dtype)
        return torch.sum(weights * sequence_output, dim=1)

    def _pool_sequence(self, sequence_output: torch.Tensor, attention_mask: Optional[torch.Tensor]) -> torch.Tensor:
        token_mask = _normalize_token_attention_mask(
            attention_mask,
            batch_size=sequence_output.shape[0],
            seq_len=sequence_output.shape[1],
            device=sequence_output.device,
        )

        pooling = self.config.classifier_pooling
        if pooling == "attention":
            return self._pool_attention(sequence_output, token_mask)
        if pooling == "mean":
            return self._pool_mean(sequence_output, token_mask)
        if pooling == "cls":
            return self._pool_cls(sequence_output, token_mask)
        raise ValueError(
            f"Unsupported `classifier_pooling`={pooling!r}. Expected one of ['attention', 'cls', 'mean']."
        )

    def _repad_sequence_output_if_needed(
        self,
        sequence_output: torch.Tensor,
        indices: Optional[torch.Tensor],
        batch_size: Optional[int],
        seq_len: Optional[int],
    ) -> torch.Tensor:
        if sequence_output.dim() != 2:
            return sequence_output

        if indices is None or batch_size is None or seq_len is None:
            raise ValueError(
                "Received unpadded hidden states from `ProkBertModel`, but `indices`, `batch_size`, and "
                "`seq_len` were not provided to repad them for curricular classification."
            )

        return _pad_prokbert_output(
            inputs=sequence_output,
            indices=indices,
            batch=batch_size,
            seqlen=seq_len,
        )

    def _compute_embeddings(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        sliding_window_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        indices: Optional[torch.Tensor] = None,
        cu_seqlens: Optional[torch.Tensor] = None,
        max_seqlen: Optional[int] = None,
        batch_size: Optional[int] = None,
        seq_len: Optional[int] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        apply_dropout: bool = True,
    ) -> tuple[torch.Tensor, BaseModelOutput]:
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            sliding_window_mask=sliding_window_mask,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            indices=indices,
            cu_seqlens=cu_seqlens,
            max_seqlen=max_seqlen,
            batch_size=batch_size,
            seq_len=seq_len,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=True,
        )

        sequence_output = self._repad_sequence_output_if_needed(
            outputs.last_hidden_state,
            indices=indices,
            batch_size=batch_size,
            seq_len=seq_len,
        )
        pooled_output = self._pool_sequence(sequence_output, attention_mask)

        if apply_dropout:
            pooled_output = self.dropout(pooled_output)

        embeddings = self.linear(pooled_output)
        return embeddings, outputs

    def encode(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        sliding_window_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        indices: Optional[torch.Tensor] = None,
        cu_seqlens: Optional[torch.Tensor] = None,
        max_seqlen: Optional[int] = None,
        batch_size: Optional[int] = None,
        seq_len: Optional[int] = None,
        normalize: bool = True,
    ) -> torch.Tensor:
        embeddings, _ = self._compute_embeddings(
            input_ids=input_ids,
            attention_mask=attention_mask,
            sliding_window_mask=sliding_window_mask,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            indices=indices,
            cu_seqlens=cu_seqlens,
            max_seqlen=max_seqlen,
            batch_size=batch_size,
            seq_len=seq_len,
            apply_dropout=False,
        )
        return l2_norm(embeddings, axis=1) if normalize else embeddings

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        sliding_window_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.LongTensor] = None,
        indices: Optional[torch.Tensor] = None,
        cu_seqlens: Optional[torch.Tensor] = None,
        max_seqlen: Optional[int] = None,
        batch_size: Optional[int] = None,
        seq_len: Optional[int] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        return_embeddings: bool = False,
        normalize_embeddings: bool = True,
        **kwargs,
    ) -> Union[Tuple, CurricularSequenceClassifierOutput]:
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        embeddings, outputs = self._compute_embeddings(
            input_ids=input_ids,
            attention_mask=attention_mask,
            sliding_window_mask=sliding_window_mask,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            indices=indices,
            cu_seqlens=cu_seqlens,
            max_seqlen=max_seqlen,
            batch_size=batch_size,
            seq_len=seq_len,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            apply_dropout=self.training,
        )

        exported_embeddings = None
        if return_embeddings:
            exported_embeddings = l2_norm(embeddings, axis=1) if normalize_embeddings else embeddings

        cos_theta = self.curricular_face.cosine(embeddings)
        logits = cos_theta * self.curricular_face.s

        loss = None
        if labels is not None:
            labels = labels.view(-1).long()
            train_logits = self.curricular_face.margin_logits_from_cosine(
                cos_theta,
                labels,
                update_t=self.training,
            )
            loss = self.loss_fct(train_logits, labels)

        if not return_dict:
            out = (logits,)
            if return_embeddings:
                out = out + (exported_embeddings,)
            if output_hidden_states:
                out = out + (outputs.hidden_states,)
            if output_attentions:
                out = out + (outputs.attentions,)
            return ((loss,) + out) if loss is not None else out

        return CurricularSequenceClassifierOutput(
            loss=loss,
            logits=logits,
            embeddings=exported_embeddings,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


class ProkBertForMaskedLM2(_SafeFromPretrainedMixin, ProkBertPreTrainedModel):
    _tied_weights_keys = {"decoder.weight": "model.embeddings.tok_embeddings.weight"}

    def __init__(self, config):
        super().__init__(config)
        self.config = config
        self.model   = ProkBertModel(config)
        self.head    = ProkBertPredictionHead(config)
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=config.decoder_bias)

        # for sparse‐integer masking (legacy)
        self.sparse_prediction       = config.sparse_prediction
        self.sparse_pred_ignore_index = config.sparse_pred_ignore_index

        self.post_init()

    def get_output_embeddings(self):
        return self.decoder

    def get_input_embeddings(self):
        return self.model.get_input_embeddings()

    def set_input_embeddings(self, value):
        self.model.set_input_embeddings(value)

    def set_output_embeddings(self, new_embeddings: nn.Linear):
        self.decoder = new_embeddings

    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor]    = None,
        attention_mask: Optional[torch.Tensor]    = None,
        sliding_window_mask: Optional[torch.Tensor]= None,
        position_ids: Optional[torch.Tensor]      = None,
        inputs_embeds: Optional[torch.Tensor]     = None,
        labels: Optional[torch.LongTensor]        = None,
        labels_dist: Optional[torch.FloatTensor]  = None,
        loss_mask: Optional[torch.BoolTensor]     = None,
        indices: Optional[torch.Tensor]           = None,
        cu_seqlens: Optional[torch.Tensor]        = None,
        max_seqlen: Optional[int]                 = None,
        batch_size: Optional[int]                 = None,
        seq_len: Optional[int]                    = None,
        output_attentions: Optional[bool]         = None,
        output_hidden_states: Optional[bool]      = None,
        return_dict: Optional[bool]               = None,
        **kwargs,
    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 1) Optional unpad for flash_attention_2
        if self.config._attn_implementation == "flash_attention_2" \
        and indices is None and cu_seqlens is None and max_seqlen is None:
            # infer batch_size, seq_len
            if batch_size is None or seq_len is None:
                if inputs_embeds is not None:
                    batch_size, seq_len = inputs_embeds.shape[:2]
                else:
                    batch_size, seq_len = input_ids.shape[:2]
            # EXPLICIT device pick
            device = input_ids.device if input_ids is not None else inputs_embeds.device
            attention_mask = attention_mask if attention_mask is not None else \
                            torch.ones((batch_size, seq_len), device=device, dtype=torch.bool)
            if inputs_embeds is None:
                with torch.no_grad():
                    input_ids, indices, cu_seqlens, max_seqlen, position_ids, labels = \
                        _unpad_prokbert_input(
                            inputs=input_ids,
                            attention_mask=attention_mask,
                            position_ids=position_ids,
                            labels=labels
                        )
            else:
                inputs_embeds, indices, cu_seqlens, max_seqlen, position_ids, labels = \
                    _unpad_prokbert_input(
                        inputs=inputs_embeds,
                        attention_mask=attention_mask,
                        position_ids=position_ids,
                        labels=labels
                    )

        # 2) Core encoder
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            sliding_window_mask=sliding_window_mask,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            indices=indices,
            cu_seqlens=cu_seqlens,
            max_seqlen=max_seqlen,
            batch_size=batch_size,
            seq_len=seq_len,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]  # (B,L,H) or packed (N,H)

        # 3) Legacy sparse integer mask
        if self.sparse_prediction and labels is not None:
            flat_labels = labels.view(-1)
            flat_hidden = sequence_output.view(flat_labels.shape[0], -1)
            mask_tokens = flat_labels != self.sparse_pred_ignore_index
            sequence_output = flat_hidden[mask_tokens]
            labels = flat_labels[mask_tokens]

        hidden = self.head(sequence_output)
        logits = self.decoder(hidden)

        loss = None
        V = self.config.vocab_size

        # 5a) Integer‐label MLM
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, V), labels.view(-1))

        # 5b) Soft‐distribution MLM (no re‐pad)
        elif labels_dist is not None and loss_mask is not None:
            B, L = loss_mask.shape
            flat_mask = loss_mask.view(-1)        # (B*L,)
            flat_dist = labels_dist.view(-1, V)   # (B*L, V)

            # packed by attention_mask
            if logits.dim() == 2 and logits.shape[0] != flat_mask.sum().item():
                full_attn    = attention_mask.view(-1)     # (B*L,)
                assert logits.shape[0] == full_attn.sum().item()
                dist_attn    = flat_dist[full_attn]        # (Natt, V)
                mask_in_attn = flat_mask[full_attn]        # (Natt,)
                pred = logits[mask_in_attn]                # (N_mask, V)
                targ = dist_attn[mask_in_attn]             # (N_mask, V)

            # packed exactly by loss_mask
            elif logits.dim() == 2 and logits.shape[0] == flat_mask.sum().item():
                pred = logits
                targ = flat_dist[flat_mask]

            # full (B,L,V)
            else:
                flat_logits = logits.view(-1, V)           # (B*L, V)
                pred        = flat_logits[flat_mask]       # (N_mask, V)
                targ        = flat_dist[flat_mask]         # (N_mask, V)

            eps  = 1e-8
            targ = targ.clamp_min(eps)
            targ = targ / targ.sum(dim=-1, keepdim=True)
            targ = targ.to(pred.dtype).detach()

            logp = F.log_softmax(pred, dim=-1)
            loss = F.kl_div(logp, targ, reduction="batchmean")

        should_repad_logits = (
            self.config._attn_implementation == "flash_attention_2"
            and indices is not None
            and batch_size is not None
            and seq_len is not None
            and not (self.sparse_prediction and labels is not None)
        )
        if should_repad_logits:
            with nullcontext() if self.config.repad_logits_with_grad or labels is None else torch.no_grad():
                logits = _pad_prokbert_output(inputs=logits, indices=indices, batch=batch_size, seqlen=seq_len)

        # 6) Return
        if not return_dict:
            out = (logits,) + outputs[1:]
            return ((loss,) + out) if loss is not None else out

        return MaskedLMOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )